tvkain's picture
Add files using upload-large-folder tool
9fd5c4b verified
raw
history blame
78.7 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5153532316942238,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005726147018824708,
"grad_norm": 7.761023998260498,
"learning_rate": 0.0,
"loss": 6.0592,
"step": 1
},
{
"epoch": 0.0011452294037649416,
"grad_norm": 7.8541951179504395,
"learning_rate": 5.714285714285715e-07,
"loss": 6.0156,
"step": 2
},
{
"epoch": 0.0022904588075298832,
"grad_norm": 7.347611904144287,
"learning_rate": 1.7142857142857145e-06,
"loss": 6.0103,
"step": 4
},
{
"epoch": 0.003435688211294825,
"grad_norm": 5.382428169250488,
"learning_rate": 2.8571428571428573e-06,
"loss": 5.9221,
"step": 6
},
{
"epoch": 0.0045809176150597665,
"grad_norm": 5.063406467437744,
"learning_rate": 4.000000000000001e-06,
"loss": 6.0365,
"step": 8
},
{
"epoch": 0.005726147018824708,
"grad_norm": 9.779157638549805,
"learning_rate": 5.142857142857143e-06,
"loss": 6.0336,
"step": 10
},
{
"epoch": 0.00687137642258965,
"grad_norm": 7.555446147918701,
"learning_rate": 6.285714285714287e-06,
"loss": 6.0328,
"step": 12
},
{
"epoch": 0.008016605826354592,
"grad_norm": 6.790043354034424,
"learning_rate": 7.428571428571429e-06,
"loss": 5.7848,
"step": 14
},
{
"epoch": 0.009161835230119533,
"grad_norm": 4.4132208824157715,
"learning_rate": 8.571428571428573e-06,
"loss": 5.8207,
"step": 16
},
{
"epoch": 0.010307064633884476,
"grad_norm": 4.064995765686035,
"learning_rate": 9.714285714285715e-06,
"loss": 5.6497,
"step": 18
},
{
"epoch": 0.011452294037649417,
"grad_norm": 3.357184410095215,
"learning_rate": 1.0857142857142858e-05,
"loss": 5.7758,
"step": 20
},
{
"epoch": 0.012597523441414358,
"grad_norm": 2.742230176925659,
"learning_rate": 1.2e-05,
"loss": 5.6173,
"step": 22
},
{
"epoch": 0.0137427528451793,
"grad_norm": 2.491459369659424,
"learning_rate": 1.3142857142857143e-05,
"loss": 5.6681,
"step": 24
},
{
"epoch": 0.014887982248944241,
"grad_norm": 2.7569029331207275,
"learning_rate": 1.4285714285714285e-05,
"loss": 5.6393,
"step": 26
},
{
"epoch": 0.016033211652709184,
"grad_norm": 2.208378791809082,
"learning_rate": 1.5428571428571428e-05,
"loss": 5.5768,
"step": 28
},
{
"epoch": 0.017178441056474127,
"grad_norm": 3.2770133018493652,
"learning_rate": 1.657142857142857e-05,
"loss": 5.484,
"step": 30
},
{
"epoch": 0.018323670460239066,
"grad_norm": 3.177299976348877,
"learning_rate": 1.7714285714285713e-05,
"loss": 5.528,
"step": 32
},
{
"epoch": 0.01946889986400401,
"grad_norm": 2.1981537342071533,
"learning_rate": 1.885714285714286e-05,
"loss": 5.6327,
"step": 34
},
{
"epoch": 0.02061412926776895,
"grad_norm": 3.265881061553955,
"learning_rate": 2e-05,
"loss": 5.6288,
"step": 36
},
{
"epoch": 0.02175935867153389,
"grad_norm": 3.6059298515319824,
"learning_rate": 2.1142857142857144e-05,
"loss": 5.4789,
"step": 38
},
{
"epoch": 0.022904588075298833,
"grad_norm": 2.4080026149749756,
"learning_rate": 2.2285714285714287e-05,
"loss": 5.4046,
"step": 40
},
{
"epoch": 0.024049817479063776,
"grad_norm": 2.142902135848999,
"learning_rate": 2.342857142857143e-05,
"loss": 5.4738,
"step": 42
},
{
"epoch": 0.025195046882828715,
"grad_norm": 2.4021224975585938,
"learning_rate": 2.4571428571428572e-05,
"loss": 5.4649,
"step": 44
},
{
"epoch": 0.026340276286593658,
"grad_norm": 2.172009229660034,
"learning_rate": 2.5714285714285714e-05,
"loss": 5.4302,
"step": 46
},
{
"epoch": 0.0274855056903586,
"grad_norm": 2.9737730026245117,
"learning_rate": 2.6857142857142857e-05,
"loss": 5.3045,
"step": 48
},
{
"epoch": 0.028630735094123543,
"grad_norm": 3.0378615856170654,
"learning_rate": 2.8000000000000003e-05,
"loss": 5.2185,
"step": 50
},
{
"epoch": 0.029775964497888482,
"grad_norm": 3.4448676109313965,
"learning_rate": 2.9142857142857146e-05,
"loss": 5.1838,
"step": 52
},
{
"epoch": 0.030921193901653425,
"grad_norm": 2.469245672225952,
"learning_rate": 3.0285714285714288e-05,
"loss": 5.1637,
"step": 54
},
{
"epoch": 0.03206642330541837,
"grad_norm": 3.58486008644104,
"learning_rate": 3.142857142857143e-05,
"loss": 5.2063,
"step": 56
},
{
"epoch": 0.03321165270918331,
"grad_norm": 3.0815446376800537,
"learning_rate": 3.257142857142857e-05,
"loss": 5.2317,
"step": 58
},
{
"epoch": 0.03435688211294825,
"grad_norm": 3.6842119693756104,
"learning_rate": 3.3714285714285716e-05,
"loss": 5.2695,
"step": 60
},
{
"epoch": 0.03550211151671319,
"grad_norm": 2.9440791606903076,
"learning_rate": 3.485714285714286e-05,
"loss": 5.2686,
"step": 62
},
{
"epoch": 0.03664734092047813,
"grad_norm": 3.9632568359375,
"learning_rate": 3.6e-05,
"loss": 5.1262,
"step": 64
},
{
"epoch": 0.037792570324243074,
"grad_norm": 4.045065402984619,
"learning_rate": 3.7142857142857143e-05,
"loss": 5.1546,
"step": 66
},
{
"epoch": 0.03893779972800802,
"grad_norm": 3.5707085132598877,
"learning_rate": 3.8285714285714286e-05,
"loss": 5.0036,
"step": 68
},
{
"epoch": 0.04008302913177296,
"grad_norm": 3.014404535293579,
"learning_rate": 3.942857142857143e-05,
"loss": 5.026,
"step": 70
},
{
"epoch": 0.0412282585355379,
"grad_norm": 2.708796977996826,
"learning_rate": 4.057142857142857e-05,
"loss": 4.9442,
"step": 72
},
{
"epoch": 0.04237348793930284,
"grad_norm": 2.5384011268615723,
"learning_rate": 4.1714285714285714e-05,
"loss": 5.0223,
"step": 74
},
{
"epoch": 0.04351871734306778,
"grad_norm": 3.006281852722168,
"learning_rate": 4.2857142857142856e-05,
"loss": 4.9827,
"step": 76
},
{
"epoch": 0.044663946746832724,
"grad_norm": 2.5772130489349365,
"learning_rate": 4.4000000000000006e-05,
"loss": 4.9675,
"step": 78
},
{
"epoch": 0.045809176150597666,
"grad_norm": 3.456017255783081,
"learning_rate": 4.514285714285714e-05,
"loss": 5.0341,
"step": 80
},
{
"epoch": 0.04695440555436261,
"grad_norm": 3.3163113594055176,
"learning_rate": 4.628571428571429e-05,
"loss": 4.9867,
"step": 82
},
{
"epoch": 0.04809963495812755,
"grad_norm": 3.7568469047546387,
"learning_rate": 4.742857142857143e-05,
"loss": 4.8652,
"step": 84
},
{
"epoch": 0.049244864361892494,
"grad_norm": 4.19318151473999,
"learning_rate": 4.8571428571428576e-05,
"loss": 5.0602,
"step": 86
},
{
"epoch": 0.05039009376565743,
"grad_norm": 5.1034064292907715,
"learning_rate": 4.971428571428572e-05,
"loss": 4.9757,
"step": 88
},
{
"epoch": 0.05153532316942237,
"grad_norm": 4.0827484130859375,
"learning_rate": 5.085714285714286e-05,
"loss": 4.8486,
"step": 90
},
{
"epoch": 0.052680552573187316,
"grad_norm": 4.6189446449279785,
"learning_rate": 5.2000000000000004e-05,
"loss": 4.9595,
"step": 92
},
{
"epoch": 0.05382578197695226,
"grad_norm": 3.988513469696045,
"learning_rate": 5.314285714285715e-05,
"loss": 4.9035,
"step": 94
},
{
"epoch": 0.0549710113807172,
"grad_norm": 3.857276678085327,
"learning_rate": 5.428571428571428e-05,
"loss": 4.8277,
"step": 96
},
{
"epoch": 0.056116240784482144,
"grad_norm": 3.5372354984283447,
"learning_rate": 5.542857142857143e-05,
"loss": 4.7718,
"step": 98
},
{
"epoch": 0.057261470188247086,
"grad_norm": 3.3853676319122314,
"learning_rate": 5.6571428571428574e-05,
"loss": 4.8098,
"step": 100
},
{
"epoch": 0.05840669959201202,
"grad_norm": 2.1142077445983887,
"learning_rate": 5.771428571428572e-05,
"loss": 4.7975,
"step": 102
},
{
"epoch": 0.059551928995776965,
"grad_norm": 3.2275538444519043,
"learning_rate": 5.885714285714285e-05,
"loss": 4.8509,
"step": 104
},
{
"epoch": 0.06069715839954191,
"grad_norm": 3.5413126945495605,
"learning_rate": 6e-05,
"loss": 4.6069,
"step": 106
},
{
"epoch": 0.06184238780330685,
"grad_norm": 2.755648374557495,
"learning_rate": 6.114285714285714e-05,
"loss": 4.6951,
"step": 108
},
{
"epoch": 0.06298761720707179,
"grad_norm": 2.980039596557617,
"learning_rate": 6.22857142857143e-05,
"loss": 4.7012,
"step": 110
},
{
"epoch": 0.06413284661083674,
"grad_norm": 4.890020370483398,
"learning_rate": 6.342857142857143e-05,
"loss": 4.8008,
"step": 112
},
{
"epoch": 0.06527807601460167,
"grad_norm": 4.35846471786499,
"learning_rate": 6.457142857142856e-05,
"loss": 4.8587,
"step": 114
},
{
"epoch": 0.06642330541836662,
"grad_norm": 3.6171813011169434,
"learning_rate": 6.571428571428571e-05,
"loss": 4.7473,
"step": 116
},
{
"epoch": 0.06756853482213156,
"grad_norm": 2.4927010536193848,
"learning_rate": 6.685714285714286e-05,
"loss": 4.7113,
"step": 118
},
{
"epoch": 0.0687137642258965,
"grad_norm": 3.3327009677886963,
"learning_rate": 6.800000000000001e-05,
"loss": 4.6105,
"step": 120
},
{
"epoch": 0.06985899362966144,
"grad_norm": 3.1123206615448,
"learning_rate": 6.914285714285715e-05,
"loss": 4.5968,
"step": 122
},
{
"epoch": 0.07100422303342638,
"grad_norm": 2.6985421180725098,
"learning_rate": 7.028571428571428e-05,
"loss": 4.6323,
"step": 124
},
{
"epoch": 0.07214945243719133,
"grad_norm": 2.058084011077881,
"learning_rate": 7.142857142857143e-05,
"loss": 4.5721,
"step": 126
},
{
"epoch": 0.07329468184095626,
"grad_norm": 2.144658327102661,
"learning_rate": 7.257142857142858e-05,
"loss": 4.6125,
"step": 128
},
{
"epoch": 0.07443991124472121,
"grad_norm": 2.477219820022583,
"learning_rate": 7.371428571428572e-05,
"loss": 4.4727,
"step": 130
},
{
"epoch": 0.07558514064848615,
"grad_norm": 3.8517298698425293,
"learning_rate": 7.485714285714285e-05,
"loss": 4.5696,
"step": 132
},
{
"epoch": 0.0767303700522511,
"grad_norm": 3.0253565311431885,
"learning_rate": 7.6e-05,
"loss": 4.4838,
"step": 134
},
{
"epoch": 0.07787559945601603,
"grad_norm": 3.397569179534912,
"learning_rate": 7.714285714285715e-05,
"loss": 4.6431,
"step": 136
},
{
"epoch": 0.07902082885978097,
"grad_norm": 2.435197114944458,
"learning_rate": 7.828571428571429e-05,
"loss": 4.4681,
"step": 138
},
{
"epoch": 0.08016605826354592,
"grad_norm": 2.6476476192474365,
"learning_rate": 7.942857142857143e-05,
"loss": 4.4462,
"step": 140
},
{
"epoch": 0.08131128766731086,
"grad_norm": 2.1929690837860107,
"learning_rate": 8.057142857142857e-05,
"loss": 4.5136,
"step": 142
},
{
"epoch": 0.0824565170710758,
"grad_norm": 2.4533395767211914,
"learning_rate": 8.171428571428572e-05,
"loss": 4.5572,
"step": 144
},
{
"epoch": 0.08360174647484074,
"grad_norm": 2.601806879043579,
"learning_rate": 8.285714285714287e-05,
"loss": 4.4121,
"step": 146
},
{
"epoch": 0.08474697587860568,
"grad_norm": 3.233973741531372,
"learning_rate": 8.4e-05,
"loss": 4.4599,
"step": 148
},
{
"epoch": 0.08589220528237063,
"grad_norm": 2.6353538036346436,
"learning_rate": 8.514285714285714e-05,
"loss": 4.4533,
"step": 150
},
{
"epoch": 0.08703743468613556,
"grad_norm": 2.8465511798858643,
"learning_rate": 8.62857142857143e-05,
"loss": 4.5246,
"step": 152
},
{
"epoch": 0.08818266408990051,
"grad_norm": 2.8642711639404297,
"learning_rate": 8.742857142857144e-05,
"loss": 4.4659,
"step": 154
},
{
"epoch": 0.08932789349366545,
"grad_norm": 2.793112277984619,
"learning_rate": 8.857142857142857e-05,
"loss": 4.5107,
"step": 156
},
{
"epoch": 0.0904731228974304,
"grad_norm": 3.43472957611084,
"learning_rate": 8.971428571428571e-05,
"loss": 4.4079,
"step": 158
},
{
"epoch": 0.09161835230119533,
"grad_norm": 2.9260294437408447,
"learning_rate": 9.085714285714286e-05,
"loss": 4.4047,
"step": 160
},
{
"epoch": 0.09276358170496027,
"grad_norm": 2.6336724758148193,
"learning_rate": 9.200000000000001e-05,
"loss": 4.4777,
"step": 162
},
{
"epoch": 0.09390881110872522,
"grad_norm": 2.8348231315612793,
"learning_rate": 9.314285714285715e-05,
"loss": 4.3445,
"step": 164
},
{
"epoch": 0.09505404051249015,
"grad_norm": 4.271595478057861,
"learning_rate": 9.428571428571429e-05,
"loss": 4.4234,
"step": 166
},
{
"epoch": 0.0961992699162551,
"grad_norm": 3.4789109230041504,
"learning_rate": 9.542857142857143e-05,
"loss": 4.2872,
"step": 168
},
{
"epoch": 0.09734449932002004,
"grad_norm": 2.57273530960083,
"learning_rate": 9.657142857142858e-05,
"loss": 4.4177,
"step": 170
},
{
"epoch": 0.09848972872378499,
"grad_norm": 2.185086250305176,
"learning_rate": 9.771428571428572e-05,
"loss": 4.3568,
"step": 172
},
{
"epoch": 0.09963495812754992,
"grad_norm": 2.771744966506958,
"learning_rate": 9.885714285714286e-05,
"loss": 4.3392,
"step": 174
},
{
"epoch": 0.10078018753131486,
"grad_norm": 1.950353741645813,
"learning_rate": 0.0001,
"loss": 4.1931,
"step": 176
},
{
"epoch": 0.10192541693507981,
"grad_norm": 2.4709694385528564,
"learning_rate": 9.999991040472416e-05,
"loss": 4.2936,
"step": 178
},
{
"epoch": 0.10307064633884475,
"grad_norm": 2.140997886657715,
"learning_rate": 9.999964161921776e-05,
"loss": 4.1653,
"step": 180
},
{
"epoch": 0.1042158757426097,
"grad_norm": 2.491321563720703,
"learning_rate": 9.999919364444403e-05,
"loss": 4.3202,
"step": 182
},
{
"epoch": 0.10536110514637463,
"grad_norm": 2.5410189628601074,
"learning_rate": 9.999856648200845e-05,
"loss": 4.2657,
"step": 184
},
{
"epoch": 0.10650633455013958,
"grad_norm": 2.1820590496063232,
"learning_rate": 9.999776013415866e-05,
"loss": 4.2282,
"step": 186
},
{
"epoch": 0.10765156395390452,
"grad_norm": 1.7251808643341064,
"learning_rate": 9.999677460378444e-05,
"loss": 4.3421,
"step": 188
},
{
"epoch": 0.10879679335766945,
"grad_norm": 2.002145290374756,
"learning_rate": 9.999560989441779e-05,
"loss": 4.1361,
"step": 190
},
{
"epoch": 0.1099420227614344,
"grad_norm": 1.9663431644439697,
"learning_rate": 9.999426601023274e-05,
"loss": 4.201,
"step": 192
},
{
"epoch": 0.11108725216519934,
"grad_norm": 2.1406776905059814,
"learning_rate": 9.999274295604558e-05,
"loss": 4.1086,
"step": 194
},
{
"epoch": 0.11223248156896429,
"grad_norm": 3.3888607025146484,
"learning_rate": 9.999104073731458e-05,
"loss": 4.2723,
"step": 196
},
{
"epoch": 0.11337771097272922,
"grad_norm": 2.371840715408325,
"learning_rate": 9.998915936014024e-05,
"loss": 4.1893,
"step": 198
},
{
"epoch": 0.11452294037649417,
"grad_norm": 2.0502302646636963,
"learning_rate": 9.998709883126502e-05,
"loss": 4.1395,
"step": 200
},
{
"epoch": 0.11566816978025911,
"grad_norm": 1.6674678325653076,
"learning_rate": 9.998485915807347e-05,
"loss": 4.071,
"step": 202
},
{
"epoch": 0.11681339918402404,
"grad_norm": 1.7829004526138306,
"learning_rate": 9.998244034859219e-05,
"loss": 4.1107,
"step": 204
},
{
"epoch": 0.117958628587789,
"grad_norm": 1.763493299484253,
"learning_rate": 9.997984241148967e-05,
"loss": 4.1142,
"step": 206
},
{
"epoch": 0.11910385799155393,
"grad_norm": 2.069258213043213,
"learning_rate": 9.997706535607649e-05,
"loss": 4.047,
"step": 208
},
{
"epoch": 0.12024908739531888,
"grad_norm": 2.4262139797210693,
"learning_rate": 9.997410919230505e-05,
"loss": 4.0396,
"step": 210
},
{
"epoch": 0.12139431679908382,
"grad_norm": 1.820494532585144,
"learning_rate": 9.997097393076971e-05,
"loss": 4.1548,
"step": 212
},
{
"epoch": 0.12253954620284876,
"grad_norm": 2.1332643032073975,
"learning_rate": 9.996765958270664e-05,
"loss": 4.1384,
"step": 214
},
{
"epoch": 0.1236847756066137,
"grad_norm": 2.1329920291900635,
"learning_rate": 9.996416615999384e-05,
"loss": 4.0315,
"step": 216
},
{
"epoch": 0.12483000501037864,
"grad_norm": 2.29955792427063,
"learning_rate": 9.996049367515108e-05,
"loss": 4.0963,
"step": 218
},
{
"epoch": 0.12597523441414357,
"grad_norm": 2.225827693939209,
"learning_rate": 9.995664214133983e-05,
"loss": 4.1247,
"step": 220
},
{
"epoch": 0.12712046381790854,
"grad_norm": 1.794838786125183,
"learning_rate": 9.99526115723633e-05,
"loss": 4.0449,
"step": 222
},
{
"epoch": 0.12826569322167347,
"grad_norm": 1.7548491954803467,
"learning_rate": 9.994840198266626e-05,
"loss": 3.927,
"step": 224
},
{
"epoch": 0.1294109226254384,
"grad_norm": 1.487001895904541,
"learning_rate": 9.994401338733508e-05,
"loss": 3.9714,
"step": 226
},
{
"epoch": 0.13055615202920334,
"grad_norm": 1.9811242818832397,
"learning_rate": 9.993944580209768e-05,
"loss": 4.0094,
"step": 228
},
{
"epoch": 0.13170138143296828,
"grad_norm": 1.4257248640060425,
"learning_rate": 9.99346992433234e-05,
"loss": 4.0213,
"step": 230
},
{
"epoch": 0.13284661083673324,
"grad_norm": 1.545812726020813,
"learning_rate": 9.992977372802302e-05,
"loss": 4.0076,
"step": 232
},
{
"epoch": 0.13399184024049818,
"grad_norm": 1.8193179368972778,
"learning_rate": 9.992466927384865e-05,
"loss": 4.0536,
"step": 234
},
{
"epoch": 0.1351370696442631,
"grad_norm": 2.329951763153076,
"learning_rate": 9.991938589909369e-05,
"loss": 3.9284,
"step": 236
},
{
"epoch": 0.13628229904802805,
"grad_norm": 1.928336501121521,
"learning_rate": 9.991392362269276e-05,
"loss": 3.9462,
"step": 238
},
{
"epoch": 0.137427528451793,
"grad_norm": 1.4073456525802612,
"learning_rate": 9.990828246422164e-05,
"loss": 3.9525,
"step": 240
},
{
"epoch": 0.13857275785555795,
"grad_norm": 1.6663973331451416,
"learning_rate": 9.990246244389713e-05,
"loss": 3.9685,
"step": 242
},
{
"epoch": 0.13971798725932288,
"grad_norm": 1.8091737031936646,
"learning_rate": 9.989646358257715e-05,
"loss": 3.9284,
"step": 244
},
{
"epoch": 0.14086321666308782,
"grad_norm": 1.5511283874511719,
"learning_rate": 9.989028590176044e-05,
"loss": 3.9289,
"step": 246
},
{
"epoch": 0.14200844606685276,
"grad_norm": 1.5394625663757324,
"learning_rate": 9.988392942358664e-05,
"loss": 3.9849,
"step": 248
},
{
"epoch": 0.14315367547061772,
"grad_norm": 1.680882453918457,
"learning_rate": 9.98773941708362e-05,
"loss": 3.9452,
"step": 250
},
{
"epoch": 0.14429890487438266,
"grad_norm": 1.6341670751571655,
"learning_rate": 9.98706801669302e-05,
"loss": 3.8317,
"step": 252
},
{
"epoch": 0.1454441342781476,
"grad_norm": 1.9933757781982422,
"learning_rate": 9.986378743593036e-05,
"loss": 3.9665,
"step": 254
},
{
"epoch": 0.14658936368191253,
"grad_norm": 2.2253994941711426,
"learning_rate": 9.985671600253894e-05,
"loss": 3.9239,
"step": 256
},
{
"epoch": 0.14773459308567746,
"grad_norm": 2.2543365955352783,
"learning_rate": 9.984946589209862e-05,
"loss": 3.8639,
"step": 258
},
{
"epoch": 0.14887982248944243,
"grad_norm": 1.8106629848480225,
"learning_rate": 9.984203713059241e-05,
"loss": 3.9178,
"step": 260
},
{
"epoch": 0.15002505189320736,
"grad_norm": 1.638542652130127,
"learning_rate": 9.983442974464362e-05,
"loss": 3.9169,
"step": 262
},
{
"epoch": 0.1511702812969723,
"grad_norm": 1.3521384000778198,
"learning_rate": 9.982664376151564e-05,
"loss": 3.8682,
"step": 264
},
{
"epoch": 0.15231551070073723,
"grad_norm": 1.6458699703216553,
"learning_rate": 9.981867920911201e-05,
"loss": 3.9566,
"step": 266
},
{
"epoch": 0.1534607401045022,
"grad_norm": 1.7851066589355469,
"learning_rate": 9.981053611597615e-05,
"loss": 3.9085,
"step": 268
},
{
"epoch": 0.15460596950826713,
"grad_norm": 1.6740517616271973,
"learning_rate": 9.980221451129137e-05,
"loss": 3.8899,
"step": 270
},
{
"epoch": 0.15575119891203207,
"grad_norm": 1.117129921913147,
"learning_rate": 9.979371442488073e-05,
"loss": 3.7544,
"step": 272
},
{
"epoch": 0.156896428315797,
"grad_norm": 1.5676058530807495,
"learning_rate": 9.978503588720694e-05,
"loss": 3.7753,
"step": 274
},
{
"epoch": 0.15804165771956194,
"grad_norm": 1.6609163284301758,
"learning_rate": 9.977617892937223e-05,
"loss": 3.8463,
"step": 276
},
{
"epoch": 0.1591868871233269,
"grad_norm": 1.7229987382888794,
"learning_rate": 9.976714358311828e-05,
"loss": 3.8446,
"step": 278
},
{
"epoch": 0.16033211652709184,
"grad_norm": 1.6770962476730347,
"learning_rate": 9.975792988082603e-05,
"loss": 3.8684,
"step": 280
},
{
"epoch": 0.16147734593085677,
"grad_norm": 1.215281367301941,
"learning_rate": 9.974853785551568e-05,
"loss": 3.7788,
"step": 282
},
{
"epoch": 0.1626225753346217,
"grad_norm": 1.208257794380188,
"learning_rate": 9.973896754084646e-05,
"loss": 3.8338,
"step": 284
},
{
"epoch": 0.16376780473838665,
"grad_norm": 1.4068255424499512,
"learning_rate": 9.972921897111658e-05,
"loss": 3.8583,
"step": 286
},
{
"epoch": 0.1649130341421516,
"grad_norm": 1.4898021221160889,
"learning_rate": 9.971929218126306e-05,
"loss": 3.8051,
"step": 288
},
{
"epoch": 0.16605826354591655,
"grad_norm": 1.6303211450576782,
"learning_rate": 9.970918720686164e-05,
"loss": 3.8598,
"step": 290
},
{
"epoch": 0.16720349294968148,
"grad_norm": 1.6599496603012085,
"learning_rate": 9.969890408412665e-05,
"loss": 3.7214,
"step": 292
},
{
"epoch": 0.16834872235344642,
"grad_norm": 1.1958950757980347,
"learning_rate": 9.968844284991086e-05,
"loss": 3.7042,
"step": 294
},
{
"epoch": 0.16949395175721135,
"grad_norm": 1.3099420070648193,
"learning_rate": 9.967780354170533e-05,
"loss": 3.7405,
"step": 296
},
{
"epoch": 0.17063918116097632,
"grad_norm": 1.5054072141647339,
"learning_rate": 9.966698619763936e-05,
"loss": 3.7827,
"step": 298
},
{
"epoch": 0.17178441056474125,
"grad_norm": 1.444757103919983,
"learning_rate": 9.965599085648025e-05,
"loss": 3.7361,
"step": 300
},
{
"epoch": 0.1729296399685062,
"grad_norm": 0.9423370361328125,
"learning_rate": 9.964481755763322e-05,
"loss": 3.7063,
"step": 302
},
{
"epoch": 0.17407486937227112,
"grad_norm": 1.044169306755066,
"learning_rate": 9.963346634114128e-05,
"loss": 3.7999,
"step": 304
},
{
"epoch": 0.1752200987760361,
"grad_norm": 1.578296184539795,
"learning_rate": 9.962193724768503e-05,
"loss": 3.7448,
"step": 306
},
{
"epoch": 0.17636532817980102,
"grad_norm": 1.4953491687774658,
"learning_rate": 9.961023031858258e-05,
"loss": 3.7625,
"step": 308
},
{
"epoch": 0.17751055758356596,
"grad_norm": 1.295817494392395,
"learning_rate": 9.959834559578934e-05,
"loss": 3.7042,
"step": 310
},
{
"epoch": 0.1786557869873309,
"grad_norm": 1.4001609086990356,
"learning_rate": 9.95862831218979e-05,
"loss": 3.7272,
"step": 312
},
{
"epoch": 0.17980101639109583,
"grad_norm": 1.8881722688674927,
"learning_rate": 9.95740429401379e-05,
"loss": 3.6904,
"step": 314
},
{
"epoch": 0.1809462457948608,
"grad_norm": 1.919791340827942,
"learning_rate": 9.956162509437584e-05,
"loss": 3.7071,
"step": 316
},
{
"epoch": 0.18209147519862573,
"grad_norm": 1.758253574371338,
"learning_rate": 9.954902962911494e-05,
"loss": 3.7906,
"step": 318
},
{
"epoch": 0.18323670460239067,
"grad_norm": 1.480323314666748,
"learning_rate": 9.953625658949494e-05,
"loss": 3.7697,
"step": 320
},
{
"epoch": 0.1843819340061556,
"grad_norm": 1.5573948621749878,
"learning_rate": 9.952330602129202e-05,
"loss": 3.752,
"step": 322
},
{
"epoch": 0.18552716340992054,
"grad_norm": 1.3204878568649292,
"learning_rate": 9.951017797091858e-05,
"loss": 3.6479,
"step": 324
},
{
"epoch": 0.1866723928136855,
"grad_norm": 1.5514147281646729,
"learning_rate": 9.949687248542303e-05,
"loss": 3.7199,
"step": 326
},
{
"epoch": 0.18781762221745044,
"grad_norm": 1.2910770177841187,
"learning_rate": 9.948338961248977e-05,
"loss": 3.7427,
"step": 328
},
{
"epoch": 0.18896285162121537,
"grad_norm": 1.1663178205490112,
"learning_rate": 9.946972940043882e-05,
"loss": 3.6616,
"step": 330
},
{
"epoch": 0.1901080810249803,
"grad_norm": 1.3439650535583496,
"learning_rate": 9.945589189822584e-05,
"loss": 3.7385,
"step": 332
},
{
"epoch": 0.19125331042874527,
"grad_norm": 1.1256877183914185,
"learning_rate": 9.94418771554418e-05,
"loss": 3.6056,
"step": 334
},
{
"epoch": 0.1923985398325102,
"grad_norm": 1.1813896894454956,
"learning_rate": 9.942768522231289e-05,
"loss": 3.5544,
"step": 336
},
{
"epoch": 0.19354376923627514,
"grad_norm": 1.2541157007217407,
"learning_rate": 9.941331614970031e-05,
"loss": 3.6401,
"step": 338
},
{
"epoch": 0.19468899864004008,
"grad_norm": 1.237069010734558,
"learning_rate": 9.939876998910012e-05,
"loss": 3.7564,
"step": 340
},
{
"epoch": 0.19583422804380501,
"grad_norm": 1.1157530546188354,
"learning_rate": 9.938404679264301e-05,
"loss": 3.6164,
"step": 342
},
{
"epoch": 0.19697945744756998,
"grad_norm": 1.149465560913086,
"learning_rate": 9.936914661309412e-05,
"loss": 3.6968,
"step": 344
},
{
"epoch": 0.1981246868513349,
"grad_norm": 0.9530683755874634,
"learning_rate": 9.93540695038529e-05,
"loss": 3.6194,
"step": 346
},
{
"epoch": 0.19926991625509985,
"grad_norm": 1.1686296463012695,
"learning_rate": 9.933881551895281e-05,
"loss": 3.7604,
"step": 348
},
{
"epoch": 0.20041514565886479,
"grad_norm": 1.2699095010757446,
"learning_rate": 9.93233847130613e-05,
"loss": 3.6371,
"step": 350
},
{
"epoch": 0.20156037506262972,
"grad_norm": 1.1345208883285522,
"learning_rate": 9.930777714147945e-05,
"loss": 3.6146,
"step": 352
},
{
"epoch": 0.20270560446639468,
"grad_norm": 1.3319895267486572,
"learning_rate": 9.929199286014185e-05,
"loss": 3.6443,
"step": 354
},
{
"epoch": 0.20385083387015962,
"grad_norm": 1.6053088903427124,
"learning_rate": 9.927603192561637e-05,
"loss": 3.6277,
"step": 356
},
{
"epoch": 0.20499606327392456,
"grad_norm": 1.2149386405944824,
"learning_rate": 9.925989439510398e-05,
"loss": 3.5555,
"step": 358
},
{
"epoch": 0.2061412926776895,
"grad_norm": 1.0859287977218628,
"learning_rate": 9.924358032643855e-05,
"loss": 3.6253,
"step": 360
},
{
"epoch": 0.20728652208145446,
"grad_norm": 0.9613994359970093,
"learning_rate": 9.922708977808663e-05,
"loss": 3.5826,
"step": 362
},
{
"epoch": 0.2084317514852194,
"grad_norm": 1.0509222745895386,
"learning_rate": 9.921042280914721e-05,
"loss": 3.6263,
"step": 364
},
{
"epoch": 0.20957698088898433,
"grad_norm": 1.3777049779891968,
"learning_rate": 9.919357947935156e-05,
"loss": 3.6187,
"step": 366
},
{
"epoch": 0.21072221029274926,
"grad_norm": 1.3364644050598145,
"learning_rate": 9.9176559849063e-05,
"loss": 3.5946,
"step": 368
},
{
"epoch": 0.2118674396965142,
"grad_norm": 1.4562104940414429,
"learning_rate": 9.915936397927665e-05,
"loss": 3.6099,
"step": 370
},
{
"epoch": 0.21301266910027916,
"grad_norm": 1.066383719444275,
"learning_rate": 9.91419919316193e-05,
"loss": 3.5395,
"step": 372
},
{
"epoch": 0.2141578985040441,
"grad_norm": 1.6498733758926392,
"learning_rate": 9.912444376834903e-05,
"loss": 3.6083,
"step": 374
},
{
"epoch": 0.21530312790780903,
"grad_norm": 0.9828553795814514,
"learning_rate": 9.910671955235518e-05,
"loss": 3.5409,
"step": 376
},
{
"epoch": 0.21644835731157397,
"grad_norm": 1.178269624710083,
"learning_rate": 9.908881934715798e-05,
"loss": 3.6018,
"step": 378
},
{
"epoch": 0.2175935867153389,
"grad_norm": 1.3328818082809448,
"learning_rate": 9.907074321690838e-05,
"loss": 3.5718,
"step": 380
},
{
"epoch": 0.21873881611910387,
"grad_norm": 1.1077896356582642,
"learning_rate": 9.905249122638783e-05,
"loss": 3.581,
"step": 382
},
{
"epoch": 0.2198840455228688,
"grad_norm": 1.220638394355774,
"learning_rate": 9.903406344100798e-05,
"loss": 3.5813,
"step": 384
},
{
"epoch": 0.22102927492663374,
"grad_norm": 1.5574766397476196,
"learning_rate": 9.901545992681057e-05,
"loss": 3.5785,
"step": 386
},
{
"epoch": 0.22217450433039868,
"grad_norm": 1.013902187347412,
"learning_rate": 9.899668075046706e-05,
"loss": 3.6156,
"step": 388
},
{
"epoch": 0.2233197337341636,
"grad_norm": 1.197936773300171,
"learning_rate": 9.897772597927848e-05,
"loss": 3.5428,
"step": 390
},
{
"epoch": 0.22446496313792857,
"grad_norm": 0.9838180541992188,
"learning_rate": 9.895859568117512e-05,
"loss": 3.534,
"step": 392
},
{
"epoch": 0.2256101925416935,
"grad_norm": 1.0316840410232544,
"learning_rate": 9.893928992471639e-05,
"loss": 3.5691,
"step": 394
},
{
"epoch": 0.22675542194545845,
"grad_norm": 0.9378739595413208,
"learning_rate": 9.891980877909045e-05,
"loss": 3.5368,
"step": 396
},
{
"epoch": 0.22790065134922338,
"grad_norm": 1.4947346448898315,
"learning_rate": 9.890015231411404e-05,
"loss": 3.5709,
"step": 398
},
{
"epoch": 0.22904588075298835,
"grad_norm": 0.9118148684501648,
"learning_rate": 9.888032060023225e-05,
"loss": 3.527,
"step": 400
},
{
"epoch": 0.23019111015675328,
"grad_norm": 1.2407753467559814,
"learning_rate": 9.886031370851816e-05,
"loss": 3.5301,
"step": 402
},
{
"epoch": 0.23133633956051822,
"grad_norm": 1.7163093090057373,
"learning_rate": 9.88401317106727e-05,
"loss": 3.5828,
"step": 404
},
{
"epoch": 0.23248156896428315,
"grad_norm": 1.0757009983062744,
"learning_rate": 9.881977467902434e-05,
"loss": 3.4831,
"step": 406
},
{
"epoch": 0.2336267983680481,
"grad_norm": 0.9473862648010254,
"learning_rate": 9.879924268652885e-05,
"loss": 3.5196,
"step": 408
},
{
"epoch": 0.23477202777181305,
"grad_norm": 1.199771761894226,
"learning_rate": 9.877853580676897e-05,
"loss": 3.574,
"step": 410
},
{
"epoch": 0.235917257175578,
"grad_norm": 0.9006698131561279,
"learning_rate": 9.875765411395428e-05,
"loss": 3.5348,
"step": 412
},
{
"epoch": 0.23706248657934292,
"grad_norm": 1.1242282390594482,
"learning_rate": 9.873659768292081e-05,
"loss": 3.5249,
"step": 414
},
{
"epoch": 0.23820771598310786,
"grad_norm": 1.0675747394561768,
"learning_rate": 9.871536658913082e-05,
"loss": 3.5086,
"step": 416
},
{
"epoch": 0.2393529453868728,
"grad_norm": 0.8544116616249084,
"learning_rate": 9.869396090867255e-05,
"loss": 3.546,
"step": 418
},
{
"epoch": 0.24049817479063776,
"grad_norm": 1.3136742115020752,
"learning_rate": 9.867238071825992e-05,
"loss": 3.4937,
"step": 420
},
{
"epoch": 0.2416434041944027,
"grad_norm": 1.3740772008895874,
"learning_rate": 9.865062609523223e-05,
"loss": 3.4303,
"step": 422
},
{
"epoch": 0.24278863359816763,
"grad_norm": 1.342213749885559,
"learning_rate": 9.862869711755397e-05,
"loss": 3.4982,
"step": 424
},
{
"epoch": 0.24393386300193257,
"grad_norm": 1.0677942037582397,
"learning_rate": 9.860659386381443e-05,
"loss": 3.4288,
"step": 426
},
{
"epoch": 0.24507909240569753,
"grad_norm": 0.9615838527679443,
"learning_rate": 9.858431641322749e-05,
"loss": 3.4787,
"step": 428
},
{
"epoch": 0.24622432180946247,
"grad_norm": 1.0572890043258667,
"learning_rate": 9.856186484563134e-05,
"loss": 3.5314,
"step": 430
},
{
"epoch": 0.2473695512132274,
"grad_norm": 1.158275842666626,
"learning_rate": 9.853923924148815e-05,
"loss": 3.5504,
"step": 432
},
{
"epoch": 0.24851478061699234,
"grad_norm": 1.171581745147705,
"learning_rate": 9.851643968188383e-05,
"loss": 3.5478,
"step": 434
},
{
"epoch": 0.24966001002075727,
"grad_norm": 1.0333714485168457,
"learning_rate": 9.849346624852764e-05,
"loss": 3.5497,
"step": 436
},
{
"epoch": 0.2508052394245222,
"grad_norm": 0.9459155797958374,
"learning_rate": 9.847031902375207e-05,
"loss": 3.5074,
"step": 438
},
{
"epoch": 0.25195046882828714,
"grad_norm": 1.0424790382385254,
"learning_rate": 9.84469980905124e-05,
"loss": 3.4961,
"step": 440
},
{
"epoch": 0.25309569823205214,
"grad_norm": 1.0463571548461914,
"learning_rate": 9.842350353238642e-05,
"loss": 3.4405,
"step": 442
},
{
"epoch": 0.25424092763581707,
"grad_norm": 1.000319242477417,
"learning_rate": 9.839983543357421e-05,
"loss": 3.4595,
"step": 444
},
{
"epoch": 0.255386157039582,
"grad_norm": 1.2526150941848755,
"learning_rate": 9.837599387889773e-05,
"loss": 3.5012,
"step": 446
},
{
"epoch": 0.25653138644334694,
"grad_norm": 1.3148843050003052,
"learning_rate": 9.835197895380065e-05,
"loss": 3.4767,
"step": 448
},
{
"epoch": 0.2576766158471119,
"grad_norm": 1.3939634561538696,
"learning_rate": 9.83277907443479e-05,
"loss": 3.3783,
"step": 450
},
{
"epoch": 0.2588218452508768,
"grad_norm": 1.0367929935455322,
"learning_rate": 9.830342933722545e-05,
"loss": 3.4289,
"step": 452
},
{
"epoch": 0.25996707465464175,
"grad_norm": 0.9439120888710022,
"learning_rate": 9.827889481974e-05,
"loss": 3.4728,
"step": 454
},
{
"epoch": 0.2611123040584067,
"grad_norm": 1.2146074771881104,
"learning_rate": 9.82541872798186e-05,
"loss": 3.4257,
"step": 456
},
{
"epoch": 0.2622575334621716,
"grad_norm": 1.0530729293823242,
"learning_rate": 9.822930680600841e-05,
"loss": 3.4681,
"step": 458
},
{
"epoch": 0.26340276286593656,
"grad_norm": 1.1026678085327148,
"learning_rate": 9.820425348747637e-05,
"loss": 3.4298,
"step": 460
},
{
"epoch": 0.26454799226970155,
"grad_norm": 1.2520779371261597,
"learning_rate": 9.817902741400879e-05,
"loss": 3.4191,
"step": 462
},
{
"epoch": 0.2656932216734665,
"grad_norm": 1.1041593551635742,
"learning_rate": 9.815362867601121e-05,
"loss": 3.466,
"step": 464
},
{
"epoch": 0.2668384510772314,
"grad_norm": 0.881693422794342,
"learning_rate": 9.812805736450786e-05,
"loss": 3.4929,
"step": 466
},
{
"epoch": 0.26798368048099636,
"grad_norm": 1.3125033378601074,
"learning_rate": 9.810231357114152e-05,
"loss": 3.4592,
"step": 468
},
{
"epoch": 0.2691289098847613,
"grad_norm": 1.2968268394470215,
"learning_rate": 9.807639738817307e-05,
"loss": 3.4851,
"step": 470
},
{
"epoch": 0.2702741392885262,
"grad_norm": 0.9855544567108154,
"learning_rate": 9.805030890848119e-05,
"loss": 3.4487,
"step": 472
},
{
"epoch": 0.27141936869229116,
"grad_norm": 1.3063323497772217,
"learning_rate": 9.802404822556209e-05,
"loss": 3.4961,
"step": 474
},
{
"epoch": 0.2725645980960561,
"grad_norm": 1.0567957162857056,
"learning_rate": 9.79976154335291e-05,
"loss": 3.3975,
"step": 476
},
{
"epoch": 0.27370982749982103,
"grad_norm": 0.9473979473114014,
"learning_rate": 9.797101062711231e-05,
"loss": 3.4573,
"step": 478
},
{
"epoch": 0.274855056903586,
"grad_norm": 1.2931294441223145,
"learning_rate": 9.794423390165837e-05,
"loss": 3.3732,
"step": 480
},
{
"epoch": 0.27600028630735096,
"grad_norm": 1.233302116394043,
"learning_rate": 9.791728535312998e-05,
"loss": 3.419,
"step": 482
},
{
"epoch": 0.2771455157111159,
"grad_norm": 0.9638918042182922,
"learning_rate": 9.789016507810564e-05,
"loss": 3.4119,
"step": 484
},
{
"epoch": 0.27829074511488083,
"grad_norm": 1.105643391609192,
"learning_rate": 9.786287317377929e-05,
"loss": 3.3909,
"step": 486
},
{
"epoch": 0.27943597451864577,
"grad_norm": 0.9666796922683716,
"learning_rate": 9.783540973795998e-05,
"loss": 3.4194,
"step": 488
},
{
"epoch": 0.2805812039224107,
"grad_norm": 1.3533586263656616,
"learning_rate": 9.780777486907146e-05,
"loss": 3.3789,
"step": 490
},
{
"epoch": 0.28172643332617564,
"grad_norm": 1.1253416538238525,
"learning_rate": 9.777996866615186e-05,
"loss": 3.4385,
"step": 492
},
{
"epoch": 0.2828716627299406,
"grad_norm": 0.7198868989944458,
"learning_rate": 9.775199122885339e-05,
"loss": 3.4038,
"step": 494
},
{
"epoch": 0.2840168921337055,
"grad_norm": 0.9696770310401917,
"learning_rate": 9.772384265744188e-05,
"loss": 3.4576,
"step": 496
},
{
"epoch": 0.28516212153747045,
"grad_norm": 1.321269154548645,
"learning_rate": 9.76955230527965e-05,
"loss": 3.4348,
"step": 498
},
{
"epoch": 0.28630735094123544,
"grad_norm": 1.3119802474975586,
"learning_rate": 9.766703251640934e-05,
"loss": 3.3848,
"step": 500
},
{
"epoch": 0.2874525803450004,
"grad_norm": 1.0199967622756958,
"learning_rate": 9.763837115038513e-05,
"loss": 3.4108,
"step": 502
},
{
"epoch": 0.2885978097487653,
"grad_norm": 0.9925194382667542,
"learning_rate": 9.760953905744075e-05,
"loss": 3.31,
"step": 504
},
{
"epoch": 0.28974303915253025,
"grad_norm": 0.9447107315063477,
"learning_rate": 9.758053634090502e-05,
"loss": 3.3598,
"step": 506
},
{
"epoch": 0.2908882685562952,
"grad_norm": 1.052873134613037,
"learning_rate": 9.755136310471817e-05,
"loss": 3.3704,
"step": 508
},
{
"epoch": 0.2920334979600601,
"grad_norm": 1.061514139175415,
"learning_rate": 9.752201945343156e-05,
"loss": 3.3642,
"step": 510
},
{
"epoch": 0.29317872736382505,
"grad_norm": 0.8627074956893921,
"learning_rate": 9.74925054922073e-05,
"loss": 3.367,
"step": 512
},
{
"epoch": 0.29432395676759,
"grad_norm": 1.0214530229568481,
"learning_rate": 9.746282132681785e-05,
"loss": 3.3266,
"step": 514
},
{
"epoch": 0.2954691861713549,
"grad_norm": 1.1223275661468506,
"learning_rate": 9.743296706364565e-05,
"loss": 3.4194,
"step": 516
},
{
"epoch": 0.2966144155751199,
"grad_norm": 0.9849138259887695,
"learning_rate": 9.740294280968273e-05,
"loss": 3.3664,
"step": 518
},
{
"epoch": 0.29775964497888485,
"grad_norm": 0.7025099396705627,
"learning_rate": 9.737274867253034e-05,
"loss": 3.3772,
"step": 520
},
{
"epoch": 0.2989048743826498,
"grad_norm": 0.936536967754364,
"learning_rate": 9.734238476039858e-05,
"loss": 3.3196,
"step": 522
},
{
"epoch": 0.3000501037864147,
"grad_norm": 1.113277792930603,
"learning_rate": 9.731185118210598e-05,
"loss": 3.4606,
"step": 524
},
{
"epoch": 0.30119533319017966,
"grad_norm": 1.0153186321258545,
"learning_rate": 9.728114804707909e-05,
"loss": 3.4079,
"step": 526
},
{
"epoch": 0.3023405625939446,
"grad_norm": 1.1675206422805786,
"learning_rate": 9.725027546535215e-05,
"loss": 3.4111,
"step": 528
},
{
"epoch": 0.30348579199770953,
"grad_norm": 0.9518959522247314,
"learning_rate": 9.721923354756665e-05,
"loss": 3.3905,
"step": 530
},
{
"epoch": 0.30463102140147447,
"grad_norm": 0.9693425297737122,
"learning_rate": 9.718802240497098e-05,
"loss": 3.4364,
"step": 532
},
{
"epoch": 0.3057762508052394,
"grad_norm": 1.1249076128005981,
"learning_rate": 9.715664214941997e-05,
"loss": 3.3373,
"step": 534
},
{
"epoch": 0.3069214802090044,
"grad_norm": 0.8406875133514404,
"learning_rate": 9.712509289337453e-05,
"loss": 3.321,
"step": 536
},
{
"epoch": 0.30806670961276933,
"grad_norm": 0.9538395404815674,
"learning_rate": 9.709337474990121e-05,
"loss": 3.4007,
"step": 538
},
{
"epoch": 0.30921193901653427,
"grad_norm": 0.8003599047660828,
"learning_rate": 9.706148783267187e-05,
"loss": 3.3798,
"step": 540
},
{
"epoch": 0.3103571684202992,
"grad_norm": 0.8605026602745056,
"learning_rate": 9.702943225596316e-05,
"loss": 3.2908,
"step": 542
},
{
"epoch": 0.31150239782406414,
"grad_norm": 0.7349815964698792,
"learning_rate": 9.699720813465625e-05,
"loss": 3.408,
"step": 544
},
{
"epoch": 0.3126476272278291,
"grad_norm": 1.1622780561447144,
"learning_rate": 9.696481558423628e-05,
"loss": 3.3212,
"step": 546
},
{
"epoch": 0.313792856631594,
"grad_norm": 0.9829496145248413,
"learning_rate": 9.693225472079204e-05,
"loss": 3.4067,
"step": 548
},
{
"epoch": 0.31493808603535894,
"grad_norm": 1.1378313302993774,
"learning_rate": 9.689952566101548e-05,
"loss": 3.3556,
"step": 550
},
{
"epoch": 0.3160833154391239,
"grad_norm": 0.9355561137199402,
"learning_rate": 9.686662852220142e-05,
"loss": 3.3281,
"step": 552
},
{
"epoch": 0.3172285448428888,
"grad_norm": 0.9328277111053467,
"learning_rate": 9.683356342224694e-05,
"loss": 3.313,
"step": 554
},
{
"epoch": 0.3183737742466538,
"grad_norm": 1.277377724647522,
"learning_rate": 9.680033047965114e-05,
"loss": 3.3499,
"step": 556
},
{
"epoch": 0.31951900365041874,
"grad_norm": 1.0239235162734985,
"learning_rate": 9.67669298135146e-05,
"loss": 3.3936,
"step": 558
},
{
"epoch": 0.3206642330541837,
"grad_norm": 0.6908963322639465,
"learning_rate": 9.673336154353899e-05,
"loss": 3.3584,
"step": 560
},
{
"epoch": 0.3218094624579486,
"grad_norm": 0.8835290670394897,
"learning_rate": 9.669962579002664e-05,
"loss": 3.3728,
"step": 562
},
{
"epoch": 0.32295469186171355,
"grad_norm": 1.0561710596084595,
"learning_rate": 9.666572267388013e-05,
"loss": 3.3579,
"step": 564
},
{
"epoch": 0.3240999212654785,
"grad_norm": 0.8400120735168457,
"learning_rate": 9.663165231660181e-05,
"loss": 3.3224,
"step": 566
},
{
"epoch": 0.3252451506692434,
"grad_norm": 0.8960584998130798,
"learning_rate": 9.659741484029341e-05,
"loss": 3.3434,
"step": 568
},
{
"epoch": 0.32639038007300836,
"grad_norm": 0.9615944027900696,
"learning_rate": 9.656301036765558e-05,
"loss": 3.2587,
"step": 570
},
{
"epoch": 0.3275356094767733,
"grad_norm": 0.983391523361206,
"learning_rate": 9.652843902198743e-05,
"loss": 3.2396,
"step": 572
},
{
"epoch": 0.3286808388805383,
"grad_norm": 0.7758197784423828,
"learning_rate": 9.649370092718615e-05,
"loss": 3.2948,
"step": 574
},
{
"epoch": 0.3298260682843032,
"grad_norm": 0.9714862704277039,
"learning_rate": 9.64587962077465e-05,
"loss": 3.3381,
"step": 576
},
{
"epoch": 0.33097129768806816,
"grad_norm": 0.8628116846084595,
"learning_rate": 9.64237249887604e-05,
"loss": 3.294,
"step": 578
},
{
"epoch": 0.3321165270918331,
"grad_norm": 0.9794777035713196,
"learning_rate": 9.638848739591646e-05,
"loss": 3.3119,
"step": 580
},
{
"epoch": 0.333261756495598,
"grad_norm": 0.8179820775985718,
"learning_rate": 9.635308355549957e-05,
"loss": 3.3009,
"step": 582
},
{
"epoch": 0.33440698589936296,
"grad_norm": 0.8732323050498962,
"learning_rate": 9.63175135943904e-05,
"loss": 3.3207,
"step": 584
},
{
"epoch": 0.3355522153031279,
"grad_norm": 1.0355788469314575,
"learning_rate": 9.628177764006497e-05,
"loss": 3.2889,
"step": 586
},
{
"epoch": 0.33669744470689283,
"grad_norm": 0.8974720239639282,
"learning_rate": 9.624587582059417e-05,
"loss": 3.3089,
"step": 588
},
{
"epoch": 0.33784267411065777,
"grad_norm": 0.7800531387329102,
"learning_rate": 9.620980826464335e-05,
"loss": 3.2999,
"step": 590
},
{
"epoch": 0.3389879035144227,
"grad_norm": 0.7294676899909973,
"learning_rate": 9.617357510147182e-05,
"loss": 3.3634,
"step": 592
},
{
"epoch": 0.3401331329181877,
"grad_norm": 0.7799131274223328,
"learning_rate": 9.613717646093239e-05,
"loss": 3.308,
"step": 594
},
{
"epoch": 0.34127836232195263,
"grad_norm": 0.9899328947067261,
"learning_rate": 9.610061247347091e-05,
"loss": 3.3191,
"step": 596
},
{
"epoch": 0.34242359172571757,
"grad_norm": 1.0520347356796265,
"learning_rate": 9.606388327012579e-05,
"loss": 3.389,
"step": 598
},
{
"epoch": 0.3435688211294825,
"grad_norm": 0.9768466353416443,
"learning_rate": 9.602698898252756e-05,
"loss": 3.2905,
"step": 600
},
{
"epoch": 0.34471405053324744,
"grad_norm": 0.9359555244445801,
"learning_rate": 9.598992974289837e-05,
"loss": 3.3022,
"step": 602
},
{
"epoch": 0.3458592799370124,
"grad_norm": 0.7487738728523254,
"learning_rate": 9.595270568405156e-05,
"loss": 3.2234,
"step": 604
},
{
"epoch": 0.3470045093407773,
"grad_norm": 0.8295655846595764,
"learning_rate": 9.591531693939109e-05,
"loss": 3.3506,
"step": 606
},
{
"epoch": 0.34814973874454225,
"grad_norm": 0.9020605683326721,
"learning_rate": 9.587776364291117e-05,
"loss": 3.3026,
"step": 608
},
{
"epoch": 0.3492949681483072,
"grad_norm": 0.7868961095809937,
"learning_rate": 9.58400459291957e-05,
"loss": 3.2393,
"step": 610
},
{
"epoch": 0.3504401975520722,
"grad_norm": 0.9779835939407349,
"learning_rate": 9.580216393341785e-05,
"loss": 3.3254,
"step": 612
},
{
"epoch": 0.3515854269558371,
"grad_norm": 0.8962246179580688,
"learning_rate": 9.576411779133956e-05,
"loss": 3.2486,
"step": 614
},
{
"epoch": 0.35273065635960205,
"grad_norm": 0.9166551828384399,
"learning_rate": 9.572590763931097e-05,
"loss": 3.2193,
"step": 616
},
{
"epoch": 0.353875885763367,
"grad_norm": 0.7779364585876465,
"learning_rate": 9.568753361427009e-05,
"loss": 3.2469,
"step": 618
},
{
"epoch": 0.3550211151671319,
"grad_norm": 0.750092089176178,
"learning_rate": 9.564899585374214e-05,
"loss": 3.2532,
"step": 620
},
{
"epoch": 0.35616634457089685,
"grad_norm": 1.0269392728805542,
"learning_rate": 9.561029449583919e-05,
"loss": 3.3331,
"step": 622
},
{
"epoch": 0.3573115739746618,
"grad_norm": 0.7937965989112854,
"learning_rate": 9.557142967925956e-05,
"loss": 3.314,
"step": 624
},
{
"epoch": 0.3584568033784267,
"grad_norm": 1.1338940858840942,
"learning_rate": 9.553240154328744e-05,
"loss": 3.3375,
"step": 626
},
{
"epoch": 0.35960203278219166,
"grad_norm": 0.7937076091766357,
"learning_rate": 9.549321022779229e-05,
"loss": 3.2691,
"step": 628
},
{
"epoch": 0.36074726218595665,
"grad_norm": 0.8552340865135193,
"learning_rate": 9.545385587322839e-05,
"loss": 3.3107,
"step": 630
},
{
"epoch": 0.3618924915897216,
"grad_norm": 1.0279617309570312,
"learning_rate": 9.541433862063429e-05,
"loss": 3.2552,
"step": 632
},
{
"epoch": 0.3630377209934865,
"grad_norm": 0.9652466177940369,
"learning_rate": 9.537465861163237e-05,
"loss": 3.242,
"step": 634
},
{
"epoch": 0.36418295039725146,
"grad_norm": 0.9129723310470581,
"learning_rate": 9.533481598842827e-05,
"loss": 3.3131,
"step": 636
},
{
"epoch": 0.3653281798010164,
"grad_norm": 0.9316424131393433,
"learning_rate": 9.529481089381042e-05,
"loss": 3.3288,
"step": 638
},
{
"epoch": 0.36647340920478133,
"grad_norm": 0.7987300753593445,
"learning_rate": 9.525464347114953e-05,
"loss": 3.2832,
"step": 640
},
{
"epoch": 0.36761863860854627,
"grad_norm": 0.7103368043899536,
"learning_rate": 9.521431386439807e-05,
"loss": 3.2339,
"step": 642
},
{
"epoch": 0.3687638680123112,
"grad_norm": 0.7420955896377563,
"learning_rate": 9.517382221808969e-05,
"loss": 3.1662,
"step": 644
},
{
"epoch": 0.36990909741607614,
"grad_norm": 0.8201749324798584,
"learning_rate": 9.513316867733883e-05,
"loss": 3.2837,
"step": 646
},
{
"epoch": 0.3710543268198411,
"grad_norm": 0.8581364154815674,
"learning_rate": 9.509235338784009e-05,
"loss": 3.2949,
"step": 648
},
{
"epoch": 0.37219955622360607,
"grad_norm": 0.956118643283844,
"learning_rate": 9.505137649586775e-05,
"loss": 3.316,
"step": 650
},
{
"epoch": 0.373344785627371,
"grad_norm": 0.708759069442749,
"learning_rate": 9.501023814827524e-05,
"loss": 3.1951,
"step": 652
},
{
"epoch": 0.37449001503113594,
"grad_norm": 0.8143038153648376,
"learning_rate": 9.496893849249464e-05,
"loss": 3.2738,
"step": 654
},
{
"epoch": 0.3756352444349009,
"grad_norm": 0.6578754782676697,
"learning_rate": 9.492747767653611e-05,
"loss": 3.2809,
"step": 656
},
{
"epoch": 0.3767804738386658,
"grad_norm": 0.8550508618354797,
"learning_rate": 9.488585584898738e-05,
"loss": 3.2668,
"step": 658
},
{
"epoch": 0.37792570324243074,
"grad_norm": 0.795080304145813,
"learning_rate": 9.48440731590132e-05,
"loss": 3.28,
"step": 660
},
{
"epoch": 0.3790709326461957,
"grad_norm": 0.9933105707168579,
"learning_rate": 9.480212975635486e-05,
"loss": 3.3104,
"step": 662
},
{
"epoch": 0.3802161620499606,
"grad_norm": 1.224338412284851,
"learning_rate": 9.476002579132957e-05,
"loss": 3.29,
"step": 664
},
{
"epoch": 0.38136139145372555,
"grad_norm": 0.8564585447311401,
"learning_rate": 9.471776141483e-05,
"loss": 3.2,
"step": 666
},
{
"epoch": 0.38250662085749054,
"grad_norm": 1.160684585571289,
"learning_rate": 9.467533677832365e-05,
"loss": 3.2226,
"step": 668
},
{
"epoch": 0.3836518502612555,
"grad_norm": 0.8671857714653015,
"learning_rate": 9.463275203385244e-05,
"loss": 3.2453,
"step": 670
},
{
"epoch": 0.3847970796650204,
"grad_norm": 1.0225045680999756,
"learning_rate": 9.459000733403205e-05,
"loss": 3.2283,
"step": 672
},
{
"epoch": 0.38594230906878535,
"grad_norm": 0.8350477814674377,
"learning_rate": 9.454710283205139e-05,
"loss": 3.2584,
"step": 674
},
{
"epoch": 0.3870875384725503,
"grad_norm": 0.8098021745681763,
"learning_rate": 9.450403868167208e-05,
"loss": 3.2836,
"step": 676
},
{
"epoch": 0.3882327678763152,
"grad_norm": 0.8174638748168945,
"learning_rate": 9.446081503722792e-05,
"loss": 3.1896,
"step": 678
},
{
"epoch": 0.38937799728008016,
"grad_norm": 0.6904940009117126,
"learning_rate": 9.441743205362426e-05,
"loss": 3.2464,
"step": 680
},
{
"epoch": 0.3905232266838451,
"grad_norm": 0.692864716053009,
"learning_rate": 9.437388988633752e-05,
"loss": 3.2277,
"step": 682
},
{
"epoch": 0.39166845608761003,
"grad_norm": 0.7014842629432678,
"learning_rate": 9.433018869141464e-05,
"loss": 3.2372,
"step": 684
},
{
"epoch": 0.39281368549137496,
"grad_norm": 0.6166806817054749,
"learning_rate": 9.428632862547237e-05,
"loss": 3.2501,
"step": 686
},
{
"epoch": 0.39395891489513996,
"grad_norm": 0.7060846090316772,
"learning_rate": 9.424230984569696e-05,
"loss": 3.2881,
"step": 688
},
{
"epoch": 0.3951041442989049,
"grad_norm": 0.7771391272544861,
"learning_rate": 9.419813250984337e-05,
"loss": 3.2149,
"step": 690
},
{
"epoch": 0.3962493737026698,
"grad_norm": 0.6290923953056335,
"learning_rate": 9.415379677623485e-05,
"loss": 3.1555,
"step": 692
},
{
"epoch": 0.39739460310643476,
"grad_norm": 0.7270971536636353,
"learning_rate": 9.410930280376225e-05,
"loss": 3.2554,
"step": 694
},
{
"epoch": 0.3985398325101997,
"grad_norm": 0.681962788105011,
"learning_rate": 9.40646507518836e-05,
"loss": 3.1671,
"step": 696
},
{
"epoch": 0.39968506191396463,
"grad_norm": 0.5727997422218323,
"learning_rate": 9.40198407806234e-05,
"loss": 3.237,
"step": 698
},
{
"epoch": 0.40083029131772957,
"grad_norm": 0.7687988877296448,
"learning_rate": 9.39748730505721e-05,
"loss": 3.2357,
"step": 700
},
{
"epoch": 0.4019755207214945,
"grad_norm": 0.7813317179679871,
"learning_rate": 9.392974772288558e-05,
"loss": 3.2101,
"step": 702
},
{
"epoch": 0.40312075012525944,
"grad_norm": 0.8766132593154907,
"learning_rate": 9.388446495928446e-05,
"loss": 3.2852,
"step": 704
},
{
"epoch": 0.40426597952902443,
"grad_norm": 0.7857736349105835,
"learning_rate": 9.383902492205363e-05,
"loss": 3.2113,
"step": 706
},
{
"epoch": 0.40541120893278937,
"grad_norm": 0.9073331356048584,
"learning_rate": 9.379342777404159e-05,
"loss": 3.2478,
"step": 708
},
{
"epoch": 0.4065564383365543,
"grad_norm": 0.8033682107925415,
"learning_rate": 9.374767367865989e-05,
"loss": 3.3159,
"step": 710
},
{
"epoch": 0.40770166774031924,
"grad_norm": 0.7821508646011353,
"learning_rate": 9.370176279988256e-05,
"loss": 3.2362,
"step": 712
},
{
"epoch": 0.4088468971440842,
"grad_norm": 0.8257923126220703,
"learning_rate": 9.365569530224554e-05,
"loss": 3.1832,
"step": 714
},
{
"epoch": 0.4099921265478491,
"grad_norm": 0.8349987864494324,
"learning_rate": 9.360947135084603e-05,
"loss": 3.1995,
"step": 716
},
{
"epoch": 0.41113735595161405,
"grad_norm": 0.8590210676193237,
"learning_rate": 9.356309111134191e-05,
"loss": 3.2119,
"step": 718
},
{
"epoch": 0.412282585355379,
"grad_norm": 0.8512969017028809,
"learning_rate": 9.351655474995122e-05,
"loss": 3.2323,
"step": 720
},
{
"epoch": 0.4134278147591439,
"grad_norm": 0.6388457417488098,
"learning_rate": 9.346986243345149e-05,
"loss": 3.1677,
"step": 722
},
{
"epoch": 0.4145730441629089,
"grad_norm": 0.8811210989952087,
"learning_rate": 9.342301432917912e-05,
"loss": 3.2307,
"step": 724
},
{
"epoch": 0.41571827356667385,
"grad_norm": 0.9297654628753662,
"learning_rate": 9.337601060502891e-05,
"loss": 3.1838,
"step": 726
},
{
"epoch": 0.4168635029704388,
"grad_norm": 0.750491201877594,
"learning_rate": 9.332885142945329e-05,
"loss": 3.23,
"step": 728
},
{
"epoch": 0.4180087323742037,
"grad_norm": 0.8282638192176819,
"learning_rate": 9.328153697146186e-05,
"loss": 3.1789,
"step": 730
},
{
"epoch": 0.41915396177796865,
"grad_norm": 0.7395208477973938,
"learning_rate": 9.323406740062068e-05,
"loss": 3.2881,
"step": 732
},
{
"epoch": 0.4202991911817336,
"grad_norm": 0.5959879755973816,
"learning_rate": 9.318644288705172e-05,
"loss": 3.1879,
"step": 734
},
{
"epoch": 0.4214444205854985,
"grad_norm": 0.6063298583030701,
"learning_rate": 9.313866360143227e-05,
"loss": 3.273,
"step": 736
},
{
"epoch": 0.42258964998926346,
"grad_norm": 0.6868070960044861,
"learning_rate": 9.309072971499422e-05,
"loss": 3.2145,
"step": 738
},
{
"epoch": 0.4237348793930284,
"grad_norm": 0.6153081655502319,
"learning_rate": 9.304264139952356e-05,
"loss": 3.0791,
"step": 740
},
{
"epoch": 0.42488010879679333,
"grad_norm": 0.6345932483673096,
"learning_rate": 9.299439882735977e-05,
"loss": 3.1991,
"step": 742
},
{
"epoch": 0.4260253382005583,
"grad_norm": 0.7605310082435608,
"learning_rate": 9.294600217139506e-05,
"loss": 3.1272,
"step": 744
},
{
"epoch": 0.42717056760432326,
"grad_norm": 0.6695173382759094,
"learning_rate": 9.289745160507395e-05,
"loss": 3.1482,
"step": 746
},
{
"epoch": 0.4283157970080882,
"grad_norm": 0.8121134638786316,
"learning_rate": 9.284874730239244e-05,
"loss": 3.2122,
"step": 748
},
{
"epoch": 0.42946102641185313,
"grad_norm": 0.8771198391914368,
"learning_rate": 9.279988943789759e-05,
"loss": 3.1768,
"step": 750
},
{
"epoch": 0.43060625581561807,
"grad_norm": 0.7993550300598145,
"learning_rate": 9.275087818668675e-05,
"loss": 3.1944,
"step": 752
},
{
"epoch": 0.431751485219383,
"grad_norm": 0.6639721393585205,
"learning_rate": 9.270171372440697e-05,
"loss": 3.1418,
"step": 754
},
{
"epoch": 0.43289671462314794,
"grad_norm": 0.7494943737983704,
"learning_rate": 9.265239622725438e-05,
"loss": 3.1956,
"step": 756
},
{
"epoch": 0.4340419440269129,
"grad_norm": 0.7307000160217285,
"learning_rate": 9.26029258719736e-05,
"loss": 3.133,
"step": 758
},
{
"epoch": 0.4351871734306778,
"grad_norm": 0.7357375621795654,
"learning_rate": 9.255330283585701e-05,
"loss": 3.1898,
"step": 760
},
{
"epoch": 0.4363324028344428,
"grad_norm": 0.6649693250656128,
"learning_rate": 9.250352729674422e-05,
"loss": 3.2147,
"step": 762
},
{
"epoch": 0.43747763223820774,
"grad_norm": 0.6873495578765869,
"learning_rate": 9.245359943302133e-05,
"loss": 3.2341,
"step": 764
},
{
"epoch": 0.43862286164197267,
"grad_norm": 0.7320956587791443,
"learning_rate": 9.240351942362038e-05,
"loss": 3.1241,
"step": 766
},
{
"epoch": 0.4397680910457376,
"grad_norm": 0.6137463450431824,
"learning_rate": 9.235328744801868e-05,
"loss": 3.1529,
"step": 768
},
{
"epoch": 0.44091332044950254,
"grad_norm": 0.8658304214477539,
"learning_rate": 9.230290368623809e-05,
"loss": 3.2168,
"step": 770
},
{
"epoch": 0.4420585498532675,
"grad_norm": 0.7436694502830505,
"learning_rate": 9.225236831884454e-05,
"loss": 3.1798,
"step": 772
},
{
"epoch": 0.4432037792570324,
"grad_norm": 0.9040384888648987,
"learning_rate": 9.220168152694722e-05,
"loss": 3.2241,
"step": 774
},
{
"epoch": 0.44434900866079735,
"grad_norm": 0.7236924171447754,
"learning_rate": 9.215084349219801e-05,
"loss": 3.183,
"step": 776
},
{
"epoch": 0.4454942380645623,
"grad_norm": 0.8633347153663635,
"learning_rate": 9.209985439679081e-05,
"loss": 3.1776,
"step": 778
},
{
"epoch": 0.4466394674683272,
"grad_norm": 0.730910062789917,
"learning_rate": 9.204871442346091e-05,
"loss": 3.1633,
"step": 780
},
{
"epoch": 0.4477846968720922,
"grad_norm": 0.809923529624939,
"learning_rate": 9.199742375548432e-05,
"loss": 3.1736,
"step": 782
},
{
"epoch": 0.44892992627585715,
"grad_norm": 0.7229586839675903,
"learning_rate": 9.194598257667711e-05,
"loss": 3.1813,
"step": 784
},
{
"epoch": 0.4500751556796221,
"grad_norm": 0.6999960541725159,
"learning_rate": 9.189439107139472e-05,
"loss": 3.1125,
"step": 786
},
{
"epoch": 0.451220385083387,
"grad_norm": 0.7234693169593811,
"learning_rate": 9.184264942453138e-05,
"loss": 3.137,
"step": 788
},
{
"epoch": 0.45236561448715196,
"grad_norm": 0.7283908724784851,
"learning_rate": 9.179075782151936e-05,
"loss": 3.1672,
"step": 790
},
{
"epoch": 0.4535108438909169,
"grad_norm": 0.793543815612793,
"learning_rate": 9.173871644832834e-05,
"loss": 3.1925,
"step": 792
},
{
"epoch": 0.45465607329468183,
"grad_norm": 0.7263696789741516,
"learning_rate": 9.168652549146481e-05,
"loss": 3.1609,
"step": 794
},
{
"epoch": 0.45580130269844676,
"grad_norm": 0.7698031663894653,
"learning_rate": 9.163418513797126e-05,
"loss": 3.2547,
"step": 796
},
{
"epoch": 0.4569465321022117,
"grad_norm": 0.908698320388794,
"learning_rate": 9.158169557542566e-05,
"loss": 3.2165,
"step": 798
},
{
"epoch": 0.4580917615059767,
"grad_norm": 0.9588857293128967,
"learning_rate": 9.152905699194065e-05,
"loss": 3.1743,
"step": 800
},
{
"epoch": 0.4592369909097416,
"grad_norm": 0.7442302107810974,
"learning_rate": 9.1476269576163e-05,
"loss": 3.1088,
"step": 802
},
{
"epoch": 0.46038222031350656,
"grad_norm": 0.7421006560325623,
"learning_rate": 9.14233335172728e-05,
"loss": 3.1497,
"step": 804
},
{
"epoch": 0.4615274497172715,
"grad_norm": 0.8878415822982788,
"learning_rate": 9.13702490049829e-05,
"loss": 3.1924,
"step": 806
},
{
"epoch": 0.46267267912103643,
"grad_norm": 0.6878317594528198,
"learning_rate": 9.131701622953816e-05,
"loss": 3.1366,
"step": 808
},
{
"epoch": 0.46381790852480137,
"grad_norm": 0.7945599555969238,
"learning_rate": 9.126363538171478e-05,
"loss": 3.1926,
"step": 810
},
{
"epoch": 0.4649631379285663,
"grad_norm": 0.7997886538505554,
"learning_rate": 9.121010665281964e-05,
"loss": 3.1521,
"step": 812
},
{
"epoch": 0.46610836733233124,
"grad_norm": 0.715614378452301,
"learning_rate": 9.115643023468958e-05,
"loss": 3.1904,
"step": 814
},
{
"epoch": 0.4672535967360962,
"grad_norm": 0.7846017479896545,
"learning_rate": 9.110260631969077e-05,
"loss": 3.1338,
"step": 816
},
{
"epoch": 0.46839882613986117,
"grad_norm": 0.6939677596092224,
"learning_rate": 9.10486351007179e-05,
"loss": 3.1635,
"step": 818
},
{
"epoch": 0.4695440555436261,
"grad_norm": 0.7764283418655396,
"learning_rate": 9.099451677119366e-05,
"loss": 3.1922,
"step": 820
},
{
"epoch": 0.47068928494739104,
"grad_norm": 0.753666877746582,
"learning_rate": 9.094025152506788e-05,
"loss": 3.0827,
"step": 822
},
{
"epoch": 0.471834514351156,
"grad_norm": 0.6793937683105469,
"learning_rate": 9.088583955681699e-05,
"loss": 3.1235,
"step": 824
},
{
"epoch": 0.4729797437549209,
"grad_norm": 0.645055890083313,
"learning_rate": 9.08312810614432e-05,
"loss": 3.1758,
"step": 826
},
{
"epoch": 0.47412497315868585,
"grad_norm": 0.7241025567054749,
"learning_rate": 9.077657623447379e-05,
"loss": 3.1636,
"step": 828
},
{
"epoch": 0.4752702025624508,
"grad_norm": 0.762117862701416,
"learning_rate": 9.07217252719606e-05,
"loss": 3.1423,
"step": 830
},
{
"epoch": 0.4764154319662157,
"grad_norm": 0.7575943470001221,
"learning_rate": 9.066672837047907e-05,
"loss": 3.1304,
"step": 832
},
{
"epoch": 0.47756066136998065,
"grad_norm": 0.8326764106750488,
"learning_rate": 9.061158572712769e-05,
"loss": 3.1807,
"step": 834
},
{
"epoch": 0.4787058907737456,
"grad_norm": 0.7815741300582886,
"learning_rate": 9.055629753952731e-05,
"loss": 3.2113,
"step": 836
},
{
"epoch": 0.4798511201775106,
"grad_norm": 0.7716583609580994,
"learning_rate": 9.050086400582033e-05,
"loss": 3.1791,
"step": 838
},
{
"epoch": 0.4809963495812755,
"grad_norm": 0.6160004734992981,
"learning_rate": 9.044528532467006e-05,
"loss": 3.1696,
"step": 840
},
{
"epoch": 0.48214157898504045,
"grad_norm": 0.8025004267692566,
"learning_rate": 9.038956169525998e-05,
"loss": 3.2002,
"step": 842
},
{
"epoch": 0.4832868083888054,
"grad_norm": 0.733741819858551,
"learning_rate": 9.033369331729307e-05,
"loss": 3.1661,
"step": 844
},
{
"epoch": 0.4844320377925703,
"grad_norm": 0.7210118770599365,
"learning_rate": 9.027768039099103e-05,
"loss": 3.1492,
"step": 846
},
{
"epoch": 0.48557726719633526,
"grad_norm": 0.6915583610534668,
"learning_rate": 9.02215231170936e-05,
"loss": 3.1892,
"step": 848
},
{
"epoch": 0.4867224966001002,
"grad_norm": 0.6812649965286255,
"learning_rate": 9.016522169685783e-05,
"loss": 3.1404,
"step": 850
},
{
"epoch": 0.48786772600386513,
"grad_norm": 0.7272056341171265,
"learning_rate": 9.010877633205738e-05,
"loss": 3.1935,
"step": 852
},
{
"epoch": 0.48901295540763007,
"grad_norm": 0.7162798643112183,
"learning_rate": 9.005218722498177e-05,
"loss": 3.1949,
"step": 854
},
{
"epoch": 0.49015818481139506,
"grad_norm": 0.6110600829124451,
"learning_rate": 8.999545457843568e-05,
"loss": 3.1217,
"step": 856
},
{
"epoch": 0.49130341421516,
"grad_norm": 0.657370924949646,
"learning_rate": 8.993857859573818e-05,
"loss": 3.1381,
"step": 858
},
{
"epoch": 0.49244864361892493,
"grad_norm": 0.8181600570678711,
"learning_rate": 8.988155948072203e-05,
"loss": 3.1527,
"step": 860
},
{
"epoch": 0.49359387302268987,
"grad_norm": 0.586644172668457,
"learning_rate": 8.9824397437733e-05,
"loss": 3.1328,
"step": 862
},
{
"epoch": 0.4947391024264548,
"grad_norm": 0.8710150718688965,
"learning_rate": 8.976709267162903e-05,
"loss": 3.1509,
"step": 864
},
{
"epoch": 0.49588433183021974,
"grad_norm": 0.7185545563697815,
"learning_rate": 8.970964538777957e-05,
"loss": 3.0628,
"step": 866
},
{
"epoch": 0.4970295612339847,
"grad_norm": 0.7242484092712402,
"learning_rate": 8.965205579206483e-05,
"loss": 3.0603,
"step": 868
},
{
"epoch": 0.4981747906377496,
"grad_norm": 0.7996972799301147,
"learning_rate": 8.959432409087504e-05,
"loss": 3.2346,
"step": 870
},
{
"epoch": 0.49932002004151455,
"grad_norm": 0.6038782000541687,
"learning_rate": 8.953645049110971e-05,
"loss": 3.0751,
"step": 872
},
{
"epoch": 0.5004652494452795,
"grad_norm": 0.7712786197662354,
"learning_rate": 8.94784352001769e-05,
"loss": 3.1086,
"step": 874
},
{
"epoch": 0.5016104788490444,
"grad_norm": 0.6952617168426514,
"learning_rate": 8.94202784259924e-05,
"loss": 3.13,
"step": 876
},
{
"epoch": 0.5027557082528094,
"grad_norm": 0.7420851588249207,
"learning_rate": 8.936198037697916e-05,
"loss": 3.1094,
"step": 878
},
{
"epoch": 0.5039009376565743,
"grad_norm": 0.6883806586265564,
"learning_rate": 8.930354126206634e-05,
"loss": 3.0722,
"step": 880
},
{
"epoch": 0.5050461670603392,
"grad_norm": 0.7546491026878357,
"learning_rate": 8.92449612906887e-05,
"loss": 3.1571,
"step": 882
},
{
"epoch": 0.5061913964641043,
"grad_norm": 0.7471094727516174,
"learning_rate": 8.918624067278576e-05,
"loss": 3.1842,
"step": 884
},
{
"epoch": 0.5073366258678692,
"grad_norm": 0.8344042897224426,
"learning_rate": 8.912737961880116e-05,
"loss": 3.1709,
"step": 886
},
{
"epoch": 0.5084818552716341,
"grad_norm": 0.6555135250091553,
"learning_rate": 8.906837833968174e-05,
"loss": 3.1777,
"step": 888
},
{
"epoch": 0.5096270846753991,
"grad_norm": 0.799281120300293,
"learning_rate": 8.900923704687697e-05,
"loss": 3.176,
"step": 890
},
{
"epoch": 0.510772314079164,
"grad_norm": 0.8266319632530212,
"learning_rate": 8.894995595233809e-05,
"loss": 3.1353,
"step": 892
},
{
"epoch": 0.511917543482929,
"grad_norm": 0.7263309955596924,
"learning_rate": 8.889053526851729e-05,
"loss": 3.0824,
"step": 894
},
{
"epoch": 0.5130627728866939,
"grad_norm": 0.7665941119194031,
"learning_rate": 8.88309752083671e-05,
"loss": 3.1808,
"step": 896
},
{
"epoch": 0.5142080022904588,
"grad_norm": 0.7014003396034241,
"learning_rate": 8.877127598533952e-05,
"loss": 3.1158,
"step": 898
},
{
"epoch": 0.5153532316942238,
"grad_norm": 0.6320556998252869,
"learning_rate": 8.871143781338529e-05,
"loss": 3.1276,
"step": 900
}
],
"logging_steps": 2,
"max_steps": 3494,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0132651008589824e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}