{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.19173923464088838, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00031956539106814733, "grad_norm": 5.807275295257568, "learning_rate": 0.0, "loss": 5.0454, "step": 1 }, { "epoch": 0.0006391307821362947, "grad_norm": 6.20149564743042, "learning_rate": 6.369426751592357e-07, "loss": 5.1424, "step": 2 }, { "epoch": 0.0012782615642725893, "grad_norm": 5.7567291259765625, "learning_rate": 1.910828025477707e-06, "loss": 5.0835, "step": 4 }, { "epoch": 0.001917392346408884, "grad_norm": 5.017141819000244, "learning_rate": 3.1847133757961785e-06, "loss": 5.0733, "step": 6 }, { "epoch": 0.0025565231285451786, "grad_norm": 3.2059810161590576, "learning_rate": 4.45859872611465e-06, "loss": 5.0714, "step": 8 }, { "epoch": 0.003195653910681473, "grad_norm": 6.303244113922119, "learning_rate": 5.732484076433121e-06, "loss": 4.9915, "step": 10 }, { "epoch": 0.003834784692817768, "grad_norm": 4.852840423583984, "learning_rate": 7.006369426751593e-06, "loss": 4.9307, "step": 12 }, { "epoch": 0.004473915474954062, "grad_norm": 3.78067946434021, "learning_rate": 8.280254777070064e-06, "loss": 4.8924, "step": 14 }, { "epoch": 0.005113046257090357, "grad_norm": 3.5641331672668457, "learning_rate": 9.554140127388536e-06, "loss": 4.8244, "step": 16 }, { "epoch": 0.005752177039226652, "grad_norm": 2.191957712173462, "learning_rate": 1.0828025477707008e-05, "loss": 4.6179, "step": 18 }, { "epoch": 0.006391307821362946, "grad_norm": 2.0675458908081055, "learning_rate": 1.2101910828025478e-05, "loss": 4.5827, "step": 20 }, { "epoch": 0.007030438603499241, "grad_norm": 1.6559146642684937, "learning_rate": 1.337579617834395e-05, "loss": 4.4544, "step": 22 }, { "epoch": 0.007669569385635536, "grad_norm": 1.3731284141540527, "learning_rate": 1.464968152866242e-05, "loss": 4.3748, "step": 24 }, { "epoch": 0.00830870016777183, "grad_norm": 1.3962030410766602, "learning_rate": 1.592356687898089e-05, "loss": 4.3269, "step": 26 }, { "epoch": 0.008947830949908125, "grad_norm": 1.2659896612167358, "learning_rate": 1.7197452229299362e-05, "loss": 4.2133, "step": 28 }, { "epoch": 0.00958696173204442, "grad_norm": 0.9881806373596191, "learning_rate": 1.8471337579617834e-05, "loss": 4.0961, "step": 30 }, { "epoch": 0.010226092514180714, "grad_norm": 0.9945515394210815, "learning_rate": 1.974522292993631e-05, "loss": 4.0158, "step": 32 }, { "epoch": 0.01086522329631701, "grad_norm": 0.9396588802337646, "learning_rate": 2.1019108280254778e-05, "loss": 3.8763, "step": 34 }, { "epoch": 0.011504354078453304, "grad_norm": 1.0665779113769531, "learning_rate": 2.229299363057325e-05, "loss": 3.8635, "step": 36 }, { "epoch": 0.012143484860589597, "grad_norm": 1.077245831489563, "learning_rate": 2.356687898089172e-05, "loss": 3.802, "step": 38 }, { "epoch": 0.012782615642725892, "grad_norm": 0.8040191531181335, "learning_rate": 2.4840764331210193e-05, "loss": 3.7284, "step": 40 }, { "epoch": 0.013421746424862187, "grad_norm": 1.4325759410858154, "learning_rate": 2.6114649681528662e-05, "loss": 3.665, "step": 42 }, { "epoch": 0.014060877206998482, "grad_norm": 1.3450332880020142, "learning_rate": 2.7388535031847134e-05, "loss": 3.6242, "step": 44 }, { "epoch": 0.014700007989134777, "grad_norm": 0.8203895688056946, "learning_rate": 2.8662420382165606e-05, "loss": 3.5576, "step": 46 }, { "epoch": 0.015339138771271072, "grad_norm": 1.1661335229873657, "learning_rate": 2.9936305732484078e-05, "loss": 3.522, "step": 48 }, { "epoch": 0.015978269553407365, "grad_norm": 1.0148671865463257, "learning_rate": 3.121019108280255e-05, "loss": 3.4594, "step": 50 }, { "epoch": 0.01661740033554366, "grad_norm": 0.6624857187271118, "learning_rate": 3.248407643312102e-05, "loss": 3.465, "step": 52 }, { "epoch": 0.017256531117679955, "grad_norm": 0.943125307559967, "learning_rate": 3.375796178343949e-05, "loss": 3.4021, "step": 54 }, { "epoch": 0.01789566189981625, "grad_norm": 0.9854550957679749, "learning_rate": 3.503184713375796e-05, "loss": 3.3361, "step": 56 }, { "epoch": 0.018534792681952544, "grad_norm": 1.2242411375045776, "learning_rate": 3.630573248407643e-05, "loss": 3.3283, "step": 58 }, { "epoch": 0.01917392346408884, "grad_norm": 0.9556372761726379, "learning_rate": 3.7579617834394906e-05, "loss": 3.2914, "step": 60 }, { "epoch": 0.019813054246225134, "grad_norm": 1.3133809566497803, "learning_rate": 3.885350318471338e-05, "loss": 3.3126, "step": 62 }, { "epoch": 0.02045218502836143, "grad_norm": 0.9322234392166138, "learning_rate": 4.012738853503185e-05, "loss": 3.2443, "step": 64 }, { "epoch": 0.021091315810497724, "grad_norm": 1.4383481740951538, "learning_rate": 4.1401273885350325e-05, "loss": 3.2428, "step": 66 }, { "epoch": 0.02173044659263402, "grad_norm": 1.0156841278076172, "learning_rate": 4.267515923566879e-05, "loss": 3.1735, "step": 68 }, { "epoch": 0.022369577374770314, "grad_norm": 1.1754450798034668, "learning_rate": 4.394904458598726e-05, "loss": 3.1788, "step": 70 }, { "epoch": 0.02300870815690661, "grad_norm": 1.0960084199905396, "learning_rate": 4.522292993630574e-05, "loss": 3.1963, "step": 72 }, { "epoch": 0.023647838939042903, "grad_norm": 1.054401159286499, "learning_rate": 4.6496815286624206e-05, "loss": 3.1604, "step": 74 }, { "epoch": 0.024286969721179195, "grad_norm": 1.1957581043243408, "learning_rate": 4.777070063694268e-05, "loss": 3.1648, "step": 76 }, { "epoch": 0.02492610050331549, "grad_norm": 0.7756203413009644, "learning_rate": 4.904458598726115e-05, "loss": 3.1066, "step": 78 }, { "epoch": 0.025565231285451784, "grad_norm": 1.0459190607070923, "learning_rate": 5.031847133757962e-05, "loss": 3.1571, "step": 80 }, { "epoch": 0.02620436206758808, "grad_norm": 0.9746761322021484, "learning_rate": 5.159235668789809e-05, "loss": 3.1026, "step": 82 }, { "epoch": 0.026843492849724374, "grad_norm": 1.0770882368087769, "learning_rate": 5.286624203821656e-05, "loss": 3.1125, "step": 84 }, { "epoch": 0.02748262363186067, "grad_norm": 0.9542138576507568, "learning_rate": 5.414012738853504e-05, "loss": 3.059, "step": 86 }, { "epoch": 0.028121754413996964, "grad_norm": 1.3454134464263916, "learning_rate": 5.5414012738853505e-05, "loss": 3.0645, "step": 88 }, { "epoch": 0.02876088519613326, "grad_norm": 1.0354089736938477, "learning_rate": 5.6687898089171974e-05, "loss": 3.04, "step": 90 }, { "epoch": 0.029400015978269554, "grad_norm": 1.1339548826217651, "learning_rate": 5.796178343949045e-05, "loss": 3.0625, "step": 92 }, { "epoch": 0.03003914676040585, "grad_norm": 1.200062870979309, "learning_rate": 5.923566878980892e-05, "loss": 3.057, "step": 94 }, { "epoch": 0.030678277542542143, "grad_norm": 1.395698070526123, "learning_rate": 6.0509554140127386e-05, "loss": 3.0341, "step": 96 }, { "epoch": 0.031317408324678435, "grad_norm": 0.9392653703689575, "learning_rate": 6.178343949044585e-05, "loss": 3.0087, "step": 98 }, { "epoch": 0.03195653910681473, "grad_norm": 1.1301568746566772, "learning_rate": 6.305732484076433e-05, "loss": 3.0294, "step": 100 }, { "epoch": 0.032595669888951025, "grad_norm": 0.9571443796157837, "learning_rate": 6.43312101910828e-05, "loss": 3.0522, "step": 102 }, { "epoch": 0.03323480067108732, "grad_norm": 0.9494081735610962, "learning_rate": 6.560509554140127e-05, "loss": 3.0012, "step": 104 }, { "epoch": 0.033873931453223614, "grad_norm": 1.3672889471054077, "learning_rate": 6.687898089171974e-05, "loss": 3.0188, "step": 106 }, { "epoch": 0.03451306223535991, "grad_norm": 1.2122056484222412, "learning_rate": 6.815286624203822e-05, "loss": 2.9497, "step": 108 }, { "epoch": 0.035152193017496204, "grad_norm": 1.2184698581695557, "learning_rate": 6.942675159235669e-05, "loss": 2.9739, "step": 110 }, { "epoch": 0.0357913237996325, "grad_norm": 1.09404456615448, "learning_rate": 7.070063694267515e-05, "loss": 3.0241, "step": 112 }, { "epoch": 0.036430454581768794, "grad_norm": 1.1653715372085571, "learning_rate": 7.197452229299363e-05, "loss": 2.9606, "step": 114 }, { "epoch": 0.03706958536390509, "grad_norm": 1.050194501876831, "learning_rate": 7.32484076433121e-05, "loss": 2.9582, "step": 116 }, { "epoch": 0.037708716146041384, "grad_norm": 1.1262322664260864, "learning_rate": 7.452229299363057e-05, "loss": 2.9462, "step": 118 }, { "epoch": 0.03834784692817768, "grad_norm": 1.1232227087020874, "learning_rate": 7.579617834394906e-05, "loss": 2.9784, "step": 120 }, { "epoch": 0.03898697771031397, "grad_norm": 0.9088072776794434, "learning_rate": 7.707006369426753e-05, "loss": 2.944, "step": 122 }, { "epoch": 0.03962610849245027, "grad_norm": 0.8985419869422913, "learning_rate": 7.834394904458599e-05, "loss": 2.9003, "step": 124 }, { "epoch": 0.04026523927458656, "grad_norm": 1.2419854402542114, "learning_rate": 7.961783439490447e-05, "loss": 2.9753, "step": 126 }, { "epoch": 0.04090437005672286, "grad_norm": 1.4533154964447021, "learning_rate": 8.089171974522294e-05, "loss": 2.9069, "step": 128 }, { "epoch": 0.04154350083885915, "grad_norm": 1.475258231163025, "learning_rate": 8.21656050955414e-05, "loss": 2.9402, "step": 130 }, { "epoch": 0.04218263162099545, "grad_norm": 1.0348827838897705, "learning_rate": 8.343949044585988e-05, "loss": 2.9295, "step": 132 }, { "epoch": 0.04282176240313174, "grad_norm": 0.9143719673156738, "learning_rate": 8.471337579617836e-05, "loss": 2.9408, "step": 134 }, { "epoch": 0.04346089318526804, "grad_norm": 1.1310492753982544, "learning_rate": 8.598726114649682e-05, "loss": 2.875, "step": 136 }, { "epoch": 0.04410002396740433, "grad_norm": 1.0483386516571045, "learning_rate": 8.726114649681529e-05, "loss": 2.9142, "step": 138 }, { "epoch": 0.04473915474954063, "grad_norm": 0.921519935131073, "learning_rate": 8.853503184713377e-05, "loss": 2.9188, "step": 140 }, { "epoch": 0.04537828553167692, "grad_norm": 1.3271907567977905, "learning_rate": 8.980891719745223e-05, "loss": 2.9075, "step": 142 }, { "epoch": 0.04601741631381322, "grad_norm": 1.7488983869552612, "learning_rate": 9.10828025477707e-05, "loss": 2.9201, "step": 144 }, { "epoch": 0.04665654709594951, "grad_norm": 1.4263213872909546, "learning_rate": 9.235668789808918e-05, "loss": 2.9045, "step": 146 }, { "epoch": 0.04729567787808581, "grad_norm": 0.8777288794517517, "learning_rate": 9.363057324840766e-05, "loss": 2.8959, "step": 148 }, { "epoch": 0.047934808660222095, "grad_norm": 1.3402196168899536, "learning_rate": 9.490445859872612e-05, "loss": 2.8893, "step": 150 }, { "epoch": 0.04857393944235839, "grad_norm": 1.0943351984024048, "learning_rate": 9.617834394904459e-05, "loss": 2.9137, "step": 152 }, { "epoch": 0.049213070224494684, "grad_norm": 1.0603907108306885, "learning_rate": 9.745222929936307e-05, "loss": 2.8677, "step": 154 }, { "epoch": 0.04985220100663098, "grad_norm": 1.010772705078125, "learning_rate": 9.872611464968153e-05, "loss": 2.8374, "step": 156 }, { "epoch": 0.050491331788767274, "grad_norm": 1.2628934383392334, "learning_rate": 0.0001, "loss": 2.9009, "step": 158 }, { "epoch": 0.05113046257090357, "grad_norm": 1.146183729171753, "learning_rate": 9.999988833687822e-05, "loss": 2.8633, "step": 160 }, { "epoch": 0.051769593353039864, "grad_norm": 0.8704808354377747, "learning_rate": 9.99995533480116e-05, "loss": 2.8464, "step": 162 }, { "epoch": 0.05240872413517616, "grad_norm": 1.044418454170227, "learning_rate": 9.999899503489641e-05, "loss": 2.8695, "step": 164 }, { "epoch": 0.053047854917312454, "grad_norm": 0.833791196346283, "learning_rate": 9.999821340002636e-05, "loss": 2.8605, "step": 166 }, { "epoch": 0.05368698569944875, "grad_norm": 0.922815203666687, "learning_rate": 9.99972084468926e-05, "loss": 2.8737, "step": 168 }, { "epoch": 0.05432611648158504, "grad_norm": 0.9120809435844421, "learning_rate": 9.999598017998384e-05, "loss": 2.8753, "step": 170 }, { "epoch": 0.05496524726372134, "grad_norm": 1.0272431373596191, "learning_rate": 9.999452860478611e-05, "loss": 2.8907, "step": 172 }, { "epoch": 0.05560437804585763, "grad_norm": 0.7777165174484253, "learning_rate": 9.999285372778295e-05, "loss": 2.8517, "step": 174 }, { "epoch": 0.05624350882799393, "grad_norm": 0.7110999822616577, "learning_rate": 9.999095555645523e-05, "loss": 2.8211, "step": 176 }, { "epoch": 0.05688263961013022, "grad_norm": 0.7857067584991455, "learning_rate": 9.998883409928117e-05, "loss": 2.8463, "step": 178 }, { "epoch": 0.05752177039226652, "grad_norm": 0.8582798838615417, "learning_rate": 9.998648936573629e-05, "loss": 2.8197, "step": 180 }, { "epoch": 0.05816090117440281, "grad_norm": 0.9790541529655457, "learning_rate": 9.998392136629345e-05, "loss": 2.8193, "step": 182 }, { "epoch": 0.05880003195653911, "grad_norm": 1.1599719524383545, "learning_rate": 9.998113011242264e-05, "loss": 2.8206, "step": 184 }, { "epoch": 0.0594391627386754, "grad_norm": 0.8326631188392639, "learning_rate": 9.99781156165911e-05, "loss": 2.8349, "step": 186 }, { "epoch": 0.0600782935208117, "grad_norm": 0.8876377940177917, "learning_rate": 9.997487789226312e-05, "loss": 2.8225, "step": 188 }, { "epoch": 0.06071742430294799, "grad_norm": 0.9899202585220337, "learning_rate": 9.997141695390009e-05, "loss": 2.7875, "step": 190 }, { "epoch": 0.06135655508508429, "grad_norm": 1.0686557292938232, "learning_rate": 9.996773281696037e-05, "loss": 2.8024, "step": 192 }, { "epoch": 0.06199568586722058, "grad_norm": 0.8899752497673035, "learning_rate": 9.996382549789926e-05, "loss": 2.8225, "step": 194 }, { "epoch": 0.06263481664935687, "grad_norm": 0.7781797647476196, "learning_rate": 9.995969501416891e-05, "loss": 2.8046, "step": 196 }, { "epoch": 0.06327394743149317, "grad_norm": 0.6428512930870056, "learning_rate": 9.995534138421818e-05, "loss": 2.7693, "step": 198 }, { "epoch": 0.06391307821362946, "grad_norm": 0.7047809958457947, "learning_rate": 9.995076462749273e-05, "loss": 2.766, "step": 200 }, { "epoch": 0.06455220899576576, "grad_norm": 0.6256312131881714, "learning_rate": 9.99459647644347e-05, "loss": 2.8071, "step": 202 }, { "epoch": 0.06519133977790205, "grad_norm": 0.699400007724762, "learning_rate": 9.994094181648283e-05, "loss": 2.8347, "step": 204 }, { "epoch": 0.06583047056003835, "grad_norm": 0.7256817817687988, "learning_rate": 9.993569580607225e-05, "loss": 2.8074, "step": 206 }, { "epoch": 0.06646960134217464, "grad_norm": 0.573846161365509, "learning_rate": 9.993022675663437e-05, "loss": 2.7413, "step": 208 }, { "epoch": 0.06710873212431094, "grad_norm": 0.7314406037330627, "learning_rate": 9.992453469259685e-05, "loss": 2.7983, "step": 210 }, { "epoch": 0.06774786290644723, "grad_norm": 0.7307546734809875, "learning_rate": 9.991861963938342e-05, "loss": 2.8026, "step": 212 }, { "epoch": 0.06838699368858353, "grad_norm": 0.6367102861404419, "learning_rate": 9.991248162341384e-05, "loss": 2.7424, "step": 214 }, { "epoch": 0.06902612447071982, "grad_norm": 0.8630378246307373, "learning_rate": 9.99061206721037e-05, "loss": 2.7395, "step": 216 }, { "epoch": 0.06966525525285612, "grad_norm": 0.7586290240287781, "learning_rate": 9.989953681386433e-05, "loss": 2.7624, "step": 218 }, { "epoch": 0.07030438603499241, "grad_norm": 0.7091168761253357, "learning_rate": 9.989273007810271e-05, "loss": 2.7719, "step": 220 }, { "epoch": 0.07094351681712871, "grad_norm": 0.684183657169342, "learning_rate": 9.98857004952213e-05, "loss": 2.7806, "step": 222 }, { "epoch": 0.071582647599265, "grad_norm": 0.920498788356781, "learning_rate": 9.987844809661791e-05, "loss": 2.7626, "step": 224 }, { "epoch": 0.0722217783814013, "grad_norm": 0.730060875415802, "learning_rate": 9.987097291468552e-05, "loss": 2.8107, "step": 226 }, { "epoch": 0.07286090916353759, "grad_norm": 0.8606828451156616, "learning_rate": 9.986327498281227e-05, "loss": 2.7814, "step": 228 }, { "epoch": 0.07350003994567389, "grad_norm": 0.8068298101425171, "learning_rate": 9.985535433538113e-05, "loss": 2.7775, "step": 230 }, { "epoch": 0.07413917072781018, "grad_norm": 0.6887542009353638, "learning_rate": 9.984721100776989e-05, "loss": 2.784, "step": 232 }, { "epoch": 0.07477830150994648, "grad_norm": 0.84773850440979, "learning_rate": 9.98388450363509e-05, "loss": 2.7333, "step": 234 }, { "epoch": 0.07541743229208277, "grad_norm": 0.7914923429489136, "learning_rate": 9.9830256458491e-05, "loss": 2.7363, "step": 236 }, { "epoch": 0.07605656307421906, "grad_norm": 0.8284217715263367, "learning_rate": 9.982144531255127e-05, "loss": 2.7389, "step": 238 }, { "epoch": 0.07669569385635536, "grad_norm": 0.7706480622291565, "learning_rate": 9.981241163788694e-05, "loss": 2.7377, "step": 240 }, { "epoch": 0.07733482463849164, "grad_norm": 0.6147120594978333, "learning_rate": 9.980315547484711e-05, "loss": 2.7862, "step": 242 }, { "epoch": 0.07797395542062795, "grad_norm": 0.6364494562149048, "learning_rate": 9.979367686477469e-05, "loss": 2.762, "step": 244 }, { "epoch": 0.07861308620276423, "grad_norm": 0.6944818496704102, "learning_rate": 9.978397585000611e-05, "loss": 2.7624, "step": 246 }, { "epoch": 0.07925221698490054, "grad_norm": 1.2648204565048218, "learning_rate": 9.977405247387119e-05, "loss": 2.7544, "step": 248 }, { "epoch": 0.07989134776703682, "grad_norm": 1.0054659843444824, "learning_rate": 9.976390678069295e-05, "loss": 2.7523, "step": 250 }, { "epoch": 0.08053047854917313, "grad_norm": 0.715492308139801, "learning_rate": 9.975353881578738e-05, "loss": 2.7341, "step": 252 }, { "epoch": 0.08116960933130941, "grad_norm": 0.7963582277297974, "learning_rate": 9.974294862546325e-05, "loss": 2.7484, "step": 254 }, { "epoch": 0.08180874011344572, "grad_norm": 0.7069251537322998, "learning_rate": 9.97321362570219e-05, "loss": 2.7719, "step": 256 }, { "epoch": 0.082447870895582, "grad_norm": 0.5716209411621094, "learning_rate": 9.972110175875706e-05, "loss": 2.8079, "step": 258 }, { "epoch": 0.0830870016777183, "grad_norm": 0.65562903881073, "learning_rate": 9.970984517995456e-05, "loss": 2.7642, "step": 260 }, { "epoch": 0.0837261324598546, "grad_norm": 0.647085964679718, "learning_rate": 9.969836657089225e-05, "loss": 2.7139, "step": 262 }, { "epoch": 0.0843652632419909, "grad_norm": 0.6401609778404236, "learning_rate": 9.968666598283955e-05, "loss": 2.7278, "step": 264 }, { "epoch": 0.08500439402412718, "grad_norm": 0.5514021515846252, "learning_rate": 9.967474346805746e-05, "loss": 2.7332, "step": 266 }, { "epoch": 0.08564352480626349, "grad_norm": 0.5908826589584351, "learning_rate": 9.96625990797982e-05, "loss": 2.741, "step": 268 }, { "epoch": 0.08628265558839977, "grad_norm": 0.5510653853416443, "learning_rate": 9.965023287230497e-05, "loss": 2.7025, "step": 270 }, { "epoch": 0.08692178637053607, "grad_norm": 0.5656317472457886, "learning_rate": 9.963764490081176e-05, "loss": 2.7184, "step": 272 }, { "epoch": 0.08756091715267236, "grad_norm": 0.5132441520690918, "learning_rate": 9.962483522154302e-05, "loss": 2.7632, "step": 274 }, { "epoch": 0.08820004793480866, "grad_norm": 0.6730588674545288, "learning_rate": 9.961180389171352e-05, "loss": 2.7705, "step": 276 }, { "epoch": 0.08883917871694495, "grad_norm": 0.5657472610473633, "learning_rate": 9.959855096952804e-05, "loss": 2.7191, "step": 278 }, { "epoch": 0.08947830949908125, "grad_norm": 0.8265955448150635, "learning_rate": 9.958507651418106e-05, "loss": 2.7718, "step": 280 }, { "epoch": 0.09011744028121754, "grad_norm": 0.8996996879577637, "learning_rate": 9.957138058585658e-05, "loss": 2.7124, "step": 282 }, { "epoch": 0.09075657106335384, "grad_norm": 0.6458889842033386, "learning_rate": 9.955746324572781e-05, "loss": 2.7403, "step": 284 }, { "epoch": 0.09139570184549013, "grad_norm": 0.7175470590591431, "learning_rate": 9.954332455595689e-05, "loss": 2.7188, "step": 286 }, { "epoch": 0.09203483262762643, "grad_norm": 0.6640183329582214, "learning_rate": 9.952896457969463e-05, "loss": 2.7223, "step": 288 }, { "epoch": 0.09267396340976272, "grad_norm": 0.6551202535629272, "learning_rate": 9.951438338108022e-05, "loss": 2.7189, "step": 290 }, { "epoch": 0.09331309419189902, "grad_norm": 0.6980673670768738, "learning_rate": 9.949958102524093e-05, "loss": 2.7183, "step": 292 }, { "epoch": 0.09395222497403531, "grad_norm": 0.5926324129104614, "learning_rate": 9.948455757829187e-05, "loss": 2.7476, "step": 294 }, { "epoch": 0.09459135575617161, "grad_norm": 0.5434746742248535, "learning_rate": 9.946931310733565e-05, "loss": 2.7368, "step": 296 }, { "epoch": 0.0952304865383079, "grad_norm": 0.6466372609138489, "learning_rate": 9.945384768046206e-05, "loss": 2.7307, "step": 298 }, { "epoch": 0.09586961732044419, "grad_norm": 0.6376985311508179, "learning_rate": 9.943816136674782e-05, "loss": 2.7239, "step": 300 }, { "epoch": 0.09650874810258049, "grad_norm": 0.6092653274536133, "learning_rate": 9.942225423625624e-05, "loss": 2.7678, "step": 302 }, { "epoch": 0.09714787888471678, "grad_norm": 0.7219493389129639, "learning_rate": 9.94061263600369e-05, "loss": 2.723, "step": 304 }, { "epoch": 0.09778700966685308, "grad_norm": 0.5244786143302917, "learning_rate": 9.93897778101254e-05, "loss": 2.7329, "step": 306 }, { "epoch": 0.09842614044898937, "grad_norm": 0.5384829044342041, "learning_rate": 9.937320865954289e-05, "loss": 2.661, "step": 308 }, { "epoch": 0.09906527123112567, "grad_norm": 0.624033510684967, "learning_rate": 9.935641898229594e-05, "loss": 2.7177, "step": 310 }, { "epoch": 0.09970440201326196, "grad_norm": 0.6381804347038269, "learning_rate": 9.933940885337602e-05, "loss": 2.7616, "step": 312 }, { "epoch": 0.10034353279539826, "grad_norm": 0.7671799659729004, "learning_rate": 9.932217834875934e-05, "loss": 2.7256, "step": 314 }, { "epoch": 0.10098266357753455, "grad_norm": 0.5695899128913879, "learning_rate": 9.930472754540634e-05, "loss": 2.6975, "step": 316 }, { "epoch": 0.10162179435967085, "grad_norm": 0.6461712121963501, "learning_rate": 9.92870565212615e-05, "loss": 2.7121, "step": 318 }, { "epoch": 0.10226092514180714, "grad_norm": 0.6111094355583191, "learning_rate": 9.926916535525283e-05, "loss": 2.6964, "step": 320 }, { "epoch": 0.10290005592394344, "grad_norm": 0.6368963718414307, "learning_rate": 9.925105412729175e-05, "loss": 2.6793, "step": 322 }, { "epoch": 0.10353918670607973, "grad_norm": 0.6973994374275208, "learning_rate": 9.923272291827245e-05, "loss": 2.6862, "step": 324 }, { "epoch": 0.10417831748821603, "grad_norm": 0.6717987656593323, "learning_rate": 9.921417181007175e-05, "loss": 2.686, "step": 326 }, { "epoch": 0.10481744827035232, "grad_norm": 0.6282898783683777, "learning_rate": 9.919540088554862e-05, "loss": 2.6807, "step": 328 }, { "epoch": 0.10545657905248862, "grad_norm": 0.6404539942741394, "learning_rate": 9.91764102285439e-05, "loss": 2.659, "step": 330 }, { "epoch": 0.10609570983462491, "grad_norm": 0.679418683052063, "learning_rate": 9.915719992387979e-05, "loss": 2.662, "step": 332 }, { "epoch": 0.10673484061676121, "grad_norm": 0.7185142040252686, "learning_rate": 9.913777005735963e-05, "loss": 2.7208, "step": 334 }, { "epoch": 0.1073739713988975, "grad_norm": 0.5328919887542725, "learning_rate": 9.911812071576736e-05, "loss": 2.6428, "step": 336 }, { "epoch": 0.1080131021810338, "grad_norm": 0.6135143637657166, "learning_rate": 9.909825198686729e-05, "loss": 2.6543, "step": 338 }, { "epoch": 0.10865223296317009, "grad_norm": 0.6830089092254639, "learning_rate": 9.907816395940359e-05, "loss": 2.677, "step": 340 }, { "epoch": 0.10929136374530639, "grad_norm": 0.6469766497612, "learning_rate": 9.90578567230999e-05, "loss": 2.726, "step": 342 }, { "epoch": 0.10993049452744268, "grad_norm": 0.5899373888969421, "learning_rate": 9.903733036865903e-05, "loss": 2.7208, "step": 344 }, { "epoch": 0.11056962530957898, "grad_norm": 0.82301926612854, "learning_rate": 9.901658498776246e-05, "loss": 2.6925, "step": 346 }, { "epoch": 0.11120875609171527, "grad_norm": 0.8507819771766663, "learning_rate": 9.899562067306989e-05, "loss": 2.6905, "step": 348 }, { "epoch": 0.11184788687385157, "grad_norm": 0.6785141229629517, "learning_rate": 9.897443751821902e-05, "loss": 2.6643, "step": 350 }, { "epoch": 0.11248701765598786, "grad_norm": 0.6389050483703613, "learning_rate": 9.89530356178249e-05, "loss": 2.6769, "step": 352 }, { "epoch": 0.11312614843812416, "grad_norm": 0.5903960466384888, "learning_rate": 9.893141506747967e-05, "loss": 2.6793, "step": 354 }, { "epoch": 0.11376527922026045, "grad_norm": 0.583307147026062, "learning_rate": 9.890957596375206e-05, "loss": 2.676, "step": 356 }, { "epoch": 0.11440441000239673, "grad_norm": 0.6372009515762329, "learning_rate": 9.888751840418695e-05, "loss": 2.6567, "step": 358 }, { "epoch": 0.11504354078453304, "grad_norm": 0.7056903839111328, "learning_rate": 9.886524248730497e-05, "loss": 2.6973, "step": 360 }, { "epoch": 0.11568267156666932, "grad_norm": 0.5459578633308411, "learning_rate": 9.88427483126021e-05, "loss": 2.6522, "step": 362 }, { "epoch": 0.11632180234880563, "grad_norm": 0.5186561346054077, "learning_rate": 9.882003598054907e-05, "loss": 2.6567, "step": 364 }, { "epoch": 0.11696093313094191, "grad_norm": 0.5469943881034851, "learning_rate": 9.879710559259114e-05, "loss": 2.6586, "step": 366 }, { "epoch": 0.11760006391307821, "grad_norm": 0.6790450215339661, "learning_rate": 9.877395725114742e-05, "loss": 2.6874, "step": 368 }, { "epoch": 0.1182391946952145, "grad_norm": 0.624920129776001, "learning_rate": 9.875059105961056e-05, "loss": 2.6777, "step": 370 }, { "epoch": 0.1188783254773508, "grad_norm": 0.6039037704467773, "learning_rate": 9.872700712234624e-05, "loss": 2.6881, "step": 372 }, { "epoch": 0.11951745625948709, "grad_norm": 0.6653264760971069, "learning_rate": 9.87032055446927e-05, "loss": 2.6388, "step": 374 }, { "epoch": 0.1201565870416234, "grad_norm": 0.7718141078948975, "learning_rate": 9.867918643296025e-05, "loss": 2.6686, "step": 376 }, { "epoch": 0.12079571782375968, "grad_norm": 0.6357402801513672, "learning_rate": 9.865494989443092e-05, "loss": 2.6611, "step": 378 }, { "epoch": 0.12143484860589598, "grad_norm": 0.560418963432312, "learning_rate": 9.863049603735775e-05, "loss": 2.6944, "step": 380 }, { "epoch": 0.12207397938803227, "grad_norm": 0.5758490562438965, "learning_rate": 9.860582497096452e-05, "loss": 2.6589, "step": 382 }, { "epoch": 0.12271311017016857, "grad_norm": 0.6144497990608215, "learning_rate": 9.858093680544516e-05, "loss": 2.6839, "step": 384 }, { "epoch": 0.12335224095230486, "grad_norm": 0.5986223816871643, "learning_rate": 9.855583165196329e-05, "loss": 2.6778, "step": 386 }, { "epoch": 0.12399137173444116, "grad_norm": 0.5350797176361084, "learning_rate": 9.853050962265169e-05, "loss": 2.6539, "step": 388 }, { "epoch": 0.12463050251657745, "grad_norm": 0.5589949488639832, "learning_rate": 9.850497083061183e-05, "loss": 2.6536, "step": 390 }, { "epoch": 0.12526963329871374, "grad_norm": 0.5695136189460754, "learning_rate": 9.847921538991339e-05, "loss": 2.6615, "step": 392 }, { "epoch": 0.12590876408085006, "grad_norm": 0.5739374756813049, "learning_rate": 9.845324341559366e-05, "loss": 2.6883, "step": 394 }, { "epoch": 0.12654789486298634, "grad_norm": 0.528075098991394, "learning_rate": 9.84270550236571e-05, "loss": 2.6944, "step": 396 }, { "epoch": 0.12718702564512263, "grad_norm": 0.6400613188743591, "learning_rate": 9.840065033107483e-05, "loss": 2.6596, "step": 398 }, { "epoch": 0.12782615642725892, "grad_norm": 0.6734158992767334, "learning_rate": 9.837402945578406e-05, "loss": 2.6562, "step": 400 }, { "epoch": 0.12846528720939523, "grad_norm": 0.6197201013565063, "learning_rate": 9.834719251668761e-05, "loss": 2.6971, "step": 402 }, { "epoch": 0.12910441799153152, "grad_norm": 0.5766332745552063, "learning_rate": 9.832013963365332e-05, "loss": 2.6355, "step": 404 }, { "epoch": 0.1297435487736678, "grad_norm": 0.7926291823387146, "learning_rate": 9.829287092751357e-05, "loss": 2.6438, "step": 406 }, { "epoch": 0.1303826795558041, "grad_norm": 0.7527420520782471, "learning_rate": 9.826538652006469e-05, "loss": 2.6695, "step": 408 }, { "epoch": 0.13102181033794041, "grad_norm": 0.7154802083969116, "learning_rate": 9.823768653406652e-05, "loss": 2.6158, "step": 410 }, { "epoch": 0.1316609411200767, "grad_norm": 0.5435774326324463, "learning_rate": 9.820977109324169e-05, "loss": 2.6843, "step": 412 }, { "epoch": 0.132300071902213, "grad_norm": 0.5893809199333191, "learning_rate": 9.818164032227522e-05, "loss": 2.6607, "step": 414 }, { "epoch": 0.13293920268434928, "grad_norm": 0.5635148882865906, "learning_rate": 9.815329434681392e-05, "loss": 2.658, "step": 416 }, { "epoch": 0.13357833346648557, "grad_norm": 0.4904562830924988, "learning_rate": 9.812473329346578e-05, "loss": 2.6616, "step": 418 }, { "epoch": 0.13421746424862188, "grad_norm": 0.5800766944885254, "learning_rate": 9.809595728979945e-05, "loss": 2.6657, "step": 420 }, { "epoch": 0.13485659503075817, "grad_norm": 0.5110253691673279, "learning_rate": 9.806696646434367e-05, "loss": 2.6192, "step": 422 }, { "epoch": 0.13549572581289446, "grad_norm": 0.5567732453346252, "learning_rate": 9.803776094658668e-05, "loss": 2.6475, "step": 424 }, { "epoch": 0.13613485659503075, "grad_norm": 0.5255835056304932, "learning_rate": 9.800834086697566e-05, "loss": 2.6644, "step": 426 }, { "epoch": 0.13677398737716706, "grad_norm": 0.4851606786251068, "learning_rate": 9.797870635691613e-05, "loss": 2.6628, "step": 428 }, { "epoch": 0.13741311815930335, "grad_norm": 0.4904446005821228, "learning_rate": 9.794885754877135e-05, "loss": 2.6222, "step": 430 }, { "epoch": 0.13805224894143964, "grad_norm": 0.47077298164367676, "learning_rate": 9.791879457586178e-05, "loss": 2.5875, "step": 432 }, { "epoch": 0.13869137972357592, "grad_norm": 0.4484720528125763, "learning_rate": 9.788851757246443e-05, "loss": 2.6279, "step": 434 }, { "epoch": 0.13933051050571224, "grad_norm": 0.5684689283370972, "learning_rate": 9.785802667381227e-05, "loss": 2.6507, "step": 436 }, { "epoch": 0.13996964128784853, "grad_norm": 0.5868870615959167, "learning_rate": 9.78273220160937e-05, "loss": 2.6476, "step": 438 }, { "epoch": 0.14060877206998482, "grad_norm": 0.5244540572166443, "learning_rate": 9.77964037364518e-05, "loss": 2.6353, "step": 440 }, { "epoch": 0.1412479028521211, "grad_norm": 0.5107213258743286, "learning_rate": 9.776527197298386e-05, "loss": 2.6335, "step": 442 }, { "epoch": 0.14188703363425742, "grad_norm": 0.5410230159759521, "learning_rate": 9.773392686474065e-05, "loss": 2.6248, "step": 444 }, { "epoch": 0.1425261644163937, "grad_norm": 0.5540198683738708, "learning_rate": 9.770236855172587e-05, "loss": 2.6304, "step": 446 }, { "epoch": 0.14316529519853, "grad_norm": 0.6982893347740173, "learning_rate": 9.767059717489557e-05, "loss": 2.6285, "step": 448 }, { "epoch": 0.14380442598066628, "grad_norm": 0.7649112939834595, "learning_rate": 9.763861287615732e-05, "loss": 2.6863, "step": 450 }, { "epoch": 0.1444435567628026, "grad_norm": 0.5209079384803772, "learning_rate": 9.760641579836984e-05, "loss": 2.6262, "step": 452 }, { "epoch": 0.1450826875449389, "grad_norm": 0.5985437631607056, "learning_rate": 9.757400608534215e-05, "loss": 2.5451, "step": 454 }, { "epoch": 0.14572181832707518, "grad_norm": 0.6232045888900757, "learning_rate": 9.754138388183305e-05, "loss": 2.6142, "step": 456 }, { "epoch": 0.14636094910921146, "grad_norm": 0.7111669778823853, "learning_rate": 9.750854933355042e-05, "loss": 2.5868, "step": 458 }, { "epoch": 0.14700007989134778, "grad_norm": 0.6749933362007141, "learning_rate": 9.747550258715059e-05, "loss": 2.6233, "step": 460 }, { "epoch": 0.14763921067348407, "grad_norm": 0.5915788412094116, "learning_rate": 9.744224379023768e-05, "loss": 2.6233, "step": 462 }, { "epoch": 0.14827834145562035, "grad_norm": 0.6704515814781189, "learning_rate": 9.740877309136291e-05, "loss": 2.6432, "step": 464 }, { "epoch": 0.14891747223775664, "grad_norm": 0.6156161427497864, "learning_rate": 9.737509064002402e-05, "loss": 2.6436, "step": 466 }, { "epoch": 0.14955660301989296, "grad_norm": 0.49440738558769226, "learning_rate": 9.734119658666448e-05, "loss": 2.6488, "step": 468 }, { "epoch": 0.15019573380202925, "grad_norm": 0.6561670899391174, "learning_rate": 9.730709108267296e-05, "loss": 2.6191, "step": 470 }, { "epoch": 0.15083486458416553, "grad_norm": 0.6310847997665405, "learning_rate": 9.727277428038253e-05, "loss": 2.6055, "step": 472 }, { "epoch": 0.15147399536630182, "grad_norm": 0.5141007304191589, "learning_rate": 9.723824633307001e-05, "loss": 2.626, "step": 474 }, { "epoch": 0.1521131261484381, "grad_norm": 0.5299694538116455, "learning_rate": 9.720350739495538e-05, "loss": 2.6401, "step": 476 }, { "epoch": 0.15275225693057443, "grad_norm": 0.5702034831047058, "learning_rate": 9.716855762120097e-05, "loss": 2.6392, "step": 478 }, { "epoch": 0.15339138771271071, "grad_norm": 0.5058117508888245, "learning_rate": 9.713339716791076e-05, "loss": 2.5778, "step": 480 }, { "epoch": 0.154030518494847, "grad_norm": 0.6530377864837646, "learning_rate": 9.709802619212987e-05, "loss": 2.6359, "step": 482 }, { "epoch": 0.1546696492769833, "grad_norm": 0.6136478781700134, "learning_rate": 9.706244485184357e-05, "loss": 2.6117, "step": 484 }, { "epoch": 0.1553087800591196, "grad_norm": 0.5947436094284058, "learning_rate": 9.702665330597684e-05, "loss": 2.6148, "step": 486 }, { "epoch": 0.1559479108412559, "grad_norm": 0.6332894563674927, "learning_rate": 9.699065171439349e-05, "loss": 2.6251, "step": 488 }, { "epoch": 0.15658704162339218, "grad_norm": 0.5429502129554749, "learning_rate": 9.695444023789554e-05, "loss": 2.577, "step": 490 }, { "epoch": 0.15722617240552847, "grad_norm": 0.6252620220184326, "learning_rate": 9.691801903822244e-05, "loss": 2.6114, "step": 492 }, { "epoch": 0.15786530318766478, "grad_norm": 0.5587325692176819, "learning_rate": 9.68813882780504e-05, "loss": 2.632, "step": 494 }, { "epoch": 0.15850443396980107, "grad_norm": 0.5149174332618713, "learning_rate": 9.68445481209916e-05, "loss": 2.6394, "step": 496 }, { "epoch": 0.15914356475193736, "grad_norm": 0.5343561172485352, "learning_rate": 9.680749873159354e-05, "loss": 2.572, "step": 498 }, { "epoch": 0.15978269553407365, "grad_norm": 0.5082888603210449, "learning_rate": 9.677024027533821e-05, "loss": 2.5786, "step": 500 }, { "epoch": 0.16042182631620996, "grad_norm": 0.46739038825035095, "learning_rate": 9.673277291864145e-05, "loss": 2.5933, "step": 502 }, { "epoch": 0.16106095709834625, "grad_norm": 0.5262092351913452, "learning_rate": 9.669509682885216e-05, "loss": 2.6295, "step": 504 }, { "epoch": 0.16170008788048254, "grad_norm": 0.5002930760383606, "learning_rate": 9.66572121742515e-05, "loss": 2.6306, "step": 506 }, { "epoch": 0.16233921866261883, "grad_norm": 0.4859941601753235, "learning_rate": 9.661911912405222e-05, "loss": 2.5742, "step": 508 }, { "epoch": 0.16297834944475514, "grad_norm": 0.6142066717147827, "learning_rate": 9.65808178483979e-05, "loss": 2.61, "step": 510 }, { "epoch": 0.16361748022689143, "grad_norm": 0.6018419861793518, "learning_rate": 9.654230851836214e-05, "loss": 2.6158, "step": 512 }, { "epoch": 0.16425661100902772, "grad_norm": 0.5785476565361023, "learning_rate": 9.650359130594779e-05, "loss": 2.629, "step": 514 }, { "epoch": 0.164895741791164, "grad_norm": 0.5036047697067261, "learning_rate": 9.646466638408629e-05, "loss": 2.6087, "step": 516 }, { "epoch": 0.16553487257330032, "grad_norm": 0.5089232325553894, "learning_rate": 9.642553392663672e-05, "loss": 2.6299, "step": 518 }, { "epoch": 0.1661740033554366, "grad_norm": 0.5314218997955322, "learning_rate": 9.63861941083852e-05, "loss": 2.6152, "step": 520 }, { "epoch": 0.1668131341375729, "grad_norm": 0.6545165181159973, "learning_rate": 9.634664710504402e-05, "loss": 2.5711, "step": 522 }, { "epoch": 0.1674522649197092, "grad_norm": 0.7461646199226379, "learning_rate": 9.630689309325082e-05, "loss": 2.627, "step": 524 }, { "epoch": 0.1680913957018455, "grad_norm": 0.6585918068885803, "learning_rate": 9.626693225056794e-05, "loss": 2.6231, "step": 526 }, { "epoch": 0.1687305264839818, "grad_norm": 0.5888398289680481, "learning_rate": 9.62267647554814e-05, "loss": 2.6175, "step": 528 }, { "epoch": 0.16936965726611808, "grad_norm": 0.49957162141799927, "learning_rate": 9.618639078740037e-05, "loss": 2.5771, "step": 530 }, { "epoch": 0.17000878804825437, "grad_norm": 0.4573955535888672, "learning_rate": 9.614581052665616e-05, "loss": 2.5855, "step": 532 }, { "epoch": 0.17064791883039068, "grad_norm": 0.5360051393508911, "learning_rate": 9.610502415450153e-05, "loss": 2.6107, "step": 534 }, { "epoch": 0.17128704961252697, "grad_norm": 0.5413601994514465, "learning_rate": 9.606403185310981e-05, "loss": 2.5971, "step": 536 }, { "epoch": 0.17192618039466326, "grad_norm": 0.5360136032104492, "learning_rate": 9.602283380557416e-05, "loss": 2.5878, "step": 538 }, { "epoch": 0.17256531117679955, "grad_norm": 0.653225839138031, "learning_rate": 9.598143019590664e-05, "loss": 2.6, "step": 540 }, { "epoch": 0.17320444195893583, "grad_norm": 0.5268750786781311, "learning_rate": 9.593982120903754e-05, "loss": 2.5992, "step": 542 }, { "epoch": 0.17384357274107215, "grad_norm": 0.5311806797981262, "learning_rate": 9.589800703081442e-05, "loss": 2.5939, "step": 544 }, { "epoch": 0.17448270352320844, "grad_norm": 0.47583094239234924, "learning_rate": 9.585598784800135e-05, "loss": 2.5863, "step": 546 }, { "epoch": 0.17512183430534473, "grad_norm": 0.44130444526672363, "learning_rate": 9.581376384827804e-05, "loss": 2.5568, "step": 548 }, { "epoch": 0.175760965087481, "grad_norm": 0.45064234733581543, "learning_rate": 9.577133522023906e-05, "loss": 2.5888, "step": 550 }, { "epoch": 0.17640009586961733, "grad_norm": 0.4643968343734741, "learning_rate": 9.572870215339294e-05, "loss": 2.6121, "step": 552 }, { "epoch": 0.17703922665175362, "grad_norm": 0.446347713470459, "learning_rate": 9.568586483816129e-05, "loss": 2.614, "step": 554 }, { "epoch": 0.1776783574338899, "grad_norm": 0.48379895091056824, "learning_rate": 9.564282346587809e-05, "loss": 2.6353, "step": 556 }, { "epoch": 0.1783174882160262, "grad_norm": 0.45891985297203064, "learning_rate": 9.559957822878867e-05, "loss": 2.6111, "step": 558 }, { "epoch": 0.1789566189981625, "grad_norm": 0.49106699228286743, "learning_rate": 9.555612932004896e-05, "loss": 2.5876, "step": 560 }, { "epoch": 0.1795957497802988, "grad_norm": 0.5220739245414734, "learning_rate": 9.55124769337246e-05, "loss": 2.5988, "step": 562 }, { "epoch": 0.18023488056243508, "grad_norm": 0.6365030407905579, "learning_rate": 9.546862126479006e-05, "loss": 2.5763, "step": 564 }, { "epoch": 0.18087401134457137, "grad_norm": 0.706681489944458, "learning_rate": 9.542456250912776e-05, "loss": 2.5965, "step": 566 }, { "epoch": 0.1815131421267077, "grad_norm": 0.4519253373146057, "learning_rate": 9.538030086352725e-05, "loss": 2.568, "step": 568 }, { "epoch": 0.18215227290884398, "grad_norm": 0.6023289561271667, "learning_rate": 9.533583652568426e-05, "loss": 2.6034, "step": 570 }, { "epoch": 0.18279140369098026, "grad_norm": 0.581615686416626, "learning_rate": 9.529116969419986e-05, "loss": 2.5858, "step": 572 }, { "epoch": 0.18343053447311655, "grad_norm": 0.49777430295944214, "learning_rate": 9.524630056857958e-05, "loss": 2.6062, "step": 574 }, { "epoch": 0.18406966525525287, "grad_norm": 0.5936197638511658, "learning_rate": 9.520122934923246e-05, "loss": 2.5976, "step": 576 }, { "epoch": 0.18470879603738916, "grad_norm": 0.5317326784133911, "learning_rate": 9.515595623747022e-05, "loss": 2.6004, "step": 578 }, { "epoch": 0.18534792681952544, "grad_norm": 0.524297297000885, "learning_rate": 9.511048143550637e-05, "loss": 2.583, "step": 580 }, { "epoch": 0.18598705760166173, "grad_norm": 0.5107091665267944, "learning_rate": 9.506480514645523e-05, "loss": 2.5704, "step": 582 }, { "epoch": 0.18662618838379805, "grad_norm": 0.4521612226963043, "learning_rate": 9.501892757433107e-05, "loss": 2.5903, "step": 584 }, { "epoch": 0.18726531916593434, "grad_norm": 0.48701736330986023, "learning_rate": 9.497284892404721e-05, "loss": 2.5758, "step": 586 }, { "epoch": 0.18790444994807062, "grad_norm": 0.613917887210846, "learning_rate": 9.492656940141512e-05, "loss": 2.5749, "step": 588 }, { "epoch": 0.1885435807302069, "grad_norm": 0.5269163846969604, "learning_rate": 9.488008921314338e-05, "loss": 2.6126, "step": 590 }, { "epoch": 0.18918271151234323, "grad_norm": 0.6326431632041931, "learning_rate": 9.483340856683696e-05, "loss": 2.5863, "step": 592 }, { "epoch": 0.18982184229447951, "grad_norm": 0.47863009572029114, "learning_rate": 9.47865276709961e-05, "loss": 2.6201, "step": 594 }, { "epoch": 0.1904609730766158, "grad_norm": 0.5771295428276062, "learning_rate": 9.473944673501549e-05, "loss": 2.5914, "step": 596 }, { "epoch": 0.1911001038587521, "grad_norm": 0.4584767818450928, "learning_rate": 9.469216596918331e-05, "loss": 2.5497, "step": 598 }, { "epoch": 0.19173923464088838, "grad_norm": 0.4598289728164673, "learning_rate": 9.464468558468026e-05, "loss": 2.5841, "step": 600 } ], "logging_steps": 2, "max_steps": 3130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.755100672393216e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }