{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21024046252901757, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00035040077088169594, "grad_norm": 6.5142412185668945, "learning_rate": 0.0, "loss": 5.324, "step": 1 }, { "epoch": 0.0007008015417633919, "grad_norm": 6.758334159851074, "learning_rate": 6.993006993006994e-07, "loss": 5.3405, "step": 2 }, { "epoch": 0.0014016030835267838, "grad_norm": 6.22674036026001, "learning_rate": 2.0979020979020983e-06, "loss": 5.3286, "step": 4 }, { "epoch": 0.0021024046252901755, "grad_norm": 5.438386917114258, "learning_rate": 3.496503496503497e-06, "loss": 5.25, "step": 6 }, { "epoch": 0.0028032061670535675, "grad_norm": 3.365504741668701, "learning_rate": 4.895104895104895e-06, "loss": 5.2821, "step": 8 }, { "epoch": 0.0035040077088169595, "grad_norm": 7.186147212982178, "learning_rate": 6.2937062937062944e-06, "loss": 5.21, "step": 10 }, { "epoch": 0.004204809250580351, "grad_norm": 4.960826396942139, "learning_rate": 7.692307692307694e-06, "loss": 5.0759, "step": 12 }, { "epoch": 0.004905610792343743, "grad_norm": 4.001464366912842, "learning_rate": 9.090909090909091e-06, "loss": 5.1092, "step": 14 }, { "epoch": 0.005606412334107135, "grad_norm": 3.2986342906951904, "learning_rate": 1.048951048951049e-05, "loss": 4.93, "step": 16 }, { "epoch": 0.006307213875870527, "grad_norm": 2.5407276153564453, "learning_rate": 1.188811188811189e-05, "loss": 4.8535, "step": 18 }, { "epoch": 0.007008015417633919, "grad_norm": 2.211754083633423, "learning_rate": 1.3286713286713287e-05, "loss": 4.74, "step": 20 }, { "epoch": 0.007708816959397311, "grad_norm": 1.6710195541381836, "learning_rate": 1.4685314685314686e-05, "loss": 4.609, "step": 22 }, { "epoch": 0.008409618501160702, "grad_norm": 1.280752182006836, "learning_rate": 1.6083916083916083e-05, "loss": 4.4879, "step": 24 }, { "epoch": 0.009110420042924094, "grad_norm": 1.312186598777771, "learning_rate": 1.7482517482517483e-05, "loss": 4.3995, "step": 26 }, { "epoch": 0.009811221584687486, "grad_norm": 1.3315190076828003, "learning_rate": 1.888111888111888e-05, "loss": 4.3005, "step": 28 }, { "epoch": 0.010512023126450878, "grad_norm": 1.3252590894699097, "learning_rate": 2.027972027972028e-05, "loss": 4.1952, "step": 30 }, { "epoch": 0.01121282466821427, "grad_norm": 1.3794758319854736, "learning_rate": 2.1678321678321677e-05, "loss": 4.1459, "step": 32 }, { "epoch": 0.011913626209977662, "grad_norm": 1.1808068752288818, "learning_rate": 2.307692307692308e-05, "loss": 4.034, "step": 34 }, { "epoch": 0.012614427751741054, "grad_norm": 1.31660795211792, "learning_rate": 2.4475524475524478e-05, "loss": 3.926, "step": 36 }, { "epoch": 0.013315229293504446, "grad_norm": 1.0347495079040527, "learning_rate": 2.5874125874125877e-05, "loss": 3.8812, "step": 38 }, { "epoch": 0.014016030835267838, "grad_norm": 1.050775408744812, "learning_rate": 2.7272727272727273e-05, "loss": 3.7787, "step": 40 }, { "epoch": 0.01471683237703123, "grad_norm": 0.9461761713027954, "learning_rate": 2.8671328671328672e-05, "loss": 3.6738, "step": 42 }, { "epoch": 0.015417633918794622, "grad_norm": 1.0460454225540161, "learning_rate": 3.0069930069930068e-05, "loss": 3.6385, "step": 44 }, { "epoch": 0.016118435460558012, "grad_norm": 1.0687191486358643, "learning_rate": 3.146853146853147e-05, "loss": 3.5701, "step": 46 }, { "epoch": 0.016819237002321404, "grad_norm": 1.4722611904144287, "learning_rate": 3.2867132867132866e-05, "loss": 3.5438, "step": 48 }, { "epoch": 0.017520038544084796, "grad_norm": 1.1305724382400513, "learning_rate": 3.4265734265734265e-05, "loss": 3.4694, "step": 50 }, { "epoch": 0.018220840085848188, "grad_norm": 0.9322625994682312, "learning_rate": 3.566433566433567e-05, "loss": 3.4488, "step": 52 }, { "epoch": 0.01892164162761158, "grad_norm": 1.2441555261611938, "learning_rate": 3.7062937062937064e-05, "loss": 3.4289, "step": 54 }, { "epoch": 0.019622443169374972, "grad_norm": 0.9397731423377991, "learning_rate": 3.846153846153846e-05, "loss": 3.4021, "step": 56 }, { "epoch": 0.020323244711138364, "grad_norm": 1.3261164426803589, "learning_rate": 3.986013986013986e-05, "loss": 3.3575, "step": 58 }, { "epoch": 0.021024046252901756, "grad_norm": 1.08541738986969, "learning_rate": 4.125874125874126e-05, "loss": 3.3403, "step": 60 }, { "epoch": 0.021724847794665148, "grad_norm": 0.8626166582107544, "learning_rate": 4.265734265734266e-05, "loss": 3.3306, "step": 62 }, { "epoch": 0.02242564933642854, "grad_norm": 1.0596344470977783, "learning_rate": 4.405594405594406e-05, "loss": 3.2779, "step": 64 }, { "epoch": 0.023126450878191932, "grad_norm": 1.511917233467102, "learning_rate": 4.545454545454546e-05, "loss": 3.2759, "step": 66 }, { "epoch": 0.023827252419955324, "grad_norm": 1.2062046527862549, "learning_rate": 4.685314685314686e-05, "loss": 3.2545, "step": 68 }, { "epoch": 0.024528053961718716, "grad_norm": 1.1399930715560913, "learning_rate": 4.825174825174825e-05, "loss": 3.2235, "step": 70 }, { "epoch": 0.025228855503482108, "grad_norm": 0.8960133790969849, "learning_rate": 4.9650349650349656e-05, "loss": 3.2025, "step": 72 }, { "epoch": 0.0259296570452455, "grad_norm": 1.3042056560516357, "learning_rate": 5.1048951048951055e-05, "loss": 3.1475, "step": 74 }, { "epoch": 0.026630458587008892, "grad_norm": 1.186320424079895, "learning_rate": 5.244755244755245e-05, "loss": 3.1759, "step": 76 }, { "epoch": 0.027331260128772284, "grad_norm": 1.2691158056259155, "learning_rate": 5.384615384615385e-05, "loss": 3.1296, "step": 78 }, { "epoch": 0.028032061670535676, "grad_norm": 0.7816159129142761, "learning_rate": 5.524475524475524e-05, "loss": 3.1017, "step": 80 }, { "epoch": 0.028732863212299068, "grad_norm": 1.1489295959472656, "learning_rate": 5.664335664335665e-05, "loss": 3.1151, "step": 82 }, { "epoch": 0.02943366475406246, "grad_norm": 1.5686062574386597, "learning_rate": 5.8041958041958044e-05, "loss": 3.114, "step": 84 }, { "epoch": 0.030134466295825852, "grad_norm": 1.4421433210372925, "learning_rate": 5.944055944055944e-05, "loss": 3.0946, "step": 86 }, { "epoch": 0.030835267837589244, "grad_norm": 1.335250973701477, "learning_rate": 6.083916083916085e-05, "loss": 3.084, "step": 88 }, { "epoch": 0.03153606937935263, "grad_norm": 0.970507800579071, "learning_rate": 6.223776223776224e-05, "loss": 3.1163, "step": 90 }, { "epoch": 0.032236870921116025, "grad_norm": 1.2849407196044922, "learning_rate": 6.363636363636364e-05, "loss": 3.063, "step": 92 }, { "epoch": 0.032937672462879417, "grad_norm": 1.0378247499465942, "learning_rate": 6.503496503496504e-05, "loss": 3.0223, "step": 94 }, { "epoch": 0.03363847400464281, "grad_norm": 1.3139392137527466, "learning_rate": 6.643356643356644e-05, "loss": 3.0572, "step": 96 }, { "epoch": 0.0343392755464062, "grad_norm": 1.254752278327942, "learning_rate": 6.783216783216784e-05, "loss": 3.0408, "step": 98 }, { "epoch": 0.03504007708816959, "grad_norm": 1.3333168029785156, "learning_rate": 6.923076923076924e-05, "loss": 3.0185, "step": 100 }, { "epoch": 0.035740878629932984, "grad_norm": 1.2795464992523193, "learning_rate": 7.062937062937062e-05, "loss": 3.0328, "step": 102 }, { "epoch": 0.036441680171696376, "grad_norm": 1.2025645971298218, "learning_rate": 7.202797202797204e-05, "loss": 3.0303, "step": 104 }, { "epoch": 0.03714248171345977, "grad_norm": 1.1741266250610352, "learning_rate": 7.342657342657343e-05, "loss": 3.0252, "step": 106 }, { "epoch": 0.03784328325522316, "grad_norm": 1.2022653818130493, "learning_rate": 7.482517482517482e-05, "loss": 3.0183, "step": 108 }, { "epoch": 0.03854408479698655, "grad_norm": 1.1950666904449463, "learning_rate": 7.622377622377622e-05, "loss": 2.9804, "step": 110 }, { "epoch": 0.039244886338749944, "grad_norm": 1.5780822038650513, "learning_rate": 7.762237762237763e-05, "loss": 2.9804, "step": 112 }, { "epoch": 0.039945687880513336, "grad_norm": 1.0478655099868774, "learning_rate": 7.902097902097903e-05, "loss": 2.9894, "step": 114 }, { "epoch": 0.04064648942227673, "grad_norm": 1.1782268285751343, "learning_rate": 8.041958041958042e-05, "loss": 2.9717, "step": 116 }, { "epoch": 0.04134729096404012, "grad_norm": 1.0321820974349976, "learning_rate": 8.181818181818183e-05, "loss": 2.9776, "step": 118 }, { "epoch": 0.04204809250580351, "grad_norm": 0.9697206020355225, "learning_rate": 8.321678321678323e-05, "loss": 2.9804, "step": 120 }, { "epoch": 0.042748894047566904, "grad_norm": 1.1984606981277466, "learning_rate": 8.461538461538461e-05, "loss": 2.9495, "step": 122 }, { "epoch": 0.043449695589330296, "grad_norm": 0.9830178618431091, "learning_rate": 8.601398601398601e-05, "loss": 2.9656, "step": 124 }, { "epoch": 0.04415049713109369, "grad_norm": 1.3105114698410034, "learning_rate": 8.741258741258743e-05, "loss": 2.9306, "step": 126 }, { "epoch": 0.04485129867285708, "grad_norm": 1.3499157428741455, "learning_rate": 8.881118881118881e-05, "loss": 2.9381, "step": 128 }, { "epoch": 0.04555210021462047, "grad_norm": 0.9977575540542603, "learning_rate": 9.020979020979021e-05, "loss": 2.907, "step": 130 }, { "epoch": 0.046252901756383864, "grad_norm": 1.2331498861312866, "learning_rate": 9.160839160839161e-05, "loss": 2.9224, "step": 132 }, { "epoch": 0.046953703298147256, "grad_norm": 1.451253890991211, "learning_rate": 9.300699300699301e-05, "loss": 2.9202, "step": 134 }, { "epoch": 0.04765450483991065, "grad_norm": 1.2146471738815308, "learning_rate": 9.440559440559441e-05, "loss": 2.9098, "step": 136 }, { "epoch": 0.04835530638167404, "grad_norm": 1.0873245000839233, "learning_rate": 9.580419580419581e-05, "loss": 2.9218, "step": 138 }, { "epoch": 0.04905610792343743, "grad_norm": 1.276413083076477, "learning_rate": 9.72027972027972e-05, "loss": 2.8947, "step": 140 }, { "epoch": 0.049756909465200824, "grad_norm": 1.126065731048584, "learning_rate": 9.86013986013986e-05, "loss": 2.8788, "step": 142 }, { "epoch": 0.050457711006964216, "grad_norm": 1.5177017450332642, "learning_rate": 0.0001, "loss": 2.9043, "step": 144 }, { "epoch": 0.05115851254872761, "grad_norm": 1.3744112253189087, "learning_rate": 9.99998657109765e-05, "loss": 2.888, "step": 146 }, { "epoch": 0.051859314090491, "grad_norm": 1.7921055555343628, "learning_rate": 9.999946284462733e-05, "loss": 2.8631, "step": 148 }, { "epoch": 0.05256011563225439, "grad_norm": 1.1755317449569702, "learning_rate": 9.999879140311652e-05, "loss": 2.8735, "step": 150 }, { "epoch": 0.053260917174017784, "grad_norm": 0.846362292766571, "learning_rate": 9.999785139005073e-05, "loss": 2.8768, "step": 152 }, { "epoch": 0.053961718715781176, "grad_norm": 0.9867280721664429, "learning_rate": 9.999664281047933e-05, "loss": 2.8859, "step": 154 }, { "epoch": 0.05466252025754457, "grad_norm": 0.9751666188240051, "learning_rate": 9.999516567089429e-05, "loss": 2.8497, "step": 156 }, { "epoch": 0.05536332179930796, "grad_norm": 1.0603703260421753, "learning_rate": 9.999341997923011e-05, "loss": 2.8404, "step": 158 }, { "epoch": 0.05606412334107135, "grad_norm": 1.0447975397109985, "learning_rate": 9.999140574486392e-05, "loss": 2.9092, "step": 160 }, { "epoch": 0.056764924882834744, "grad_norm": 1.3046443462371826, "learning_rate": 9.998912297861527e-05, "loss": 2.8971, "step": 162 }, { "epoch": 0.057465726424598136, "grad_norm": 1.1029243469238281, "learning_rate": 9.998657169274622e-05, "loss": 2.8834, "step": 164 }, { "epoch": 0.05816652796636153, "grad_norm": 0.8594210743904114, "learning_rate": 9.99837519009611e-05, "loss": 2.8361, "step": 166 }, { "epoch": 0.05886732950812492, "grad_norm": 0.8585363030433655, "learning_rate": 9.998066361840665e-05, "loss": 2.8782, "step": 168 }, { "epoch": 0.05956813104988831, "grad_norm": 0.693467378616333, "learning_rate": 9.997730686167173e-05, "loss": 2.8537, "step": 170 }, { "epoch": 0.060268932591651704, "grad_norm": 0.8418940305709839, "learning_rate": 9.997368164878738e-05, "loss": 2.8294, "step": 172 }, { "epoch": 0.060969734133415096, "grad_norm": 0.9938271045684814, "learning_rate": 9.996978799922665e-05, "loss": 2.8458, "step": 174 }, { "epoch": 0.06167053567517849, "grad_norm": 1.0347217321395874, "learning_rate": 9.99656259339045e-05, "loss": 2.8081, "step": 176 }, { "epoch": 0.06237133721694188, "grad_norm": 0.9216743111610413, "learning_rate": 9.996119547517775e-05, "loss": 2.8655, "step": 178 }, { "epoch": 0.06307213875870527, "grad_norm": 1.0579859018325806, "learning_rate": 9.995649664684486e-05, "loss": 2.823, "step": 180 }, { "epoch": 0.06377294030046866, "grad_norm": 0.9864194393157959, "learning_rate": 9.995152947414586e-05, "loss": 2.8081, "step": 182 }, { "epoch": 0.06447374184223205, "grad_norm": 0.8999143838882446, "learning_rate": 9.994629398376226e-05, "loss": 2.7947, "step": 184 }, { "epoch": 0.06517454338399545, "grad_norm": 0.9121315479278564, "learning_rate": 9.994079020381676e-05, "loss": 2.8253, "step": 186 }, { "epoch": 0.06587534492575883, "grad_norm": 0.8578842282295227, "learning_rate": 9.993501816387329e-05, "loss": 2.7548, "step": 188 }, { "epoch": 0.06657614646752223, "grad_norm": 0.8564820289611816, "learning_rate": 9.992897789493672e-05, "loss": 2.8361, "step": 190 }, { "epoch": 0.06727694800928562, "grad_norm": 0.8013344407081604, "learning_rate": 9.992266942945269e-05, "loss": 2.8606, "step": 192 }, { "epoch": 0.06797774955104902, "grad_norm": 0.7343975901603699, "learning_rate": 9.991609280130752e-05, "loss": 2.7947, "step": 194 }, { "epoch": 0.0686785510928124, "grad_norm": 0.7338536381721497, "learning_rate": 9.990924804582797e-05, "loss": 2.7492, "step": 196 }, { "epoch": 0.0693793526345758, "grad_norm": 0.828781008720398, "learning_rate": 9.990213519978109e-05, "loss": 2.8013, "step": 198 }, { "epoch": 0.07008015417633918, "grad_norm": 0.7156624794006348, "learning_rate": 9.989475430137391e-05, "loss": 2.7943, "step": 200 }, { "epoch": 0.07078095571810258, "grad_norm": 0.6014353632926941, "learning_rate": 9.988710539025341e-05, "loss": 2.8099, "step": 202 }, { "epoch": 0.07148175725986597, "grad_norm": 0.6569661498069763, "learning_rate": 9.987918850750619e-05, "loss": 2.8125, "step": 204 }, { "epoch": 0.07218255880162937, "grad_norm": 0.6558775305747986, "learning_rate": 9.987100369565825e-05, "loss": 2.7487, "step": 206 }, { "epoch": 0.07288336034339275, "grad_norm": 0.6454245448112488, "learning_rate": 9.986255099867481e-05, "loss": 2.7648, "step": 208 }, { "epoch": 0.07358416188515615, "grad_norm": 0.5741921067237854, "learning_rate": 9.985383046196004e-05, "loss": 2.7743, "step": 210 }, { "epoch": 0.07428496342691954, "grad_norm": 0.5875937938690186, "learning_rate": 9.984484213235685e-05, "loss": 2.7728, "step": 212 }, { "epoch": 0.07498576496868294, "grad_norm": 0.6638422012329102, "learning_rate": 9.98355860581466e-05, "loss": 2.7504, "step": 214 }, { "epoch": 0.07568656651044632, "grad_norm": 1.1614341735839844, "learning_rate": 9.982606228904884e-05, "loss": 2.7923, "step": 216 }, { "epoch": 0.07638736805220972, "grad_norm": 1.005254864692688, "learning_rate": 9.981627087622108e-05, "loss": 2.76, "step": 218 }, { "epoch": 0.0770881695939731, "grad_norm": 0.7738555669784546, "learning_rate": 9.980621187225852e-05, "loss": 2.7866, "step": 220 }, { "epoch": 0.0777889711357365, "grad_norm": 0.9469527006149292, "learning_rate": 9.979588533119367e-05, "loss": 2.8012, "step": 222 }, { "epoch": 0.07848977267749989, "grad_norm": 0.9031473398208618, "learning_rate": 9.978529130849619e-05, "loss": 2.7522, "step": 224 }, { "epoch": 0.07919057421926329, "grad_norm": 0.9450514912605286, "learning_rate": 9.977442986107252e-05, "loss": 2.7791, "step": 226 }, { "epoch": 0.07989137576102667, "grad_norm": 0.7259206771850586, "learning_rate": 9.97633010472656e-05, "loss": 2.7237, "step": 228 }, { "epoch": 0.08059217730279007, "grad_norm": 0.6595309972763062, "learning_rate": 9.975190492685451e-05, "loss": 2.7284, "step": 230 }, { "epoch": 0.08129297884455346, "grad_norm": 0.7696382999420166, "learning_rate": 9.974024156105422e-05, "loss": 2.7631, "step": 232 }, { "epoch": 0.08199378038631686, "grad_norm": 0.7305110096931458, "learning_rate": 9.972831101251521e-05, "loss": 2.7793, "step": 234 }, { "epoch": 0.08269458192808024, "grad_norm": 0.6039514541625977, "learning_rate": 9.971611334532314e-05, "loss": 2.7669, "step": 236 }, { "epoch": 0.08339538346984364, "grad_norm": 0.5824711918830872, "learning_rate": 9.970364862499852e-05, "loss": 2.7476, "step": 238 }, { "epoch": 0.08409618501160702, "grad_norm": 0.6831758618354797, "learning_rate": 9.969091691849637e-05, "loss": 2.7098, "step": 240 }, { "epoch": 0.08479698655337042, "grad_norm": 0.6469074487686157, "learning_rate": 9.967791829420581e-05, "loss": 2.7609, "step": 242 }, { "epoch": 0.08549778809513381, "grad_norm": 0.5876832604408264, "learning_rate": 9.966465282194976e-05, "loss": 2.7306, "step": 244 }, { "epoch": 0.08619858963689721, "grad_norm": 0.6310129761695862, "learning_rate": 9.965112057298451e-05, "loss": 2.7283, "step": 246 }, { "epoch": 0.08689939117866059, "grad_norm": 0.6113069653511047, "learning_rate": 9.963732161999935e-05, "loss": 2.7274, "step": 248 }, { "epoch": 0.08760019272042399, "grad_norm": 1.0655111074447632, "learning_rate": 9.96232560371162e-05, "loss": 2.7022, "step": 250 }, { "epoch": 0.08830099426218738, "grad_norm": 0.8412613272666931, "learning_rate": 9.960892389988918e-05, "loss": 2.7213, "step": 252 }, { "epoch": 0.08900179580395078, "grad_norm": 0.7329776883125305, "learning_rate": 9.959432528530428e-05, "loss": 2.7343, "step": 254 }, { "epoch": 0.08970259734571416, "grad_norm": 0.702498197555542, "learning_rate": 9.95794602717788e-05, "loss": 2.7642, "step": 256 }, { "epoch": 0.09040339888747755, "grad_norm": 0.6936408281326294, "learning_rate": 9.95643289391611e-05, "loss": 2.7081, "step": 258 }, { "epoch": 0.09110420042924094, "grad_norm": 0.664743959903717, "learning_rate": 9.954893136873005e-05, "loss": 2.7054, "step": 260 }, { "epoch": 0.09180500197100433, "grad_norm": 0.5716791152954102, "learning_rate": 9.953326764319463e-05, "loss": 2.6751, "step": 262 }, { "epoch": 0.09250580351276773, "grad_norm": 0.6207195520401001, "learning_rate": 9.95173378466935e-05, "loss": 2.6945, "step": 264 }, { "epoch": 0.09320660505453111, "grad_norm": 0.6572092771530151, "learning_rate": 9.950114206479453e-05, "loss": 2.6989, "step": 266 }, { "epoch": 0.09390740659629451, "grad_norm": 0.7676830887794495, "learning_rate": 9.948468038449435e-05, "loss": 2.7613, "step": 268 }, { "epoch": 0.0946082081380579, "grad_norm": 0.5810503959655762, "learning_rate": 9.946795289421787e-05, "loss": 2.7234, "step": 270 }, { "epoch": 0.0953090096798213, "grad_norm": 0.6459682583808899, "learning_rate": 9.945095968381784e-05, "loss": 2.717, "step": 272 }, { "epoch": 0.09600981122158468, "grad_norm": 0.6498464345932007, "learning_rate": 9.94337008445743e-05, "loss": 2.7389, "step": 274 }, { "epoch": 0.09671061276334808, "grad_norm": 0.6287350654602051, "learning_rate": 9.941617646919421e-05, "loss": 2.681, "step": 276 }, { "epoch": 0.09741141430511147, "grad_norm": 0.7516258955001831, "learning_rate": 9.939838665181076e-05, "loss": 2.6696, "step": 278 }, { "epoch": 0.09811221584687486, "grad_norm": 0.6962350606918335, "learning_rate": 9.938033148798307e-05, "loss": 2.6971, "step": 280 }, { "epoch": 0.09881301738863825, "grad_norm": 0.6605144739151001, "learning_rate": 9.936201107469555e-05, "loss": 2.6999, "step": 282 }, { "epoch": 0.09951381893040165, "grad_norm": 0.5991240739822388, "learning_rate": 9.93434255103574e-05, "loss": 2.6936, "step": 284 }, { "epoch": 0.10021462047216503, "grad_norm": 0.5660961866378784, "learning_rate": 9.932457489480213e-05, "loss": 2.686, "step": 286 }, { "epoch": 0.10091542201392843, "grad_norm": 0.690290093421936, "learning_rate": 9.930545932928698e-05, "loss": 2.6809, "step": 288 }, { "epoch": 0.10161622355569182, "grad_norm": 0.7119167447090149, "learning_rate": 9.928607891649234e-05, "loss": 2.7221, "step": 290 }, { "epoch": 0.10231702509745522, "grad_norm": 0.7049365639686584, "learning_rate": 9.926643376052131e-05, "loss": 2.6569, "step": 292 }, { "epoch": 0.1030178266392186, "grad_norm": 0.6691743731498718, "learning_rate": 9.924652396689902e-05, "loss": 2.6751, "step": 294 }, { "epoch": 0.103718628180982, "grad_norm": 0.5533433556556702, "learning_rate": 9.922634964257215e-05, "loss": 2.7064, "step": 296 }, { "epoch": 0.10441942972274539, "grad_norm": 0.6669672727584839, "learning_rate": 9.920591089590831e-05, "loss": 2.687, "step": 298 }, { "epoch": 0.10512023126450878, "grad_norm": 0.8539720773696899, "learning_rate": 9.918520783669549e-05, "loss": 2.6968, "step": 300 }, { "epoch": 0.10582103280627217, "grad_norm": 0.827905535697937, "learning_rate": 9.916424057614142e-05, "loss": 2.7339, "step": 302 }, { "epoch": 0.10652183434803557, "grad_norm": 0.7071542143821716, "learning_rate": 9.9143009226873e-05, "loss": 2.67, "step": 304 }, { "epoch": 0.10722263588979895, "grad_norm": 0.6667853593826294, "learning_rate": 9.912151390293575e-05, "loss": 2.7113, "step": 306 }, { "epoch": 0.10792343743156235, "grad_norm": 0.49210044741630554, "learning_rate": 9.90997547197931e-05, "loss": 2.7034, "step": 308 }, { "epoch": 0.10862423897332574, "grad_norm": 0.5823047757148743, "learning_rate": 9.907773179432581e-05, "loss": 2.6815, "step": 310 }, { "epoch": 0.10932504051508914, "grad_norm": 0.5159279704093933, "learning_rate": 9.905544524483138e-05, "loss": 2.7055, "step": 312 }, { "epoch": 0.11002584205685252, "grad_norm": 0.5294278264045715, "learning_rate": 9.903289519102338e-05, "loss": 2.6821, "step": 314 }, { "epoch": 0.11072664359861592, "grad_norm": 0.5865507125854492, "learning_rate": 9.901008175403078e-05, "loss": 2.698, "step": 316 }, { "epoch": 0.1114274451403793, "grad_norm": 0.7102755904197693, "learning_rate": 9.898700505639735e-05, "loss": 2.693, "step": 318 }, { "epoch": 0.1121282466821427, "grad_norm": 0.8151699900627136, "learning_rate": 9.8963665222081e-05, "loss": 2.6482, "step": 320 }, { "epoch": 0.11282904822390609, "grad_norm": 0.5769193172454834, "learning_rate": 9.894006237645304e-05, "loss": 2.6893, "step": 322 }, { "epoch": 0.11352984976566949, "grad_norm": 0.6606284976005554, "learning_rate": 9.891619664629762e-05, "loss": 2.6859, "step": 324 }, { "epoch": 0.11423065130743287, "grad_norm": 0.5883016586303711, "learning_rate": 9.889206815981094e-05, "loss": 2.6622, "step": 326 }, { "epoch": 0.11493145284919627, "grad_norm": 0.5413339734077454, "learning_rate": 9.886767704660067e-05, "loss": 2.6718, "step": 328 }, { "epoch": 0.11563225439095966, "grad_norm": 0.7391770482063293, "learning_rate": 9.884302343768512e-05, "loss": 2.6695, "step": 330 }, { "epoch": 0.11633305593272306, "grad_norm": 0.7529366612434387, "learning_rate": 9.881810746549267e-05, "loss": 2.7341, "step": 332 }, { "epoch": 0.11703385747448644, "grad_norm": 0.6971571445465088, "learning_rate": 9.8792929263861e-05, "loss": 2.6444, "step": 334 }, { "epoch": 0.11773465901624984, "grad_norm": 0.544129490852356, "learning_rate": 9.876748896803633e-05, "loss": 2.7351, "step": 336 }, { "epoch": 0.11843546055801323, "grad_norm": 0.6561135649681091, "learning_rate": 9.874178671467277e-05, "loss": 2.6896, "step": 338 }, { "epoch": 0.11913626209977662, "grad_norm": 0.6607089042663574, "learning_rate": 9.871582264183155e-05, "loss": 2.6664, "step": 340 }, { "epoch": 0.11983706364154001, "grad_norm": 0.6727411150932312, "learning_rate": 9.868959688898023e-05, "loss": 2.68, "step": 342 }, { "epoch": 0.12053786518330341, "grad_norm": 0.5672718286514282, "learning_rate": 9.86631095969921e-05, "loss": 2.6639, "step": 344 }, { "epoch": 0.1212386667250668, "grad_norm": 0.7188961505889893, "learning_rate": 9.86363609081452e-05, "loss": 2.6604, "step": 346 }, { "epoch": 0.12193946826683019, "grad_norm": 0.9785953760147095, "learning_rate": 9.86093509661218e-05, "loss": 2.6557, "step": 348 }, { "epoch": 0.12264026980859358, "grad_norm": 0.7856999635696411, "learning_rate": 9.85820799160074e-05, "loss": 2.6418, "step": 350 }, { "epoch": 0.12334107135035698, "grad_norm": 0.5956946015357971, "learning_rate": 9.855454790429015e-05, "loss": 2.658, "step": 352 }, { "epoch": 0.12404187289212036, "grad_norm": 0.6523074507713318, "learning_rate": 9.852675507885991e-05, "loss": 2.6743, "step": 354 }, { "epoch": 0.12474267443388376, "grad_norm": 0.71266108751297, "learning_rate": 9.849870158900753e-05, "loss": 2.6805, "step": 356 }, { "epoch": 0.12544347597564715, "grad_norm": 0.5674154162406921, "learning_rate": 9.847038758542404e-05, "loss": 2.6678, "step": 358 }, { "epoch": 0.12614427751741053, "grad_norm": 0.5430511236190796, "learning_rate": 9.844181322019983e-05, "loss": 2.643, "step": 360 }, { "epoch": 0.12684507905917394, "grad_norm": 0.508791983127594, "learning_rate": 9.841297864682388e-05, "loss": 2.6524, "step": 362 }, { "epoch": 0.12754588060093733, "grad_norm": 0.6082713603973389, "learning_rate": 9.838388402018282e-05, "loss": 2.6892, "step": 364 }, { "epoch": 0.1282466821427007, "grad_norm": 0.6065689325332642, "learning_rate": 9.835452949656022e-05, "loss": 2.6083, "step": 366 }, { "epoch": 0.1289474836844641, "grad_norm": 0.5220572352409363, "learning_rate": 9.83249152336357e-05, "loss": 2.6573, "step": 368 }, { "epoch": 0.1296482852262275, "grad_norm": 0.568534791469574, "learning_rate": 9.829504139048406e-05, "loss": 2.6266, "step": 370 }, { "epoch": 0.1303490867679909, "grad_norm": 0.6165401339530945, "learning_rate": 9.826490812757452e-05, "loss": 2.6928, "step": 372 }, { "epoch": 0.13104988830975428, "grad_norm": 0.5951835513114929, "learning_rate": 9.823451560676966e-05, "loss": 2.6468, "step": 374 }, { "epoch": 0.13175068985151767, "grad_norm": 0.4942519962787628, "learning_rate": 9.820386399132482e-05, "loss": 2.6493, "step": 376 }, { "epoch": 0.13245149139328108, "grad_norm": 0.6185161471366882, "learning_rate": 9.8172953445887e-05, "loss": 2.6741, "step": 378 }, { "epoch": 0.13315229293504446, "grad_norm": 0.5588895678520203, "learning_rate": 9.814178413649407e-05, "loss": 2.6393, "step": 380 }, { "epoch": 0.13385309447680785, "grad_norm": 0.6289598941802979, "learning_rate": 9.811035623057387e-05, "loss": 2.6022, "step": 382 }, { "epoch": 0.13455389601857123, "grad_norm": 0.6258370280265808, "learning_rate": 9.807866989694334e-05, "loss": 2.6033, "step": 384 }, { "epoch": 0.13525469756033462, "grad_norm": 0.6390899419784546, "learning_rate": 9.804672530580754e-05, "loss": 2.6413, "step": 386 }, { "epoch": 0.13595549910209803, "grad_norm": 0.6844115257263184, "learning_rate": 9.801452262875877e-05, "loss": 2.6339, "step": 388 }, { "epoch": 0.13665630064386142, "grad_norm": 0.70540452003479, "learning_rate": 9.798206203877569e-05, "loss": 2.6471, "step": 390 }, { "epoch": 0.1373571021856248, "grad_norm": 0.7336652278900146, "learning_rate": 9.794934371022233e-05, "loss": 2.6348, "step": 392 }, { "epoch": 0.1380579037273882, "grad_norm": 0.7155029773712158, "learning_rate": 9.79163678188472e-05, "loss": 2.6128, "step": 394 }, { "epoch": 0.1387587052691516, "grad_norm": 0.6354189515113831, "learning_rate": 9.788313454178228e-05, "loss": 2.6281, "step": 396 }, { "epoch": 0.13945950681091498, "grad_norm": 0.596047043800354, "learning_rate": 9.78496440575422e-05, "loss": 2.6719, "step": 398 }, { "epoch": 0.14016030835267837, "grad_norm": 0.6149719953536987, "learning_rate": 9.781589654602306e-05, "loss": 2.625, "step": 400 }, { "epoch": 0.14086110989444176, "grad_norm": 0.6066911816596985, "learning_rate": 9.778189218850174e-05, "loss": 2.6193, "step": 402 }, { "epoch": 0.14156191143620517, "grad_norm": 0.5690994262695312, "learning_rate": 9.774763116763466e-05, "loss": 2.6239, "step": 404 }, { "epoch": 0.14226271297796855, "grad_norm": 0.532486081123352, "learning_rate": 9.771311366745703e-05, "loss": 2.6264, "step": 406 }, { "epoch": 0.14296351451973194, "grad_norm": 0.5434598326683044, "learning_rate": 9.767833987338171e-05, "loss": 2.6534, "step": 408 }, { "epoch": 0.14366431606149532, "grad_norm": 0.522413432598114, "learning_rate": 9.764330997219822e-05, "loss": 2.6468, "step": 410 }, { "epoch": 0.14436511760325874, "grad_norm": 0.5612457990646362, "learning_rate": 9.760802415207181e-05, "loss": 2.6307, "step": 412 }, { "epoch": 0.14506591914502212, "grad_norm": 0.5850318670272827, "learning_rate": 9.757248260254244e-05, "loss": 2.6324, "step": 414 }, { "epoch": 0.1457667206867855, "grad_norm": 0.688555121421814, "learning_rate": 9.753668551452368e-05, "loss": 2.6066, "step": 416 }, { "epoch": 0.1464675222285489, "grad_norm": 0.6506465077400208, "learning_rate": 9.750063308030179e-05, "loss": 2.5964, "step": 418 }, { "epoch": 0.1471683237703123, "grad_norm": 0.6529019474983215, "learning_rate": 9.746432549353462e-05, "loss": 2.651, "step": 420 }, { "epoch": 0.1478691253120757, "grad_norm": 0.5469995141029358, "learning_rate": 9.742776294925058e-05, "loss": 2.6129, "step": 422 }, { "epoch": 0.14856992685383907, "grad_norm": 0.4992043673992157, "learning_rate": 9.739094564384758e-05, "loss": 2.6074, "step": 424 }, { "epoch": 0.14927072839560246, "grad_norm": 0.5064156651496887, "learning_rate": 9.735387377509206e-05, "loss": 2.6408, "step": 426 }, { "epoch": 0.14997152993736587, "grad_norm": 0.5961376428604126, "learning_rate": 9.731654754211781e-05, "loss": 2.615, "step": 428 }, { "epoch": 0.15067233147912926, "grad_norm": 0.5533669590950012, "learning_rate": 9.727896714542494e-05, "loss": 2.6225, "step": 430 }, { "epoch": 0.15137313302089264, "grad_norm": 0.5527905821800232, "learning_rate": 9.724113278687888e-05, "loss": 2.5836, "step": 432 }, { "epoch": 0.15207393456265603, "grad_norm": 0.4616098701953888, "learning_rate": 9.720304466970916e-05, "loss": 2.6236, "step": 434 }, { "epoch": 0.15277473610441944, "grad_norm": 0.5189539790153503, "learning_rate": 9.716470299850844e-05, "loss": 2.6364, "step": 436 }, { "epoch": 0.15347553764618282, "grad_norm": 0.5303817987442017, "learning_rate": 9.712610797923133e-05, "loss": 2.6097, "step": 438 }, { "epoch": 0.1541763391879462, "grad_norm": 0.5957894921302795, "learning_rate": 9.708725981919333e-05, "loss": 2.5749, "step": 440 }, { "epoch": 0.1548771407297096, "grad_norm": 0.5686895251274109, "learning_rate": 9.704815872706972e-05, "loss": 2.6319, "step": 442 }, { "epoch": 0.155577942271473, "grad_norm": 0.5570897459983826, "learning_rate": 9.700880491289438e-05, "loss": 2.6287, "step": 444 }, { "epoch": 0.1562787438132364, "grad_norm": 0.5330969095230103, "learning_rate": 9.696919858805873e-05, "loss": 2.6014, "step": 446 }, { "epoch": 0.15697954535499978, "grad_norm": 0.4891030192375183, "learning_rate": 9.692933996531053e-05, "loss": 2.6097, "step": 448 }, { "epoch": 0.15768034689676316, "grad_norm": 0.5465073585510254, "learning_rate": 9.688922925875285e-05, "loss": 2.6162, "step": 450 }, { "epoch": 0.15838114843852658, "grad_norm": 0.5483290553092957, "learning_rate": 9.684886668384277e-05, "loss": 2.5999, "step": 452 }, { "epoch": 0.15908194998028996, "grad_norm": 0.6061928868293762, "learning_rate": 9.68082524573903e-05, "loss": 2.6614, "step": 454 }, { "epoch": 0.15978275152205335, "grad_norm": 0.5806353688240051, "learning_rate": 9.676738679755726e-05, "loss": 2.6039, "step": 456 }, { "epoch": 0.16048355306381673, "grad_norm": 0.5722226500511169, "learning_rate": 9.672626992385602e-05, "loss": 2.6529, "step": 458 }, { "epoch": 0.16118435460558014, "grad_norm": 0.5939204096794128, "learning_rate": 9.668490205714839e-05, "loss": 2.6314, "step": 460 }, { "epoch": 0.16188515614734353, "grad_norm": 0.7260386943817139, "learning_rate": 9.664328341964436e-05, "loss": 2.6211, "step": 462 }, { "epoch": 0.1625859576891069, "grad_norm": 0.8503554463386536, "learning_rate": 9.6601414234901e-05, "loss": 2.6134, "step": 464 }, { "epoch": 0.1632867592308703, "grad_norm": 0.5818518996238708, "learning_rate": 9.655929472782116e-05, "loss": 2.5667, "step": 466 }, { "epoch": 0.1639875607726337, "grad_norm": 0.5678598284721375, "learning_rate": 9.651692512465239e-05, "loss": 2.6153, "step": 468 }, { "epoch": 0.1646883623143971, "grad_norm": 0.5939005613327026, "learning_rate": 9.647430565298555e-05, "loss": 2.6098, "step": 470 }, { "epoch": 0.16538916385616048, "grad_norm": 0.5300047993659973, "learning_rate": 9.643143654175373e-05, "loss": 2.6167, "step": 472 }, { "epoch": 0.16608996539792387, "grad_norm": 0.4946250319480896, "learning_rate": 9.638831802123101e-05, "loss": 2.581, "step": 474 }, { "epoch": 0.16679076693968728, "grad_norm": 0.4555206000804901, "learning_rate": 9.634495032303111e-05, "loss": 2.588, "step": 476 }, { "epoch": 0.16749156848145066, "grad_norm": 0.5159677267074585, "learning_rate": 9.630133368010628e-05, "loss": 2.5868, "step": 478 }, { "epoch": 0.16819237002321405, "grad_norm": 0.5565433502197266, "learning_rate": 9.625746832674597e-05, "loss": 2.6185, "step": 480 }, { "epoch": 0.16889317156497743, "grad_norm": 0.4775915741920471, "learning_rate": 9.621335449857562e-05, "loss": 2.5897, "step": 482 }, { "epoch": 0.16959397310674085, "grad_norm": 0.5150102376937866, "learning_rate": 9.616899243255532e-05, "loss": 2.5478, "step": 484 }, { "epoch": 0.17029477464850423, "grad_norm": 0.48455357551574707, "learning_rate": 9.612438236697863e-05, "loss": 2.5639, "step": 486 }, { "epoch": 0.17099557619026762, "grad_norm": 0.5149878859519958, "learning_rate": 9.607952454147121e-05, "loss": 2.599, "step": 488 }, { "epoch": 0.171696377732031, "grad_norm": 0.6969982385635376, "learning_rate": 9.603441919698963e-05, "loss": 2.5733, "step": 490 }, { "epoch": 0.17239717927379442, "grad_norm": 0.57285475730896, "learning_rate": 9.598906657582e-05, "loss": 2.5791, "step": 492 }, { "epoch": 0.1730979808155578, "grad_norm": 0.5704159140586853, "learning_rate": 9.594346692157667e-05, "loss": 2.5692, "step": 494 }, { "epoch": 0.17379878235732119, "grad_norm": 0.681797444820404, "learning_rate": 9.589762047920096e-05, "loss": 2.5759, "step": 496 }, { "epoch": 0.17449958389908457, "grad_norm": 0.49717003107070923, "learning_rate": 9.585152749495984e-05, "loss": 2.5848, "step": 498 }, { "epoch": 0.17520038544084798, "grad_norm": 0.48680582642555237, "learning_rate": 9.580518821644457e-05, "loss": 2.5682, "step": 500 }, { "epoch": 0.17590118698261137, "grad_norm": 0.5525830388069153, "learning_rate": 9.575860289256943e-05, "loss": 2.5894, "step": 502 }, { "epoch": 0.17660198852437475, "grad_norm": 0.5562606453895569, "learning_rate": 9.571177177357032e-05, "loss": 2.5675, "step": 504 }, { "epoch": 0.17730279006613814, "grad_norm": 0.5515877604484558, "learning_rate": 9.566469511100345e-05, "loss": 2.5877, "step": 506 }, { "epoch": 0.17800359160790155, "grad_norm": 0.6816357970237732, "learning_rate": 9.561737315774398e-05, "loss": 2.596, "step": 508 }, { "epoch": 0.17870439314966494, "grad_norm": 0.507437527179718, "learning_rate": 9.556980616798463e-05, "loss": 2.5721, "step": 510 }, { "epoch": 0.17940519469142832, "grad_norm": 0.5275202989578247, "learning_rate": 9.552199439723443e-05, "loss": 2.568, "step": 512 }, { "epoch": 0.1801059962331917, "grad_norm": 0.5467104911804199, "learning_rate": 9.547393810231722e-05, "loss": 2.5842, "step": 514 }, { "epoch": 0.1808067977749551, "grad_norm": 0.5407027006149292, "learning_rate": 9.542563754137031e-05, "loss": 2.5891, "step": 516 }, { "epoch": 0.1815075993167185, "grad_norm": 0.5731847882270813, "learning_rate": 9.537709297384308e-05, "loss": 2.6143, "step": 518 }, { "epoch": 0.1822084008584819, "grad_norm": 0.566457986831665, "learning_rate": 9.532830466049565e-05, "loss": 2.5522, "step": 520 }, { "epoch": 0.18290920240024527, "grad_norm": 0.4899183213710785, "learning_rate": 9.527927286339744e-05, "loss": 2.5961, "step": 522 }, { "epoch": 0.18361000394200866, "grad_norm": 0.4883110523223877, "learning_rate": 9.52299978459257e-05, "loss": 2.5557, "step": 524 }, { "epoch": 0.18431080548377207, "grad_norm": 0.5534235239028931, "learning_rate": 9.518047987276421e-05, "loss": 2.6452, "step": 526 }, { "epoch": 0.18501160702553546, "grad_norm": 0.47292667627334595, "learning_rate": 9.513071920990179e-05, "loss": 2.5848, "step": 528 }, { "epoch": 0.18571240856729884, "grad_norm": 0.5438964366912842, "learning_rate": 9.508071612463086e-05, "loss": 2.5332, "step": 530 }, { "epoch": 0.18641321010906223, "grad_norm": 0.5318060517311096, "learning_rate": 9.503047088554601e-05, "loss": 2.585, "step": 532 }, { "epoch": 0.18711401165082564, "grad_norm": 0.49279502034187317, "learning_rate": 9.497998376254267e-05, "loss": 2.5948, "step": 534 }, { "epoch": 0.18781481319258903, "grad_norm": 0.5161717534065247, "learning_rate": 9.492925502681545e-05, "loss": 2.5644, "step": 536 }, { "epoch": 0.1885156147343524, "grad_norm": 0.4586479663848877, "learning_rate": 9.487828495085684e-05, "loss": 2.5568, "step": 538 }, { "epoch": 0.1892164162761158, "grad_norm": 0.4390322268009186, "learning_rate": 9.482707380845573e-05, "loss": 2.5938, "step": 540 }, { "epoch": 0.1899172178178792, "grad_norm": 0.5253728628158569, "learning_rate": 9.47756218746959e-05, "loss": 2.5996, "step": 542 }, { "epoch": 0.1906180193596426, "grad_norm": 0.4567623436450958, "learning_rate": 9.472392942595454e-05, "loss": 2.5576, "step": 544 }, { "epoch": 0.19131882090140598, "grad_norm": 0.5091727375984192, "learning_rate": 9.467199673990077e-05, "loss": 2.5873, "step": 546 }, { "epoch": 0.19201962244316936, "grad_norm": 0.4959392845630646, "learning_rate": 9.46198240954942e-05, "loss": 2.5291, "step": 548 }, { "epoch": 0.19272042398493278, "grad_norm": 0.5150632262229919, "learning_rate": 9.456741177298336e-05, "loss": 2.5503, "step": 550 }, { "epoch": 0.19342122552669616, "grad_norm": 0.4603368639945984, "learning_rate": 9.451476005390422e-05, "loss": 2.5785, "step": 552 }, { "epoch": 0.19412202706845955, "grad_norm": 0.4441729784011841, "learning_rate": 9.446186922107873e-05, "loss": 2.5512, "step": 554 }, { "epoch": 0.19482282861022293, "grad_norm": 0.5432455539703369, "learning_rate": 9.44087395586132e-05, "loss": 2.5741, "step": 556 }, { "epoch": 0.19552363015198634, "grad_norm": 0.42969366908073425, "learning_rate": 9.435537135189687e-05, "loss": 2.5677, "step": 558 }, { "epoch": 0.19622443169374973, "grad_norm": 0.5706619620323181, "learning_rate": 9.430176488760027e-05, "loss": 2.556, "step": 560 }, { "epoch": 0.19692523323551311, "grad_norm": 0.7202513217926025, "learning_rate": 9.424792045367383e-05, "loss": 2.5435, "step": 562 }, { "epoch": 0.1976260347772765, "grad_norm": 0.5471363663673401, "learning_rate": 9.419383833934621e-05, "loss": 2.572, "step": 564 }, { "epoch": 0.1983268363190399, "grad_norm": 0.654058575630188, "learning_rate": 9.413951883512275e-05, "loss": 2.5432, "step": 566 }, { "epoch": 0.1990276378608033, "grad_norm": 0.6124361157417297, "learning_rate": 9.408496223278403e-05, "loss": 2.5803, "step": 568 }, { "epoch": 0.19972843940256668, "grad_norm": 0.5291132926940918, "learning_rate": 9.403016882538408e-05, "loss": 2.576, "step": 570 }, { "epoch": 0.20042924094433007, "grad_norm": 0.6087374687194824, "learning_rate": 9.397513890724911e-05, "loss": 2.5171, "step": 572 }, { "epoch": 0.20113004248609348, "grad_norm": 0.5776922106742859, "learning_rate": 9.391987277397566e-05, "loss": 2.6054, "step": 574 }, { "epoch": 0.20183084402785686, "grad_norm": 0.544319748878479, "learning_rate": 9.38643707224291e-05, "loss": 2.548, "step": 576 }, { "epoch": 0.20253164556962025, "grad_norm": 0.5210007429122925, "learning_rate": 9.38086330507421e-05, "loss": 2.6019, "step": 578 }, { "epoch": 0.20323244711138364, "grad_norm": 0.5160629153251648, "learning_rate": 9.375266005831297e-05, "loss": 2.6046, "step": 580 }, { "epoch": 0.20393324865314705, "grad_norm": 0.6452796459197998, "learning_rate": 9.369645204580403e-05, "loss": 2.566, "step": 582 }, { "epoch": 0.20463405019491043, "grad_norm": 0.5813329815864563, "learning_rate": 9.364000931514008e-05, "loss": 2.5661, "step": 584 }, { "epoch": 0.20533485173667382, "grad_norm": 0.5450593829154968, "learning_rate": 9.358333216950664e-05, "loss": 2.5769, "step": 586 }, { "epoch": 0.2060356532784372, "grad_norm": 0.5340794324874878, "learning_rate": 9.352642091334849e-05, "loss": 2.5549, "step": 588 }, { "epoch": 0.20673645482020062, "grad_norm": 0.5767348408699036, "learning_rate": 9.34692758523679e-05, "loss": 2.5604, "step": 590 }, { "epoch": 0.207437256361964, "grad_norm": 0.6048093438148499, "learning_rate": 9.341189729352302e-05, "loss": 2.5929, "step": 592 }, { "epoch": 0.20813805790372739, "grad_norm": 0.4430505335330963, "learning_rate": 9.33542855450263e-05, "loss": 2.5563, "step": 594 }, { "epoch": 0.20883885944549077, "grad_norm": 0.49373888969421387, "learning_rate": 9.329644091634278e-05, "loss": 2.5517, "step": 596 }, { "epoch": 0.20953966098725418, "grad_norm": 0.5227393507957458, "learning_rate": 9.323836371818837e-05, "loss": 2.5286, "step": 598 }, { "epoch": 0.21024046252901757, "grad_norm": 0.497405081987381, "learning_rate": 9.318005426252832e-05, "loss": 2.5638, "step": 600 } ], "logging_steps": 2, "max_steps": 2854, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.755100672393216e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }