diff --git "a/q2.5-ga/checkpoint-1500/trainer_state.json" "b/q2.5-ga/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/q2.5-ga/checkpoint-1500/trainer_state.json" @@ -0,0 +1,5291 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5256011563225439, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00035040077088169594, + "grad_norm": 6.5142412185668945, + "learning_rate": 0.0, + "loss": 5.324, + "step": 1 + }, + { + "epoch": 0.0007008015417633919, + "grad_norm": 6.758334159851074, + "learning_rate": 6.993006993006994e-07, + "loss": 5.3405, + "step": 2 + }, + { + "epoch": 0.0014016030835267838, + "grad_norm": 6.22674036026001, + "learning_rate": 2.0979020979020983e-06, + "loss": 5.3286, + "step": 4 + }, + { + "epoch": 0.0021024046252901755, + "grad_norm": 5.438386917114258, + "learning_rate": 3.496503496503497e-06, + "loss": 5.25, + "step": 6 + }, + { + "epoch": 0.0028032061670535675, + "grad_norm": 3.365504741668701, + "learning_rate": 4.895104895104895e-06, + "loss": 5.2821, + "step": 8 + }, + { + "epoch": 0.0035040077088169595, + "grad_norm": 7.186147212982178, + "learning_rate": 6.2937062937062944e-06, + "loss": 5.21, + "step": 10 + }, + { + "epoch": 0.004204809250580351, + "grad_norm": 4.960826396942139, + "learning_rate": 7.692307692307694e-06, + "loss": 5.0759, + "step": 12 + }, + { + "epoch": 0.004905610792343743, + "grad_norm": 4.001464366912842, + "learning_rate": 9.090909090909091e-06, + "loss": 5.1092, + "step": 14 + }, + { + "epoch": 0.005606412334107135, + "grad_norm": 3.2986342906951904, + "learning_rate": 1.048951048951049e-05, + "loss": 4.93, + "step": 16 + }, + { + "epoch": 0.006307213875870527, + "grad_norm": 2.5407276153564453, + "learning_rate": 1.188811188811189e-05, + "loss": 4.8535, + "step": 18 + }, + { + "epoch": 0.007008015417633919, + "grad_norm": 2.211754083633423, + "learning_rate": 1.3286713286713287e-05, + "loss": 4.74, + "step": 20 + }, + { + "epoch": 0.007708816959397311, + "grad_norm": 1.6710195541381836, + "learning_rate": 1.4685314685314686e-05, + "loss": 4.609, + "step": 22 + }, + { + "epoch": 0.008409618501160702, + "grad_norm": 1.280752182006836, + "learning_rate": 1.6083916083916083e-05, + "loss": 4.4879, + "step": 24 + }, + { + "epoch": 0.009110420042924094, + "grad_norm": 1.312186598777771, + "learning_rate": 1.7482517482517483e-05, + "loss": 4.3995, + "step": 26 + }, + { + "epoch": 0.009811221584687486, + "grad_norm": 1.3315190076828003, + "learning_rate": 1.888111888111888e-05, + "loss": 4.3005, + "step": 28 + }, + { + "epoch": 0.010512023126450878, + "grad_norm": 1.3252590894699097, + "learning_rate": 2.027972027972028e-05, + "loss": 4.1952, + "step": 30 + }, + { + "epoch": 0.01121282466821427, + "grad_norm": 1.3794758319854736, + "learning_rate": 2.1678321678321677e-05, + "loss": 4.1459, + "step": 32 + }, + { + "epoch": 0.011913626209977662, + "grad_norm": 1.1808068752288818, + "learning_rate": 2.307692307692308e-05, + "loss": 4.034, + "step": 34 + }, + { + "epoch": 0.012614427751741054, + "grad_norm": 1.31660795211792, + "learning_rate": 2.4475524475524478e-05, + "loss": 3.926, + "step": 36 + }, + { + "epoch": 0.013315229293504446, + "grad_norm": 1.0347495079040527, + "learning_rate": 2.5874125874125877e-05, + "loss": 3.8812, + "step": 38 + }, + { + "epoch": 0.014016030835267838, + "grad_norm": 1.050775408744812, + "learning_rate": 2.7272727272727273e-05, + "loss": 3.7787, + "step": 40 + }, + { + "epoch": 0.01471683237703123, + "grad_norm": 0.9461761713027954, + "learning_rate": 2.8671328671328672e-05, + "loss": 3.6738, + "step": 42 + }, + { + "epoch": 0.015417633918794622, + "grad_norm": 1.0460454225540161, + "learning_rate": 3.0069930069930068e-05, + "loss": 3.6385, + "step": 44 + }, + { + "epoch": 0.016118435460558012, + "grad_norm": 1.0687191486358643, + "learning_rate": 3.146853146853147e-05, + "loss": 3.5701, + "step": 46 + }, + { + "epoch": 0.016819237002321404, + "grad_norm": 1.4722611904144287, + "learning_rate": 3.2867132867132866e-05, + "loss": 3.5438, + "step": 48 + }, + { + "epoch": 0.017520038544084796, + "grad_norm": 1.1305724382400513, + "learning_rate": 3.4265734265734265e-05, + "loss": 3.4694, + "step": 50 + }, + { + "epoch": 0.018220840085848188, + "grad_norm": 0.9322625994682312, + "learning_rate": 3.566433566433567e-05, + "loss": 3.4488, + "step": 52 + }, + { + "epoch": 0.01892164162761158, + "grad_norm": 1.2441555261611938, + "learning_rate": 3.7062937062937064e-05, + "loss": 3.4289, + "step": 54 + }, + { + "epoch": 0.019622443169374972, + "grad_norm": 0.9397731423377991, + "learning_rate": 3.846153846153846e-05, + "loss": 3.4021, + "step": 56 + }, + { + "epoch": 0.020323244711138364, + "grad_norm": 1.3261164426803589, + "learning_rate": 3.986013986013986e-05, + "loss": 3.3575, + "step": 58 + }, + { + "epoch": 0.021024046252901756, + "grad_norm": 1.08541738986969, + "learning_rate": 4.125874125874126e-05, + "loss": 3.3403, + "step": 60 + }, + { + "epoch": 0.021724847794665148, + "grad_norm": 0.8626166582107544, + "learning_rate": 4.265734265734266e-05, + "loss": 3.3306, + "step": 62 + }, + { + "epoch": 0.02242564933642854, + "grad_norm": 1.0596344470977783, + "learning_rate": 4.405594405594406e-05, + "loss": 3.2779, + "step": 64 + }, + { + "epoch": 0.023126450878191932, + "grad_norm": 1.511917233467102, + "learning_rate": 4.545454545454546e-05, + "loss": 3.2759, + "step": 66 + }, + { + "epoch": 0.023827252419955324, + "grad_norm": 1.2062046527862549, + "learning_rate": 4.685314685314686e-05, + "loss": 3.2545, + "step": 68 + }, + { + "epoch": 0.024528053961718716, + "grad_norm": 1.1399930715560913, + "learning_rate": 4.825174825174825e-05, + "loss": 3.2235, + "step": 70 + }, + { + "epoch": 0.025228855503482108, + "grad_norm": 0.8960133790969849, + "learning_rate": 4.9650349650349656e-05, + "loss": 3.2025, + "step": 72 + }, + { + "epoch": 0.0259296570452455, + "grad_norm": 1.3042056560516357, + "learning_rate": 5.1048951048951055e-05, + "loss": 3.1475, + "step": 74 + }, + { + "epoch": 0.026630458587008892, + "grad_norm": 1.186320424079895, + "learning_rate": 5.244755244755245e-05, + "loss": 3.1759, + "step": 76 + }, + { + "epoch": 0.027331260128772284, + "grad_norm": 1.2691158056259155, + "learning_rate": 5.384615384615385e-05, + "loss": 3.1296, + "step": 78 + }, + { + "epoch": 0.028032061670535676, + "grad_norm": 0.7816159129142761, + "learning_rate": 5.524475524475524e-05, + "loss": 3.1017, + "step": 80 + }, + { + "epoch": 0.028732863212299068, + "grad_norm": 1.1489295959472656, + "learning_rate": 5.664335664335665e-05, + "loss": 3.1151, + "step": 82 + }, + { + "epoch": 0.02943366475406246, + "grad_norm": 1.5686062574386597, + "learning_rate": 5.8041958041958044e-05, + "loss": 3.114, + "step": 84 + }, + { + "epoch": 0.030134466295825852, + "grad_norm": 1.4421433210372925, + "learning_rate": 5.944055944055944e-05, + "loss": 3.0946, + "step": 86 + }, + { + "epoch": 0.030835267837589244, + "grad_norm": 1.335250973701477, + "learning_rate": 6.083916083916085e-05, + "loss": 3.084, + "step": 88 + }, + { + "epoch": 0.03153606937935263, + "grad_norm": 0.970507800579071, + "learning_rate": 6.223776223776224e-05, + "loss": 3.1163, + "step": 90 + }, + { + "epoch": 0.032236870921116025, + "grad_norm": 1.2849407196044922, + "learning_rate": 6.363636363636364e-05, + "loss": 3.063, + "step": 92 + }, + { + "epoch": 0.032937672462879417, + "grad_norm": 1.0378247499465942, + "learning_rate": 6.503496503496504e-05, + "loss": 3.0223, + "step": 94 + }, + { + "epoch": 0.03363847400464281, + "grad_norm": 1.3139392137527466, + "learning_rate": 6.643356643356644e-05, + "loss": 3.0572, + "step": 96 + }, + { + "epoch": 0.0343392755464062, + "grad_norm": 1.254752278327942, + "learning_rate": 6.783216783216784e-05, + "loss": 3.0408, + "step": 98 + }, + { + "epoch": 0.03504007708816959, + "grad_norm": 1.3333168029785156, + "learning_rate": 6.923076923076924e-05, + "loss": 3.0185, + "step": 100 + }, + { + "epoch": 0.035740878629932984, + "grad_norm": 1.2795464992523193, + "learning_rate": 7.062937062937062e-05, + "loss": 3.0328, + "step": 102 + }, + { + "epoch": 0.036441680171696376, + "grad_norm": 1.2025645971298218, + "learning_rate": 7.202797202797204e-05, + "loss": 3.0303, + "step": 104 + }, + { + "epoch": 0.03714248171345977, + "grad_norm": 1.1741266250610352, + "learning_rate": 7.342657342657343e-05, + "loss": 3.0252, + "step": 106 + }, + { + "epoch": 0.03784328325522316, + "grad_norm": 1.2022653818130493, + "learning_rate": 7.482517482517482e-05, + "loss": 3.0183, + "step": 108 + }, + { + "epoch": 0.03854408479698655, + "grad_norm": 1.1950666904449463, + "learning_rate": 7.622377622377622e-05, + "loss": 2.9804, + "step": 110 + }, + { + "epoch": 0.039244886338749944, + "grad_norm": 1.5780822038650513, + "learning_rate": 7.762237762237763e-05, + "loss": 2.9804, + "step": 112 + }, + { + "epoch": 0.039945687880513336, + "grad_norm": 1.0478655099868774, + "learning_rate": 7.902097902097903e-05, + "loss": 2.9894, + "step": 114 + }, + { + "epoch": 0.04064648942227673, + "grad_norm": 1.1782268285751343, + "learning_rate": 8.041958041958042e-05, + "loss": 2.9717, + "step": 116 + }, + { + "epoch": 0.04134729096404012, + "grad_norm": 1.0321820974349976, + "learning_rate": 8.181818181818183e-05, + "loss": 2.9776, + "step": 118 + }, + { + "epoch": 0.04204809250580351, + "grad_norm": 0.9697206020355225, + "learning_rate": 8.321678321678323e-05, + "loss": 2.9804, + "step": 120 + }, + { + "epoch": 0.042748894047566904, + "grad_norm": 1.1984606981277466, + "learning_rate": 8.461538461538461e-05, + "loss": 2.9495, + "step": 122 + }, + { + "epoch": 0.043449695589330296, + "grad_norm": 0.9830178618431091, + "learning_rate": 8.601398601398601e-05, + "loss": 2.9656, + "step": 124 + }, + { + "epoch": 0.04415049713109369, + "grad_norm": 1.3105114698410034, + "learning_rate": 8.741258741258743e-05, + "loss": 2.9306, + "step": 126 + }, + { + "epoch": 0.04485129867285708, + "grad_norm": 1.3499157428741455, + "learning_rate": 8.881118881118881e-05, + "loss": 2.9381, + "step": 128 + }, + { + "epoch": 0.04555210021462047, + "grad_norm": 0.9977575540542603, + "learning_rate": 9.020979020979021e-05, + "loss": 2.907, + "step": 130 + }, + { + "epoch": 0.046252901756383864, + "grad_norm": 1.2331498861312866, + "learning_rate": 9.160839160839161e-05, + "loss": 2.9224, + "step": 132 + }, + { + "epoch": 0.046953703298147256, + "grad_norm": 1.451253890991211, + "learning_rate": 9.300699300699301e-05, + "loss": 2.9202, + "step": 134 + }, + { + "epoch": 0.04765450483991065, + "grad_norm": 1.2146471738815308, + "learning_rate": 9.440559440559441e-05, + "loss": 2.9098, + "step": 136 + }, + { + "epoch": 0.04835530638167404, + "grad_norm": 1.0873245000839233, + "learning_rate": 9.580419580419581e-05, + "loss": 2.9218, + "step": 138 + }, + { + "epoch": 0.04905610792343743, + "grad_norm": 1.276413083076477, + "learning_rate": 9.72027972027972e-05, + "loss": 2.8947, + "step": 140 + }, + { + "epoch": 0.049756909465200824, + "grad_norm": 1.126065731048584, + "learning_rate": 9.86013986013986e-05, + "loss": 2.8788, + "step": 142 + }, + { + "epoch": 0.050457711006964216, + "grad_norm": 1.5177017450332642, + "learning_rate": 0.0001, + "loss": 2.9043, + "step": 144 + }, + { + "epoch": 0.05115851254872761, + "grad_norm": 1.3744112253189087, + "learning_rate": 9.99998657109765e-05, + "loss": 2.888, + "step": 146 + }, + { + "epoch": 0.051859314090491, + "grad_norm": 1.7921055555343628, + "learning_rate": 9.999946284462733e-05, + "loss": 2.8631, + "step": 148 + }, + { + "epoch": 0.05256011563225439, + "grad_norm": 1.1755317449569702, + "learning_rate": 9.999879140311652e-05, + "loss": 2.8735, + "step": 150 + }, + { + "epoch": 0.053260917174017784, + "grad_norm": 0.846362292766571, + "learning_rate": 9.999785139005073e-05, + "loss": 2.8768, + "step": 152 + }, + { + "epoch": 0.053961718715781176, + "grad_norm": 0.9867280721664429, + "learning_rate": 9.999664281047933e-05, + "loss": 2.8859, + "step": 154 + }, + { + "epoch": 0.05466252025754457, + "grad_norm": 0.9751666188240051, + "learning_rate": 9.999516567089429e-05, + "loss": 2.8497, + "step": 156 + }, + { + "epoch": 0.05536332179930796, + "grad_norm": 1.0603703260421753, + "learning_rate": 9.999341997923011e-05, + "loss": 2.8404, + "step": 158 + }, + { + "epoch": 0.05606412334107135, + "grad_norm": 1.0447975397109985, + "learning_rate": 9.999140574486392e-05, + "loss": 2.9092, + "step": 160 + }, + { + "epoch": 0.056764924882834744, + "grad_norm": 1.3046443462371826, + "learning_rate": 9.998912297861527e-05, + "loss": 2.8971, + "step": 162 + }, + { + "epoch": 0.057465726424598136, + "grad_norm": 1.1029243469238281, + "learning_rate": 9.998657169274622e-05, + "loss": 2.8834, + "step": 164 + }, + { + "epoch": 0.05816652796636153, + "grad_norm": 0.8594210743904114, + "learning_rate": 9.99837519009611e-05, + "loss": 2.8361, + "step": 166 + }, + { + "epoch": 0.05886732950812492, + "grad_norm": 0.8585363030433655, + "learning_rate": 9.998066361840665e-05, + "loss": 2.8782, + "step": 168 + }, + { + "epoch": 0.05956813104988831, + "grad_norm": 0.693467378616333, + "learning_rate": 9.997730686167173e-05, + "loss": 2.8537, + "step": 170 + }, + { + "epoch": 0.060268932591651704, + "grad_norm": 0.8418940305709839, + "learning_rate": 9.997368164878738e-05, + "loss": 2.8294, + "step": 172 + }, + { + "epoch": 0.060969734133415096, + "grad_norm": 0.9938271045684814, + "learning_rate": 9.996978799922665e-05, + "loss": 2.8458, + "step": 174 + }, + { + "epoch": 0.06167053567517849, + "grad_norm": 1.0347217321395874, + "learning_rate": 9.99656259339045e-05, + "loss": 2.8081, + "step": 176 + }, + { + "epoch": 0.06237133721694188, + "grad_norm": 0.9216743111610413, + "learning_rate": 9.996119547517775e-05, + "loss": 2.8655, + "step": 178 + }, + { + "epoch": 0.06307213875870527, + "grad_norm": 1.0579859018325806, + "learning_rate": 9.995649664684486e-05, + "loss": 2.823, + "step": 180 + }, + { + "epoch": 0.06377294030046866, + "grad_norm": 0.9864194393157959, + "learning_rate": 9.995152947414586e-05, + "loss": 2.8081, + "step": 182 + }, + { + "epoch": 0.06447374184223205, + "grad_norm": 0.8999143838882446, + "learning_rate": 9.994629398376226e-05, + "loss": 2.7947, + "step": 184 + }, + { + "epoch": 0.06517454338399545, + "grad_norm": 0.9121315479278564, + "learning_rate": 9.994079020381676e-05, + "loss": 2.8253, + "step": 186 + }, + { + "epoch": 0.06587534492575883, + "grad_norm": 0.8578842282295227, + "learning_rate": 9.993501816387329e-05, + "loss": 2.7548, + "step": 188 + }, + { + "epoch": 0.06657614646752223, + "grad_norm": 0.8564820289611816, + "learning_rate": 9.992897789493672e-05, + "loss": 2.8361, + "step": 190 + }, + { + "epoch": 0.06727694800928562, + "grad_norm": 0.8013344407081604, + "learning_rate": 9.992266942945269e-05, + "loss": 2.8606, + "step": 192 + }, + { + "epoch": 0.06797774955104902, + "grad_norm": 0.7343975901603699, + "learning_rate": 9.991609280130752e-05, + "loss": 2.7947, + "step": 194 + }, + { + "epoch": 0.0686785510928124, + "grad_norm": 0.7338536381721497, + "learning_rate": 9.990924804582797e-05, + "loss": 2.7492, + "step": 196 + }, + { + "epoch": 0.0693793526345758, + "grad_norm": 0.828781008720398, + "learning_rate": 9.990213519978109e-05, + "loss": 2.8013, + "step": 198 + }, + { + "epoch": 0.07008015417633918, + "grad_norm": 0.7156624794006348, + "learning_rate": 9.989475430137391e-05, + "loss": 2.7943, + "step": 200 + }, + { + "epoch": 0.07078095571810258, + "grad_norm": 0.6014353632926941, + "learning_rate": 9.988710539025341e-05, + "loss": 2.8099, + "step": 202 + }, + { + "epoch": 0.07148175725986597, + "grad_norm": 0.6569661498069763, + "learning_rate": 9.987918850750619e-05, + "loss": 2.8125, + "step": 204 + }, + { + "epoch": 0.07218255880162937, + "grad_norm": 0.6558775305747986, + "learning_rate": 9.987100369565825e-05, + "loss": 2.7487, + "step": 206 + }, + { + "epoch": 0.07288336034339275, + "grad_norm": 0.6454245448112488, + "learning_rate": 9.986255099867481e-05, + "loss": 2.7648, + "step": 208 + }, + { + "epoch": 0.07358416188515615, + "grad_norm": 0.5741921067237854, + "learning_rate": 9.985383046196004e-05, + "loss": 2.7743, + "step": 210 + }, + { + "epoch": 0.07428496342691954, + "grad_norm": 0.5875937938690186, + "learning_rate": 9.984484213235685e-05, + "loss": 2.7728, + "step": 212 + }, + { + "epoch": 0.07498576496868294, + "grad_norm": 0.6638422012329102, + "learning_rate": 9.98355860581466e-05, + "loss": 2.7504, + "step": 214 + }, + { + "epoch": 0.07568656651044632, + "grad_norm": 1.1614341735839844, + "learning_rate": 9.982606228904884e-05, + "loss": 2.7923, + "step": 216 + }, + { + "epoch": 0.07638736805220972, + "grad_norm": 1.005254864692688, + "learning_rate": 9.981627087622108e-05, + "loss": 2.76, + "step": 218 + }, + { + "epoch": 0.0770881695939731, + "grad_norm": 0.7738555669784546, + "learning_rate": 9.980621187225852e-05, + "loss": 2.7866, + "step": 220 + }, + { + "epoch": 0.0777889711357365, + "grad_norm": 0.9469527006149292, + "learning_rate": 9.979588533119367e-05, + "loss": 2.8012, + "step": 222 + }, + { + "epoch": 0.07848977267749989, + "grad_norm": 0.9031473398208618, + "learning_rate": 9.978529130849619e-05, + "loss": 2.7522, + "step": 224 + }, + { + "epoch": 0.07919057421926329, + "grad_norm": 0.9450514912605286, + "learning_rate": 9.977442986107252e-05, + "loss": 2.7791, + "step": 226 + }, + { + "epoch": 0.07989137576102667, + "grad_norm": 0.7259206771850586, + "learning_rate": 9.97633010472656e-05, + "loss": 2.7237, + "step": 228 + }, + { + "epoch": 0.08059217730279007, + "grad_norm": 0.6595309972763062, + "learning_rate": 9.975190492685451e-05, + "loss": 2.7284, + "step": 230 + }, + { + "epoch": 0.08129297884455346, + "grad_norm": 0.7696382999420166, + "learning_rate": 9.974024156105422e-05, + "loss": 2.7631, + "step": 232 + }, + { + "epoch": 0.08199378038631686, + "grad_norm": 0.7305110096931458, + "learning_rate": 9.972831101251521e-05, + "loss": 2.7793, + "step": 234 + }, + { + "epoch": 0.08269458192808024, + "grad_norm": 0.6039514541625977, + "learning_rate": 9.971611334532314e-05, + "loss": 2.7669, + "step": 236 + }, + { + "epoch": 0.08339538346984364, + "grad_norm": 0.5824711918830872, + "learning_rate": 9.970364862499852e-05, + "loss": 2.7476, + "step": 238 + }, + { + "epoch": 0.08409618501160702, + "grad_norm": 0.6831758618354797, + "learning_rate": 9.969091691849637e-05, + "loss": 2.7098, + "step": 240 + }, + { + "epoch": 0.08479698655337042, + "grad_norm": 0.6469074487686157, + "learning_rate": 9.967791829420581e-05, + "loss": 2.7609, + "step": 242 + }, + { + "epoch": 0.08549778809513381, + "grad_norm": 0.5876832604408264, + "learning_rate": 9.966465282194976e-05, + "loss": 2.7306, + "step": 244 + }, + { + "epoch": 0.08619858963689721, + "grad_norm": 0.6310129761695862, + "learning_rate": 9.965112057298451e-05, + "loss": 2.7283, + "step": 246 + }, + { + "epoch": 0.08689939117866059, + "grad_norm": 0.6113069653511047, + "learning_rate": 9.963732161999935e-05, + "loss": 2.7274, + "step": 248 + }, + { + "epoch": 0.08760019272042399, + "grad_norm": 1.0655111074447632, + "learning_rate": 9.96232560371162e-05, + "loss": 2.7022, + "step": 250 + }, + { + "epoch": 0.08830099426218738, + "grad_norm": 0.8412613272666931, + "learning_rate": 9.960892389988918e-05, + "loss": 2.7213, + "step": 252 + }, + { + "epoch": 0.08900179580395078, + "grad_norm": 0.7329776883125305, + "learning_rate": 9.959432528530428e-05, + "loss": 2.7343, + "step": 254 + }, + { + "epoch": 0.08970259734571416, + "grad_norm": 0.702498197555542, + "learning_rate": 9.95794602717788e-05, + "loss": 2.7642, + "step": 256 + }, + { + "epoch": 0.09040339888747755, + "grad_norm": 0.6936408281326294, + "learning_rate": 9.95643289391611e-05, + "loss": 2.7081, + "step": 258 + }, + { + "epoch": 0.09110420042924094, + "grad_norm": 0.664743959903717, + "learning_rate": 9.954893136873005e-05, + "loss": 2.7054, + "step": 260 + }, + { + "epoch": 0.09180500197100433, + "grad_norm": 0.5716791152954102, + "learning_rate": 9.953326764319463e-05, + "loss": 2.6751, + "step": 262 + }, + { + "epoch": 0.09250580351276773, + "grad_norm": 0.6207195520401001, + "learning_rate": 9.95173378466935e-05, + "loss": 2.6945, + "step": 264 + }, + { + "epoch": 0.09320660505453111, + "grad_norm": 0.6572092771530151, + "learning_rate": 9.950114206479453e-05, + "loss": 2.6989, + "step": 266 + }, + { + "epoch": 0.09390740659629451, + "grad_norm": 0.7676830887794495, + "learning_rate": 9.948468038449435e-05, + "loss": 2.7613, + "step": 268 + }, + { + "epoch": 0.0946082081380579, + "grad_norm": 0.5810503959655762, + "learning_rate": 9.946795289421787e-05, + "loss": 2.7234, + "step": 270 + }, + { + "epoch": 0.0953090096798213, + "grad_norm": 0.6459682583808899, + "learning_rate": 9.945095968381784e-05, + "loss": 2.717, + "step": 272 + }, + { + "epoch": 0.09600981122158468, + "grad_norm": 0.6498464345932007, + "learning_rate": 9.94337008445743e-05, + "loss": 2.7389, + "step": 274 + }, + { + "epoch": 0.09671061276334808, + "grad_norm": 0.6287350654602051, + "learning_rate": 9.941617646919421e-05, + "loss": 2.681, + "step": 276 + }, + { + "epoch": 0.09741141430511147, + "grad_norm": 0.7516258955001831, + "learning_rate": 9.939838665181076e-05, + "loss": 2.6696, + "step": 278 + }, + { + "epoch": 0.09811221584687486, + "grad_norm": 0.6962350606918335, + "learning_rate": 9.938033148798307e-05, + "loss": 2.6971, + "step": 280 + }, + { + "epoch": 0.09881301738863825, + "grad_norm": 0.6605144739151001, + "learning_rate": 9.936201107469555e-05, + "loss": 2.6999, + "step": 282 + }, + { + "epoch": 0.09951381893040165, + "grad_norm": 0.5991240739822388, + "learning_rate": 9.93434255103574e-05, + "loss": 2.6936, + "step": 284 + }, + { + "epoch": 0.10021462047216503, + "grad_norm": 0.5660961866378784, + "learning_rate": 9.932457489480213e-05, + "loss": 2.686, + "step": 286 + }, + { + "epoch": 0.10091542201392843, + "grad_norm": 0.690290093421936, + "learning_rate": 9.930545932928698e-05, + "loss": 2.6809, + "step": 288 + }, + { + "epoch": 0.10161622355569182, + "grad_norm": 0.7119167447090149, + "learning_rate": 9.928607891649234e-05, + "loss": 2.7221, + "step": 290 + }, + { + "epoch": 0.10231702509745522, + "grad_norm": 0.7049365639686584, + "learning_rate": 9.926643376052131e-05, + "loss": 2.6569, + "step": 292 + }, + { + "epoch": 0.1030178266392186, + "grad_norm": 0.6691743731498718, + "learning_rate": 9.924652396689902e-05, + "loss": 2.6751, + "step": 294 + }, + { + "epoch": 0.103718628180982, + "grad_norm": 0.5533433556556702, + "learning_rate": 9.922634964257215e-05, + "loss": 2.7064, + "step": 296 + }, + { + "epoch": 0.10441942972274539, + "grad_norm": 0.6669672727584839, + "learning_rate": 9.920591089590831e-05, + "loss": 2.687, + "step": 298 + }, + { + "epoch": 0.10512023126450878, + "grad_norm": 0.8539720773696899, + "learning_rate": 9.918520783669549e-05, + "loss": 2.6968, + "step": 300 + }, + { + "epoch": 0.10582103280627217, + "grad_norm": 0.827905535697937, + "learning_rate": 9.916424057614142e-05, + "loss": 2.7339, + "step": 302 + }, + { + "epoch": 0.10652183434803557, + "grad_norm": 0.7071542143821716, + "learning_rate": 9.9143009226873e-05, + "loss": 2.67, + "step": 304 + }, + { + "epoch": 0.10722263588979895, + "grad_norm": 0.6667853593826294, + "learning_rate": 9.912151390293575e-05, + "loss": 2.7113, + "step": 306 + }, + { + "epoch": 0.10792343743156235, + "grad_norm": 0.49210044741630554, + "learning_rate": 9.90997547197931e-05, + "loss": 2.7034, + "step": 308 + }, + { + "epoch": 0.10862423897332574, + "grad_norm": 0.5823047757148743, + "learning_rate": 9.907773179432581e-05, + "loss": 2.6815, + "step": 310 + }, + { + "epoch": 0.10932504051508914, + "grad_norm": 0.5159279704093933, + "learning_rate": 9.905544524483138e-05, + "loss": 2.7055, + "step": 312 + }, + { + "epoch": 0.11002584205685252, + "grad_norm": 0.5294278264045715, + "learning_rate": 9.903289519102338e-05, + "loss": 2.6821, + "step": 314 + }, + { + "epoch": 0.11072664359861592, + "grad_norm": 0.5865507125854492, + "learning_rate": 9.901008175403078e-05, + "loss": 2.698, + "step": 316 + }, + { + "epoch": 0.1114274451403793, + "grad_norm": 0.7102755904197693, + "learning_rate": 9.898700505639735e-05, + "loss": 2.693, + "step": 318 + }, + { + "epoch": 0.1121282466821427, + "grad_norm": 0.8151699900627136, + "learning_rate": 9.8963665222081e-05, + "loss": 2.6482, + "step": 320 + }, + { + "epoch": 0.11282904822390609, + "grad_norm": 0.5769193172454834, + "learning_rate": 9.894006237645304e-05, + "loss": 2.6893, + "step": 322 + }, + { + "epoch": 0.11352984976566949, + "grad_norm": 0.6606284976005554, + "learning_rate": 9.891619664629762e-05, + "loss": 2.6859, + "step": 324 + }, + { + "epoch": 0.11423065130743287, + "grad_norm": 0.5883016586303711, + "learning_rate": 9.889206815981094e-05, + "loss": 2.6622, + "step": 326 + }, + { + "epoch": 0.11493145284919627, + "grad_norm": 0.5413339734077454, + "learning_rate": 9.886767704660067e-05, + "loss": 2.6718, + "step": 328 + }, + { + "epoch": 0.11563225439095966, + "grad_norm": 0.7391770482063293, + "learning_rate": 9.884302343768512e-05, + "loss": 2.6695, + "step": 330 + }, + { + "epoch": 0.11633305593272306, + "grad_norm": 0.7529366612434387, + "learning_rate": 9.881810746549267e-05, + "loss": 2.7341, + "step": 332 + }, + { + "epoch": 0.11703385747448644, + "grad_norm": 0.6971571445465088, + "learning_rate": 9.8792929263861e-05, + "loss": 2.6444, + "step": 334 + }, + { + "epoch": 0.11773465901624984, + "grad_norm": 0.544129490852356, + "learning_rate": 9.876748896803633e-05, + "loss": 2.7351, + "step": 336 + }, + { + "epoch": 0.11843546055801323, + "grad_norm": 0.6561135649681091, + "learning_rate": 9.874178671467277e-05, + "loss": 2.6896, + "step": 338 + }, + { + "epoch": 0.11913626209977662, + "grad_norm": 0.6607089042663574, + "learning_rate": 9.871582264183155e-05, + "loss": 2.6664, + "step": 340 + }, + { + "epoch": 0.11983706364154001, + "grad_norm": 0.6727411150932312, + "learning_rate": 9.868959688898023e-05, + "loss": 2.68, + "step": 342 + }, + { + "epoch": 0.12053786518330341, + "grad_norm": 0.5672718286514282, + "learning_rate": 9.86631095969921e-05, + "loss": 2.6639, + "step": 344 + }, + { + "epoch": 0.1212386667250668, + "grad_norm": 0.7188961505889893, + "learning_rate": 9.86363609081452e-05, + "loss": 2.6604, + "step": 346 + }, + { + "epoch": 0.12193946826683019, + "grad_norm": 0.9785953760147095, + "learning_rate": 9.86093509661218e-05, + "loss": 2.6557, + "step": 348 + }, + { + "epoch": 0.12264026980859358, + "grad_norm": 0.7856999635696411, + "learning_rate": 9.85820799160074e-05, + "loss": 2.6418, + "step": 350 + }, + { + "epoch": 0.12334107135035698, + "grad_norm": 0.5956946015357971, + "learning_rate": 9.855454790429015e-05, + "loss": 2.658, + "step": 352 + }, + { + "epoch": 0.12404187289212036, + "grad_norm": 0.6523074507713318, + "learning_rate": 9.852675507885991e-05, + "loss": 2.6743, + "step": 354 + }, + { + "epoch": 0.12474267443388376, + "grad_norm": 0.71266108751297, + "learning_rate": 9.849870158900753e-05, + "loss": 2.6805, + "step": 356 + }, + { + "epoch": 0.12544347597564715, + "grad_norm": 0.5674154162406921, + "learning_rate": 9.847038758542404e-05, + "loss": 2.6678, + "step": 358 + }, + { + "epoch": 0.12614427751741053, + "grad_norm": 0.5430511236190796, + "learning_rate": 9.844181322019983e-05, + "loss": 2.643, + "step": 360 + }, + { + "epoch": 0.12684507905917394, + "grad_norm": 0.508791983127594, + "learning_rate": 9.841297864682388e-05, + "loss": 2.6524, + "step": 362 + }, + { + "epoch": 0.12754588060093733, + "grad_norm": 0.6082713603973389, + "learning_rate": 9.838388402018282e-05, + "loss": 2.6892, + "step": 364 + }, + { + "epoch": 0.1282466821427007, + "grad_norm": 0.6065689325332642, + "learning_rate": 9.835452949656022e-05, + "loss": 2.6083, + "step": 366 + }, + { + "epoch": 0.1289474836844641, + "grad_norm": 0.5220572352409363, + "learning_rate": 9.83249152336357e-05, + "loss": 2.6573, + "step": 368 + }, + { + "epoch": 0.1296482852262275, + "grad_norm": 0.568534791469574, + "learning_rate": 9.829504139048406e-05, + "loss": 2.6266, + "step": 370 + }, + { + "epoch": 0.1303490867679909, + "grad_norm": 0.6165401339530945, + "learning_rate": 9.826490812757452e-05, + "loss": 2.6928, + "step": 372 + }, + { + "epoch": 0.13104988830975428, + "grad_norm": 0.5951835513114929, + "learning_rate": 9.823451560676966e-05, + "loss": 2.6468, + "step": 374 + }, + { + "epoch": 0.13175068985151767, + "grad_norm": 0.4942519962787628, + "learning_rate": 9.820386399132482e-05, + "loss": 2.6493, + "step": 376 + }, + { + "epoch": 0.13245149139328108, + "grad_norm": 0.6185161471366882, + "learning_rate": 9.8172953445887e-05, + "loss": 2.6741, + "step": 378 + }, + { + "epoch": 0.13315229293504446, + "grad_norm": 0.5588895678520203, + "learning_rate": 9.814178413649407e-05, + "loss": 2.6393, + "step": 380 + }, + { + "epoch": 0.13385309447680785, + "grad_norm": 0.6289598941802979, + "learning_rate": 9.811035623057387e-05, + "loss": 2.6022, + "step": 382 + }, + { + "epoch": 0.13455389601857123, + "grad_norm": 0.6258370280265808, + "learning_rate": 9.807866989694334e-05, + "loss": 2.6033, + "step": 384 + }, + { + "epoch": 0.13525469756033462, + "grad_norm": 0.6390899419784546, + "learning_rate": 9.804672530580754e-05, + "loss": 2.6413, + "step": 386 + }, + { + "epoch": 0.13595549910209803, + "grad_norm": 0.6844115257263184, + "learning_rate": 9.801452262875877e-05, + "loss": 2.6339, + "step": 388 + }, + { + "epoch": 0.13665630064386142, + "grad_norm": 0.70540452003479, + "learning_rate": 9.798206203877569e-05, + "loss": 2.6471, + "step": 390 + }, + { + "epoch": 0.1373571021856248, + "grad_norm": 0.7336652278900146, + "learning_rate": 9.794934371022233e-05, + "loss": 2.6348, + "step": 392 + }, + { + "epoch": 0.1380579037273882, + "grad_norm": 0.7155029773712158, + "learning_rate": 9.79163678188472e-05, + "loss": 2.6128, + "step": 394 + }, + { + "epoch": 0.1387587052691516, + "grad_norm": 0.6354189515113831, + "learning_rate": 9.788313454178228e-05, + "loss": 2.6281, + "step": 396 + }, + { + "epoch": 0.13945950681091498, + "grad_norm": 0.596047043800354, + "learning_rate": 9.78496440575422e-05, + "loss": 2.6719, + "step": 398 + }, + { + "epoch": 0.14016030835267837, + "grad_norm": 0.6149719953536987, + "learning_rate": 9.781589654602306e-05, + "loss": 2.625, + "step": 400 + }, + { + "epoch": 0.14086110989444176, + "grad_norm": 0.6066911816596985, + "learning_rate": 9.778189218850174e-05, + "loss": 2.6193, + "step": 402 + }, + { + "epoch": 0.14156191143620517, + "grad_norm": 0.5690994262695312, + "learning_rate": 9.774763116763466e-05, + "loss": 2.6239, + "step": 404 + }, + { + "epoch": 0.14226271297796855, + "grad_norm": 0.532486081123352, + "learning_rate": 9.771311366745703e-05, + "loss": 2.6264, + "step": 406 + }, + { + "epoch": 0.14296351451973194, + "grad_norm": 0.5434598326683044, + "learning_rate": 9.767833987338171e-05, + "loss": 2.6534, + "step": 408 + }, + { + "epoch": 0.14366431606149532, + "grad_norm": 0.522413432598114, + "learning_rate": 9.764330997219822e-05, + "loss": 2.6468, + "step": 410 + }, + { + "epoch": 0.14436511760325874, + "grad_norm": 0.5612457990646362, + "learning_rate": 9.760802415207181e-05, + "loss": 2.6307, + "step": 412 + }, + { + "epoch": 0.14506591914502212, + "grad_norm": 0.5850318670272827, + "learning_rate": 9.757248260254244e-05, + "loss": 2.6324, + "step": 414 + }, + { + "epoch": 0.1457667206867855, + "grad_norm": 0.688555121421814, + "learning_rate": 9.753668551452368e-05, + "loss": 2.6066, + "step": 416 + }, + { + "epoch": 0.1464675222285489, + "grad_norm": 0.6506465077400208, + "learning_rate": 9.750063308030179e-05, + "loss": 2.5964, + "step": 418 + }, + { + "epoch": 0.1471683237703123, + "grad_norm": 0.6529019474983215, + "learning_rate": 9.746432549353462e-05, + "loss": 2.651, + "step": 420 + }, + { + "epoch": 0.1478691253120757, + "grad_norm": 0.5469995141029358, + "learning_rate": 9.742776294925058e-05, + "loss": 2.6129, + "step": 422 + }, + { + "epoch": 0.14856992685383907, + "grad_norm": 0.4992043673992157, + "learning_rate": 9.739094564384758e-05, + "loss": 2.6074, + "step": 424 + }, + { + "epoch": 0.14927072839560246, + "grad_norm": 0.5064156651496887, + "learning_rate": 9.735387377509206e-05, + "loss": 2.6408, + "step": 426 + }, + { + "epoch": 0.14997152993736587, + "grad_norm": 0.5961376428604126, + "learning_rate": 9.731654754211781e-05, + "loss": 2.615, + "step": 428 + }, + { + "epoch": 0.15067233147912926, + "grad_norm": 0.5533669590950012, + "learning_rate": 9.727896714542494e-05, + "loss": 2.6225, + "step": 430 + }, + { + "epoch": 0.15137313302089264, + "grad_norm": 0.5527905821800232, + "learning_rate": 9.724113278687888e-05, + "loss": 2.5836, + "step": 432 + }, + { + "epoch": 0.15207393456265603, + "grad_norm": 0.4616098701953888, + "learning_rate": 9.720304466970916e-05, + "loss": 2.6236, + "step": 434 + }, + { + "epoch": 0.15277473610441944, + "grad_norm": 0.5189539790153503, + "learning_rate": 9.716470299850844e-05, + "loss": 2.6364, + "step": 436 + }, + { + "epoch": 0.15347553764618282, + "grad_norm": 0.5303817987442017, + "learning_rate": 9.712610797923133e-05, + "loss": 2.6097, + "step": 438 + }, + { + "epoch": 0.1541763391879462, + "grad_norm": 0.5957894921302795, + "learning_rate": 9.708725981919333e-05, + "loss": 2.5749, + "step": 440 + }, + { + "epoch": 0.1548771407297096, + "grad_norm": 0.5686895251274109, + "learning_rate": 9.704815872706972e-05, + "loss": 2.6319, + "step": 442 + }, + { + "epoch": 0.155577942271473, + "grad_norm": 0.5570897459983826, + "learning_rate": 9.700880491289438e-05, + "loss": 2.6287, + "step": 444 + }, + { + "epoch": 0.1562787438132364, + "grad_norm": 0.5330969095230103, + "learning_rate": 9.696919858805873e-05, + "loss": 2.6014, + "step": 446 + }, + { + "epoch": 0.15697954535499978, + "grad_norm": 0.4891030192375183, + "learning_rate": 9.692933996531053e-05, + "loss": 2.6097, + "step": 448 + }, + { + "epoch": 0.15768034689676316, + "grad_norm": 0.5465073585510254, + "learning_rate": 9.688922925875285e-05, + "loss": 2.6162, + "step": 450 + }, + { + "epoch": 0.15838114843852658, + "grad_norm": 0.5483290553092957, + "learning_rate": 9.684886668384277e-05, + "loss": 2.5999, + "step": 452 + }, + { + "epoch": 0.15908194998028996, + "grad_norm": 0.6061928868293762, + "learning_rate": 9.68082524573903e-05, + "loss": 2.6614, + "step": 454 + }, + { + "epoch": 0.15978275152205335, + "grad_norm": 0.5806353688240051, + "learning_rate": 9.676738679755726e-05, + "loss": 2.6039, + "step": 456 + }, + { + "epoch": 0.16048355306381673, + "grad_norm": 0.5722226500511169, + "learning_rate": 9.672626992385602e-05, + "loss": 2.6529, + "step": 458 + }, + { + "epoch": 0.16118435460558014, + "grad_norm": 0.5939204096794128, + "learning_rate": 9.668490205714839e-05, + "loss": 2.6314, + "step": 460 + }, + { + "epoch": 0.16188515614734353, + "grad_norm": 0.7260386943817139, + "learning_rate": 9.664328341964436e-05, + "loss": 2.6211, + "step": 462 + }, + { + "epoch": 0.1625859576891069, + "grad_norm": 0.8503554463386536, + "learning_rate": 9.6601414234901e-05, + "loss": 2.6134, + "step": 464 + }, + { + "epoch": 0.1632867592308703, + "grad_norm": 0.5818518996238708, + "learning_rate": 9.655929472782116e-05, + "loss": 2.5667, + "step": 466 + }, + { + "epoch": 0.1639875607726337, + "grad_norm": 0.5678598284721375, + "learning_rate": 9.651692512465239e-05, + "loss": 2.6153, + "step": 468 + }, + { + "epoch": 0.1646883623143971, + "grad_norm": 0.5939005613327026, + "learning_rate": 9.647430565298555e-05, + "loss": 2.6098, + "step": 470 + }, + { + "epoch": 0.16538916385616048, + "grad_norm": 0.5300047993659973, + "learning_rate": 9.643143654175373e-05, + "loss": 2.6167, + "step": 472 + }, + { + "epoch": 0.16608996539792387, + "grad_norm": 0.4946250319480896, + "learning_rate": 9.638831802123101e-05, + "loss": 2.581, + "step": 474 + }, + { + "epoch": 0.16679076693968728, + "grad_norm": 0.4555206000804901, + "learning_rate": 9.634495032303111e-05, + "loss": 2.588, + "step": 476 + }, + { + "epoch": 0.16749156848145066, + "grad_norm": 0.5159677267074585, + "learning_rate": 9.630133368010628e-05, + "loss": 2.5868, + "step": 478 + }, + { + "epoch": 0.16819237002321405, + "grad_norm": 0.5565433502197266, + "learning_rate": 9.625746832674597e-05, + "loss": 2.6185, + "step": 480 + }, + { + "epoch": 0.16889317156497743, + "grad_norm": 0.4775915741920471, + "learning_rate": 9.621335449857562e-05, + "loss": 2.5897, + "step": 482 + }, + { + "epoch": 0.16959397310674085, + "grad_norm": 0.5150102376937866, + "learning_rate": 9.616899243255532e-05, + "loss": 2.5478, + "step": 484 + }, + { + "epoch": 0.17029477464850423, + "grad_norm": 0.48455357551574707, + "learning_rate": 9.612438236697863e-05, + "loss": 2.5639, + "step": 486 + }, + { + "epoch": 0.17099557619026762, + "grad_norm": 0.5149878859519958, + "learning_rate": 9.607952454147121e-05, + "loss": 2.599, + "step": 488 + }, + { + "epoch": 0.171696377732031, + "grad_norm": 0.6969982385635376, + "learning_rate": 9.603441919698963e-05, + "loss": 2.5733, + "step": 490 + }, + { + "epoch": 0.17239717927379442, + "grad_norm": 0.57285475730896, + "learning_rate": 9.598906657582e-05, + "loss": 2.5791, + "step": 492 + }, + { + "epoch": 0.1730979808155578, + "grad_norm": 0.5704159140586853, + "learning_rate": 9.594346692157667e-05, + "loss": 2.5692, + "step": 494 + }, + { + "epoch": 0.17379878235732119, + "grad_norm": 0.681797444820404, + "learning_rate": 9.589762047920096e-05, + "loss": 2.5759, + "step": 496 + }, + { + "epoch": 0.17449958389908457, + "grad_norm": 0.49717003107070923, + "learning_rate": 9.585152749495984e-05, + "loss": 2.5848, + "step": 498 + }, + { + "epoch": 0.17520038544084798, + "grad_norm": 0.48680582642555237, + "learning_rate": 9.580518821644457e-05, + "loss": 2.5682, + "step": 500 + }, + { + "epoch": 0.17590118698261137, + "grad_norm": 0.5525830388069153, + "learning_rate": 9.575860289256943e-05, + "loss": 2.5894, + "step": 502 + }, + { + "epoch": 0.17660198852437475, + "grad_norm": 0.5562606453895569, + "learning_rate": 9.571177177357032e-05, + "loss": 2.5675, + "step": 504 + }, + { + "epoch": 0.17730279006613814, + "grad_norm": 0.5515877604484558, + "learning_rate": 9.566469511100345e-05, + "loss": 2.5877, + "step": 506 + }, + { + "epoch": 0.17800359160790155, + "grad_norm": 0.6816357970237732, + "learning_rate": 9.561737315774398e-05, + "loss": 2.596, + "step": 508 + }, + { + "epoch": 0.17870439314966494, + "grad_norm": 0.507437527179718, + "learning_rate": 9.556980616798463e-05, + "loss": 2.5721, + "step": 510 + }, + { + "epoch": 0.17940519469142832, + "grad_norm": 0.5275202989578247, + "learning_rate": 9.552199439723443e-05, + "loss": 2.568, + "step": 512 + }, + { + "epoch": 0.1801059962331917, + "grad_norm": 0.5467104911804199, + "learning_rate": 9.547393810231722e-05, + "loss": 2.5842, + "step": 514 + }, + { + "epoch": 0.1808067977749551, + "grad_norm": 0.5407027006149292, + "learning_rate": 9.542563754137031e-05, + "loss": 2.5891, + "step": 516 + }, + { + "epoch": 0.1815075993167185, + "grad_norm": 0.5731847882270813, + "learning_rate": 9.537709297384308e-05, + "loss": 2.6143, + "step": 518 + }, + { + "epoch": 0.1822084008584819, + "grad_norm": 0.566457986831665, + "learning_rate": 9.532830466049565e-05, + "loss": 2.5522, + "step": 520 + }, + { + "epoch": 0.18290920240024527, + "grad_norm": 0.4899183213710785, + "learning_rate": 9.527927286339744e-05, + "loss": 2.5961, + "step": 522 + }, + { + "epoch": 0.18361000394200866, + "grad_norm": 0.4883110523223877, + "learning_rate": 9.52299978459257e-05, + "loss": 2.5557, + "step": 524 + }, + { + "epoch": 0.18431080548377207, + "grad_norm": 0.5534235239028931, + "learning_rate": 9.518047987276421e-05, + "loss": 2.6452, + "step": 526 + }, + { + "epoch": 0.18501160702553546, + "grad_norm": 0.47292667627334595, + "learning_rate": 9.513071920990179e-05, + "loss": 2.5848, + "step": 528 + }, + { + "epoch": 0.18571240856729884, + "grad_norm": 0.5438964366912842, + "learning_rate": 9.508071612463086e-05, + "loss": 2.5332, + "step": 530 + }, + { + "epoch": 0.18641321010906223, + "grad_norm": 0.5318060517311096, + "learning_rate": 9.503047088554601e-05, + "loss": 2.585, + "step": 532 + }, + { + "epoch": 0.18711401165082564, + "grad_norm": 0.49279502034187317, + "learning_rate": 9.497998376254267e-05, + "loss": 2.5948, + "step": 534 + }, + { + "epoch": 0.18781481319258903, + "grad_norm": 0.5161717534065247, + "learning_rate": 9.492925502681545e-05, + "loss": 2.5644, + "step": 536 + }, + { + "epoch": 0.1885156147343524, + "grad_norm": 0.4586479663848877, + "learning_rate": 9.487828495085684e-05, + "loss": 2.5568, + "step": 538 + }, + { + "epoch": 0.1892164162761158, + "grad_norm": 0.4390322268009186, + "learning_rate": 9.482707380845573e-05, + "loss": 2.5938, + "step": 540 + }, + { + "epoch": 0.1899172178178792, + "grad_norm": 0.5253728628158569, + "learning_rate": 9.47756218746959e-05, + "loss": 2.5996, + "step": 542 + }, + { + "epoch": 0.1906180193596426, + "grad_norm": 0.4567623436450958, + "learning_rate": 9.472392942595454e-05, + "loss": 2.5576, + "step": 544 + }, + { + "epoch": 0.19131882090140598, + "grad_norm": 0.5091727375984192, + "learning_rate": 9.467199673990077e-05, + "loss": 2.5873, + "step": 546 + }, + { + "epoch": 0.19201962244316936, + "grad_norm": 0.4959392845630646, + "learning_rate": 9.46198240954942e-05, + "loss": 2.5291, + "step": 548 + }, + { + "epoch": 0.19272042398493278, + "grad_norm": 0.5150632262229919, + "learning_rate": 9.456741177298336e-05, + "loss": 2.5503, + "step": 550 + }, + { + "epoch": 0.19342122552669616, + "grad_norm": 0.4603368639945984, + "learning_rate": 9.451476005390422e-05, + "loss": 2.5785, + "step": 552 + }, + { + "epoch": 0.19412202706845955, + "grad_norm": 0.4441729784011841, + "learning_rate": 9.446186922107873e-05, + "loss": 2.5512, + "step": 554 + }, + { + "epoch": 0.19482282861022293, + "grad_norm": 0.5432455539703369, + "learning_rate": 9.44087395586132e-05, + "loss": 2.5741, + "step": 556 + }, + { + "epoch": 0.19552363015198634, + "grad_norm": 0.42969366908073425, + "learning_rate": 9.435537135189687e-05, + "loss": 2.5677, + "step": 558 + }, + { + "epoch": 0.19622443169374973, + "grad_norm": 0.5706619620323181, + "learning_rate": 9.430176488760027e-05, + "loss": 2.556, + "step": 560 + }, + { + "epoch": 0.19692523323551311, + "grad_norm": 0.7202513217926025, + "learning_rate": 9.424792045367383e-05, + "loss": 2.5435, + "step": 562 + }, + { + "epoch": 0.1976260347772765, + "grad_norm": 0.5471363663673401, + "learning_rate": 9.419383833934621e-05, + "loss": 2.572, + "step": 564 + }, + { + "epoch": 0.1983268363190399, + "grad_norm": 0.654058575630188, + "learning_rate": 9.413951883512275e-05, + "loss": 2.5432, + "step": 566 + }, + { + "epoch": 0.1990276378608033, + "grad_norm": 0.6124361157417297, + "learning_rate": 9.408496223278403e-05, + "loss": 2.5803, + "step": 568 + }, + { + "epoch": 0.19972843940256668, + "grad_norm": 0.5291132926940918, + "learning_rate": 9.403016882538408e-05, + "loss": 2.576, + "step": 570 + }, + { + "epoch": 0.20042924094433007, + "grad_norm": 0.6087374687194824, + "learning_rate": 9.397513890724911e-05, + "loss": 2.5171, + "step": 572 + }, + { + "epoch": 0.20113004248609348, + "grad_norm": 0.5776922106742859, + "learning_rate": 9.391987277397566e-05, + "loss": 2.6054, + "step": 574 + }, + { + "epoch": 0.20183084402785686, + "grad_norm": 0.544319748878479, + "learning_rate": 9.38643707224291e-05, + "loss": 2.548, + "step": 576 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 0.5210007429122925, + "learning_rate": 9.38086330507421e-05, + "loss": 2.6019, + "step": 578 + }, + { + "epoch": 0.20323244711138364, + "grad_norm": 0.5160629153251648, + "learning_rate": 9.375266005831297e-05, + "loss": 2.6046, + "step": 580 + }, + { + "epoch": 0.20393324865314705, + "grad_norm": 0.6452796459197998, + "learning_rate": 9.369645204580403e-05, + "loss": 2.566, + "step": 582 + }, + { + "epoch": 0.20463405019491043, + "grad_norm": 0.5813329815864563, + "learning_rate": 9.364000931514008e-05, + "loss": 2.5661, + "step": 584 + }, + { + "epoch": 0.20533485173667382, + "grad_norm": 0.5450593829154968, + "learning_rate": 9.358333216950664e-05, + "loss": 2.5769, + "step": 586 + }, + { + "epoch": 0.2060356532784372, + "grad_norm": 0.5340794324874878, + "learning_rate": 9.352642091334849e-05, + "loss": 2.5549, + "step": 588 + }, + { + "epoch": 0.20673645482020062, + "grad_norm": 0.5767348408699036, + "learning_rate": 9.34692758523679e-05, + "loss": 2.5604, + "step": 590 + }, + { + "epoch": 0.207437256361964, + "grad_norm": 0.6048093438148499, + "learning_rate": 9.341189729352302e-05, + "loss": 2.5929, + "step": 592 + }, + { + "epoch": 0.20813805790372739, + "grad_norm": 0.4430505335330963, + "learning_rate": 9.33542855450263e-05, + "loss": 2.5563, + "step": 594 + }, + { + "epoch": 0.20883885944549077, + "grad_norm": 0.49373888969421387, + "learning_rate": 9.329644091634278e-05, + "loss": 2.5517, + "step": 596 + }, + { + "epoch": 0.20953966098725418, + "grad_norm": 0.5227393507957458, + "learning_rate": 9.323836371818837e-05, + "loss": 2.5286, + "step": 598 + }, + { + "epoch": 0.21024046252901757, + "grad_norm": 0.497405081987381, + "learning_rate": 9.318005426252832e-05, + "loss": 2.5638, + "step": 600 + }, + { + "epoch": 0.21094126407078095, + "grad_norm": 0.48721396923065186, + "learning_rate": 9.312151286257537e-05, + "loss": 2.5751, + "step": 602 + }, + { + "epoch": 0.21164206561254434, + "grad_norm": 0.4621741771697998, + "learning_rate": 9.306273983278825e-05, + "loss": 2.5654, + "step": 604 + }, + { + "epoch": 0.21234286715430775, + "grad_norm": 0.4756307005882263, + "learning_rate": 9.300373548886987e-05, + "loss": 2.5989, + "step": 606 + }, + { + "epoch": 0.21304366869607114, + "grad_norm": 0.42497771978378296, + "learning_rate": 9.294450014776566e-05, + "loss": 2.564, + "step": 608 + }, + { + "epoch": 0.21374447023783452, + "grad_norm": 0.5173219442367554, + "learning_rate": 9.288503412766185e-05, + "loss": 2.5296, + "step": 610 + }, + { + "epoch": 0.2144452717795979, + "grad_norm": 0.4622451066970825, + "learning_rate": 9.28253377479838e-05, + "loss": 2.5829, + "step": 612 + }, + { + "epoch": 0.21514607332136132, + "grad_norm": 0.5879294276237488, + "learning_rate": 9.276541132939428e-05, + "loss": 2.5462, + "step": 614 + }, + { + "epoch": 0.2158468748631247, + "grad_norm": 0.6237635612487793, + "learning_rate": 9.270525519379165e-05, + "loss": 2.6143, + "step": 616 + }, + { + "epoch": 0.2165476764048881, + "grad_norm": 0.5845280289649963, + "learning_rate": 9.264486966430829e-05, + "loss": 2.5272, + "step": 618 + }, + { + "epoch": 0.21724847794665147, + "grad_norm": 0.5140432715415955, + "learning_rate": 9.258425506530872e-05, + "loss": 2.5716, + "step": 620 + }, + { + "epoch": 0.2179492794884149, + "grad_norm": 0.5868300199508667, + "learning_rate": 9.2523411722388e-05, + "loss": 2.5699, + "step": 622 + }, + { + "epoch": 0.21865008103017827, + "grad_norm": 0.587374210357666, + "learning_rate": 9.246233996236983e-05, + "loss": 2.5335, + "step": 624 + }, + { + "epoch": 0.21935088257194166, + "grad_norm": 0.5000743865966797, + "learning_rate": 9.240104011330489e-05, + "loss": 2.5367, + "step": 626 + }, + { + "epoch": 0.22005168411370504, + "grad_norm": 0.5124289393424988, + "learning_rate": 9.233951250446902e-05, + "loss": 2.5598, + "step": 628 + }, + { + "epoch": 0.22075248565546846, + "grad_norm": 0.4815032482147217, + "learning_rate": 9.227775746636158e-05, + "loss": 2.5468, + "step": 630 + }, + { + "epoch": 0.22145328719723184, + "grad_norm": 0.5089353919029236, + "learning_rate": 9.22157753307035e-05, + "loss": 2.5482, + "step": 632 + }, + { + "epoch": 0.22215408873899523, + "grad_norm": 0.468841552734375, + "learning_rate": 9.215356643043559e-05, + "loss": 2.5138, + "step": 634 + }, + { + "epoch": 0.2228548902807586, + "grad_norm": 0.511968731880188, + "learning_rate": 9.209113109971676e-05, + "loss": 2.5481, + "step": 636 + }, + { + "epoch": 0.223555691822522, + "grad_norm": 0.6082082390785217, + "learning_rate": 9.202846967392217e-05, + "loss": 2.5459, + "step": 638 + }, + { + "epoch": 0.2242564933642854, + "grad_norm": 0.4931623637676239, + "learning_rate": 9.196558248964151e-05, + "loss": 2.5785, + "step": 640 + }, + { + "epoch": 0.2249572949060488, + "grad_norm": 0.5754916071891785, + "learning_rate": 9.190246988467712e-05, + "loss": 2.5166, + "step": 642 + }, + { + "epoch": 0.22565809644781218, + "grad_norm": 0.5335285067558289, + "learning_rate": 9.183913219804221e-05, + "loss": 2.4976, + "step": 644 + }, + { + "epoch": 0.22635889798957556, + "grad_norm": 0.4676333963871002, + "learning_rate": 9.1775569769959e-05, + "loss": 2.5361, + "step": 646 + }, + { + "epoch": 0.22705969953133898, + "grad_norm": 0.48826783895492554, + "learning_rate": 9.171178294185697e-05, + "loss": 2.5347, + "step": 648 + }, + { + "epoch": 0.22776050107310236, + "grad_norm": 0.509066104888916, + "learning_rate": 9.164777205637094e-05, + "loss": 2.5326, + "step": 650 + }, + { + "epoch": 0.22846130261486575, + "grad_norm": 0.5001896619796753, + "learning_rate": 9.158353745733927e-05, + "loss": 2.5605, + "step": 652 + }, + { + "epoch": 0.22916210415662913, + "grad_norm": 0.5497420430183411, + "learning_rate": 9.151907948980206e-05, + "loss": 2.5295, + "step": 654 + }, + { + "epoch": 0.22986290569839254, + "grad_norm": 0.43462875485420227, + "learning_rate": 9.145439849999919e-05, + "loss": 2.5358, + "step": 656 + }, + { + "epoch": 0.23056370724015593, + "grad_norm": 0.5398270487785339, + "learning_rate": 9.138949483536852e-05, + "loss": 2.5464, + "step": 658 + }, + { + "epoch": 0.23126450878191931, + "grad_norm": 0.5165109038352966, + "learning_rate": 9.132436884454408e-05, + "loss": 2.5043, + "step": 660 + }, + { + "epoch": 0.2319653103236827, + "grad_norm": 0.6717212200164795, + "learning_rate": 9.125902087735407e-05, + "loss": 2.547, + "step": 662 + }, + { + "epoch": 0.2326661118654461, + "grad_norm": 0.4584912061691284, + "learning_rate": 9.119345128481909e-05, + "loss": 2.5106, + "step": 664 + }, + { + "epoch": 0.2333669134072095, + "grad_norm": 0.5452204942703247, + "learning_rate": 9.112766041915019e-05, + "loss": 2.5189, + "step": 666 + }, + { + "epoch": 0.23406771494897288, + "grad_norm": 0.5055968761444092, + "learning_rate": 9.106164863374702e-05, + "loss": 2.4957, + "step": 668 + }, + { + "epoch": 0.23476851649073627, + "grad_norm": 0.4905461072921753, + "learning_rate": 9.099541628319592e-05, + "loss": 2.5523, + "step": 670 + }, + { + "epoch": 0.23546931803249968, + "grad_norm": 0.44840848445892334, + "learning_rate": 9.092896372326798e-05, + "loss": 2.4713, + "step": 672 + }, + { + "epoch": 0.23617011957426307, + "grad_norm": 0.46489134430885315, + "learning_rate": 9.086229131091717e-05, + "loss": 2.5071, + "step": 674 + }, + { + "epoch": 0.23687092111602645, + "grad_norm": 0.4460737705230713, + "learning_rate": 9.079539940427845e-05, + "loss": 2.5799, + "step": 676 + }, + { + "epoch": 0.23757172265778984, + "grad_norm": 0.5268511176109314, + "learning_rate": 9.072828836266574e-05, + "loss": 2.5574, + "step": 678 + }, + { + "epoch": 0.23827252419955325, + "grad_norm": 0.5001477003097534, + "learning_rate": 9.066095854657011e-05, + "loss": 2.5117, + "step": 680 + }, + { + "epoch": 0.23897332574131663, + "grad_norm": 0.5136899352073669, + "learning_rate": 9.059341031765773e-05, + "loss": 2.4855, + "step": 682 + }, + { + "epoch": 0.23967412728308002, + "grad_norm": 0.5532418489456177, + "learning_rate": 9.052564403876808e-05, + "loss": 2.5623, + "step": 684 + }, + { + "epoch": 0.2403749288248434, + "grad_norm": 0.4908037483692169, + "learning_rate": 9.045766007391185e-05, + "loss": 2.5248, + "step": 686 + }, + { + "epoch": 0.24107573036660682, + "grad_norm": 0.45994317531585693, + "learning_rate": 9.038945878826903e-05, + "loss": 2.5007, + "step": 688 + }, + { + "epoch": 0.2417765319083702, + "grad_norm": 0.5593565702438354, + "learning_rate": 9.032104054818698e-05, + "loss": 2.5759, + "step": 690 + }, + { + "epoch": 0.2424773334501336, + "grad_norm": 0.5076695084571838, + "learning_rate": 9.025240572117846e-05, + "loss": 2.5272, + "step": 692 + }, + { + "epoch": 0.24317813499189697, + "grad_norm": 0.3996141850948334, + "learning_rate": 9.018355467591962e-05, + "loss": 2.5317, + "step": 694 + }, + { + "epoch": 0.24387893653366038, + "grad_norm": 0.49347859621047974, + "learning_rate": 9.011448778224802e-05, + "loss": 2.5186, + "step": 696 + }, + { + "epoch": 0.24457973807542377, + "grad_norm": 0.5040503144264221, + "learning_rate": 9.004520541116075e-05, + "loss": 2.5015, + "step": 698 + }, + { + "epoch": 0.24528053961718715, + "grad_norm": 0.4658913016319275, + "learning_rate": 8.997570793481223e-05, + "loss": 2.5481, + "step": 700 + }, + { + "epoch": 0.24598134115895054, + "grad_norm": 0.47850051522254944, + "learning_rate": 8.990599572651242e-05, + "loss": 2.5505, + "step": 702 + }, + { + "epoch": 0.24668214270071395, + "grad_norm": 0.48090964555740356, + "learning_rate": 8.983606916072469e-05, + "loss": 2.5669, + "step": 704 + }, + { + "epoch": 0.24738294424247734, + "grad_norm": 0.5716775059700012, + "learning_rate": 8.976592861306384e-05, + "loss": 2.523, + "step": 706 + }, + { + "epoch": 0.24808374578424072, + "grad_norm": 0.49985334277153015, + "learning_rate": 8.969557446029409e-05, + "loss": 2.5439, + "step": 708 + }, + { + "epoch": 0.2487845473260041, + "grad_norm": 0.6331408023834229, + "learning_rate": 8.962500708032708e-05, + "loss": 2.5601, + "step": 710 + }, + { + "epoch": 0.24948534886776752, + "grad_norm": 0.5418590307235718, + "learning_rate": 8.955422685221979e-05, + "loss": 2.5495, + "step": 712 + }, + { + "epoch": 0.2501861504095309, + "grad_norm": 0.5396260619163513, + "learning_rate": 8.948323415617253e-05, + "loss": 2.5151, + "step": 714 + }, + { + "epoch": 0.2508869519512943, + "grad_norm": 0.5641499161720276, + "learning_rate": 8.941202937352686e-05, + "loss": 2.4895, + "step": 716 + }, + { + "epoch": 0.2515877534930577, + "grad_norm": 0.47651517391204834, + "learning_rate": 8.934061288676365e-05, + "loss": 2.5634, + "step": 718 + }, + { + "epoch": 0.25228855503482106, + "grad_norm": 0.5351449251174927, + "learning_rate": 8.92689850795009e-05, + "loss": 2.4804, + "step": 720 + }, + { + "epoch": 0.25298935657658445, + "grad_norm": 0.5856335759162903, + "learning_rate": 8.919714633649172e-05, + "loss": 2.5304, + "step": 722 + }, + { + "epoch": 0.2536901581183479, + "grad_norm": 0.4513723850250244, + "learning_rate": 8.912509704362232e-05, + "loss": 2.5369, + "step": 724 + }, + { + "epoch": 0.25439095966011127, + "grad_norm": 0.4676707983016968, + "learning_rate": 8.905283758790985e-05, + "loss": 2.5589, + "step": 726 + }, + { + "epoch": 0.25509176120187466, + "grad_norm": 0.5069173574447632, + "learning_rate": 8.89803683575004e-05, + "loss": 2.4958, + "step": 728 + }, + { + "epoch": 0.25579256274363804, + "grad_norm": 0.4774676263332367, + "learning_rate": 8.890768974166685e-05, + "loss": 2.5229, + "step": 730 + }, + { + "epoch": 0.2564933642854014, + "grad_norm": 0.548409104347229, + "learning_rate": 8.883480213080681e-05, + "loss": 2.4815, + "step": 732 + }, + { + "epoch": 0.2571941658271648, + "grad_norm": 0.4854792356491089, + "learning_rate": 8.876170591644054e-05, + "loss": 2.5118, + "step": 734 + }, + { + "epoch": 0.2578949673689282, + "grad_norm": 0.4988788664340973, + "learning_rate": 8.868840149120876e-05, + "loss": 2.5073, + "step": 736 + }, + { + "epoch": 0.2585957689106916, + "grad_norm": 0.4614211618900299, + "learning_rate": 8.861488924887071e-05, + "loss": 2.4866, + "step": 738 + }, + { + "epoch": 0.259296570452455, + "grad_norm": 0.4878149926662445, + "learning_rate": 8.854116958430185e-05, + "loss": 2.5315, + "step": 740 + }, + { + "epoch": 0.2599973719942184, + "grad_norm": 0.47185149788856506, + "learning_rate": 8.846724289349189e-05, + "loss": 2.4766, + "step": 742 + }, + { + "epoch": 0.2606981735359818, + "grad_norm": 0.446411669254303, + "learning_rate": 8.839310957354249e-05, + "loss": 2.5278, + "step": 744 + }, + { + "epoch": 0.2613989750777452, + "grad_norm": 0.45869573950767517, + "learning_rate": 8.831877002266536e-05, + "loss": 2.5051, + "step": 746 + }, + { + "epoch": 0.26209977661950856, + "grad_norm": 0.4578917920589447, + "learning_rate": 8.82442246401799e-05, + "loss": 2.4903, + "step": 748 + }, + { + "epoch": 0.26280057816127195, + "grad_norm": 0.4389136731624603, + "learning_rate": 8.816947382651116e-05, + "loss": 2.519, + "step": 750 + }, + { + "epoch": 0.26350137970303533, + "grad_norm": 0.4686265289783478, + "learning_rate": 8.80945179831877e-05, + "loss": 2.5537, + "step": 752 + }, + { + "epoch": 0.2642021812447987, + "grad_norm": 0.49357905983924866, + "learning_rate": 8.801935751283944e-05, + "loss": 2.4971, + "step": 754 + }, + { + "epoch": 0.26490298278656216, + "grad_norm": 0.5659007430076599, + "learning_rate": 8.794399281919537e-05, + "loss": 2.5291, + "step": 756 + }, + { + "epoch": 0.26560378432832554, + "grad_norm": 0.5637578964233398, + "learning_rate": 8.786842430708157e-05, + "loss": 2.5335, + "step": 758 + }, + { + "epoch": 0.26630458587008893, + "grad_norm": 0.47859886288642883, + "learning_rate": 8.779265238241888e-05, + "loss": 2.5104, + "step": 760 + }, + { + "epoch": 0.2670053874118523, + "grad_norm": 0.5444939732551575, + "learning_rate": 8.771667745222082e-05, + "loss": 2.4823, + "step": 762 + }, + { + "epoch": 0.2677061889536157, + "grad_norm": 0.5456621050834656, + "learning_rate": 8.76404999245914e-05, + "loss": 2.5027, + "step": 764 + }, + { + "epoch": 0.2684069904953791, + "grad_norm": 0.5168180465698242, + "learning_rate": 8.75641202087228e-05, + "loss": 2.5562, + "step": 766 + }, + { + "epoch": 0.26910779203714247, + "grad_norm": 0.5675712823867798, + "learning_rate": 8.748753871489333e-05, + "loss": 2.5195, + "step": 768 + }, + { + "epoch": 0.26980859357890585, + "grad_norm": 0.4084811806678772, + "learning_rate": 8.741075585446514e-05, + "loss": 2.4853, + "step": 770 + }, + { + "epoch": 0.27050939512066924, + "grad_norm": 0.4109669327735901, + "learning_rate": 8.733377203988208e-05, + "loss": 2.5186, + "step": 772 + }, + { + "epoch": 0.2712101966624327, + "grad_norm": 0.5689636468887329, + "learning_rate": 8.725658768466738e-05, + "loss": 2.5106, + "step": 774 + }, + { + "epoch": 0.27191099820419606, + "grad_norm": 0.4750414192676544, + "learning_rate": 8.71792032034215e-05, + "loss": 2.4927, + "step": 776 + }, + { + "epoch": 0.27261179974595945, + "grad_norm": 0.4577466547489166, + "learning_rate": 8.710161901181993e-05, + "loss": 2.5005, + "step": 778 + }, + { + "epoch": 0.27331260128772283, + "grad_norm": 0.4786745011806488, + "learning_rate": 8.702383552661081e-05, + "loss": 2.5099, + "step": 780 + }, + { + "epoch": 0.2740134028294862, + "grad_norm": 0.508456289768219, + "learning_rate": 8.694585316561296e-05, + "loss": 2.5377, + "step": 782 + }, + { + "epoch": 0.2747142043712496, + "grad_norm": 0.49584171175956726, + "learning_rate": 8.686767234771333e-05, + "loss": 2.5208, + "step": 784 + }, + { + "epoch": 0.275415005913013, + "grad_norm": 0.4523308575153351, + "learning_rate": 8.678929349286498e-05, + "loss": 2.5663, + "step": 786 + }, + { + "epoch": 0.2761158074547764, + "grad_norm": 0.411276638507843, + "learning_rate": 8.671071702208467e-05, + "loss": 2.5076, + "step": 788 + }, + { + "epoch": 0.2768166089965398, + "grad_norm": 0.47366130352020264, + "learning_rate": 8.663194335745071e-05, + "loss": 2.4725, + "step": 790 + }, + { + "epoch": 0.2775174105383032, + "grad_norm": 0.44845113158226013, + "learning_rate": 8.655297292210067e-05, + "loss": 2.5204, + "step": 792 + }, + { + "epoch": 0.2782182120800666, + "grad_norm": 0.4630947709083557, + "learning_rate": 8.647380614022902e-05, + "loss": 2.4848, + "step": 794 + }, + { + "epoch": 0.27891901362182997, + "grad_norm": 0.4739050567150116, + "learning_rate": 8.639444343708496e-05, + "loss": 2.4975, + "step": 796 + }, + { + "epoch": 0.27961981516359335, + "grad_norm": 0.41872844099998474, + "learning_rate": 8.631488523897011e-05, + "loss": 2.5105, + "step": 798 + }, + { + "epoch": 0.28032061670535674, + "grad_norm": 0.5174891948699951, + "learning_rate": 8.623513197323615e-05, + "loss": 2.4428, + "step": 800 + }, + { + "epoch": 0.2810214182471201, + "grad_norm": 0.4543634057044983, + "learning_rate": 8.615518406828262e-05, + "loss": 2.5248, + "step": 802 + }, + { + "epoch": 0.2817222197888835, + "grad_norm": 0.433250367641449, + "learning_rate": 8.607504195355458e-05, + "loss": 2.4887, + "step": 804 + }, + { + "epoch": 0.28242302133064695, + "grad_norm": 0.47642698884010315, + "learning_rate": 8.599470605954025e-05, + "loss": 2.5391, + "step": 806 + }, + { + "epoch": 0.28312382287241034, + "grad_norm": 0.45496654510498047, + "learning_rate": 8.59141768177688e-05, + "loss": 2.5444, + "step": 808 + }, + { + "epoch": 0.2838246244141737, + "grad_norm": 0.4619695544242859, + "learning_rate": 8.583345466080796e-05, + "loss": 2.504, + "step": 810 + }, + { + "epoch": 0.2845254259559371, + "grad_norm": 0.4610481262207031, + "learning_rate": 8.575254002226173e-05, + "loss": 2.4904, + "step": 812 + }, + { + "epoch": 0.2852262274977005, + "grad_norm": 0.4597660005092621, + "learning_rate": 8.5671433336768e-05, + "loss": 2.4923, + "step": 814 + }, + { + "epoch": 0.2859270290394639, + "grad_norm": 0.5440905094146729, + "learning_rate": 8.559013503999626e-05, + "loss": 2.4806, + "step": 816 + }, + { + "epoch": 0.28662783058122726, + "grad_norm": 0.4667718708515167, + "learning_rate": 8.550864556864529e-05, + "loss": 2.5595, + "step": 818 + }, + { + "epoch": 0.28732863212299065, + "grad_norm": 0.47145599126815796, + "learning_rate": 8.542696536044075e-05, + "loss": 2.4813, + "step": 820 + }, + { + "epoch": 0.2880294336647541, + "grad_norm": 0.4581964313983917, + "learning_rate": 8.534509485413284e-05, + "loss": 2.5467, + "step": 822 + }, + { + "epoch": 0.28873023520651747, + "grad_norm": 0.5127134919166565, + "learning_rate": 8.5263034489494e-05, + "loss": 2.5067, + "step": 824 + }, + { + "epoch": 0.28943103674828086, + "grad_norm": 0.5416949391365051, + "learning_rate": 8.518078470731644e-05, + "loss": 2.4669, + "step": 826 + }, + { + "epoch": 0.29013183829004424, + "grad_norm": 0.442828506231308, + "learning_rate": 8.509834594940991e-05, + "loss": 2.4708, + "step": 828 + }, + { + "epoch": 0.2908326398318076, + "grad_norm": 0.4708557426929474, + "learning_rate": 8.501571865859924e-05, + "loss": 2.5192, + "step": 830 + }, + { + "epoch": 0.291533441373571, + "grad_norm": 0.4371870458126068, + "learning_rate": 8.49329032787219e-05, + "loss": 2.4778, + "step": 832 + }, + { + "epoch": 0.2922342429153344, + "grad_norm": 0.48408806324005127, + "learning_rate": 8.48499002546258e-05, + "loss": 2.4868, + "step": 834 + }, + { + "epoch": 0.2929350444570978, + "grad_norm": 0.45126622915267944, + "learning_rate": 8.47667100321667e-05, + "loss": 2.4999, + "step": 836 + }, + { + "epoch": 0.2936358459988612, + "grad_norm": 0.4448654353618622, + "learning_rate": 8.468333305820599e-05, + "loss": 2.4848, + "step": 838 + }, + { + "epoch": 0.2943366475406246, + "grad_norm": 0.47776126861572266, + "learning_rate": 8.459976978060815e-05, + "loss": 2.5515, + "step": 840 + }, + { + "epoch": 0.295037449082388, + "grad_norm": 0.4572128653526306, + "learning_rate": 8.45160206482384e-05, + "loss": 2.5172, + "step": 842 + }, + { + "epoch": 0.2957382506241514, + "grad_norm": 0.4419424831867218, + "learning_rate": 8.443208611096036e-05, + "loss": 2.5035, + "step": 844 + }, + { + "epoch": 0.29643905216591476, + "grad_norm": 0.42213693261146545, + "learning_rate": 8.434796661963344e-05, + "loss": 2.542, + "step": 846 + }, + { + "epoch": 0.29713985370767815, + "grad_norm": 0.446344792842865, + "learning_rate": 8.426366262611067e-05, + "loss": 2.5119, + "step": 848 + }, + { + "epoch": 0.29784065524944153, + "grad_norm": 0.44233253598213196, + "learning_rate": 8.417917458323607e-05, + "loss": 2.4985, + "step": 850 + }, + { + "epoch": 0.2985414567912049, + "grad_norm": 0.492471843957901, + "learning_rate": 8.40945029448423e-05, + "loss": 2.4553, + "step": 852 + }, + { + "epoch": 0.29924225833296836, + "grad_norm": 0.4490063488483429, + "learning_rate": 8.400964816574826e-05, + "loss": 2.5389, + "step": 854 + }, + { + "epoch": 0.29994305987473174, + "grad_norm": 0.5494585633277893, + "learning_rate": 8.392461070175652e-05, + "loss": 2.5163, + "step": 856 + }, + { + "epoch": 0.30064386141649513, + "grad_norm": 0.4822872281074524, + "learning_rate": 8.383939100965103e-05, + "loss": 2.504, + "step": 858 + }, + { + "epoch": 0.3013446629582585, + "grad_norm": 0.5434439778327942, + "learning_rate": 8.375398954719456e-05, + "loss": 2.4841, + "step": 860 + }, + { + "epoch": 0.3020454645000219, + "grad_norm": 0.5055859088897705, + "learning_rate": 8.366840677312626e-05, + "loss": 2.4985, + "step": 862 + }, + { + "epoch": 0.3027462660417853, + "grad_norm": 0.44319674372673035, + "learning_rate": 8.358264314715923e-05, + "loss": 2.4661, + "step": 864 + }, + { + "epoch": 0.30344706758354867, + "grad_norm": 0.5121539235115051, + "learning_rate": 8.349669912997799e-05, + "loss": 2.4797, + "step": 866 + }, + { + "epoch": 0.30414786912531205, + "grad_norm": 0.4748767912387848, + "learning_rate": 8.341057518323607e-05, + "loss": 2.5009, + "step": 868 + }, + { + "epoch": 0.3048486706670755, + "grad_norm": 0.4823194742202759, + "learning_rate": 8.332427176955353e-05, + "loss": 2.4798, + "step": 870 + }, + { + "epoch": 0.3055494722088389, + "grad_norm": 0.4242302477359772, + "learning_rate": 8.323778935251437e-05, + "loss": 2.4764, + "step": 872 + }, + { + "epoch": 0.30625027375060226, + "grad_norm": 0.46324998140335083, + "learning_rate": 8.31511283966642e-05, + "loss": 2.509, + "step": 874 + }, + { + "epoch": 0.30695107529236565, + "grad_norm": 0.4894976317882538, + "learning_rate": 8.30642893675076e-05, + "loss": 2.498, + "step": 876 + }, + { + "epoch": 0.30765187683412903, + "grad_norm": 0.4574197232723236, + "learning_rate": 8.297727273150573e-05, + "loss": 2.48, + "step": 878 + }, + { + "epoch": 0.3083526783758924, + "grad_norm": 0.44225645065307617, + "learning_rate": 8.289007895607375e-05, + "loss": 2.502, + "step": 880 + }, + { + "epoch": 0.3090534799176558, + "grad_norm": 0.47749781608581543, + "learning_rate": 8.28027085095783e-05, + "loss": 2.5043, + "step": 882 + }, + { + "epoch": 0.3097542814594192, + "grad_norm": 0.4569682478904724, + "learning_rate": 8.271516186133511e-05, + "loss": 2.4454, + "step": 884 + }, + { + "epoch": 0.31045508300118263, + "grad_norm": 0.4561903178691864, + "learning_rate": 8.262743948160632e-05, + "loss": 2.4826, + "step": 886 + }, + { + "epoch": 0.311155884542946, + "grad_norm": 0.4749627411365509, + "learning_rate": 8.253954184159803e-05, + "loss": 2.4707, + "step": 888 + }, + { + "epoch": 0.3118566860847094, + "grad_norm": 0.4455653131008148, + "learning_rate": 8.245146941345774e-05, + "loss": 2.4647, + "step": 890 + }, + { + "epoch": 0.3125574876264728, + "grad_norm": 0.4758734405040741, + "learning_rate": 8.236322267027193e-05, + "loss": 2.4885, + "step": 892 + }, + { + "epoch": 0.31325828916823617, + "grad_norm": 0.45016252994537354, + "learning_rate": 8.227480208606333e-05, + "loss": 2.4993, + "step": 894 + }, + { + "epoch": 0.31395909070999956, + "grad_norm": 0.48177486658096313, + "learning_rate": 8.218620813578847e-05, + "loss": 2.4838, + "step": 896 + }, + { + "epoch": 0.31465989225176294, + "grad_norm": 0.4863053858280182, + "learning_rate": 8.209744129533519e-05, + "loss": 2.5381, + "step": 898 + }, + { + "epoch": 0.3153606937935263, + "grad_norm": 0.49010857939720154, + "learning_rate": 8.200850204151995e-05, + "loss": 2.5721, + "step": 900 + }, + { + "epoch": 0.3160614953352897, + "grad_norm": 0.43315884470939636, + "learning_rate": 8.191939085208542e-05, + "loss": 2.4976, + "step": 902 + }, + { + "epoch": 0.31676229687705315, + "grad_norm": 0.4580542743206024, + "learning_rate": 8.183010820569775e-05, + "loss": 2.4885, + "step": 904 + }, + { + "epoch": 0.31746309841881654, + "grad_norm": 0.40409061312675476, + "learning_rate": 8.17406545819441e-05, + "loss": 2.4872, + "step": 906 + }, + { + "epoch": 0.3181638999605799, + "grad_norm": 0.5763331055641174, + "learning_rate": 8.16510304613301e-05, + "loss": 2.4991, + "step": 908 + }, + { + "epoch": 0.3188647015023433, + "grad_norm": 0.4705376625061035, + "learning_rate": 8.156123632527714e-05, + "loss": 2.5071, + "step": 910 + }, + { + "epoch": 0.3195655030441067, + "grad_norm": 0.42091286182403564, + "learning_rate": 8.147127265611991e-05, + "loss": 2.4639, + "step": 912 + }, + { + "epoch": 0.3202663045858701, + "grad_norm": 0.4637336730957031, + "learning_rate": 8.138113993710377e-05, + "loss": 2.5368, + "step": 914 + }, + { + "epoch": 0.32096710612763346, + "grad_norm": 0.48139557242393494, + "learning_rate": 8.129083865238207e-05, + "loss": 2.4677, + "step": 916 + }, + { + "epoch": 0.32166790766939685, + "grad_norm": 0.4305325448513031, + "learning_rate": 8.120036928701367e-05, + "loss": 2.5125, + "step": 918 + }, + { + "epoch": 0.3223687092111603, + "grad_norm": 0.4576675295829773, + "learning_rate": 8.110973232696029e-05, + "loss": 2.4559, + "step": 920 + }, + { + "epoch": 0.32306951075292367, + "grad_norm": 0.4506300687789917, + "learning_rate": 8.10189282590839e-05, + "loss": 2.46, + "step": 922 + }, + { + "epoch": 0.32377031229468706, + "grad_norm": 0.4221961200237274, + "learning_rate": 8.092795757114405e-05, + "loss": 2.5269, + "step": 924 + }, + { + "epoch": 0.32447111383645044, + "grad_norm": 0.4183507561683655, + "learning_rate": 8.083682075179535e-05, + "loss": 2.4963, + "step": 926 + }, + { + "epoch": 0.3251719153782138, + "grad_norm": 0.40157100558280945, + "learning_rate": 8.07455182905848e-05, + "loss": 2.497, + "step": 928 + }, + { + "epoch": 0.3258727169199772, + "grad_norm": 0.41130340099334717, + "learning_rate": 8.06540506779491e-05, + "loss": 2.4894, + "step": 930 + }, + { + "epoch": 0.3265735184617406, + "grad_norm": 0.45491817593574524, + "learning_rate": 8.056241840521212e-05, + "loss": 2.4991, + "step": 932 + }, + { + "epoch": 0.327274320003504, + "grad_norm": 0.5285101532936096, + "learning_rate": 8.047062196458222e-05, + "loss": 2.4956, + "step": 934 + }, + { + "epoch": 0.3279751215452674, + "grad_norm": 0.4162616729736328, + "learning_rate": 8.037866184914952e-05, + "loss": 2.4349, + "step": 936 + }, + { + "epoch": 0.3286759230870308, + "grad_norm": 0.47431331872940063, + "learning_rate": 8.028653855288342e-05, + "loss": 2.4502, + "step": 938 + }, + { + "epoch": 0.3293767246287942, + "grad_norm": 0.516228437423706, + "learning_rate": 8.019425257062983e-05, + "loss": 2.463, + "step": 940 + }, + { + "epoch": 0.3300775261705576, + "grad_norm": 0.4729955196380615, + "learning_rate": 8.010180439810852e-05, + "loss": 2.4764, + "step": 942 + }, + { + "epoch": 0.33077832771232096, + "grad_norm": 0.4590371549129486, + "learning_rate": 8.000919453191046e-05, + "loss": 2.4677, + "step": 944 + }, + { + "epoch": 0.33147912925408435, + "grad_norm": 0.4180818498134613, + "learning_rate": 7.99164234694952e-05, + "loss": 2.4478, + "step": 946 + }, + { + "epoch": 0.33217993079584773, + "grad_norm": 0.459830641746521, + "learning_rate": 7.982349170918819e-05, + "loss": 2.5018, + "step": 948 + }, + { + "epoch": 0.3328807323376111, + "grad_norm": 0.42037877440452576, + "learning_rate": 7.9730399750178e-05, + "loss": 2.5018, + "step": 950 + }, + { + "epoch": 0.33358153387937456, + "grad_norm": 0.48284855484962463, + "learning_rate": 7.963714809251375e-05, + "loss": 2.4853, + "step": 952 + }, + { + "epoch": 0.33428233542113794, + "grad_norm": 0.4275258779525757, + "learning_rate": 7.954373723710247e-05, + "loss": 2.4716, + "step": 954 + }, + { + "epoch": 0.33498313696290133, + "grad_norm": 0.4475047290325165, + "learning_rate": 7.945016768570619e-05, + "loss": 2.5075, + "step": 956 + }, + { + "epoch": 0.3356839385046647, + "grad_norm": 0.4026126265525818, + "learning_rate": 7.93564399409395e-05, + "loss": 2.4381, + "step": 958 + }, + { + "epoch": 0.3363847400464281, + "grad_norm": 0.4063897132873535, + "learning_rate": 7.926255450626668e-05, + "loss": 2.4607, + "step": 960 + }, + { + "epoch": 0.3370855415881915, + "grad_norm": 0.4034193158149719, + "learning_rate": 7.916851188599908e-05, + "loss": 2.4615, + "step": 962 + }, + { + "epoch": 0.33778634312995487, + "grad_norm": 0.449735552072525, + "learning_rate": 7.907431258529232e-05, + "loss": 2.447, + "step": 964 + }, + { + "epoch": 0.33848714467171825, + "grad_norm": 0.46238863468170166, + "learning_rate": 7.897995711014373e-05, + "loss": 2.5097, + "step": 966 + }, + { + "epoch": 0.3391879462134817, + "grad_norm": 0.4304943084716797, + "learning_rate": 7.88854459673895e-05, + "loss": 2.48, + "step": 968 + }, + { + "epoch": 0.3398887477552451, + "grad_norm": 0.4926084578037262, + "learning_rate": 7.879077966470194e-05, + "loss": 2.4716, + "step": 970 + }, + { + "epoch": 0.34058954929700846, + "grad_norm": 0.4950031638145447, + "learning_rate": 7.86959587105869e-05, + "loss": 2.4648, + "step": 972 + }, + { + "epoch": 0.34129035083877185, + "grad_norm": 0.4255363345146179, + "learning_rate": 7.860098361438092e-05, + "loss": 2.4388, + "step": 974 + }, + { + "epoch": 0.34199115238053523, + "grad_norm": 0.48036453127861023, + "learning_rate": 7.85058548862485e-05, + "loss": 2.4286, + "step": 976 + }, + { + "epoch": 0.3426919539222986, + "grad_norm": 0.43235254287719727, + "learning_rate": 7.84105730371794e-05, + "loss": 2.5079, + "step": 978 + }, + { + "epoch": 0.343392755464062, + "grad_norm": 0.4415871500968933, + "learning_rate": 7.831513857898589e-05, + "loss": 2.4995, + "step": 980 + }, + { + "epoch": 0.3440935570058254, + "grad_norm": 0.4261731207370758, + "learning_rate": 7.821955202429997e-05, + "loss": 2.4732, + "step": 982 + }, + { + "epoch": 0.34479435854758883, + "grad_norm": 0.4182513654232025, + "learning_rate": 7.812381388657066e-05, + "loss": 2.4424, + "step": 984 + }, + { + "epoch": 0.3454951600893522, + "grad_norm": 0.38975790143013, + "learning_rate": 7.802792468006119e-05, + "loss": 2.4672, + "step": 986 + }, + { + "epoch": 0.3461959616311156, + "grad_norm": 0.4010710120201111, + "learning_rate": 7.793188491984626e-05, + "loss": 2.4458, + "step": 988 + }, + { + "epoch": 0.346896763172879, + "grad_norm": 0.4334274232387543, + "learning_rate": 7.783569512180933e-05, + "loss": 2.4296, + "step": 990 + }, + { + "epoch": 0.34759756471464237, + "grad_norm": 0.42379283905029297, + "learning_rate": 7.77393558026397e-05, + "loss": 2.4737, + "step": 992 + }, + { + "epoch": 0.34829836625640576, + "grad_norm": 0.3837527632713318, + "learning_rate": 7.764286747982998e-05, + "loss": 2.5038, + "step": 994 + }, + { + "epoch": 0.34899916779816914, + "grad_norm": 0.3951718211174011, + "learning_rate": 7.754623067167301e-05, + "loss": 2.4743, + "step": 996 + }, + { + "epoch": 0.3496999693399325, + "grad_norm": 0.42385029792785645, + "learning_rate": 7.744944589725931e-05, + "loss": 2.4839, + "step": 998 + }, + { + "epoch": 0.35040077088169597, + "grad_norm": 0.35979923605918884, + "learning_rate": 7.73525136764742e-05, + "loss": 2.466, + "step": 1000 + }, + { + "epoch": 0.35110157242345935, + "grad_norm": 0.3813255727291107, + "learning_rate": 7.725543452999501e-05, + "loss": 2.4657, + "step": 1002 + }, + { + "epoch": 0.35180237396522274, + "grad_norm": 0.44004762172698975, + "learning_rate": 7.715820897928831e-05, + "loss": 2.4969, + "step": 1004 + }, + { + "epoch": 0.3525031755069861, + "grad_norm": 0.41002413630485535, + "learning_rate": 7.706083754660704e-05, + "loss": 2.4762, + "step": 1006 + }, + { + "epoch": 0.3532039770487495, + "grad_norm": 0.39008840918540955, + "learning_rate": 7.696332075498778e-05, + "loss": 2.4857, + "step": 1008 + }, + { + "epoch": 0.3539047785905129, + "grad_norm": 0.38840317726135254, + "learning_rate": 7.686565912824797e-05, + "loss": 2.4517, + "step": 1010 + }, + { + "epoch": 0.3546055801322763, + "grad_norm": 0.4124884605407715, + "learning_rate": 7.676785319098292e-05, + "loss": 2.5003, + "step": 1012 + }, + { + "epoch": 0.35530638167403966, + "grad_norm": 0.39664867520332336, + "learning_rate": 7.666990346856323e-05, + "loss": 2.4567, + "step": 1014 + }, + { + "epoch": 0.3560071832158031, + "grad_norm": 0.41662028431892395, + "learning_rate": 7.65718104871318e-05, + "loss": 2.4603, + "step": 1016 + }, + { + "epoch": 0.3567079847575665, + "grad_norm": 0.41218042373657227, + "learning_rate": 7.647357477360103e-05, + "loss": 2.4775, + "step": 1018 + }, + { + "epoch": 0.3574087862993299, + "grad_norm": 0.4241933226585388, + "learning_rate": 7.637519685565007e-05, + "loss": 2.4581, + "step": 1020 + }, + { + "epoch": 0.35810958784109326, + "grad_norm": 0.4417102336883545, + "learning_rate": 7.627667726172188e-05, + "loss": 2.4707, + "step": 1022 + }, + { + "epoch": 0.35881038938285664, + "grad_norm": 0.4153529405593872, + "learning_rate": 7.617801652102047e-05, + "loss": 2.4899, + "step": 1024 + }, + { + "epoch": 0.35951119092462, + "grad_norm": 0.42329278588294983, + "learning_rate": 7.607921516350805e-05, + "loss": 2.4753, + "step": 1026 + }, + { + "epoch": 0.3602119924663834, + "grad_norm": 0.4246659576892853, + "learning_rate": 7.598027371990209e-05, + "loss": 2.4887, + "step": 1028 + }, + { + "epoch": 0.3609127940081468, + "grad_norm": 0.4006962180137634, + "learning_rate": 7.58811927216726e-05, + "loss": 2.4446, + "step": 1030 + }, + { + "epoch": 0.3616135955499102, + "grad_norm": 0.4364110231399536, + "learning_rate": 7.578197270103922e-05, + "loss": 2.4661, + "step": 1032 + }, + { + "epoch": 0.3623143970916736, + "grad_norm": 0.41410303115844727, + "learning_rate": 7.568261419096829e-05, + "loss": 2.5071, + "step": 1034 + }, + { + "epoch": 0.363015198633437, + "grad_norm": 0.4187104403972626, + "learning_rate": 7.558311772517015e-05, + "loss": 2.4325, + "step": 1036 + }, + { + "epoch": 0.3637160001752004, + "grad_norm": 0.3853437304496765, + "learning_rate": 7.548348383809609e-05, + "loss": 2.5366, + "step": 1038 + }, + { + "epoch": 0.3644168017169638, + "grad_norm": 0.4048047363758087, + "learning_rate": 7.538371306493565e-05, + "loss": 2.5017, + "step": 1040 + }, + { + "epoch": 0.36511760325872716, + "grad_norm": 0.5083643198013306, + "learning_rate": 7.528380594161357e-05, + "loss": 2.4921, + "step": 1042 + }, + { + "epoch": 0.36581840480049055, + "grad_norm": 0.48616695404052734, + "learning_rate": 7.51837630047871e-05, + "loss": 2.4939, + "step": 1044 + }, + { + "epoch": 0.36651920634225393, + "grad_norm": 0.4780706763267517, + "learning_rate": 7.508358479184292e-05, + "loss": 2.4355, + "step": 1046 + }, + { + "epoch": 0.3672200078840173, + "grad_norm": 0.48671412467956543, + "learning_rate": 7.498327184089444e-05, + "loss": 2.4921, + "step": 1048 + }, + { + "epoch": 0.36792080942578076, + "grad_norm": 0.4375866651535034, + "learning_rate": 7.488282469077878e-05, + "loss": 2.47, + "step": 1050 + }, + { + "epoch": 0.36862161096754414, + "grad_norm": 0.40635204315185547, + "learning_rate": 7.478224388105395e-05, + "loss": 2.4903, + "step": 1052 + }, + { + "epoch": 0.36932241250930753, + "grad_norm": 0.4381403923034668, + "learning_rate": 7.468152995199586e-05, + "loss": 2.4512, + "step": 1054 + }, + { + "epoch": 0.3700232140510709, + "grad_norm": 0.4581202566623688, + "learning_rate": 7.458068344459556e-05, + "loss": 2.461, + "step": 1056 + }, + { + "epoch": 0.3707240155928343, + "grad_norm": 0.41336745023727417, + "learning_rate": 7.447970490055615e-05, + "loss": 2.5116, + "step": 1058 + }, + { + "epoch": 0.3714248171345977, + "grad_norm": 0.44782060384750366, + "learning_rate": 7.437859486229008e-05, + "loss": 2.463, + "step": 1060 + }, + { + "epoch": 0.37212561867636107, + "grad_norm": 0.39710888266563416, + "learning_rate": 7.42773538729161e-05, + "loss": 2.4415, + "step": 1062 + }, + { + "epoch": 0.37282642021812445, + "grad_norm": 0.4727705717086792, + "learning_rate": 7.41759824762563e-05, + "loss": 2.5044, + "step": 1064 + }, + { + "epoch": 0.3735272217598879, + "grad_norm": 0.4554997384548187, + "learning_rate": 7.407448121683334e-05, + "loss": 2.458, + "step": 1066 + }, + { + "epoch": 0.3742280233016513, + "grad_norm": 0.3889988958835602, + "learning_rate": 7.397285063986743e-05, + "loss": 2.3949, + "step": 1068 + }, + { + "epoch": 0.37492882484341467, + "grad_norm": 0.4311942160129547, + "learning_rate": 7.387109129127338e-05, + "loss": 2.49, + "step": 1070 + }, + { + "epoch": 0.37562962638517805, + "grad_norm": 0.5033543705940247, + "learning_rate": 7.376920371765778e-05, + "loss": 2.5222, + "step": 1072 + }, + { + "epoch": 0.37633042792694144, + "grad_norm": 0.44039902091026306, + "learning_rate": 7.366718846631589e-05, + "loss": 2.4676, + "step": 1074 + }, + { + "epoch": 0.3770312294687048, + "grad_norm": 0.4856110215187073, + "learning_rate": 7.356504608522886e-05, + "loss": 2.475, + "step": 1076 + }, + { + "epoch": 0.3777320310104682, + "grad_norm": 0.4543730914592743, + "learning_rate": 7.346277712306074e-05, + "loss": 2.4278, + "step": 1078 + }, + { + "epoch": 0.3784328325522316, + "grad_norm": 0.44085338711738586, + "learning_rate": 7.336038212915547e-05, + "loss": 2.4556, + "step": 1080 + }, + { + "epoch": 0.37913363409399503, + "grad_norm": 0.46556994318962097, + "learning_rate": 7.325786165353403e-05, + "loss": 2.5022, + "step": 1082 + }, + { + "epoch": 0.3798344356357584, + "grad_norm": 0.49558380246162415, + "learning_rate": 7.315521624689135e-05, + "loss": 2.441, + "step": 1084 + }, + { + "epoch": 0.3805352371775218, + "grad_norm": 0.41412246227264404, + "learning_rate": 7.305244646059353e-05, + "loss": 2.4467, + "step": 1086 + }, + { + "epoch": 0.3812360387192852, + "grad_norm": 0.40206030011177063, + "learning_rate": 7.294955284667473e-05, + "loss": 2.4546, + "step": 1088 + }, + { + "epoch": 0.38193684026104857, + "grad_norm": 0.4202801585197449, + "learning_rate": 7.284653595783427e-05, + "loss": 2.4638, + "step": 1090 + }, + { + "epoch": 0.38263764180281196, + "grad_norm": 0.41482287645339966, + "learning_rate": 7.274339634743364e-05, + "loss": 2.4295, + "step": 1092 + }, + { + "epoch": 0.38333844334457534, + "grad_norm": 0.4058382511138916, + "learning_rate": 7.264013456949352e-05, + "loss": 2.4885, + "step": 1094 + }, + { + "epoch": 0.3840392448863387, + "grad_norm": 0.3752575218677521, + "learning_rate": 7.253675117869088e-05, + "loss": 2.4247, + "step": 1096 + }, + { + "epoch": 0.38474004642810217, + "grad_norm": 0.4093259572982788, + "learning_rate": 7.24332467303559e-05, + "loss": 2.4403, + "step": 1098 + }, + { + "epoch": 0.38544084796986555, + "grad_norm": 0.4372352063655853, + "learning_rate": 7.232962178046901e-05, + "loss": 2.4606, + "step": 1100 + }, + { + "epoch": 0.38614164951162894, + "grad_norm": 0.4315595328807831, + "learning_rate": 7.222587688565796e-05, + "loss": 2.4756, + "step": 1102 + }, + { + "epoch": 0.3868424510533923, + "grad_norm": 0.4413852095603943, + "learning_rate": 7.212201260319477e-05, + "loss": 2.4678, + "step": 1104 + }, + { + "epoch": 0.3875432525951557, + "grad_norm": 0.41132184863090515, + "learning_rate": 7.201802949099275e-05, + "loss": 2.4659, + "step": 1106 + }, + { + "epoch": 0.3882440541369191, + "grad_norm": 0.3732544183731079, + "learning_rate": 7.191392810760356e-05, + "loss": 2.4209, + "step": 1108 + }, + { + "epoch": 0.3889448556786825, + "grad_norm": 0.4892706871032715, + "learning_rate": 7.180970901221408e-05, + "loss": 2.4762, + "step": 1110 + }, + { + "epoch": 0.38964565722044586, + "grad_norm": 0.42454981803894043, + "learning_rate": 7.170537276464355e-05, + "loss": 2.4781, + "step": 1112 + }, + { + "epoch": 0.3903464587622093, + "grad_norm": 0.4017874598503113, + "learning_rate": 7.160091992534051e-05, + "loss": 2.4563, + "step": 1114 + }, + { + "epoch": 0.3910472603039727, + "grad_norm": 0.4679020643234253, + "learning_rate": 7.14963510553797e-05, + "loss": 2.4337, + "step": 1116 + }, + { + "epoch": 0.3917480618457361, + "grad_norm": 0.4594592750072479, + "learning_rate": 7.139166671645923e-05, + "loss": 2.4431, + "step": 1118 + }, + { + "epoch": 0.39244886338749946, + "grad_norm": 0.4678356349468231, + "learning_rate": 7.128686747089737e-05, + "loss": 2.4513, + "step": 1120 + }, + { + "epoch": 0.39314966492926284, + "grad_norm": 0.462960422039032, + "learning_rate": 7.118195388162966e-05, + "loss": 2.4636, + "step": 1122 + }, + { + "epoch": 0.39385046647102623, + "grad_norm": 0.4102267920970917, + "learning_rate": 7.107692651220585e-05, + "loss": 2.4249, + "step": 1124 + }, + { + "epoch": 0.3945512680127896, + "grad_norm": 0.4165082573890686, + "learning_rate": 7.097178592678683e-05, + "loss": 2.4793, + "step": 1126 + }, + { + "epoch": 0.395252069554553, + "grad_norm": 0.438484787940979, + "learning_rate": 7.086653269014171e-05, + "loss": 2.4437, + "step": 1128 + }, + { + "epoch": 0.39595287109631644, + "grad_norm": 0.4235878884792328, + "learning_rate": 7.076116736764461e-05, + "loss": 2.4276, + "step": 1130 + }, + { + "epoch": 0.3966536726380798, + "grad_norm": 0.4382783770561218, + "learning_rate": 7.065569052527182e-05, + "loss": 2.4572, + "step": 1132 + }, + { + "epoch": 0.3973544741798432, + "grad_norm": 0.4001896381378174, + "learning_rate": 7.055010272959861e-05, + "loss": 2.4796, + "step": 1134 + }, + { + "epoch": 0.3980552757216066, + "grad_norm": 0.5236755609512329, + "learning_rate": 7.044440454779625e-05, + "loss": 2.395, + "step": 1136 + }, + { + "epoch": 0.39875607726337, + "grad_norm": 0.4379066824913025, + "learning_rate": 7.033859654762898e-05, + "loss": 2.4206, + "step": 1138 + }, + { + "epoch": 0.39945687880513336, + "grad_norm": 0.44132131338119507, + "learning_rate": 7.023267929745092e-05, + "loss": 2.4723, + "step": 1140 + }, + { + "epoch": 0.40015768034689675, + "grad_norm": 0.4132694602012634, + "learning_rate": 7.012665336620303e-05, + "loss": 2.4371, + "step": 1142 + }, + { + "epoch": 0.40085848188866013, + "grad_norm": 0.4186249077320099, + "learning_rate": 7.002051932341008e-05, + "loss": 2.4136, + "step": 1144 + }, + { + "epoch": 0.4015592834304235, + "grad_norm": 0.43945756554603577, + "learning_rate": 6.99142777391775e-05, + "loss": 2.3931, + "step": 1146 + }, + { + "epoch": 0.40226008497218696, + "grad_norm": 0.402310311794281, + "learning_rate": 6.980792918418849e-05, + "loss": 2.4078, + "step": 1148 + }, + { + "epoch": 0.40296088651395034, + "grad_norm": 0.4048164486885071, + "learning_rate": 6.970147422970074e-05, + "loss": 2.44, + "step": 1150 + }, + { + "epoch": 0.40366168805571373, + "grad_norm": 0.4304601550102234, + "learning_rate": 6.959491344754357e-05, + "loss": 2.481, + "step": 1152 + }, + { + "epoch": 0.4043624895974771, + "grad_norm": 0.41222673654556274, + "learning_rate": 6.94882474101147e-05, + "loss": 2.4581, + "step": 1154 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 0.4438692033290863, + "learning_rate": 6.938147669037724e-05, + "loss": 2.4474, + "step": 1156 + }, + { + "epoch": 0.4057640926810039, + "grad_norm": 0.41356995701789856, + "learning_rate": 6.927460186185663e-05, + "loss": 2.4394, + "step": 1158 + }, + { + "epoch": 0.40646489422276727, + "grad_norm": 0.43938153982162476, + "learning_rate": 6.916762349863755e-05, + "loss": 2.4529, + "step": 1160 + }, + { + "epoch": 0.40716569576453066, + "grad_norm": 0.4264746904373169, + "learning_rate": 6.906054217536074e-05, + "loss": 2.4103, + "step": 1162 + }, + { + "epoch": 0.4078664973062941, + "grad_norm": 0.40205949544906616, + "learning_rate": 6.895335846722007e-05, + "loss": 2.4533, + "step": 1164 + }, + { + "epoch": 0.4085672988480575, + "grad_norm": 0.3951698839664459, + "learning_rate": 6.88460729499594e-05, + "loss": 2.4059, + "step": 1166 + }, + { + "epoch": 0.40926810038982087, + "grad_norm": 0.40137380361557007, + "learning_rate": 6.873868619986938e-05, + "loss": 2.4282, + "step": 1168 + }, + { + "epoch": 0.40996890193158425, + "grad_norm": 0.39662086963653564, + "learning_rate": 6.863119879378451e-05, + "loss": 2.404, + "step": 1170 + }, + { + "epoch": 0.41066970347334764, + "grad_norm": 0.40667644143104553, + "learning_rate": 6.852361130907992e-05, + "loss": 2.4205, + "step": 1172 + }, + { + "epoch": 0.411370505015111, + "grad_norm": 0.4023439288139343, + "learning_rate": 6.841592432366835e-05, + "loss": 2.4271, + "step": 1174 + }, + { + "epoch": 0.4120713065568744, + "grad_norm": 0.4581812620162964, + "learning_rate": 6.830813841599702e-05, + "loss": 2.4589, + "step": 1176 + }, + { + "epoch": 0.4127721080986378, + "grad_norm": 0.40035149455070496, + "learning_rate": 6.820025416504447e-05, + "loss": 2.4299, + "step": 1178 + }, + { + "epoch": 0.41347290964040123, + "grad_norm": 0.40674343705177307, + "learning_rate": 6.809227215031757e-05, + "loss": 2.4479, + "step": 1180 + }, + { + "epoch": 0.4141737111821646, + "grad_norm": 0.3893290162086487, + "learning_rate": 6.798419295184823e-05, + "loss": 2.4555, + "step": 1182 + }, + { + "epoch": 0.414874512723928, + "grad_norm": 0.4277731776237488, + "learning_rate": 6.787601715019051e-05, + "loss": 2.426, + "step": 1184 + }, + { + "epoch": 0.4155753142656914, + "grad_norm": 0.3891987204551697, + "learning_rate": 6.776774532641731e-05, + "loss": 2.455, + "step": 1186 + }, + { + "epoch": 0.41627611580745477, + "grad_norm": 0.42866095900535583, + "learning_rate": 6.765937806211731e-05, + "loss": 2.4278, + "step": 1188 + }, + { + "epoch": 0.41697691734921816, + "grad_norm": 0.4561346769332886, + "learning_rate": 6.75509159393919e-05, + "loss": 2.4655, + "step": 1190 + }, + { + "epoch": 0.41767771889098154, + "grad_norm": 0.4195440709590912, + "learning_rate": 6.744235954085193e-05, + "loss": 2.4429, + "step": 1192 + }, + { + "epoch": 0.4183785204327449, + "grad_norm": 0.43625494837760925, + "learning_rate": 6.733370944961476e-05, + "loss": 2.4733, + "step": 1194 + }, + { + "epoch": 0.41907932197450837, + "grad_norm": 0.45825034379959106, + "learning_rate": 6.7224966249301e-05, + "loss": 2.4267, + "step": 1196 + }, + { + "epoch": 0.41978012351627175, + "grad_norm": 0.4522963762283325, + "learning_rate": 6.711613052403129e-05, + "loss": 2.4268, + "step": 1198 + }, + { + "epoch": 0.42048092505803514, + "grad_norm": 0.43499064445495605, + "learning_rate": 6.700720285842344e-05, + "loss": 2.4429, + "step": 1200 + }, + { + "epoch": 0.4211817265997985, + "grad_norm": 0.39839762449264526, + "learning_rate": 6.689818383758899e-05, + "loss": 2.4226, + "step": 1202 + }, + { + "epoch": 0.4218825281415619, + "grad_norm": 0.42643973231315613, + "learning_rate": 6.678907404713028e-05, + "loss": 2.4154, + "step": 1204 + }, + { + "epoch": 0.4225833296833253, + "grad_norm": 0.38087818026542664, + "learning_rate": 6.667987407313721e-05, + "loss": 2.4337, + "step": 1206 + }, + { + "epoch": 0.4232841312250887, + "grad_norm": 0.3658069968223572, + "learning_rate": 6.657058450218407e-05, + "loss": 2.4608, + "step": 1208 + }, + { + "epoch": 0.42398493276685206, + "grad_norm": 0.35932332277297974, + "learning_rate": 6.646120592132647e-05, + "loss": 2.3846, + "step": 1210 + }, + { + "epoch": 0.4246857343086155, + "grad_norm": 0.3892819583415985, + "learning_rate": 6.635173891809811e-05, + "loss": 2.4385, + "step": 1212 + }, + { + "epoch": 0.4253865358503789, + "grad_norm": 0.4014433026313782, + "learning_rate": 6.624218408050768e-05, + "loss": 2.4243, + "step": 1214 + }, + { + "epoch": 0.4260873373921423, + "grad_norm": 0.38692885637283325, + "learning_rate": 6.613254199703567e-05, + "loss": 2.484, + "step": 1216 + }, + { + "epoch": 0.42678813893390566, + "grad_norm": 0.3929356336593628, + "learning_rate": 6.602281325663119e-05, + "loss": 2.4391, + "step": 1218 + }, + { + "epoch": 0.42748894047566904, + "grad_norm": 0.37778228521347046, + "learning_rate": 6.591299844870886e-05, + "loss": 2.4475, + "step": 1220 + }, + { + "epoch": 0.42818974201743243, + "grad_norm": 0.3806099593639374, + "learning_rate": 6.580309816314566e-05, + "loss": 2.465, + "step": 1222 + }, + { + "epoch": 0.4288905435591958, + "grad_norm": 0.3700454831123352, + "learning_rate": 6.569311299027758e-05, + "loss": 2.4834, + "step": 1224 + }, + { + "epoch": 0.4295913451009592, + "grad_norm": 0.4086442291736603, + "learning_rate": 6.558304352089676e-05, + "loss": 2.4605, + "step": 1226 + }, + { + "epoch": 0.43029214664272264, + "grad_norm": 0.4239005744457245, + "learning_rate": 6.547289034624803e-05, + "loss": 2.4578, + "step": 1228 + }, + { + "epoch": 0.430992948184486, + "grad_norm": 0.4295092523097992, + "learning_rate": 6.536265405802588e-05, + "loss": 2.4021, + "step": 1230 + }, + { + "epoch": 0.4316937497262494, + "grad_norm": 0.3939379155635834, + "learning_rate": 6.525233524837126e-05, + "loss": 2.4258, + "step": 1232 + }, + { + "epoch": 0.4323945512680128, + "grad_norm": 0.40129542350769043, + "learning_rate": 6.514193450986838e-05, + "loss": 2.4434, + "step": 1234 + }, + { + "epoch": 0.4330953528097762, + "grad_norm": 0.41189342737197876, + "learning_rate": 6.503145243554151e-05, + "loss": 2.4488, + "step": 1236 + }, + { + "epoch": 0.43379615435153956, + "grad_norm": 0.39507558941841125, + "learning_rate": 6.492088961885189e-05, + "loss": 2.437, + "step": 1238 + }, + { + "epoch": 0.43449695589330295, + "grad_norm": 0.4083710014820099, + "learning_rate": 6.481024665369437e-05, + "loss": 2.4487, + "step": 1240 + }, + { + "epoch": 0.43519775743506633, + "grad_norm": 0.39629602432250977, + "learning_rate": 6.469952413439444e-05, + "loss": 2.4023, + "step": 1242 + }, + { + "epoch": 0.4358985589768298, + "grad_norm": 0.38575878739356995, + "learning_rate": 6.458872265570482e-05, + "loss": 2.4191, + "step": 1244 + }, + { + "epoch": 0.43659936051859316, + "grad_norm": 0.4187893271446228, + "learning_rate": 6.44778428128024e-05, + "loss": 2.424, + "step": 1246 + }, + { + "epoch": 0.43730016206035655, + "grad_norm": 0.41006091237068176, + "learning_rate": 6.436688520128505e-05, + "loss": 2.4236, + "step": 1248 + }, + { + "epoch": 0.43800096360211993, + "grad_norm": 0.3775830566883087, + "learning_rate": 6.42558504171683e-05, + "loss": 2.4487, + "step": 1250 + }, + { + "epoch": 0.4387017651438833, + "grad_norm": 0.40173962712287903, + "learning_rate": 6.41447390568823e-05, + "loss": 2.4602, + "step": 1252 + }, + { + "epoch": 0.4394025666856467, + "grad_norm": 0.4107741713523865, + "learning_rate": 6.403355171726844e-05, + "loss": 2.4658, + "step": 1254 + }, + { + "epoch": 0.4401033682274101, + "grad_norm": 0.37253475189208984, + "learning_rate": 6.392228899557635e-05, + "loss": 2.444, + "step": 1256 + }, + { + "epoch": 0.44080416976917347, + "grad_norm": 0.41569939255714417, + "learning_rate": 6.38109514894605e-05, + "loss": 2.413, + "step": 1258 + }, + { + "epoch": 0.4415049713109369, + "grad_norm": 0.3902023732662201, + "learning_rate": 6.369953979697707e-05, + "loss": 2.4157, + "step": 1260 + }, + { + "epoch": 0.4422057728527003, + "grad_norm": 0.3787744641304016, + "learning_rate": 6.358805451658079e-05, + "loss": 2.4063, + "step": 1262 + }, + { + "epoch": 0.4429065743944637, + "grad_norm": 0.38019701838493347, + "learning_rate": 6.347649624712159e-05, + "loss": 2.4503, + "step": 1264 + }, + { + "epoch": 0.44360737593622707, + "grad_norm": 0.37601906061172485, + "learning_rate": 6.336486558784154e-05, + "loss": 2.4674, + "step": 1266 + }, + { + "epoch": 0.44430817747799045, + "grad_norm": 0.3607248067855835, + "learning_rate": 6.325316313837153e-05, + "loss": 2.4162, + "step": 1268 + }, + { + "epoch": 0.44500897901975384, + "grad_norm": 0.3930775821208954, + "learning_rate": 6.314138949872808e-05, + "loss": 2.4097, + "step": 1270 + }, + { + "epoch": 0.4457097805615172, + "grad_norm": 0.37233275175094604, + "learning_rate": 6.302954526931009e-05, + "loss": 2.4397, + "step": 1272 + }, + { + "epoch": 0.4464105821032806, + "grad_norm": 0.36300432682037354, + "learning_rate": 6.291763105089567e-05, + "loss": 2.429, + "step": 1274 + }, + { + "epoch": 0.447111383645044, + "grad_norm": 0.3873036205768585, + "learning_rate": 6.280564744463886e-05, + "loss": 2.449, + "step": 1276 + }, + { + "epoch": 0.44781218518680743, + "grad_norm": 0.3959653973579407, + "learning_rate": 6.269359505206641e-05, + "loss": 2.4212, + "step": 1278 + }, + { + "epoch": 0.4485129867285708, + "grad_norm": 0.38037773966789246, + "learning_rate": 6.25814744750746e-05, + "loss": 2.409, + "step": 1280 + }, + { + "epoch": 0.4492137882703342, + "grad_norm": 0.42133620381355286, + "learning_rate": 6.246928631592593e-05, + "loss": 2.472, + "step": 1282 + }, + { + "epoch": 0.4499145898120976, + "grad_norm": 0.3604891002178192, + "learning_rate": 6.235703117724591e-05, + "loss": 2.4107, + "step": 1284 + }, + { + "epoch": 0.450615391353861, + "grad_norm": 0.40617790818214417, + "learning_rate": 6.224470966201991e-05, + "loss": 2.4255, + "step": 1286 + }, + { + "epoch": 0.45131619289562436, + "grad_norm": 0.424081414937973, + "learning_rate": 6.213232237358977e-05, + "loss": 2.4598, + "step": 1288 + }, + { + "epoch": 0.45201699443738774, + "grad_norm": 0.4017449617385864, + "learning_rate": 6.201986991565063e-05, + "loss": 2.4215, + "step": 1290 + }, + { + "epoch": 0.4527177959791511, + "grad_norm": 0.37244996428489685, + "learning_rate": 6.190735289224775e-05, + "loss": 2.4077, + "step": 1292 + }, + { + "epoch": 0.45341859752091457, + "grad_norm": 0.3613777458667755, + "learning_rate": 6.179477190777317e-05, + "loss": 2.4602, + "step": 1294 + }, + { + "epoch": 0.45411939906267795, + "grad_norm": 0.38270074129104614, + "learning_rate": 6.168212756696252e-05, + "loss": 2.398, + "step": 1296 + }, + { + "epoch": 0.45482020060444134, + "grad_norm": 0.35369983315467834, + "learning_rate": 6.156942047489174e-05, + "loss": 2.4614, + "step": 1298 + }, + { + "epoch": 0.4555210021462047, + "grad_norm": 0.3624161183834076, + "learning_rate": 6.145665123697383e-05, + "loss": 2.3981, + "step": 1300 + }, + { + "epoch": 0.4562218036879681, + "grad_norm": 0.35648027062416077, + "learning_rate": 6.134382045895563e-05, + "loss": 2.4268, + "step": 1302 + }, + { + "epoch": 0.4569226052297315, + "grad_norm": 0.3938058018684387, + "learning_rate": 6.123092874691453e-05, + "loss": 2.4214, + "step": 1304 + }, + { + "epoch": 0.4576234067714949, + "grad_norm": 0.4438329041004181, + "learning_rate": 6.111797670725527e-05, + "loss": 2.4096, + "step": 1306 + }, + { + "epoch": 0.45832420831325826, + "grad_norm": 0.47116366028785706, + "learning_rate": 6.100496494670658e-05, + "loss": 2.4272, + "step": 1308 + }, + { + "epoch": 0.4590250098550217, + "grad_norm": 0.42287012934684753, + "learning_rate": 6.0891894072318056e-05, + "loss": 2.3914, + "step": 1310 + }, + { + "epoch": 0.4597258113967851, + "grad_norm": 0.4389081299304962, + "learning_rate": 6.077876469145675e-05, + "loss": 2.4542, + "step": 1312 + }, + { + "epoch": 0.4604266129385485, + "grad_norm": 0.406688392162323, + "learning_rate": 6.0665577411804056e-05, + "loss": 2.4614, + "step": 1314 + }, + { + "epoch": 0.46112741448031186, + "grad_norm": 0.42373642325401306, + "learning_rate": 6.055233284135231e-05, + "loss": 2.4162, + "step": 1316 + }, + { + "epoch": 0.46182821602207524, + "grad_norm": 0.42829835414886475, + "learning_rate": 6.043903158840166e-05, + "loss": 2.4324, + "step": 1318 + }, + { + "epoch": 0.46252901756383863, + "grad_norm": 0.4124324917793274, + "learning_rate": 6.0325674261556686e-05, + "loss": 2.4084, + "step": 1320 + }, + { + "epoch": 0.463229819105602, + "grad_norm": 0.38950327038764954, + "learning_rate": 6.021226146972315e-05, + "loss": 2.5012, + "step": 1322 + }, + { + "epoch": 0.4639306206473654, + "grad_norm": 0.36520397663116455, + "learning_rate": 6.0098793822104804e-05, + "loss": 2.3733, + "step": 1324 + }, + { + "epoch": 0.46463142218912884, + "grad_norm": 0.38024482131004333, + "learning_rate": 5.998527192820001e-05, + "loss": 2.398, + "step": 1326 + }, + { + "epoch": 0.4653322237308922, + "grad_norm": 0.39080825448036194, + "learning_rate": 5.987169639779856e-05, + "loss": 2.4214, + "step": 1328 + }, + { + "epoch": 0.4660330252726556, + "grad_norm": 0.35209205746650696, + "learning_rate": 5.9758067840978325e-05, + "loss": 2.4532, + "step": 1330 + }, + { + "epoch": 0.466733826814419, + "grad_norm": 0.36672139167785645, + "learning_rate": 5.964438686810202e-05, + "loss": 2.4343, + "step": 1332 + }, + { + "epoch": 0.4674346283561824, + "grad_norm": 0.37624260783195496, + "learning_rate": 5.953065408981392e-05, + "loss": 2.4449, + "step": 1334 + }, + { + "epoch": 0.46813542989794577, + "grad_norm": 0.3691907525062561, + "learning_rate": 5.941687011703657e-05, + "loss": 2.4148, + "step": 1336 + }, + { + "epoch": 0.46883623143970915, + "grad_norm": 0.37521377205848694, + "learning_rate": 5.9303035560967546e-05, + "loss": 2.4287, + "step": 1338 + }, + { + "epoch": 0.46953703298147254, + "grad_norm": 0.4108455777168274, + "learning_rate": 5.918915103307605e-05, + "loss": 2.4007, + "step": 1340 + }, + { + "epoch": 0.470237834523236, + "grad_norm": 0.4070550799369812, + "learning_rate": 5.9075217145099806e-05, + "loss": 2.4627, + "step": 1342 + }, + { + "epoch": 0.47093863606499936, + "grad_norm": 0.38162675499916077, + "learning_rate": 5.896123450904162e-05, + "loss": 2.4326, + "step": 1344 + }, + { + "epoch": 0.47163943760676275, + "grad_norm": 0.4042952358722687, + "learning_rate": 5.884720373716617e-05, + "loss": 2.4683, + "step": 1346 + }, + { + "epoch": 0.47234023914852613, + "grad_norm": 0.3937741219997406, + "learning_rate": 5.8733125441996696e-05, + "loss": 2.4243, + "step": 1348 + }, + { + "epoch": 0.4730410406902895, + "grad_norm": 0.394265353679657, + "learning_rate": 5.861900023631172e-05, + "loss": 2.4323, + "step": 1350 + }, + { + "epoch": 0.4737418422320529, + "grad_norm": 0.44313526153564453, + "learning_rate": 5.8504828733141716e-05, + "loss": 2.4085, + "step": 1352 + }, + { + "epoch": 0.4744426437738163, + "grad_norm": 0.4558910131454468, + "learning_rate": 5.8390611545765886e-05, + "loss": 2.3757, + "step": 1354 + }, + { + "epoch": 0.47514344531557967, + "grad_norm": 0.41724249720573425, + "learning_rate": 5.827634928770882e-05, + "loss": 2.4224, + "step": 1356 + }, + { + "epoch": 0.4758442468573431, + "grad_norm": 0.40503472089767456, + "learning_rate": 5.816204257273719e-05, + "loss": 2.4405, + "step": 1358 + }, + { + "epoch": 0.4765450483991065, + "grad_norm": 0.36606565117836, + "learning_rate": 5.804769201485648e-05, + "loss": 2.4352, + "step": 1360 + }, + { + "epoch": 0.4772458499408699, + "grad_norm": 0.3968331515789032, + "learning_rate": 5.79332982283077e-05, + "loss": 2.4547, + "step": 1362 + }, + { + "epoch": 0.47794665148263327, + "grad_norm": 0.3790927231311798, + "learning_rate": 5.7818861827564006e-05, + "loss": 2.3899, + "step": 1364 + }, + { + "epoch": 0.47864745302439665, + "grad_norm": 0.3793366253376007, + "learning_rate": 5.770438342732755e-05, + "loss": 2.4267, + "step": 1366 + }, + { + "epoch": 0.47934825456616004, + "grad_norm": 0.43721577525138855, + "learning_rate": 5.7589863642525984e-05, + "loss": 2.3998, + "step": 1368 + }, + { + "epoch": 0.4800490561079234, + "grad_norm": 0.4096902012825012, + "learning_rate": 5.7475303088309355e-05, + "loss": 2.4216, + "step": 1370 + }, + { + "epoch": 0.4807498576496868, + "grad_norm": 0.37640735507011414, + "learning_rate": 5.736070238004663e-05, + "loss": 2.4119, + "step": 1372 + }, + { + "epoch": 0.48145065919145025, + "grad_norm": 0.37925073504447937, + "learning_rate": 5.724606213332251e-05, + "loss": 2.4375, + "step": 1374 + }, + { + "epoch": 0.48215146073321363, + "grad_norm": 0.40147697925567627, + "learning_rate": 5.713138296393407e-05, + "loss": 2.3849, + "step": 1376 + }, + { + "epoch": 0.482852262274977, + "grad_norm": 0.39544767141342163, + "learning_rate": 5.701666548788743e-05, + "loss": 2.3931, + "step": 1378 + }, + { + "epoch": 0.4835530638167404, + "grad_norm": 0.3495292365550995, + "learning_rate": 5.6901910321394535e-05, + "loss": 2.4072, + "step": 1380 + }, + { + "epoch": 0.4842538653585038, + "grad_norm": 0.3946402966976166, + "learning_rate": 5.678711808086975e-05, + "loss": 2.4355, + "step": 1382 + }, + { + "epoch": 0.4849546669002672, + "grad_norm": 0.4135989248752594, + "learning_rate": 5.667228938292658e-05, + "loss": 2.446, + "step": 1384 + }, + { + "epoch": 0.48565546844203056, + "grad_norm": 0.4330977201461792, + "learning_rate": 5.655742484437438e-05, + "loss": 2.3511, + "step": 1386 + }, + { + "epoch": 0.48635626998379394, + "grad_norm": 0.4151771664619446, + "learning_rate": 5.6442525082215026e-05, + "loss": 2.4394, + "step": 1388 + }, + { + "epoch": 0.4870570715255574, + "grad_norm": 0.39435574412345886, + "learning_rate": 5.6327590713639575e-05, + "loss": 2.4012, + "step": 1390 + }, + { + "epoch": 0.48775787306732077, + "grad_norm": 0.37605562806129456, + "learning_rate": 5.6212622356025015e-05, + "loss": 2.4322, + "step": 1392 + }, + { + "epoch": 0.48845867460908415, + "grad_norm": 0.39531436562538147, + "learning_rate": 5.609762062693086e-05, + "loss": 2.3881, + "step": 1394 + }, + { + "epoch": 0.48915947615084754, + "grad_norm": 0.3732660114765167, + "learning_rate": 5.5982586144095913e-05, + "loss": 2.4337, + "step": 1396 + }, + { + "epoch": 0.4898602776926109, + "grad_norm": 0.3481023907661438, + "learning_rate": 5.586751952543493e-05, + "loss": 2.3736, + "step": 1398 + }, + { + "epoch": 0.4905610792343743, + "grad_norm": 0.39541634917259216, + "learning_rate": 5.5752421389035235e-05, + "loss": 2.3951, + "step": 1400 + }, + { + "epoch": 0.4912618807761377, + "grad_norm": 0.41655370593070984, + "learning_rate": 5.56372923531535e-05, + "loss": 2.4328, + "step": 1402 + }, + { + "epoch": 0.4919626823179011, + "grad_norm": 0.37396180629730225, + "learning_rate": 5.552213303621235e-05, + "loss": 2.4274, + "step": 1404 + }, + { + "epoch": 0.49266348385966446, + "grad_norm": 0.36494237184524536, + "learning_rate": 5.540694405679707e-05, + "loss": 2.3988, + "step": 1406 + }, + { + "epoch": 0.4933642854014279, + "grad_norm": 0.4091382920742035, + "learning_rate": 5.52917260336523e-05, + "loss": 2.4052, + "step": 1408 + }, + { + "epoch": 0.4940650869431913, + "grad_norm": 0.5013120174407959, + "learning_rate": 5.517647958567867e-05, + "loss": 2.414, + "step": 1410 + }, + { + "epoch": 0.4947658884849547, + "grad_norm": 0.3942740261554718, + "learning_rate": 5.506120533192948e-05, + "loss": 2.3965, + "step": 1412 + }, + { + "epoch": 0.49546669002671806, + "grad_norm": 0.3768194019794464, + "learning_rate": 5.4945903891607406e-05, + "loss": 2.3793, + "step": 1414 + }, + { + "epoch": 0.49616749156848144, + "grad_norm": 0.39486873149871826, + "learning_rate": 5.4830575884061184e-05, + "loss": 2.4039, + "step": 1416 + }, + { + "epoch": 0.49686829311024483, + "grad_norm": 0.3837323784828186, + "learning_rate": 5.471522192878222e-05, + "loss": 2.4353, + "step": 1418 + }, + { + "epoch": 0.4975690946520082, + "grad_norm": 0.37630417943000793, + "learning_rate": 5.4599842645401335e-05, + "loss": 2.4056, + "step": 1420 + }, + { + "epoch": 0.4982698961937716, + "grad_norm": 0.39257028698921204, + "learning_rate": 5.4484438653685345e-05, + "loss": 2.4018, + "step": 1422 + }, + { + "epoch": 0.49897069773553504, + "grad_norm": 0.4382604658603668, + "learning_rate": 5.436901057353385e-05, + "loss": 2.4285, + "step": 1424 + }, + { + "epoch": 0.4996714992772984, + "grad_norm": 0.43138858675956726, + "learning_rate": 5.4253559024975816e-05, + "loss": 2.3996, + "step": 1426 + }, + { + "epoch": 0.5003723008190618, + "grad_norm": 0.37432757019996643, + "learning_rate": 5.4138084628166266e-05, + "loss": 2.4102, + "step": 1428 + }, + { + "epoch": 0.5010731023608251, + "grad_norm": 0.3795533776283264, + "learning_rate": 5.4022588003382955e-05, + "loss": 2.4231, + "step": 1430 + }, + { + "epoch": 0.5017739039025886, + "grad_norm": 0.3747682571411133, + "learning_rate": 5.390706977102304e-05, + "loss": 2.3939, + "step": 1432 + }, + { + "epoch": 0.502474705444352, + "grad_norm": 0.3895587921142578, + "learning_rate": 5.379153055159978e-05, + "loss": 2.3694, + "step": 1434 + }, + { + "epoch": 0.5031755069861154, + "grad_norm": 0.3492620289325714, + "learning_rate": 5.3675970965739076e-05, + "loss": 2.3752, + "step": 1436 + }, + { + "epoch": 0.5038763085278788, + "grad_norm": 0.3917596638202667, + "learning_rate": 5.356039163417633e-05, + "loss": 2.4205, + "step": 1438 + }, + { + "epoch": 0.5045771100696421, + "grad_norm": 0.3660494089126587, + "learning_rate": 5.344479317775295e-05, + "loss": 2.3649, + "step": 1440 + }, + { + "epoch": 0.5052779116114056, + "grad_norm": 0.3819909393787384, + "learning_rate": 5.332917621741308e-05, + "loss": 2.4108, + "step": 1442 + }, + { + "epoch": 0.5059787131531689, + "grad_norm": 0.34344562888145447, + "learning_rate": 5.321354137420029e-05, + "loss": 2.4052, + "step": 1444 + }, + { + "epoch": 0.5066795146949323, + "grad_norm": 0.3695683479309082, + "learning_rate": 5.309788926925418e-05, + "loss": 2.4155, + "step": 1446 + }, + { + "epoch": 0.5073803162366958, + "grad_norm": 0.35396599769592285, + "learning_rate": 5.2982220523807055e-05, + "loss": 2.4107, + "step": 1448 + }, + { + "epoch": 0.5080811177784591, + "grad_norm": 0.3450440466403961, + "learning_rate": 5.286653575918066e-05, + "loss": 2.4528, + "step": 1450 + }, + { + "epoch": 0.5087819193202225, + "grad_norm": 0.3628271520137787, + "learning_rate": 5.275083559678275e-05, + "loss": 2.4078, + "step": 1452 + }, + { + "epoch": 0.5094827208619859, + "grad_norm": 0.40400680899620056, + "learning_rate": 5.263512065810379e-05, + "loss": 2.3694, + "step": 1454 + }, + { + "epoch": 0.5101835224037493, + "grad_norm": 0.3815267086029053, + "learning_rate": 5.2519391564713626e-05, + "loss": 2.4096, + "step": 1456 + }, + { + "epoch": 0.5108843239455126, + "grad_norm": 0.35269948840141296, + "learning_rate": 5.2403648938258144e-05, + "loss": 2.4205, + "step": 1458 + }, + { + "epoch": 0.5115851254872761, + "grad_norm": 0.3467412292957306, + "learning_rate": 5.228789340045591e-05, + "loss": 2.4357, + "step": 1460 + }, + { + "epoch": 0.5122859270290394, + "grad_norm": 0.38257092237472534, + "learning_rate": 5.217212557309485e-05, + "loss": 2.3958, + "step": 1462 + }, + { + "epoch": 0.5129867285708029, + "grad_norm": 0.42102304100990295, + "learning_rate": 5.20563460780289e-05, + "loss": 2.4122, + "step": 1464 + }, + { + "epoch": 0.5136875301125663, + "grad_norm": 0.410961389541626, + "learning_rate": 5.194055553717471e-05, + "loss": 2.4256, + "step": 1466 + }, + { + "epoch": 0.5143883316543296, + "grad_norm": 0.3856711685657501, + "learning_rate": 5.182475457250816e-05, + "loss": 2.3983, + "step": 1468 + }, + { + "epoch": 0.5150891331960931, + "grad_norm": 0.4079275131225586, + "learning_rate": 5.1708943806061225e-05, + "loss": 2.377, + "step": 1470 + }, + { + "epoch": 0.5157899347378564, + "grad_norm": 0.39763742685317993, + "learning_rate": 5.15931238599185e-05, + "loss": 2.4304, + "step": 1472 + }, + { + "epoch": 0.5164907362796198, + "grad_norm": 0.44861507415771484, + "learning_rate": 5.147729535621388e-05, + "loss": 2.3918, + "step": 1474 + }, + { + "epoch": 0.5171915378213832, + "grad_norm": 0.44281211495399475, + "learning_rate": 5.136145891712721e-05, + "loss": 2.4509, + "step": 1476 + }, + { + "epoch": 0.5178923393631466, + "grad_norm": 0.3945384919643402, + "learning_rate": 5.1245615164881025e-05, + "loss": 2.3719, + "step": 1478 + }, + { + "epoch": 0.51859314090491, + "grad_norm": 0.4102880358695984, + "learning_rate": 5.112976472173706e-05, + "loss": 2.4198, + "step": 1480 + }, + { + "epoch": 0.5192939424466734, + "grad_norm": 0.3924848735332489, + "learning_rate": 5.1013908209993045e-05, + "loss": 2.3773, + "step": 1482 + }, + { + "epoch": 0.5199947439884368, + "grad_norm": 0.38885822892189026, + "learning_rate": 5.089804625197929e-05, + "loss": 2.3925, + "step": 1484 + }, + { + "epoch": 0.5206955455302001, + "grad_norm": 0.38488003611564636, + "learning_rate": 5.078217947005537e-05, + "loss": 2.403, + "step": 1486 + }, + { + "epoch": 0.5213963470719636, + "grad_norm": 0.43979960680007935, + "learning_rate": 5.066630848660676e-05, + "loss": 2.424, + "step": 1488 + }, + { + "epoch": 0.5220971486137269, + "grad_norm": 0.40183478593826294, + "learning_rate": 5.055043392404151e-05, + "loss": 2.4164, + "step": 1490 + }, + { + "epoch": 0.5227979501554904, + "grad_norm": 0.36646613478660583, + "learning_rate": 5.0434556404786894e-05, + "loss": 2.4044, + "step": 1492 + }, + { + "epoch": 0.5234987516972537, + "grad_norm": 0.3749536871910095, + "learning_rate": 5.031867655128606e-05, + "loss": 2.3571, + "step": 1494 + }, + { + "epoch": 0.5241995532390171, + "grad_norm": 0.36778751015663147, + "learning_rate": 5.0202794985994716e-05, + "loss": 2.4313, + "step": 1496 + }, + { + "epoch": 0.5249003547807806, + "grad_norm": 0.3651956021785736, + "learning_rate": 5.0086912331377743e-05, + "loss": 2.3786, + "step": 1498 + }, + { + "epoch": 0.5256011563225439, + "grad_norm": 0.3685001730918884, + "learning_rate": 4.997102920990589e-05, + "loss": 2.3805, + "step": 1500 + } + ], + "logging_steps": 2, + "max_steps": 2854, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.688775168098304e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}