{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24596464258262873, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.198821419420958e-05, "grad_norm": 3.1852145195007324, "learning_rate": 0.0, "loss": 12.119, "step": 1 }, { "epoch": 0.00016397642838841917, "grad_norm": 3.045940399169922, "learning_rate": 1.639344262295082e-07, "loss": 12.1143, "step": 2 }, { "epoch": 0.00032795285677683834, "grad_norm": 3.0180699825286865, "learning_rate": 4.918032786885246e-07, "loss": 12.1146, "step": 4 }, { "epoch": 0.0004919292851652575, "grad_norm": 2.994316816329956, "learning_rate": 8.19672131147541e-07, "loss": 12.1185, "step": 6 }, { "epoch": 0.0006559057135536767, "grad_norm": 3.1152169704437256, "learning_rate": 1.1475409836065575e-06, "loss": 12.1106, "step": 8 }, { "epoch": 0.0008198821419420958, "grad_norm": 3.100684881210327, "learning_rate": 1.4754098360655739e-06, "loss": 12.1086, "step": 10 }, { "epoch": 0.000983858570330515, "grad_norm": 2.950772762298584, "learning_rate": 1.8032786885245903e-06, "loss": 12.102, "step": 12 }, { "epoch": 0.001147834998718934, "grad_norm": 2.897174835205078, "learning_rate": 2.1311475409836067e-06, "loss": 12.0903, "step": 14 }, { "epoch": 0.0013118114271073534, "grad_norm": 3.0111382007598877, "learning_rate": 2.459016393442623e-06, "loss": 12.0613, "step": 16 }, { "epoch": 0.0014757878554957724, "grad_norm": 3.11842942237854, "learning_rate": 2.7868852459016396e-06, "loss": 12.0268, "step": 18 }, { "epoch": 0.0016397642838841917, "grad_norm": 3.0497119426727295, "learning_rate": 3.114754098360656e-06, "loss": 12.0057, "step": 20 }, { "epoch": 0.0018037407122726108, "grad_norm": 3.685875177383423, "learning_rate": 3.4426229508196724e-06, "loss": 11.905, "step": 22 }, { "epoch": 0.00196771714066103, "grad_norm": 3.960944175720215, "learning_rate": 3.770491803278689e-06, "loss": 11.8163, "step": 24 }, { "epoch": 0.002131693569049449, "grad_norm": 4.291749000549316, "learning_rate": 4.098360655737704e-06, "loss": 11.7568, "step": 26 }, { "epoch": 0.002295669997437868, "grad_norm": 4.350489616394043, "learning_rate": 4.426229508196722e-06, "loss": 11.6989, "step": 28 }, { "epoch": 0.0024596464258262877, "grad_norm": 4.357988357543945, "learning_rate": 4.754098360655738e-06, "loss": 11.4718, "step": 30 }, { "epoch": 0.0026236228542147067, "grad_norm": 3.87373423576355, "learning_rate": 5.0819672131147545e-06, "loss": 11.2832, "step": 32 }, { "epoch": 0.0027875992826031258, "grad_norm": 3.546539306640625, "learning_rate": 5.409836065573771e-06, "loss": 11.1534, "step": 34 }, { "epoch": 0.002951575710991545, "grad_norm": 3.165611743927002, "learning_rate": 5.737704918032787e-06, "loss": 11.0611, "step": 36 }, { "epoch": 0.0031155521393799643, "grad_norm": 2.898167848587036, "learning_rate": 6.065573770491804e-06, "loss": 10.9593, "step": 38 }, { "epoch": 0.0032795285677683834, "grad_norm": 2.756183385848999, "learning_rate": 6.393442622950819e-06, "loss": 10.8399, "step": 40 }, { "epoch": 0.0034435049961568025, "grad_norm": 2.5953481197357178, "learning_rate": 6.721311475409836e-06, "loss": 10.7619, "step": 42 }, { "epoch": 0.0036074814245452215, "grad_norm": 2.4934237003326416, "learning_rate": 7.049180327868852e-06, "loss": 10.6836, "step": 44 }, { "epoch": 0.0037714578529336406, "grad_norm": 2.4235870838165283, "learning_rate": 7.3770491803278695e-06, "loss": 10.6374, "step": 46 }, { "epoch": 0.00393543428132206, "grad_norm": 2.4326910972595215, "learning_rate": 7.704918032786886e-06, "loss": 10.5798, "step": 48 }, { "epoch": 0.004099410709710479, "grad_norm": 2.3508763313293457, "learning_rate": 8.032786885245902e-06, "loss": 10.5475, "step": 50 }, { "epoch": 0.004263387138098898, "grad_norm": 2.411895513534546, "learning_rate": 8.360655737704919e-06, "loss": 10.4924, "step": 52 }, { "epoch": 0.004427363566487317, "grad_norm": 2.3643436431884766, "learning_rate": 8.688524590163935e-06, "loss": 10.4639, "step": 54 }, { "epoch": 0.004591339994875736, "grad_norm": 2.3757126331329346, "learning_rate": 9.016393442622952e-06, "loss": 10.4338, "step": 56 }, { "epoch": 0.004755316423264155, "grad_norm": 2.3393900394439697, "learning_rate": 9.344262295081968e-06, "loss": 10.3862, "step": 58 }, { "epoch": 0.004919292851652575, "grad_norm": 2.3030688762664795, "learning_rate": 9.672131147540984e-06, "loss": 10.3828, "step": 60 }, { "epoch": 0.005083269280040994, "grad_norm": 2.290266275405884, "learning_rate": 1e-05, "loss": 10.3545, "step": 62 }, { "epoch": 0.005247245708429413, "grad_norm": 2.3040578365325928, "learning_rate": 1.0327868852459017e-05, "loss": 10.3075, "step": 64 }, { "epoch": 0.0054112221368178325, "grad_norm": 2.2848293781280518, "learning_rate": 1.0655737704918032e-05, "loss": 10.2883, "step": 66 }, { "epoch": 0.0055751985652062516, "grad_norm": 2.261911630630493, "learning_rate": 1.0983606557377048e-05, "loss": 10.2328, "step": 68 }, { "epoch": 0.005739174993594671, "grad_norm": 2.2474453449249268, "learning_rate": 1.1311475409836065e-05, "loss": 10.2065, "step": 70 }, { "epoch": 0.00590315142198309, "grad_norm": 2.2612874507904053, "learning_rate": 1.1639344262295083e-05, "loss": 10.1581, "step": 72 }, { "epoch": 0.006067127850371509, "grad_norm": 2.293903112411499, "learning_rate": 1.19672131147541e-05, "loss": 10.0998, "step": 74 }, { "epoch": 0.006231104278759929, "grad_norm": 2.2207071781158447, "learning_rate": 1.2295081967213116e-05, "loss": 10.087, "step": 76 }, { "epoch": 0.006395080707148348, "grad_norm": 2.2724802494049072, "learning_rate": 1.2622950819672132e-05, "loss": 10.006, "step": 78 }, { "epoch": 0.006559057135536767, "grad_norm": 2.18399977684021, "learning_rate": 1.2950819672131149e-05, "loss": 9.9685, "step": 80 }, { "epoch": 0.006723033563925186, "grad_norm": 2.2227847576141357, "learning_rate": 1.3278688524590163e-05, "loss": 9.8784, "step": 82 }, { "epoch": 0.006887009992313605, "grad_norm": 2.275341033935547, "learning_rate": 1.3606557377049181e-05, "loss": 9.7873, "step": 84 }, { "epoch": 0.007050986420702024, "grad_norm": 2.170790195465088, "learning_rate": 1.3934426229508196e-05, "loss": 9.758, "step": 86 }, { "epoch": 0.007214962849090443, "grad_norm": 2.3812711238861084, "learning_rate": 1.4262295081967214e-05, "loss": 9.6872, "step": 88 }, { "epoch": 0.007378939277478862, "grad_norm": 2.2956268787384033, "learning_rate": 1.4590163934426229e-05, "loss": 9.6342, "step": 90 }, { "epoch": 0.007542915705867281, "grad_norm": 4.3424859046936035, "learning_rate": 1.4918032786885247e-05, "loss": 9.599, "step": 92 }, { "epoch": 0.007706892134255701, "grad_norm": 2.303790807723999, "learning_rate": 1.5245901639344262e-05, "loss": 9.4832, "step": 94 }, { "epoch": 0.00787086856264412, "grad_norm": 2.4979960918426514, "learning_rate": 1.557377049180328e-05, "loss": 9.4503, "step": 96 }, { "epoch": 0.008034844991032538, "grad_norm": 1.9802457094192505, "learning_rate": 1.5901639344262295e-05, "loss": 9.3706, "step": 98 }, { "epoch": 0.008198821419420958, "grad_norm": 2.263692855834961, "learning_rate": 1.6229508196721314e-05, "loss": 9.3359, "step": 100 }, { "epoch": 0.008362797847809378, "grad_norm": 2.014167547225952, "learning_rate": 1.6557377049180328e-05, "loss": 9.2529, "step": 102 }, { "epoch": 0.008526774276197796, "grad_norm": 2.3031680583953857, "learning_rate": 1.6885245901639347e-05, "loss": 9.1999, "step": 104 }, { "epoch": 0.008690750704586216, "grad_norm": 1.8865060806274414, "learning_rate": 1.721311475409836e-05, "loss": 9.2385, "step": 106 }, { "epoch": 0.008854727132974635, "grad_norm": 1.783657431602478, "learning_rate": 1.754098360655738e-05, "loss": 9.1613, "step": 108 }, { "epoch": 0.009018703561363054, "grad_norm": 1.7754466533660889, "learning_rate": 1.7868852459016393e-05, "loss": 9.0875, "step": 110 }, { "epoch": 0.009182679989751473, "grad_norm": 2.1157960891723633, "learning_rate": 1.8196721311475413e-05, "loss": 9.0315, "step": 112 }, { "epoch": 0.009346656418139893, "grad_norm": 2.1570520401000977, "learning_rate": 1.8524590163934426e-05, "loss": 8.988, "step": 114 }, { "epoch": 0.00951063284652831, "grad_norm": 2.070383071899414, "learning_rate": 1.8852459016393442e-05, "loss": 8.9084, "step": 116 }, { "epoch": 0.00967460927491673, "grad_norm": 2.004547119140625, "learning_rate": 1.918032786885246e-05, "loss": 8.8827, "step": 118 }, { "epoch": 0.00983858570330515, "grad_norm": 2.282608985900879, "learning_rate": 1.9508196721311475e-05, "loss": 8.8536, "step": 120 }, { "epoch": 0.010002562131693569, "grad_norm": 1.872442364692688, "learning_rate": 1.9836065573770492e-05, "loss": 8.8046, "step": 122 }, { "epoch": 0.010166538560081989, "grad_norm": 2.8452837467193604, "learning_rate": 2.0163934426229508e-05, "loss": 8.7791, "step": 124 }, { "epoch": 0.010330514988470407, "grad_norm": 2.155548095703125, "learning_rate": 2.0491803278688525e-05, "loss": 8.7161, "step": 126 }, { "epoch": 0.010494491416858827, "grad_norm": 2.887465238571167, "learning_rate": 2.081967213114754e-05, "loss": 8.646, "step": 128 }, { "epoch": 0.010658467845247245, "grad_norm": 1.7281243801116943, "learning_rate": 2.114754098360656e-05, "loss": 8.6342, "step": 130 }, { "epoch": 0.010822444273635665, "grad_norm": 2.309556484222412, "learning_rate": 2.1475409836065574e-05, "loss": 8.6055, "step": 132 }, { "epoch": 0.010986420702024083, "grad_norm": 2.6733663082122803, "learning_rate": 2.1803278688524594e-05, "loss": 8.5734, "step": 134 }, { "epoch": 0.011150397130412503, "grad_norm": 1.8595812320709229, "learning_rate": 2.2131147540983607e-05, "loss": 8.5053, "step": 136 }, { "epoch": 0.011314373558800923, "grad_norm": 2.128081798553467, "learning_rate": 2.2459016393442626e-05, "loss": 8.5065, "step": 138 }, { "epoch": 0.011478349987189341, "grad_norm": 2.7606353759765625, "learning_rate": 2.278688524590164e-05, "loss": 8.4438, "step": 140 }, { "epoch": 0.011642326415577761, "grad_norm": 4.279053688049316, "learning_rate": 2.311475409836066e-05, "loss": 8.3871, "step": 142 }, { "epoch": 0.01180630284396618, "grad_norm": 2.8464019298553467, "learning_rate": 2.3442622950819672e-05, "loss": 8.3569, "step": 144 }, { "epoch": 0.0119702792723546, "grad_norm": 1.880401611328125, "learning_rate": 2.377049180327869e-05, "loss": 8.3394, "step": 146 }, { "epoch": 0.012134255700743017, "grad_norm": 1.8325446844100952, "learning_rate": 2.4098360655737705e-05, "loss": 8.2505, "step": 148 }, { "epoch": 0.012298232129131437, "grad_norm": 2.5180068016052246, "learning_rate": 2.442622950819672e-05, "loss": 8.2408, "step": 150 }, { "epoch": 0.012462208557519857, "grad_norm": 1.706740379333496, "learning_rate": 2.4754098360655738e-05, "loss": 8.2324, "step": 152 }, { "epoch": 0.012626184985908276, "grad_norm": 1.7287302017211914, "learning_rate": 2.5081967213114754e-05, "loss": 8.2029, "step": 154 }, { "epoch": 0.012790161414296695, "grad_norm": 2.2318031787872314, "learning_rate": 2.540983606557377e-05, "loss": 8.1472, "step": 156 }, { "epoch": 0.012954137842685114, "grad_norm": 2.371234655380249, "learning_rate": 2.573770491803279e-05, "loss": 8.0911, "step": 158 }, { "epoch": 0.013118114271073534, "grad_norm": 2.0684690475463867, "learning_rate": 2.6065573770491804e-05, "loss": 8.0566, "step": 160 }, { "epoch": 0.013282090699461952, "grad_norm": 2.811659097671509, "learning_rate": 2.639344262295082e-05, "loss": 8.0289, "step": 162 }, { "epoch": 0.013446067127850372, "grad_norm": 1.9651939868927002, "learning_rate": 2.6721311475409837e-05, "loss": 7.9698, "step": 164 }, { "epoch": 0.01361004355623879, "grad_norm": 2.164975166320801, "learning_rate": 2.7049180327868856e-05, "loss": 7.9336, "step": 166 }, { "epoch": 0.01377401998462721, "grad_norm": 1.353088140487671, "learning_rate": 2.737704918032787e-05, "loss": 7.9496, "step": 168 }, { "epoch": 0.01393799641301563, "grad_norm": 1.452012538909912, "learning_rate": 2.7704918032786886e-05, "loss": 7.8609, "step": 170 }, { "epoch": 0.014101972841404048, "grad_norm": 2.3023464679718018, "learning_rate": 2.8032786885245906e-05, "loss": 7.8505, "step": 172 }, { "epoch": 0.014265949269792468, "grad_norm": 4.203457832336426, "learning_rate": 2.8360655737704922e-05, "loss": 7.8213, "step": 174 }, { "epoch": 0.014429925698180886, "grad_norm": 3.246518135070801, "learning_rate": 2.8688524590163935e-05, "loss": 7.7589, "step": 176 }, { "epoch": 0.014593902126569306, "grad_norm": 2.139510154724121, "learning_rate": 2.901639344262295e-05, "loss": 7.8026, "step": 178 }, { "epoch": 0.014757878554957724, "grad_norm": 1.7982274293899536, "learning_rate": 2.934426229508197e-05, "loss": 7.7061, "step": 180 }, { "epoch": 0.014921854983346144, "grad_norm": 2.1394877433776855, "learning_rate": 2.967213114754098e-05, "loss": 7.6849, "step": 182 }, { "epoch": 0.015085831411734562, "grad_norm": 1.9051532745361328, "learning_rate": 3e-05, "loss": 7.6823, "step": 184 }, { "epoch": 0.015249807840122982, "grad_norm": 2.402742862701416, "learning_rate": 3.0327868852459017e-05, "loss": 7.6698, "step": 186 }, { "epoch": 0.015413784268511402, "grad_norm": 2.6151533126831055, "learning_rate": 3.065573770491804e-05, "loss": 7.5872, "step": 188 }, { "epoch": 0.01557776069689982, "grad_norm": 1.7072643041610718, "learning_rate": 3.098360655737705e-05, "loss": 7.5527, "step": 190 }, { "epoch": 0.01574173712528824, "grad_norm": 1.5905566215515137, "learning_rate": 3.131147540983606e-05, "loss": 7.5412, "step": 192 }, { "epoch": 0.01590571355367666, "grad_norm": 2.3824453353881836, "learning_rate": 3.163934426229508e-05, "loss": 7.5453, "step": 194 }, { "epoch": 0.016069689982065077, "grad_norm": 1.4948384761810303, "learning_rate": 3.19672131147541e-05, "loss": 7.4671, "step": 196 }, { "epoch": 0.0162336664104535, "grad_norm": 2.2609403133392334, "learning_rate": 3.2295081967213116e-05, "loss": 7.4387, "step": 198 }, { "epoch": 0.016397642838841917, "grad_norm": 1.8224328756332397, "learning_rate": 3.2622950819672136e-05, "loss": 7.3725, "step": 200 }, { "epoch": 0.016561619267230335, "grad_norm": 1.9965217113494873, "learning_rate": 3.295081967213115e-05, "loss": 7.3654, "step": 202 }, { "epoch": 0.016725595695618756, "grad_norm": 1.1850310564041138, "learning_rate": 3.327868852459017e-05, "loss": 7.3385, "step": 204 }, { "epoch": 0.016889572124007175, "grad_norm": 1.642038106918335, "learning_rate": 3.360655737704918e-05, "loss": 7.3, "step": 206 }, { "epoch": 0.017053548552395593, "grad_norm": 1.9536902904510498, "learning_rate": 3.39344262295082e-05, "loss": 7.273, "step": 208 }, { "epoch": 0.01721752498078401, "grad_norm": 1.829714059829712, "learning_rate": 3.4262295081967214e-05, "loss": 7.2287, "step": 210 }, { "epoch": 0.017381501409172433, "grad_norm": 2.4997904300689697, "learning_rate": 3.459016393442623e-05, "loss": 7.2496, "step": 212 }, { "epoch": 0.01754547783756085, "grad_norm": 3.472687244415283, "learning_rate": 3.491803278688525e-05, "loss": 7.1957, "step": 214 }, { "epoch": 0.01770945426594927, "grad_norm": 3.043635845184326, "learning_rate": 3.524590163934427e-05, "loss": 7.1526, "step": 216 }, { "epoch": 0.01787343069433769, "grad_norm": 3.5498316287994385, "learning_rate": 3.557377049180328e-05, "loss": 7.1433, "step": 218 }, { "epoch": 0.01803740712272611, "grad_norm": 2.9172403812408447, "learning_rate": 3.590163934426229e-05, "loss": 7.1423, "step": 220 }, { "epoch": 0.018201383551114527, "grad_norm": 2.57663893699646, "learning_rate": 3.622950819672131e-05, "loss": 7.0919, "step": 222 }, { "epoch": 0.018365359979502945, "grad_norm": 1.6703250408172607, "learning_rate": 3.655737704918033e-05, "loss": 7.061, "step": 224 }, { "epoch": 0.018529336407891367, "grad_norm": 1.2947953939437866, "learning_rate": 3.6885245901639346e-05, "loss": 7.0452, "step": 226 }, { "epoch": 0.018693312836279785, "grad_norm": 1.5165050029754639, "learning_rate": 3.721311475409836e-05, "loss": 7.0161, "step": 228 }, { "epoch": 0.018857289264668203, "grad_norm": 2.0093023777008057, "learning_rate": 3.754098360655738e-05, "loss": 7.0072, "step": 230 }, { "epoch": 0.01902126569305662, "grad_norm": 2.3759045600891113, "learning_rate": 3.78688524590164e-05, "loss": 6.9464, "step": 232 }, { "epoch": 0.019185242121445043, "grad_norm": 2.2470510005950928, "learning_rate": 3.819672131147541e-05, "loss": 6.9827, "step": 234 }, { "epoch": 0.01934921854983346, "grad_norm": 1.9213268756866455, "learning_rate": 3.8524590163934424e-05, "loss": 6.938, "step": 236 }, { "epoch": 0.01951319497822188, "grad_norm": 1.64090895652771, "learning_rate": 3.8852459016393444e-05, "loss": 6.8792, "step": 238 }, { "epoch": 0.0196771714066103, "grad_norm": 1.7019102573394775, "learning_rate": 3.9180327868852464e-05, "loss": 6.9123, "step": 240 }, { "epoch": 0.01984114783499872, "grad_norm": 1.2651671171188354, "learning_rate": 3.950819672131148e-05, "loss": 6.8878, "step": 242 }, { "epoch": 0.020005124263387138, "grad_norm": 1.6623637676239014, "learning_rate": 3.983606557377049e-05, "loss": 6.8734, "step": 244 }, { "epoch": 0.020169100691775556, "grad_norm": 2.097914695739746, "learning_rate": 4.016393442622951e-05, "loss": 6.8284, "step": 246 }, { "epoch": 0.020333077120163977, "grad_norm": 1.9533648490905762, "learning_rate": 4.049180327868853e-05, "loss": 6.8234, "step": 248 }, { "epoch": 0.020497053548552396, "grad_norm": 3.1760356426239014, "learning_rate": 4.081967213114754e-05, "loss": 6.8269, "step": 250 }, { "epoch": 0.020661029976940814, "grad_norm": 3.1443092823028564, "learning_rate": 4.1147540983606556e-05, "loss": 6.8064, "step": 252 }, { "epoch": 0.020825006405329236, "grad_norm": 2.328242301940918, "learning_rate": 4.1475409836065575e-05, "loss": 6.7787, "step": 254 }, { "epoch": 0.020988982833717654, "grad_norm": 1.6525546312332153, "learning_rate": 4.1803278688524595e-05, "loss": 6.7703, "step": 256 }, { "epoch": 0.021152959262106072, "grad_norm": 1.2344805002212524, "learning_rate": 4.213114754098361e-05, "loss": 6.7074, "step": 258 }, { "epoch": 0.02131693569049449, "grad_norm": 1.625827670097351, "learning_rate": 4.245901639344262e-05, "loss": 6.7281, "step": 260 }, { "epoch": 0.021480912118882912, "grad_norm": 1.2819187641143799, "learning_rate": 4.278688524590164e-05, "loss": 6.7493, "step": 262 }, { "epoch": 0.02164488854727133, "grad_norm": 1.8072625398635864, "learning_rate": 4.311475409836066e-05, "loss": 6.6968, "step": 264 }, { "epoch": 0.021808864975659748, "grad_norm": 1.3799159526824951, "learning_rate": 4.3442622950819674e-05, "loss": 6.688, "step": 266 }, { "epoch": 0.021972841404048166, "grad_norm": 2.2259645462036133, "learning_rate": 4.377049180327869e-05, "loss": 6.6849, "step": 268 }, { "epoch": 0.022136817832436588, "grad_norm": 2.3046491146087646, "learning_rate": 4.409836065573771e-05, "loss": 6.6438, "step": 270 }, { "epoch": 0.022300794260825006, "grad_norm": 2.3048136234283447, "learning_rate": 4.442622950819673e-05, "loss": 6.5669, "step": 272 }, { "epoch": 0.022464770689213424, "grad_norm": 1.427413821220398, "learning_rate": 4.475409836065574e-05, "loss": 6.6016, "step": 274 }, { "epoch": 0.022628747117601846, "grad_norm": 1.9164036512374878, "learning_rate": 4.508196721311476e-05, "loss": 6.5754, "step": 276 }, { "epoch": 0.022792723545990264, "grad_norm": 2.1851377487182617, "learning_rate": 4.540983606557377e-05, "loss": 6.5257, "step": 278 }, { "epoch": 0.022956699974378682, "grad_norm": 1.622302770614624, "learning_rate": 4.5737704918032786e-05, "loss": 6.5406, "step": 280 }, { "epoch": 0.0231206764027671, "grad_norm": 1.7634437084197998, "learning_rate": 4.6065573770491805e-05, "loss": 6.5259, "step": 282 }, { "epoch": 0.023284652831155522, "grad_norm": 2.2173545360565186, "learning_rate": 4.6393442622950825e-05, "loss": 6.5383, "step": 284 }, { "epoch": 0.02344862925954394, "grad_norm": 1.620126724243164, "learning_rate": 4.672131147540984e-05, "loss": 6.5118, "step": 286 }, { "epoch": 0.02361260568793236, "grad_norm": 2.0221054553985596, "learning_rate": 4.704918032786885e-05, "loss": 6.4726, "step": 288 }, { "epoch": 0.02377658211632078, "grad_norm": 1.2866939306259155, "learning_rate": 4.737704918032787e-05, "loss": 6.4646, "step": 290 }, { "epoch": 0.0239405585447092, "grad_norm": 1.9667149782180786, "learning_rate": 4.770491803278689e-05, "loss": 6.4897, "step": 292 }, { "epoch": 0.024104534973097617, "grad_norm": 2.3963539600372314, "learning_rate": 4.8032786885245904e-05, "loss": 6.4017, "step": 294 }, { "epoch": 0.024268511401486035, "grad_norm": 2.0867695808410645, "learning_rate": 4.836065573770492e-05, "loss": 6.4881, "step": 296 }, { "epoch": 0.024432487829874457, "grad_norm": 2.1272687911987305, "learning_rate": 4.868852459016394e-05, "loss": 6.3954, "step": 298 }, { "epoch": 0.024596464258262875, "grad_norm": 1.3982235193252563, "learning_rate": 4.9016393442622957e-05, "loss": 6.4431, "step": 300 }, { "epoch": 0.024760440686651293, "grad_norm": 2.191251039505005, "learning_rate": 4.934426229508197e-05, "loss": 6.4309, "step": 302 }, { "epoch": 0.024924417115039715, "grad_norm": 1.7500178813934326, "learning_rate": 4.967213114754098e-05, "loss": 6.3676, "step": 304 }, { "epoch": 0.025088393543428133, "grad_norm": 1.4814640283584595, "learning_rate": 5e-05, "loss": 6.4043, "step": 306 }, { "epoch": 0.02525236997181655, "grad_norm": 1.549419641494751, "learning_rate": 5.0327868852459015e-05, "loss": 6.3276, "step": 308 }, { "epoch": 0.02541634640020497, "grad_norm": 1.4538007974624634, "learning_rate": 5.0655737704918035e-05, "loss": 6.2641, "step": 310 }, { "epoch": 0.02558032282859339, "grad_norm": 1.4905295372009277, "learning_rate": 5.098360655737705e-05, "loss": 6.3035, "step": 312 }, { "epoch": 0.02574429925698181, "grad_norm": 1.6055713891983032, "learning_rate": 5.131147540983606e-05, "loss": 6.3119, "step": 314 }, { "epoch": 0.025908275685370227, "grad_norm": 1.8923226594924927, "learning_rate": 5.163934426229509e-05, "loss": 6.2762, "step": 316 }, { "epoch": 0.026072252113758645, "grad_norm": 1.6730965375900269, "learning_rate": 5.19672131147541e-05, "loss": 6.262, "step": 318 }, { "epoch": 0.026236228542147067, "grad_norm": 1.7695355415344238, "learning_rate": 5.229508196721312e-05, "loss": 6.2443, "step": 320 }, { "epoch": 0.026400204970535485, "grad_norm": 1.5890907049179077, "learning_rate": 5.2622950819672134e-05, "loss": 6.2874, "step": 322 }, { "epoch": 0.026564181398923904, "grad_norm": 1.351945161819458, "learning_rate": 5.295081967213115e-05, "loss": 6.1962, "step": 324 }, { "epoch": 0.026728157827312325, "grad_norm": 1.8272804021835327, "learning_rate": 5.327868852459017e-05, "loss": 6.2117, "step": 326 }, { "epoch": 0.026892134255700743, "grad_norm": 1.517553448677063, "learning_rate": 5.360655737704918e-05, "loss": 6.226, "step": 328 }, { "epoch": 0.02705611068408916, "grad_norm": 1.2578155994415283, "learning_rate": 5.393442622950819e-05, "loss": 6.1564, "step": 330 }, { "epoch": 0.02722008711247758, "grad_norm": 1.3466731309890747, "learning_rate": 5.426229508196722e-05, "loss": 6.2005, "step": 332 }, { "epoch": 0.027384063540866, "grad_norm": 1.8164156675338745, "learning_rate": 5.459016393442623e-05, "loss": 6.2395, "step": 334 }, { "epoch": 0.02754803996925442, "grad_norm": 1.9037235975265503, "learning_rate": 5.491803278688525e-05, "loss": 6.1536, "step": 336 }, { "epoch": 0.027712016397642838, "grad_norm": 2.0508320331573486, "learning_rate": 5.5245901639344265e-05, "loss": 6.1402, "step": 338 }, { "epoch": 0.02787599282603126, "grad_norm": 2.135850667953491, "learning_rate": 5.557377049180328e-05, "loss": 6.1144, "step": 340 }, { "epoch": 0.028039969254419678, "grad_norm": 1.4268845319747925, "learning_rate": 5.5901639344262305e-05, "loss": 6.0949, "step": 342 }, { "epoch": 0.028203945682808096, "grad_norm": 1.7478644847869873, "learning_rate": 5.622950819672132e-05, "loss": 6.0932, "step": 344 }, { "epoch": 0.028367922111196514, "grad_norm": 1.3711293935775757, "learning_rate": 5.6557377049180324e-05, "loss": 6.0383, "step": 346 }, { "epoch": 0.028531898539584936, "grad_norm": 2.102510929107666, "learning_rate": 5.688524590163935e-05, "loss": 6.151, "step": 348 }, { "epoch": 0.028695874967973354, "grad_norm": 2.1685709953308105, "learning_rate": 5.7213114754098364e-05, "loss": 6.0951, "step": 350 }, { "epoch": 0.028859851396361772, "grad_norm": 1.92462158203125, "learning_rate": 5.754098360655738e-05, "loss": 6.1199, "step": 352 }, { "epoch": 0.029023827824750194, "grad_norm": 2.6841022968292236, "learning_rate": 5.7868852459016396e-05, "loss": 6.0132, "step": 354 }, { "epoch": 0.029187804253138612, "grad_norm": 2.1786410808563232, "learning_rate": 5.819672131147541e-05, "loss": 6.0524, "step": 356 }, { "epoch": 0.02935178068152703, "grad_norm": 2.2823619842529297, "learning_rate": 5.8524590163934436e-05, "loss": 6.0581, "step": 358 }, { "epoch": 0.02951575710991545, "grad_norm": 1.539980173110962, "learning_rate": 5.885245901639345e-05, "loss": 6.0375, "step": 360 }, { "epoch": 0.02967973353830387, "grad_norm": 2.02248215675354, "learning_rate": 5.9180327868852455e-05, "loss": 6.024, "step": 362 }, { "epoch": 0.029843709966692288, "grad_norm": 2.0817408561706543, "learning_rate": 5.950819672131148e-05, "loss": 5.9895, "step": 364 }, { "epoch": 0.030007686395080706, "grad_norm": 2.433933973312378, "learning_rate": 5.9836065573770495e-05, "loss": 5.9542, "step": 366 }, { "epoch": 0.030171662823469125, "grad_norm": 1.9378852844238281, "learning_rate": 6.016393442622951e-05, "loss": 6.0731, "step": 368 }, { "epoch": 0.030335639251857546, "grad_norm": 1.5799286365509033, "learning_rate": 6.049180327868853e-05, "loss": 5.941, "step": 370 }, { "epoch": 0.030499615680245964, "grad_norm": 1.8533333539962769, "learning_rate": 6.081967213114754e-05, "loss": 5.9119, "step": 372 }, { "epoch": 0.030663592108634383, "grad_norm": 1.8558950424194336, "learning_rate": 6.114754098360656e-05, "loss": 5.9879, "step": 374 }, { "epoch": 0.030827568537022804, "grad_norm": 1.987197756767273, "learning_rate": 6.147540983606557e-05, "loss": 5.9686, "step": 376 }, { "epoch": 0.030991544965411223, "grad_norm": 2.317286491394043, "learning_rate": 6.180327868852459e-05, "loss": 5.9557, "step": 378 }, { "epoch": 0.03115552139379964, "grad_norm": 2.142669916152954, "learning_rate": 6.213114754098361e-05, "loss": 5.8988, "step": 380 }, { "epoch": 0.03131949782218806, "grad_norm": 2.495762825012207, "learning_rate": 6.245901639344263e-05, "loss": 5.9131, "step": 382 }, { "epoch": 0.03148347425057648, "grad_norm": 1.6954542398452759, "learning_rate": 6.278688524590164e-05, "loss": 5.9113, "step": 384 }, { "epoch": 0.0316474506789649, "grad_norm": 1.8803491592407227, "learning_rate": 6.311475409836067e-05, "loss": 5.976, "step": 386 }, { "epoch": 0.03181142710735332, "grad_norm": 1.8088006973266602, "learning_rate": 6.344262295081968e-05, "loss": 5.9088, "step": 388 }, { "epoch": 0.031975403535741735, "grad_norm": 1.9314664602279663, "learning_rate": 6.377049180327869e-05, "loss": 5.8998, "step": 390 }, { "epoch": 0.03213937996413015, "grad_norm": 2.3078742027282715, "learning_rate": 6.40983606557377e-05, "loss": 5.9263, "step": 392 }, { "epoch": 0.03230335639251858, "grad_norm": 2.3376059532165527, "learning_rate": 6.442622950819672e-05, "loss": 5.8472, "step": 394 }, { "epoch": 0.032467332820907, "grad_norm": 2.106436252593994, "learning_rate": 6.475409836065574e-05, "loss": 5.8755, "step": 396 }, { "epoch": 0.032631309249295415, "grad_norm": 1.564982533454895, "learning_rate": 6.508196721311476e-05, "loss": 5.8482, "step": 398 }, { "epoch": 0.03279528567768383, "grad_norm": 1.3622092008590698, "learning_rate": 6.540983606557377e-05, "loss": 5.8217, "step": 400 }, { "epoch": 0.03295926210607225, "grad_norm": 2.422950029373169, "learning_rate": 6.57377049180328e-05, "loss": 5.8151, "step": 402 }, { "epoch": 0.03312323853446067, "grad_norm": 2.1505627632141113, "learning_rate": 6.606557377049181e-05, "loss": 5.8411, "step": 404 }, { "epoch": 0.03328721496284909, "grad_norm": 2.70080304145813, "learning_rate": 6.639344262295082e-05, "loss": 5.8123, "step": 406 }, { "epoch": 0.03345119139123751, "grad_norm": 2.5012848377227783, "learning_rate": 6.672131147540984e-05, "loss": 5.8724, "step": 408 }, { "epoch": 0.03361516781962593, "grad_norm": 1.424917459487915, "learning_rate": 6.704918032786885e-05, "loss": 5.7876, "step": 410 }, { "epoch": 0.03377914424801435, "grad_norm": 2.5553343296051025, "learning_rate": 6.737704918032786e-05, "loss": 5.7938, "step": 412 }, { "epoch": 0.03394312067640277, "grad_norm": 1.8178255558013916, "learning_rate": 6.770491803278689e-05, "loss": 5.8053, "step": 414 }, { "epoch": 0.034107097104791186, "grad_norm": 1.838123083114624, "learning_rate": 6.80327868852459e-05, "loss": 5.7617, "step": 416 }, { "epoch": 0.034271073533179604, "grad_norm": 2.1789233684539795, "learning_rate": 6.836065573770493e-05, "loss": 5.7346, "step": 418 }, { "epoch": 0.03443504996156802, "grad_norm": 1.6680387258529663, "learning_rate": 6.868852459016394e-05, "loss": 5.7538, "step": 420 }, { "epoch": 0.03459902638995645, "grad_norm": 1.8677077293395996, "learning_rate": 6.901639344262295e-05, "loss": 5.7391, "step": 422 }, { "epoch": 0.034763002818344865, "grad_norm": 1.6001735925674438, "learning_rate": 6.934426229508197e-05, "loss": 5.7602, "step": 424 }, { "epoch": 0.03492697924673328, "grad_norm": 1.8411906957626343, "learning_rate": 6.967213114754098e-05, "loss": 5.7353, "step": 426 }, { "epoch": 0.0350909556751217, "grad_norm": 1.8191148042678833, "learning_rate": 7e-05, "loss": 5.6915, "step": 428 }, { "epoch": 0.03525493210351012, "grad_norm": 1.5732213258743286, "learning_rate": 7.032786885245902e-05, "loss": 5.7435, "step": 430 }, { "epoch": 0.03541890853189854, "grad_norm": 1.490302562713623, "learning_rate": 7.065573770491803e-05, "loss": 5.7039, "step": 432 }, { "epoch": 0.035582884960286956, "grad_norm": 2.1343276500701904, "learning_rate": 7.098360655737706e-05, "loss": 5.6742, "step": 434 }, { "epoch": 0.03574686138867538, "grad_norm": 2.2556490898132324, "learning_rate": 7.131147540983607e-05, "loss": 5.68, "step": 436 }, { "epoch": 0.0359108378170638, "grad_norm": 1.9686386585235596, "learning_rate": 7.163934426229509e-05, "loss": 5.7184, "step": 438 }, { "epoch": 0.03607481424545222, "grad_norm": 1.8209033012390137, "learning_rate": 7.196721311475411e-05, "loss": 5.6744, "step": 440 }, { "epoch": 0.036238790673840636, "grad_norm": 1.4810411930084229, "learning_rate": 7.229508196721311e-05, "loss": 5.6897, "step": 442 }, { "epoch": 0.036402767102229054, "grad_norm": 1.438828706741333, "learning_rate": 7.262295081967213e-05, "loss": 5.7104, "step": 444 }, { "epoch": 0.03656674353061747, "grad_norm": 1.9264169931411743, "learning_rate": 7.295081967213115e-05, "loss": 5.6648, "step": 446 }, { "epoch": 0.03673071995900589, "grad_norm": 1.5463581085205078, "learning_rate": 7.327868852459016e-05, "loss": 5.6325, "step": 448 }, { "epoch": 0.03689469638739431, "grad_norm": 2.308321237564087, "learning_rate": 7.360655737704918e-05, "loss": 5.6288, "step": 450 }, { "epoch": 0.037058672815782734, "grad_norm": 1.6210638284683228, "learning_rate": 7.39344262295082e-05, "loss": 5.5833, "step": 452 }, { "epoch": 0.03722264924417115, "grad_norm": 1.9419602155685425, "learning_rate": 7.426229508196722e-05, "loss": 5.6552, "step": 454 }, { "epoch": 0.03738662567255957, "grad_norm": 1.5004584789276123, "learning_rate": 7.459016393442624e-05, "loss": 5.5908, "step": 456 }, { "epoch": 0.03755060210094799, "grad_norm": 1.7404072284698486, "learning_rate": 7.491803278688526e-05, "loss": 5.5853, "step": 458 }, { "epoch": 0.03771457852933641, "grad_norm": 2.1047239303588867, "learning_rate": 7.524590163934426e-05, "loss": 5.6346, "step": 460 }, { "epoch": 0.037878554957724825, "grad_norm": 2.2261962890625, "learning_rate": 7.557377049180328e-05, "loss": 5.6325, "step": 462 }, { "epoch": 0.03804253138611324, "grad_norm": 1.9876081943511963, "learning_rate": 7.59016393442623e-05, "loss": 5.5733, "step": 464 }, { "epoch": 0.03820650781450167, "grad_norm": 1.9988337755203247, "learning_rate": 7.622950819672131e-05, "loss": 5.5402, "step": 466 }, { "epoch": 0.038370484242890086, "grad_norm": 1.896393060684204, "learning_rate": 7.655737704918034e-05, "loss": 5.5694, "step": 468 }, { "epoch": 0.038534460671278505, "grad_norm": 1.8517329692840576, "learning_rate": 7.688524590163935e-05, "loss": 5.5517, "step": 470 }, { "epoch": 0.03869843709966692, "grad_norm": 2.0797197818756104, "learning_rate": 7.721311475409836e-05, "loss": 5.5111, "step": 472 }, { "epoch": 0.03886241352805534, "grad_norm": 1.4706847667694092, "learning_rate": 7.754098360655739e-05, "loss": 5.5535, "step": 474 }, { "epoch": 0.03902638995644376, "grad_norm": 1.4342091083526611, "learning_rate": 7.78688524590164e-05, "loss": 5.5338, "step": 476 }, { "epoch": 0.03919036638483218, "grad_norm": 1.520163893699646, "learning_rate": 7.819672131147541e-05, "loss": 5.4999, "step": 478 }, { "epoch": 0.0393543428132206, "grad_norm": 1.479134202003479, "learning_rate": 7.852459016393443e-05, "loss": 5.5316, "step": 480 }, { "epoch": 0.03951831924160902, "grad_norm": 1.9230724573135376, "learning_rate": 7.885245901639344e-05, "loss": 5.4864, "step": 482 }, { "epoch": 0.03968229566999744, "grad_norm": 1.561827301979065, "learning_rate": 7.918032786885247e-05, "loss": 5.5004, "step": 484 }, { "epoch": 0.03984627209838586, "grad_norm": 1.9881266355514526, "learning_rate": 7.950819672131148e-05, "loss": 5.5241, "step": 486 }, { "epoch": 0.040010248526774275, "grad_norm": 2.184860944747925, "learning_rate": 7.98360655737705e-05, "loss": 5.4425, "step": 488 }, { "epoch": 0.04017422495516269, "grad_norm": 1.7443156242370605, "learning_rate": 8.016393442622952e-05, "loss": 5.4779, "step": 490 }, { "epoch": 0.04033820138355111, "grad_norm": 1.4682708978652954, "learning_rate": 8.049180327868853e-05, "loss": 5.5347, "step": 492 }, { "epoch": 0.04050217781193954, "grad_norm": 1.3537533283233643, "learning_rate": 8.081967213114755e-05, "loss": 5.4484, "step": 494 }, { "epoch": 0.040666154240327955, "grad_norm": 2.4647626876831055, "learning_rate": 8.114754098360656e-05, "loss": 5.428, "step": 496 }, { "epoch": 0.04083013066871637, "grad_norm": 2.0095391273498535, "learning_rate": 8.147540983606557e-05, "loss": 5.4425, "step": 498 }, { "epoch": 0.04099410709710479, "grad_norm": 2.309438705444336, "learning_rate": 8.18032786885246e-05, "loss": 5.4524, "step": 500 }, { "epoch": 0.04115808352549321, "grad_norm": 1.7151856422424316, "learning_rate": 8.213114754098361e-05, "loss": 5.4337, "step": 502 }, { "epoch": 0.04132205995388163, "grad_norm": 1.7919552326202393, "learning_rate": 8.245901639344262e-05, "loss": 5.4316, "step": 504 }, { "epoch": 0.041486036382270046, "grad_norm": 2.1312031745910645, "learning_rate": 8.278688524590165e-05, "loss": 5.445, "step": 506 }, { "epoch": 0.04165001281065847, "grad_norm": 1.994307518005371, "learning_rate": 8.311475409836066e-05, "loss": 5.3947, "step": 508 }, { "epoch": 0.04181398923904689, "grad_norm": 1.912011981010437, "learning_rate": 8.344262295081968e-05, "loss": 5.358, "step": 510 }, { "epoch": 0.04197796566743531, "grad_norm": 2.522435188293457, "learning_rate": 8.377049180327869e-05, "loss": 5.4445, "step": 512 }, { "epoch": 0.042141942095823726, "grad_norm": 1.8543167114257812, "learning_rate": 8.40983606557377e-05, "loss": 5.4107, "step": 514 }, { "epoch": 0.042305918524212144, "grad_norm": 2.0634872913360596, "learning_rate": 8.442622950819673e-05, "loss": 5.3435, "step": 516 }, { "epoch": 0.04246989495260056, "grad_norm": 1.563451886177063, "learning_rate": 8.475409836065574e-05, "loss": 5.3955, "step": 518 }, { "epoch": 0.04263387138098898, "grad_norm": 1.305403709411621, "learning_rate": 8.508196721311476e-05, "loss": 5.3148, "step": 520 }, { "epoch": 0.042797847809377405, "grad_norm": 1.9041563272476196, "learning_rate": 8.540983606557378e-05, "loss": 5.3892, "step": 522 }, { "epoch": 0.042961824237765824, "grad_norm": 1.7804834842681885, "learning_rate": 8.57377049180328e-05, "loss": 5.3762, "step": 524 }, { "epoch": 0.04312580066615424, "grad_norm": 1.959104299545288, "learning_rate": 8.606557377049181e-05, "loss": 5.3638, "step": 526 }, { "epoch": 0.04328977709454266, "grad_norm": 2.4005024433135986, "learning_rate": 8.639344262295082e-05, "loss": 5.3913, "step": 528 }, { "epoch": 0.04345375352293108, "grad_norm": 1.8313933610916138, "learning_rate": 8.672131147540983e-05, "loss": 5.3175, "step": 530 }, { "epoch": 0.043617729951319496, "grad_norm": 1.708949327468872, "learning_rate": 8.704918032786885e-05, "loss": 5.3506, "step": 532 }, { "epoch": 0.043781706379707915, "grad_norm": 1.9135140180587769, "learning_rate": 8.737704918032787e-05, "loss": 5.3197, "step": 534 }, { "epoch": 0.04394568280809633, "grad_norm": 2.3220021724700928, "learning_rate": 8.770491803278689e-05, "loss": 5.3148, "step": 536 }, { "epoch": 0.04410965923648476, "grad_norm": 1.6934478282928467, "learning_rate": 8.803278688524591e-05, "loss": 5.3818, "step": 538 }, { "epoch": 0.044273635664873176, "grad_norm": 1.633090615272522, "learning_rate": 8.836065573770493e-05, "loss": 5.2976, "step": 540 }, { "epoch": 0.044437612093261594, "grad_norm": 2.194674253463745, "learning_rate": 8.868852459016394e-05, "loss": 5.2909, "step": 542 }, { "epoch": 0.04460158852165001, "grad_norm": 1.8494622707366943, "learning_rate": 8.901639344262295e-05, "loss": 5.3178, "step": 544 }, { "epoch": 0.04476556495003843, "grad_norm": 1.523157000541687, "learning_rate": 8.934426229508197e-05, "loss": 5.2934, "step": 546 }, { "epoch": 0.04492954137842685, "grad_norm": 1.7002984285354614, "learning_rate": 8.967213114754098e-05, "loss": 5.3107, "step": 548 }, { "epoch": 0.04509351780681527, "grad_norm": 1.9046440124511719, "learning_rate": 9e-05, "loss": 5.3225, "step": 550 }, { "epoch": 0.04525749423520369, "grad_norm": 1.8026628494262695, "learning_rate": 9.032786885245902e-05, "loss": 5.3129, "step": 552 }, { "epoch": 0.04542147066359211, "grad_norm": 1.6067262887954712, "learning_rate": 9.065573770491805e-05, "loss": 5.2214, "step": 554 }, { "epoch": 0.04558544709198053, "grad_norm": 1.669403314590454, "learning_rate": 9.098360655737706e-05, "loss": 5.274, "step": 556 }, { "epoch": 0.04574942352036895, "grad_norm": 1.7327196598052979, "learning_rate": 9.131147540983607e-05, "loss": 5.3007, "step": 558 }, { "epoch": 0.045913399948757365, "grad_norm": 1.4521604776382446, "learning_rate": 9.163934426229508e-05, "loss": 5.2284, "step": 560 }, { "epoch": 0.04607737637714578, "grad_norm": 1.5612112283706665, "learning_rate": 9.19672131147541e-05, "loss": 5.2094, "step": 562 }, { "epoch": 0.0462413528055342, "grad_norm": 2.2565908432006836, "learning_rate": 9.229508196721311e-05, "loss": 5.25, "step": 564 }, { "epoch": 0.046405329233922626, "grad_norm": 2.040969133377075, "learning_rate": 9.262295081967214e-05, "loss": 5.2399, "step": 566 }, { "epoch": 0.046569305662311045, "grad_norm": 1.8394721746444702, "learning_rate": 9.295081967213115e-05, "loss": 5.2433, "step": 568 }, { "epoch": 0.04673328209069946, "grad_norm": 2.2264137268066406, "learning_rate": 9.327868852459016e-05, "loss": 5.1826, "step": 570 }, { "epoch": 0.04689725851908788, "grad_norm": 1.537869930267334, "learning_rate": 9.360655737704919e-05, "loss": 5.2544, "step": 572 }, { "epoch": 0.0470612349474763, "grad_norm": 1.6794589757919312, "learning_rate": 9.39344262295082e-05, "loss": 5.2355, "step": 574 }, { "epoch": 0.04722521137586472, "grad_norm": 2.1024844646453857, "learning_rate": 9.426229508196722e-05, "loss": 5.2308, "step": 576 }, { "epoch": 0.047389187804253136, "grad_norm": 1.6713175773620605, "learning_rate": 9.459016393442623e-05, "loss": 5.1545, "step": 578 }, { "epoch": 0.04755316423264156, "grad_norm": 1.6628456115722656, "learning_rate": 9.491803278688524e-05, "loss": 5.1741, "step": 580 }, { "epoch": 0.04771714066102998, "grad_norm": 1.4492676258087158, "learning_rate": 9.524590163934427e-05, "loss": 5.1566, "step": 582 }, { "epoch": 0.0478811170894184, "grad_norm": 1.794235110282898, "learning_rate": 9.557377049180328e-05, "loss": 5.1699, "step": 584 }, { "epoch": 0.048045093517806815, "grad_norm": 1.934901475906372, "learning_rate": 9.59016393442623e-05, "loss": 5.1533, "step": 586 }, { "epoch": 0.048209069946195234, "grad_norm": 1.2630641460418701, "learning_rate": 9.622950819672132e-05, "loss": 5.1782, "step": 588 }, { "epoch": 0.04837304637458365, "grad_norm": 1.4576668739318848, "learning_rate": 9.655737704918033e-05, "loss": 5.1815, "step": 590 }, { "epoch": 0.04853702280297207, "grad_norm": 1.842677354812622, "learning_rate": 9.688524590163936e-05, "loss": 5.1813, "step": 592 }, { "epoch": 0.048700999231360495, "grad_norm": 1.393120288848877, "learning_rate": 9.721311475409836e-05, "loss": 5.2008, "step": 594 }, { "epoch": 0.04886497565974891, "grad_norm": 1.789939522743225, "learning_rate": 9.754098360655737e-05, "loss": 5.113, "step": 596 }, { "epoch": 0.04902895208813733, "grad_norm": 1.8867571353912354, "learning_rate": 9.78688524590164e-05, "loss": 5.1684, "step": 598 }, { "epoch": 0.04919292851652575, "grad_norm": 1.278130292892456, "learning_rate": 9.819672131147541e-05, "loss": 5.0933, "step": 600 }, { "epoch": 0.04935690494491417, "grad_norm": 1.636001467704773, "learning_rate": 9.852459016393443e-05, "loss": 5.1324, "step": 602 }, { "epoch": 0.049520881373302586, "grad_norm": 1.7511135339736938, "learning_rate": 9.885245901639345e-05, "loss": 5.1855, "step": 604 }, { "epoch": 0.049684857801691004, "grad_norm": 1.5389798879623413, "learning_rate": 9.918032786885247e-05, "loss": 5.0774, "step": 606 }, { "epoch": 0.04984883423007943, "grad_norm": 1.466962218284607, "learning_rate": 9.950819672131148e-05, "loss": 5.0469, "step": 608 }, { "epoch": 0.05001281065846785, "grad_norm": 1.6687493324279785, "learning_rate": 9.98360655737705e-05, "loss": 5.1153, "step": 610 }, { "epoch": 0.050176787086856266, "grad_norm": 2.197819232940674, "learning_rate": 9.999999816220216e-05, "loss": 5.1082, "step": 612 }, { "epoch": 0.050340763515244684, "grad_norm": 1.3717740774154663, "learning_rate": 9.999998345982023e-05, "loss": 5.0569, "step": 614 }, { "epoch": 0.0505047399436331, "grad_norm": 1.4402227401733398, "learning_rate": 9.999995405506069e-05, "loss": 5.047, "step": 616 }, { "epoch": 0.05066871637202152, "grad_norm": 1.6121858358383179, "learning_rate": 9.999990994793218e-05, "loss": 5.1063, "step": 618 }, { "epoch": 0.05083269280040994, "grad_norm": 1.4336620569229126, "learning_rate": 9.999985113844767e-05, "loss": 5.095, "step": 620 }, { "epoch": 0.050996669228798364, "grad_norm": 1.3124680519104004, "learning_rate": 9.999977762662447e-05, "loss": 5.0633, "step": 622 }, { "epoch": 0.05116064565718678, "grad_norm": 1.4217371940612793, "learning_rate": 9.999968941248419e-05, "loss": 5.0547, "step": 624 }, { "epoch": 0.0513246220855752, "grad_norm": 1.182154655456543, "learning_rate": 9.999958649605275e-05, "loss": 5.1194, "step": 626 }, { "epoch": 0.05148859851396362, "grad_norm": 1.8198816776275635, "learning_rate": 9.999946887736043e-05, "loss": 5.0367, "step": 628 }, { "epoch": 0.051652574942352036, "grad_norm": 1.8673418760299683, "learning_rate": 9.99993365564418e-05, "loss": 5.0187, "step": 630 }, { "epoch": 0.051816551370740455, "grad_norm": 1.7428374290466309, "learning_rate": 9.99991895333358e-05, "loss": 5.0675, "step": 632 }, { "epoch": 0.05198052779912887, "grad_norm": 1.8670332431793213, "learning_rate": 9.999902780808563e-05, "loss": 5.0166, "step": 634 }, { "epoch": 0.05214450422751729, "grad_norm": 1.2812050580978394, "learning_rate": 9.999885138073886e-05, "loss": 5.059, "step": 636 }, { "epoch": 0.052308480655905716, "grad_norm": 1.7798051834106445, "learning_rate": 9.999866025134737e-05, "loss": 5.0736, "step": 638 }, { "epoch": 0.052472457084294134, "grad_norm": 1.9266560077667236, "learning_rate": 9.999845441996734e-05, "loss": 4.9463, "step": 640 }, { "epoch": 0.05263643351268255, "grad_norm": 1.7603977918624878, "learning_rate": 9.999823388665932e-05, "loss": 4.9446, "step": 642 }, { "epoch": 0.05280040994107097, "grad_norm": 1.383430004119873, "learning_rate": 9.999799865148816e-05, "loss": 5.0142, "step": 644 }, { "epoch": 0.05296438636945939, "grad_norm": 1.325101613998413, "learning_rate": 9.9997748714523e-05, "loss": 4.9583, "step": 646 }, { "epoch": 0.05312836279784781, "grad_norm": 1.3705639839172363, "learning_rate": 9.999748407583736e-05, "loss": 4.9762, "step": 648 }, { "epoch": 0.053292339226236225, "grad_norm": 1.5312895774841309, "learning_rate": 9.999720473550905e-05, "loss": 4.9743, "step": 650 }, { "epoch": 0.05345631565462465, "grad_norm": 1.5449235439300537, "learning_rate": 9.999691069362019e-05, "loss": 5.0046, "step": 652 }, { "epoch": 0.05362029208301307, "grad_norm": 1.4353389739990234, "learning_rate": 9.999660195025727e-05, "loss": 4.9794, "step": 654 }, { "epoch": 0.05378426851140149, "grad_norm": 1.7781524658203125, "learning_rate": 9.999627850551108e-05, "loss": 5.0089, "step": 656 }, { "epoch": 0.053948244939789905, "grad_norm": 1.5605298280715942, "learning_rate": 9.999594035947668e-05, "loss": 4.9213, "step": 658 }, { "epoch": 0.05411222136817832, "grad_norm": 1.1541820764541626, "learning_rate": 9.999558751225355e-05, "loss": 4.9311, "step": 660 }, { "epoch": 0.05427619779656674, "grad_norm": 1.2745718955993652, "learning_rate": 9.999521996394544e-05, "loss": 4.9893, "step": 662 }, { "epoch": 0.05444017422495516, "grad_norm": 1.3617721796035767, "learning_rate": 9.999483771466041e-05, "loss": 4.8475, "step": 664 }, { "epoch": 0.054604150653343585, "grad_norm": 1.6262531280517578, "learning_rate": 9.999444076451086e-05, "loss": 4.944, "step": 666 }, { "epoch": 0.054768127081732, "grad_norm": 1.830307960510254, "learning_rate": 9.99940291136135e-05, "loss": 4.9142, "step": 668 }, { "epoch": 0.05493210351012042, "grad_norm": 1.6637526750564575, "learning_rate": 9.999360276208942e-05, "loss": 4.9335, "step": 670 }, { "epoch": 0.05509607993850884, "grad_norm": 1.5176993608474731, "learning_rate": 9.999316171006395e-05, "loss": 4.8994, "step": 672 }, { "epoch": 0.05526005636689726, "grad_norm": 1.4975998401641846, "learning_rate": 9.999270595766677e-05, "loss": 4.9188, "step": 674 }, { "epoch": 0.055424032795285676, "grad_norm": 1.371583342552185, "learning_rate": 9.999223550503191e-05, "loss": 4.9602, "step": 676 }, { "epoch": 0.055588009223674094, "grad_norm": 1.2749911546707153, "learning_rate": 9.999175035229774e-05, "loss": 4.8472, "step": 678 }, { "epoch": 0.05575198565206252, "grad_norm": 1.6738046407699585, "learning_rate": 9.999125049960687e-05, "loss": 4.8499, "step": 680 }, { "epoch": 0.05591596208045094, "grad_norm": 1.3173338174819946, "learning_rate": 9.999073594710629e-05, "loss": 4.8861, "step": 682 }, { "epoch": 0.056079938508839355, "grad_norm": 1.332027792930603, "learning_rate": 9.999020669494731e-05, "loss": 4.9713, "step": 684 }, { "epoch": 0.056243914937227774, "grad_norm": 1.3671154975891113, "learning_rate": 9.998966274328557e-05, "loss": 4.888, "step": 686 }, { "epoch": 0.05640789136561619, "grad_norm": 1.5266999006271362, "learning_rate": 9.998910409228097e-05, "loss": 4.891, "step": 688 }, { "epoch": 0.05657186779400461, "grad_norm": 1.5487200021743774, "learning_rate": 9.998853074209785e-05, "loss": 4.8246, "step": 690 }, { "epoch": 0.05673584422239303, "grad_norm": 1.2039135694503784, "learning_rate": 9.998794269290474e-05, "loss": 4.837, "step": 692 }, { "epoch": 0.05689982065078145, "grad_norm": 1.6249526739120483, "learning_rate": 9.998733994487458e-05, "loss": 4.8498, "step": 694 }, { "epoch": 0.05706379707916987, "grad_norm": 1.849284291267395, "learning_rate": 9.998672249818461e-05, "loss": 4.8735, "step": 696 }, { "epoch": 0.05722777350755829, "grad_norm": 1.22111976146698, "learning_rate": 9.998609035301638e-05, "loss": 4.8487, "step": 698 }, { "epoch": 0.05739174993594671, "grad_norm": 1.4855297803878784, "learning_rate": 9.998544350955578e-05, "loss": 4.8222, "step": 700 }, { "epoch": 0.057555726364335126, "grad_norm": 1.0973803997039795, "learning_rate": 9.998478196799301e-05, "loss": 4.8494, "step": 702 }, { "epoch": 0.057719702792723544, "grad_norm": 1.3077099323272705, "learning_rate": 9.998410572852259e-05, "loss": 4.9111, "step": 704 }, { "epoch": 0.05788367922111196, "grad_norm": 1.3853782415390015, "learning_rate": 9.998341479134337e-05, "loss": 4.8096, "step": 706 }, { "epoch": 0.05804765564950039, "grad_norm": 1.0545806884765625, "learning_rate": 9.998270915665852e-05, "loss": 4.8357, "step": 708 }, { "epoch": 0.058211632077888806, "grad_norm": 1.1127121448516846, "learning_rate": 9.998198882467552e-05, "loss": 4.7969, "step": 710 }, { "epoch": 0.058375608506277224, "grad_norm": 1.3986823558807373, "learning_rate": 9.998125379560618e-05, "loss": 4.851, "step": 712 }, { "epoch": 0.05853958493466564, "grad_norm": 1.2785799503326416, "learning_rate": 9.998050406966668e-05, "loss": 4.7953, "step": 714 }, { "epoch": 0.05870356136305406, "grad_norm": 1.2151364088058472, "learning_rate": 9.99797396470774e-05, "loss": 4.7614, "step": 716 }, { "epoch": 0.05886753779144248, "grad_norm": 1.221731424331665, "learning_rate": 9.997896052806319e-05, "loss": 4.7832, "step": 718 }, { "epoch": 0.0590315142198309, "grad_norm": 1.2028709650039673, "learning_rate": 9.99781667128531e-05, "loss": 4.7496, "step": 720 }, { "epoch": 0.05919549064821932, "grad_norm": 1.2175623178482056, "learning_rate": 9.997735820168055e-05, "loss": 4.764, "step": 722 }, { "epoch": 0.05935946707660774, "grad_norm": 1.2626240253448486, "learning_rate": 9.99765349947833e-05, "loss": 4.7384, "step": 724 }, { "epoch": 0.05952344350499616, "grad_norm": 1.1986509561538696, "learning_rate": 9.997569709240339e-05, "loss": 4.771, "step": 726 }, { "epoch": 0.059687419933384576, "grad_norm": 1.3522735834121704, "learning_rate": 9.997484449478724e-05, "loss": 4.8, "step": 728 }, { "epoch": 0.059851396361772995, "grad_norm": 1.4882395267486572, "learning_rate": 9.997397720218553e-05, "loss": 4.7719, "step": 730 }, { "epoch": 0.06001537279016141, "grad_norm": 1.54912531375885, "learning_rate": 9.99730952148533e-05, "loss": 4.796, "step": 732 }, { "epoch": 0.06017934921854983, "grad_norm": 1.442063570022583, "learning_rate": 9.997219853304986e-05, "loss": 4.7315, "step": 734 }, { "epoch": 0.06034332564693825, "grad_norm": 1.193917155265808, "learning_rate": 9.997128715703892e-05, "loss": 4.7235, "step": 736 }, { "epoch": 0.060507302075326674, "grad_norm": 1.3164023160934448, "learning_rate": 9.997036108708843e-05, "loss": 4.7216, "step": 738 }, { "epoch": 0.06067127850371509, "grad_norm": 1.4200711250305176, "learning_rate": 9.996942032347074e-05, "loss": 4.7607, "step": 740 }, { "epoch": 0.06083525493210351, "grad_norm": 1.6272212266921997, "learning_rate": 9.996846486646245e-05, "loss": 4.7334, "step": 742 }, { "epoch": 0.06099923136049193, "grad_norm": 1.371127963066101, "learning_rate": 9.996749471634452e-05, "loss": 4.7856, "step": 744 }, { "epoch": 0.06116320778888035, "grad_norm": 1.3041075468063354, "learning_rate": 9.996650987340222e-05, "loss": 4.7614, "step": 746 }, { "epoch": 0.061327184217268765, "grad_norm": 1.3496429920196533, "learning_rate": 9.996551033792514e-05, "loss": 4.7778, "step": 748 }, { "epoch": 0.061491160645657184, "grad_norm": 1.433722734451294, "learning_rate": 9.996449611020719e-05, "loss": 4.7239, "step": 750 }, { "epoch": 0.06165513707404561, "grad_norm": 1.6855007410049438, "learning_rate": 9.996346719054659e-05, "loss": 4.8127, "step": 752 }, { "epoch": 0.06181911350243403, "grad_norm": 1.6095404624938965, "learning_rate": 9.996242357924591e-05, "loss": 4.718, "step": 754 }, { "epoch": 0.061983089930822445, "grad_norm": 1.3969067335128784, "learning_rate": 9.996136527661202e-05, "loss": 4.6984, "step": 756 }, { "epoch": 0.06214706635921086, "grad_norm": 1.154539704322815, "learning_rate": 9.99602922829561e-05, "loss": 4.7393, "step": 758 }, { "epoch": 0.06231104278759928, "grad_norm": 1.1856812238693237, "learning_rate": 9.995920459859367e-05, "loss": 4.6979, "step": 760 }, { "epoch": 0.0624750192159877, "grad_norm": 1.17214834690094, "learning_rate": 9.995810222384454e-05, "loss": 4.6376, "step": 762 }, { "epoch": 0.06263899564437612, "grad_norm": 1.2906792163848877, "learning_rate": 9.995698515903289e-05, "loss": 4.7684, "step": 764 }, { "epoch": 0.06280297207276454, "grad_norm": 1.3379504680633545, "learning_rate": 9.995585340448719e-05, "loss": 4.6261, "step": 766 }, { "epoch": 0.06296694850115296, "grad_norm": 1.4633430242538452, "learning_rate": 9.995470696054021e-05, "loss": 4.6889, "step": 768 }, { "epoch": 0.06313092492954138, "grad_norm": 1.4456816911697388, "learning_rate": 9.995354582752907e-05, "loss": 4.644, "step": 770 }, { "epoch": 0.0632949013579298, "grad_norm": 1.2467187643051147, "learning_rate": 9.995237000579519e-05, "loss": 4.613, "step": 772 }, { "epoch": 0.06345887778631822, "grad_norm": 1.2840498685836792, "learning_rate": 9.995117949568433e-05, "loss": 4.6492, "step": 774 }, { "epoch": 0.06362285421470663, "grad_norm": 0.9967436194419861, "learning_rate": 9.994997429754656e-05, "loss": 4.6299, "step": 776 }, { "epoch": 0.06378683064309505, "grad_norm": 1.2300001382827759, "learning_rate": 9.994875441173623e-05, "loss": 4.6396, "step": 778 }, { "epoch": 0.06395080707148347, "grad_norm": 1.225391149520874, "learning_rate": 9.99475198386121e-05, "loss": 4.6316, "step": 780 }, { "epoch": 0.06411478349987189, "grad_norm": 1.148902177810669, "learning_rate": 9.994627057853714e-05, "loss": 4.6019, "step": 782 }, { "epoch": 0.0642787599282603, "grad_norm": 1.3424532413482666, "learning_rate": 9.994500663187874e-05, "loss": 4.6052, "step": 784 }, { "epoch": 0.06444273635664872, "grad_norm": 1.1050846576690674, "learning_rate": 9.99437279990085e-05, "loss": 4.5606, "step": 786 }, { "epoch": 0.06460671278503716, "grad_norm": 1.2558073997497559, "learning_rate": 9.994243468030247e-05, "loss": 4.5245, "step": 788 }, { "epoch": 0.06477068921342558, "grad_norm": 1.366250991821289, "learning_rate": 9.99411266761409e-05, "loss": 4.6662, "step": 790 }, { "epoch": 0.064934665641814, "grad_norm": 1.0933619737625122, "learning_rate": 9.993980398690843e-05, "loss": 4.5972, "step": 792 }, { "epoch": 0.06509864207020241, "grad_norm": 1.0256333351135254, "learning_rate": 9.993846661299396e-05, "loss": 4.5935, "step": 794 }, { "epoch": 0.06526261849859083, "grad_norm": 0.9090489149093628, "learning_rate": 9.993711455479077e-05, "loss": 4.5371, "step": 796 }, { "epoch": 0.06542659492697925, "grad_norm": 1.3676148653030396, "learning_rate": 9.993574781269644e-05, "loss": 4.5959, "step": 798 }, { "epoch": 0.06559057135536767, "grad_norm": 1.1888647079467773, "learning_rate": 9.993436638711284e-05, "loss": 4.6145, "step": 800 }, { "epoch": 0.06575454778375608, "grad_norm": 0.883764386177063, "learning_rate": 9.993297027844616e-05, "loss": 4.5562, "step": 802 }, { "epoch": 0.0659185242121445, "grad_norm": 0.969134509563446, "learning_rate": 9.993155948710694e-05, "loss": 4.6248, "step": 804 }, { "epoch": 0.06608250064053292, "grad_norm": 0.9472710490226746, "learning_rate": 9.993013401351002e-05, "loss": 4.5769, "step": 806 }, { "epoch": 0.06624647706892134, "grad_norm": 1.162370204925537, "learning_rate": 9.992869385807455e-05, "loss": 4.5947, "step": 808 }, { "epoch": 0.06641045349730976, "grad_norm": 1.0858770608901978, "learning_rate": 9.992723902122403e-05, "loss": 4.5491, "step": 810 }, { "epoch": 0.06657442992569818, "grad_norm": 1.0923309326171875, "learning_rate": 9.992576950338621e-05, "loss": 4.5982, "step": 812 }, { "epoch": 0.0667384063540866, "grad_norm": 1.1544495820999146, "learning_rate": 9.992428530499323e-05, "loss": 4.5675, "step": 814 }, { "epoch": 0.06690238278247503, "grad_norm": 1.1099858283996582, "learning_rate": 9.99227864264815e-05, "loss": 4.5893, "step": 816 }, { "epoch": 0.06706635921086344, "grad_norm": 1.3164221048355103, "learning_rate": 9.992127286829176e-05, "loss": 4.5993, "step": 818 }, { "epoch": 0.06723033563925186, "grad_norm": 1.2815008163452148, "learning_rate": 9.991974463086908e-05, "loss": 4.5687, "step": 820 }, { "epoch": 0.06739431206764028, "grad_norm": 1.104801058769226, "learning_rate": 9.991820171466284e-05, "loss": 4.5231, "step": 822 }, { "epoch": 0.0675582884960287, "grad_norm": 1.2623943090438843, "learning_rate": 9.99166441201267e-05, "loss": 4.5498, "step": 824 }, { "epoch": 0.06772226492441712, "grad_norm": 1.3679825067520142, "learning_rate": 9.991507184771869e-05, "loss": 4.5317, "step": 826 }, { "epoch": 0.06788624135280553, "grad_norm": 1.1458314657211304, "learning_rate": 9.991348489790113e-05, "loss": 4.4599, "step": 828 }, { "epoch": 0.06805021778119395, "grad_norm": 1.1556310653686523, "learning_rate": 9.991188327114068e-05, "loss": 4.5466, "step": 830 }, { "epoch": 0.06821419420958237, "grad_norm": 1.2080873250961304, "learning_rate": 9.991026696790825e-05, "loss": 4.5734, "step": 832 }, { "epoch": 0.06837817063797079, "grad_norm": 1.3832921981811523, "learning_rate": 9.990863598867914e-05, "loss": 4.5367, "step": 834 }, { "epoch": 0.06854214706635921, "grad_norm": 0.9393659234046936, "learning_rate": 9.990699033393293e-05, "loss": 4.5072, "step": 836 }, { "epoch": 0.06870612349474763, "grad_norm": 0.9582691788673401, "learning_rate": 9.990533000415352e-05, "loss": 4.5046, "step": 838 }, { "epoch": 0.06887009992313604, "grad_norm": 1.2170627117156982, "learning_rate": 9.990365499982912e-05, "loss": 4.4628, "step": 840 }, { "epoch": 0.06903407635152446, "grad_norm": 1.037985920906067, "learning_rate": 9.990196532145227e-05, "loss": 4.5521, "step": 842 }, { "epoch": 0.0691980527799129, "grad_norm": 0.9628452658653259, "learning_rate": 9.990026096951981e-05, "loss": 4.5028, "step": 844 }, { "epoch": 0.06936202920830131, "grad_norm": 1.0910757780075073, "learning_rate": 9.98985419445329e-05, "loss": 4.5354, "step": 846 }, { "epoch": 0.06952600563668973, "grad_norm": 1.5108650922775269, "learning_rate": 9.989680824699703e-05, "loss": 4.5267, "step": 848 }, { "epoch": 0.06968998206507815, "grad_norm": 1.214145541191101, "learning_rate": 9.989505987742198e-05, "loss": 4.5271, "step": 850 }, { "epoch": 0.06985395849346657, "grad_norm": 1.2133456468582153, "learning_rate": 9.989329683632185e-05, "loss": 4.5195, "step": 852 }, { "epoch": 0.07001793492185499, "grad_norm": 1.4688955545425415, "learning_rate": 9.989151912421503e-05, "loss": 4.449, "step": 854 }, { "epoch": 0.0701819113502434, "grad_norm": 0.9931148290634155, "learning_rate": 9.988972674162432e-05, "loss": 4.4952, "step": 856 }, { "epoch": 0.07034588777863182, "grad_norm": 1.1149705648422241, "learning_rate": 9.988791968907671e-05, "loss": 4.4773, "step": 858 }, { "epoch": 0.07050986420702024, "grad_norm": 1.125609278678894, "learning_rate": 9.98860979671036e-05, "loss": 4.4371, "step": 860 }, { "epoch": 0.07067384063540866, "grad_norm": 0.985559344291687, "learning_rate": 9.988426157624063e-05, "loss": 4.4348, "step": 862 }, { "epoch": 0.07083781706379708, "grad_norm": 0.937849223613739, "learning_rate": 9.988241051702778e-05, "loss": 4.4481, "step": 864 }, { "epoch": 0.0710017934921855, "grad_norm": 1.4474624395370483, "learning_rate": 9.98805447900094e-05, "loss": 4.5007, "step": 866 }, { "epoch": 0.07116576992057391, "grad_norm": 0.9758108854293823, "learning_rate": 9.987866439573403e-05, "loss": 4.448, "step": 868 }, { "epoch": 0.07132974634896233, "grad_norm": 1.147123098373413, "learning_rate": 9.987676933475467e-05, "loss": 4.3825, "step": 870 }, { "epoch": 0.07149372277735076, "grad_norm": 1.2334039211273193, "learning_rate": 9.98748596076285e-05, "loss": 4.4577, "step": 872 }, { "epoch": 0.07165769920573918, "grad_norm": 1.1643340587615967, "learning_rate": 9.987293521491711e-05, "loss": 4.4511, "step": 874 }, { "epoch": 0.0718216756341276, "grad_norm": 0.9636064767837524, "learning_rate": 9.987099615718634e-05, "loss": 4.4068, "step": 876 }, { "epoch": 0.07198565206251602, "grad_norm": 1.0015913248062134, "learning_rate": 9.986904243500637e-05, "loss": 4.4082, "step": 878 }, { "epoch": 0.07214962849090444, "grad_norm": 1.003480076789856, "learning_rate": 9.98670740489517e-05, "loss": 4.368, "step": 880 }, { "epoch": 0.07231360491929285, "grad_norm": 1.0843322277069092, "learning_rate": 9.98650909996011e-05, "loss": 4.4389, "step": 882 }, { "epoch": 0.07247758134768127, "grad_norm": 1.0302671194076538, "learning_rate": 9.986309328753772e-05, "loss": 4.5192, "step": 884 }, { "epoch": 0.07264155777606969, "grad_norm": 1.0695348978042603, "learning_rate": 9.986108091334896e-05, "loss": 4.4382, "step": 886 }, { "epoch": 0.07280553420445811, "grad_norm": 0.8926975131034851, "learning_rate": 9.985905387762656e-05, "loss": 4.3326, "step": 888 }, { "epoch": 0.07296951063284653, "grad_norm": 0.8886080384254456, "learning_rate": 9.985701218096655e-05, "loss": 4.3415, "step": 890 }, { "epoch": 0.07313348706123494, "grad_norm": 1.0607457160949707, "learning_rate": 9.985495582396931e-05, "loss": 4.3892, "step": 892 }, { "epoch": 0.07329746348962336, "grad_norm": 0.9411015510559082, "learning_rate": 9.985288480723949e-05, "loss": 4.4194, "step": 894 }, { "epoch": 0.07346143991801178, "grad_norm": 1.1800034046173096, "learning_rate": 9.985079913138607e-05, "loss": 4.2992, "step": 896 }, { "epoch": 0.0736254163464002, "grad_norm": 0.9723591804504395, "learning_rate": 9.984869879702235e-05, "loss": 4.3078, "step": 898 }, { "epoch": 0.07378939277478862, "grad_norm": 1.0988435745239258, "learning_rate": 9.98465838047659e-05, "loss": 4.3855, "step": 900 }, { "epoch": 0.07395336920317705, "grad_norm": 1.3330668210983276, "learning_rate": 9.984445415523866e-05, "loss": 4.326, "step": 902 }, { "epoch": 0.07411734563156547, "grad_norm": 1.271883249282837, "learning_rate": 9.984230984906684e-05, "loss": 4.414, "step": 904 }, { "epoch": 0.07428132205995389, "grad_norm": 1.2705031633377075, "learning_rate": 9.984015088688094e-05, "loss": 4.3481, "step": 906 }, { "epoch": 0.0744452984883423, "grad_norm": 1.1692169904708862, "learning_rate": 9.983797726931585e-05, "loss": 4.3656, "step": 908 }, { "epoch": 0.07460927491673072, "grad_norm": 1.557630181312561, "learning_rate": 9.983578899701068e-05, "loss": 4.3926, "step": 910 }, { "epoch": 0.07477325134511914, "grad_norm": 1.252167820930481, "learning_rate": 9.98335860706089e-05, "loss": 4.3261, "step": 912 }, { "epoch": 0.07493722777350756, "grad_norm": 1.0178437232971191, "learning_rate": 9.983136849075827e-05, "loss": 4.3705, "step": 914 }, { "epoch": 0.07510120420189598, "grad_norm": 1.0884320735931396, "learning_rate": 9.982913625811086e-05, "loss": 4.3286, "step": 916 }, { "epoch": 0.0752651806302844, "grad_norm": 1.018403172492981, "learning_rate": 9.982688937332305e-05, "loss": 4.3491, "step": 918 }, { "epoch": 0.07542915705867281, "grad_norm": 1.3070770502090454, "learning_rate": 9.982462783705555e-05, "loss": 4.3412, "step": 920 }, { "epoch": 0.07559313348706123, "grad_norm": 0.8433274030685425, "learning_rate": 9.982235164997336e-05, "loss": 4.3234, "step": 922 }, { "epoch": 0.07575710991544965, "grad_norm": 1.3230133056640625, "learning_rate": 9.982006081274575e-05, "loss": 4.275, "step": 924 }, { "epoch": 0.07592108634383807, "grad_norm": 1.1820060014724731, "learning_rate": 9.981775532604637e-05, "loss": 4.2512, "step": 926 }, { "epoch": 0.07608506277222649, "grad_norm": 1.17715322971344, "learning_rate": 9.981543519055314e-05, "loss": 4.2675, "step": 928 }, { "epoch": 0.07624903920061492, "grad_norm": 0.9106295704841614, "learning_rate": 9.981310040694829e-05, "loss": 4.2771, "step": 930 }, { "epoch": 0.07641301562900334, "grad_norm": 1.2079112529754639, "learning_rate": 9.981075097591834e-05, "loss": 4.3381, "step": 932 }, { "epoch": 0.07657699205739175, "grad_norm": 1.199350118637085, "learning_rate": 9.980838689815414e-05, "loss": 4.3208, "step": 934 }, { "epoch": 0.07674096848578017, "grad_norm": 1.088350534439087, "learning_rate": 9.980600817435086e-05, "loss": 4.2439, "step": 936 }, { "epoch": 0.07690494491416859, "grad_norm": 1.0347201824188232, "learning_rate": 9.980361480520794e-05, "loss": 4.3169, "step": 938 }, { "epoch": 0.07706892134255701, "grad_norm": 1.3007529973983765, "learning_rate": 9.980120679142917e-05, "loss": 4.2441, "step": 940 }, { "epoch": 0.07723289777094543, "grad_norm": 0.9512838125228882, "learning_rate": 9.979878413372259e-05, "loss": 4.2474, "step": 942 }, { "epoch": 0.07739687419933385, "grad_norm": 1.170279622077942, "learning_rate": 9.979634683280059e-05, "loss": 4.2459, "step": 944 }, { "epoch": 0.07756085062772226, "grad_norm": 0.9993996620178223, "learning_rate": 9.979389488937984e-05, "loss": 4.2862, "step": 946 }, { "epoch": 0.07772482705611068, "grad_norm": 0.8914362788200378, "learning_rate": 9.979142830418134e-05, "loss": 4.2872, "step": 948 }, { "epoch": 0.0778888034844991, "grad_norm": 1.153712511062622, "learning_rate": 9.978894707793039e-05, "loss": 4.2023, "step": 950 }, { "epoch": 0.07805277991288752, "grad_norm": 1.2968518733978271, "learning_rate": 9.978645121135659e-05, "loss": 4.1831, "step": 952 }, { "epoch": 0.07821675634127594, "grad_norm": 1.2519747018814087, "learning_rate": 9.978394070519383e-05, "loss": 4.2492, "step": 954 }, { "epoch": 0.07838073276966435, "grad_norm": 1.1906182765960693, "learning_rate": 9.978141556018031e-05, "loss": 4.2596, "step": 956 }, { "epoch": 0.07854470919805279, "grad_norm": 1.073822021484375, "learning_rate": 9.977887577705857e-05, "loss": 4.2341, "step": 958 }, { "epoch": 0.0787086856264412, "grad_norm": 1.1360152959823608, "learning_rate": 9.977632135657543e-05, "loss": 4.2389, "step": 960 }, { "epoch": 0.07887266205482962, "grad_norm": 1.0772216320037842, "learning_rate": 9.977375229948195e-05, "loss": 4.1266, "step": 962 }, { "epoch": 0.07903663848321804, "grad_norm": 1.0052435398101807, "learning_rate": 9.977116860653363e-05, "loss": 4.2679, "step": 964 }, { "epoch": 0.07920061491160646, "grad_norm": 0.8987570405006409, "learning_rate": 9.976857027849019e-05, "loss": 4.2019, "step": 966 }, { "epoch": 0.07936459133999488, "grad_norm": 1.3094909191131592, "learning_rate": 9.97659573161156e-05, "loss": 4.152, "step": 968 }, { "epoch": 0.0795285677683833, "grad_norm": 1.0730571746826172, "learning_rate": 9.976332972017826e-05, "loss": 4.1829, "step": 970 }, { "epoch": 0.07969254419677171, "grad_norm": 1.0100387334823608, "learning_rate": 9.976068749145078e-05, "loss": 4.1619, "step": 972 }, { "epoch": 0.07985652062516013, "grad_norm": 0.9166683554649353, "learning_rate": 9.97580306307101e-05, "loss": 4.2055, "step": 974 }, { "epoch": 0.08002049705354855, "grad_norm": 0.9950259327888489, "learning_rate": 9.975535913873748e-05, "loss": 4.2696, "step": 976 }, { "epoch": 0.08018447348193697, "grad_norm": 1.174676775932312, "learning_rate": 9.975267301631846e-05, "loss": 4.257, "step": 978 }, { "epoch": 0.08034844991032539, "grad_norm": 0.9548665881156921, "learning_rate": 9.974997226424288e-05, "loss": 4.1695, "step": 980 }, { "epoch": 0.0805124263387138, "grad_norm": 1.1128441095352173, "learning_rate": 9.974725688330489e-05, "loss": 4.158, "step": 982 }, { "epoch": 0.08067640276710222, "grad_norm": 1.0748997926712036, "learning_rate": 9.974452687430293e-05, "loss": 4.1652, "step": 984 }, { "epoch": 0.08084037919549064, "grad_norm": 1.1463944911956787, "learning_rate": 9.974178223803981e-05, "loss": 4.1962, "step": 986 }, { "epoch": 0.08100435562387907, "grad_norm": 1.18082857131958, "learning_rate": 9.97390229753225e-05, "loss": 4.2016, "step": 988 }, { "epoch": 0.08116833205226749, "grad_norm": 1.0245002508163452, "learning_rate": 9.973624908696242e-05, "loss": 4.1244, "step": 990 }, { "epoch": 0.08133230848065591, "grad_norm": 1.0269415378570557, "learning_rate": 9.973346057377519e-05, "loss": 4.1954, "step": 992 }, { "epoch": 0.08149628490904433, "grad_norm": 1.3380223512649536, "learning_rate": 9.973065743658078e-05, "loss": 4.1392, "step": 994 }, { "epoch": 0.08166026133743275, "grad_norm": 1.0681672096252441, "learning_rate": 9.972783967620345e-05, "loss": 4.0451, "step": 996 }, { "epoch": 0.08182423776582116, "grad_norm": 1.1791712045669556, "learning_rate": 9.972500729347176e-05, "loss": 4.1513, "step": 998 }, { "epoch": 0.08198821419420958, "grad_norm": 0.9805436134338379, "learning_rate": 9.972216028921854e-05, "loss": 4.0942, "step": 1000 }, { "epoch": 0.082152190622598, "grad_norm": 1.2421460151672363, "learning_rate": 9.971929866428095e-05, "loss": 4.1216, "step": 1002 }, { "epoch": 0.08231616705098642, "grad_norm": 1.085977554321289, "learning_rate": 9.971642241950048e-05, "loss": 4.0897, "step": 1004 }, { "epoch": 0.08248014347937484, "grad_norm": 0.987576425075531, "learning_rate": 9.971353155572284e-05, "loss": 4.1011, "step": 1006 }, { "epoch": 0.08264411990776326, "grad_norm": 1.1075130701065063, "learning_rate": 9.97106260737981e-05, "loss": 4.1546, "step": 1008 }, { "epoch": 0.08280809633615167, "grad_norm": 1.0441356897354126, "learning_rate": 9.97077059745806e-05, "loss": 4.1124, "step": 1010 }, { "epoch": 0.08297207276454009, "grad_norm": 1.0685722827911377, "learning_rate": 9.970477125892902e-05, "loss": 4.1241, "step": 1012 }, { "epoch": 0.08313604919292851, "grad_norm": 0.9862858057022095, "learning_rate": 9.970182192770627e-05, "loss": 4.1194, "step": 1014 }, { "epoch": 0.08330002562131694, "grad_norm": 1.0223233699798584, "learning_rate": 9.96988579817796e-05, "loss": 4.1471, "step": 1016 }, { "epoch": 0.08346400204970536, "grad_norm": 1.0092227458953857, "learning_rate": 9.969587942202057e-05, "loss": 4.141, "step": 1018 }, { "epoch": 0.08362797847809378, "grad_norm": 0.9383386969566345, "learning_rate": 9.9692886249305e-05, "loss": 4.0195, "step": 1020 }, { "epoch": 0.0837919549064822, "grad_norm": 0.7819382548332214, "learning_rate": 9.968987846451305e-05, "loss": 4.0688, "step": 1022 }, { "epoch": 0.08395593133487061, "grad_norm": 0.9499008655548096, "learning_rate": 9.968685606852913e-05, "loss": 4.1559, "step": 1024 }, { "epoch": 0.08411990776325903, "grad_norm": 0.9793714880943298, "learning_rate": 9.968381906224195e-05, "loss": 4.1147, "step": 1026 }, { "epoch": 0.08428388419164745, "grad_norm": 0.8692449927330017, "learning_rate": 9.968076744654458e-05, "loss": 4.0808, "step": 1028 }, { "epoch": 0.08444786062003587, "grad_norm": 1.0884157419204712, "learning_rate": 9.967770122233431e-05, "loss": 4.0656, "step": 1030 }, { "epoch": 0.08461183704842429, "grad_norm": 0.8661439418792725, "learning_rate": 9.967462039051275e-05, "loss": 4.0854, "step": 1032 }, { "epoch": 0.0847758134768127, "grad_norm": 0.8530150651931763, "learning_rate": 9.967152495198584e-05, "loss": 4.0791, "step": 1034 }, { "epoch": 0.08493978990520112, "grad_norm": 1.156949520111084, "learning_rate": 9.966841490766378e-05, "loss": 4.0719, "step": 1036 }, { "epoch": 0.08510376633358954, "grad_norm": 0.9264504313468933, "learning_rate": 9.966529025846105e-05, "loss": 4.0668, "step": 1038 }, { "epoch": 0.08526774276197796, "grad_norm": 1.0428452491760254, "learning_rate": 9.966215100529645e-05, "loss": 4.0053, "step": 1040 }, { "epoch": 0.08543171919036638, "grad_norm": 0.9271348118782043, "learning_rate": 9.96589971490931e-05, "loss": 4.105, "step": 1042 }, { "epoch": 0.08559569561875481, "grad_norm": 1.1432150602340698, "learning_rate": 9.965582869077836e-05, "loss": 4.0669, "step": 1044 }, { "epoch": 0.08575967204714323, "grad_norm": 0.9777700901031494, "learning_rate": 9.965264563128391e-05, "loss": 4.01, "step": 1046 }, { "epoch": 0.08592364847553165, "grad_norm": 0.8779552578926086, "learning_rate": 9.96494479715457e-05, "loss": 3.9549, "step": 1048 }, { "epoch": 0.08608762490392007, "grad_norm": 0.915309488773346, "learning_rate": 9.964623571250404e-05, "loss": 4.0066, "step": 1050 }, { "epoch": 0.08625160133230848, "grad_norm": 0.753326416015625, "learning_rate": 9.964300885510345e-05, "loss": 4.0328, "step": 1052 }, { "epoch": 0.0864155777606969, "grad_norm": 0.8076086044311523, "learning_rate": 9.96397674002928e-05, "loss": 4.0298, "step": 1054 }, { "epoch": 0.08657955418908532, "grad_norm": 1.0535773038864136, "learning_rate": 9.963651134902524e-05, "loss": 4.0164, "step": 1056 }, { "epoch": 0.08674353061747374, "grad_norm": 0.8676068186759949, "learning_rate": 9.963324070225817e-05, "loss": 4.0412, "step": 1058 }, { "epoch": 0.08690750704586216, "grad_norm": 0.9461095333099365, "learning_rate": 9.962995546095333e-05, "loss": 4.0265, "step": 1060 }, { "epoch": 0.08707148347425057, "grad_norm": 0.9060032367706299, "learning_rate": 9.962665562607676e-05, "loss": 4.0104, "step": 1062 }, { "epoch": 0.08723545990263899, "grad_norm": 0.8914903998374939, "learning_rate": 9.962334119859873e-05, "loss": 3.9873, "step": 1064 }, { "epoch": 0.08739943633102741, "grad_norm": 1.057827353477478, "learning_rate": 9.962001217949389e-05, "loss": 4.0135, "step": 1066 }, { "epoch": 0.08756341275941583, "grad_norm": 0.9309613108634949, "learning_rate": 9.961666856974108e-05, "loss": 4.0184, "step": 1068 }, { "epoch": 0.08772738918780425, "grad_norm": 1.0464098453521729, "learning_rate": 9.961331037032351e-05, "loss": 4.0043, "step": 1070 }, { "epoch": 0.08789136561619267, "grad_norm": 0.7158762812614441, "learning_rate": 9.960993758222863e-05, "loss": 4.0443, "step": 1072 }, { "epoch": 0.0880553420445811, "grad_norm": 0.7665286660194397, "learning_rate": 9.960655020644823e-05, "loss": 3.9872, "step": 1074 }, { "epoch": 0.08821931847296952, "grad_norm": 0.8457959890365601, "learning_rate": 9.960314824397833e-05, "loss": 3.9417, "step": 1076 }, { "epoch": 0.08838329490135793, "grad_norm": 1.152944564819336, "learning_rate": 9.959973169581928e-05, "loss": 4.0407, "step": 1078 }, { "epoch": 0.08854727132974635, "grad_norm": 0.9561640620231628, "learning_rate": 9.959630056297573e-05, "loss": 4.0229, "step": 1080 }, { "epoch": 0.08871124775813477, "grad_norm": 0.9881964921951294, "learning_rate": 9.959285484645658e-05, "loss": 3.9769, "step": 1082 }, { "epoch": 0.08887522418652319, "grad_norm": 0.891594409942627, "learning_rate": 9.9589394547275e-05, "loss": 3.9446, "step": 1084 }, { "epoch": 0.0890392006149116, "grad_norm": 0.7694927453994751, "learning_rate": 9.958591966644853e-05, "loss": 3.968, "step": 1086 }, { "epoch": 0.08920317704330002, "grad_norm": 0.9506424069404602, "learning_rate": 9.958243020499893e-05, "loss": 3.976, "step": 1088 }, { "epoch": 0.08936715347168844, "grad_norm": 0.9964757561683655, "learning_rate": 9.95789261639523e-05, "loss": 4.0114, "step": 1090 }, { "epoch": 0.08953112990007686, "grad_norm": 1.0715919733047485, "learning_rate": 9.957540754433894e-05, "loss": 3.9759, "step": 1092 }, { "epoch": 0.08969510632846528, "grad_norm": 0.9044798612594604, "learning_rate": 9.957187434719352e-05, "loss": 3.9858, "step": 1094 }, { "epoch": 0.0898590827568537, "grad_norm": 0.8711757659912109, "learning_rate": 9.956832657355497e-05, "loss": 3.9678, "step": 1096 }, { "epoch": 0.09002305918524212, "grad_norm": 1.049402117729187, "learning_rate": 9.956476422446652e-05, "loss": 3.8973, "step": 1098 }, { "epoch": 0.09018703561363053, "grad_norm": 0.9575179219245911, "learning_rate": 9.956118730097564e-05, "loss": 3.964, "step": 1100 }, { "epoch": 0.09035101204201897, "grad_norm": 1.0238109827041626, "learning_rate": 9.955759580413412e-05, "loss": 3.9312, "step": 1102 }, { "epoch": 0.09051498847040738, "grad_norm": 0.9079989790916443, "learning_rate": 9.955398973499805e-05, "loss": 3.9918, "step": 1104 }, { "epoch": 0.0906789648987958, "grad_norm": 0.9520390033721924, "learning_rate": 9.955036909462777e-05, "loss": 3.9605, "step": 1106 }, { "epoch": 0.09084294132718422, "grad_norm": 0.9960986971855164, "learning_rate": 9.954673388408793e-05, "loss": 3.9898, "step": 1108 }, { "epoch": 0.09100691775557264, "grad_norm": 0.9239450097084045, "learning_rate": 9.954308410444747e-05, "loss": 3.9124, "step": 1110 }, { "epoch": 0.09117089418396106, "grad_norm": 0.8150608539581299, "learning_rate": 9.953941975677954e-05, "loss": 4.0019, "step": 1112 }, { "epoch": 0.09133487061234948, "grad_norm": 0.8617908358573914, "learning_rate": 9.953574084216171e-05, "loss": 3.9295, "step": 1114 }, { "epoch": 0.0914988470407379, "grad_norm": 0.9470566511154175, "learning_rate": 9.953204736167569e-05, "loss": 3.9361, "step": 1116 }, { "epoch": 0.09166282346912631, "grad_norm": 0.8053050637245178, "learning_rate": 9.95283393164076e-05, "loss": 3.952, "step": 1118 }, { "epoch": 0.09182679989751473, "grad_norm": 0.8299336433410645, "learning_rate": 9.952461670744774e-05, "loss": 3.9024, "step": 1120 }, { "epoch": 0.09199077632590315, "grad_norm": 0.8287034630775452, "learning_rate": 9.952087953589073e-05, "loss": 3.8938, "step": 1122 }, { "epoch": 0.09215475275429157, "grad_norm": 0.8874202370643616, "learning_rate": 9.951712780283552e-05, "loss": 3.9419, "step": 1124 }, { "epoch": 0.09231872918267998, "grad_norm": 0.855707585811615, "learning_rate": 9.951336150938526e-05, "loss": 3.876, "step": 1126 }, { "epoch": 0.0924827056110684, "grad_norm": 0.7967925667762756, "learning_rate": 9.950958065664741e-05, "loss": 3.9378, "step": 1128 }, { "epoch": 0.09264668203945683, "grad_norm": 0.7915927171707153, "learning_rate": 9.950578524573377e-05, "loss": 3.8823, "step": 1130 }, { "epoch": 0.09281065846784525, "grad_norm": 0.8065016865730286, "learning_rate": 9.950197527776033e-05, "loss": 3.9223, "step": 1132 }, { "epoch": 0.09297463489623367, "grad_norm": 0.7818952202796936, "learning_rate": 9.949815075384742e-05, "loss": 3.9015, "step": 1134 }, { "epoch": 0.09313861132462209, "grad_norm": 0.9576020240783691, "learning_rate": 9.949431167511963e-05, "loss": 3.9206, "step": 1136 }, { "epoch": 0.09330258775301051, "grad_norm": 0.8579282760620117, "learning_rate": 9.949045804270581e-05, "loss": 3.9195, "step": 1138 }, { "epoch": 0.09346656418139893, "grad_norm": 0.7089054584503174, "learning_rate": 9.948658985773915e-05, "loss": 3.8824, "step": 1140 }, { "epoch": 0.09363054060978734, "grad_norm": 0.7162330150604248, "learning_rate": 9.948270712135705e-05, "loss": 3.8758, "step": 1142 }, { "epoch": 0.09379451703817576, "grad_norm": 1.0738468170166016, "learning_rate": 9.947880983470124e-05, "loss": 3.8408, "step": 1144 }, { "epoch": 0.09395849346656418, "grad_norm": 0.8277477025985718, "learning_rate": 9.947489799891769e-05, "loss": 3.87, "step": 1146 }, { "epoch": 0.0941224698949526, "grad_norm": 0.7950448989868164, "learning_rate": 9.947097161515668e-05, "loss": 3.9011, "step": 1148 }, { "epoch": 0.09428644632334102, "grad_norm": 0.9803164601325989, "learning_rate": 9.946703068457275e-05, "loss": 3.8423, "step": 1150 }, { "epoch": 0.09445042275172943, "grad_norm": 0.9193939566612244, "learning_rate": 9.946307520832472e-05, "loss": 3.892, "step": 1152 }, { "epoch": 0.09461439918011785, "grad_norm": 0.8781881928443909, "learning_rate": 9.94591051875757e-05, "loss": 3.8793, "step": 1154 }, { "epoch": 0.09477837560850627, "grad_norm": 0.7899143695831299, "learning_rate": 9.945512062349304e-05, "loss": 3.8543, "step": 1156 }, { "epoch": 0.0949423520368947, "grad_norm": 0.9870477914810181, "learning_rate": 9.94511215172484e-05, "loss": 3.8322, "step": 1158 }, { "epoch": 0.09510632846528312, "grad_norm": 1.0156104564666748, "learning_rate": 9.944710787001773e-05, "loss": 3.8877, "step": 1160 }, { "epoch": 0.09527030489367154, "grad_norm": 0.9456477165222168, "learning_rate": 9.94430796829812e-05, "loss": 3.873, "step": 1162 }, { "epoch": 0.09543428132205996, "grad_norm": 0.808631956577301, "learning_rate": 9.943903695732333e-05, "loss": 3.8156, "step": 1164 }, { "epoch": 0.09559825775044838, "grad_norm": 0.9766041040420532, "learning_rate": 9.943497969423283e-05, "loss": 3.8912, "step": 1166 }, { "epoch": 0.0957622341788368, "grad_norm": 1.068718671798706, "learning_rate": 9.943090789490276e-05, "loss": 3.8365, "step": 1168 }, { "epoch": 0.09592621060722521, "grad_norm": 0.8382964134216309, "learning_rate": 9.94268215605304e-05, "loss": 3.8484, "step": 1170 }, { "epoch": 0.09609018703561363, "grad_norm": 0.9153487086296082, "learning_rate": 9.942272069231735e-05, "loss": 3.8154, "step": 1172 }, { "epoch": 0.09625416346400205, "grad_norm": 0.8782140016555786, "learning_rate": 9.941860529146944e-05, "loss": 3.8068, "step": 1174 }, { "epoch": 0.09641813989239047, "grad_norm": 1.0472065210342407, "learning_rate": 9.941447535919681e-05, "loss": 3.887, "step": 1176 }, { "epoch": 0.09658211632077889, "grad_norm": 0.8168578743934631, "learning_rate": 9.941033089671385e-05, "loss": 3.8221, "step": 1178 }, { "epoch": 0.0967460927491673, "grad_norm": 0.8000882863998413, "learning_rate": 9.940617190523923e-05, "loss": 3.8425, "step": 1180 }, { "epoch": 0.09691006917755572, "grad_norm": 0.9577187895774841, "learning_rate": 9.940199838599588e-05, "loss": 3.8679, "step": 1182 }, { "epoch": 0.09707404560594414, "grad_norm": 0.8853087425231934, "learning_rate": 9.939781034021105e-05, "loss": 3.8584, "step": 1184 }, { "epoch": 0.09723802203433256, "grad_norm": 0.811404824256897, "learning_rate": 9.939360776911619e-05, "loss": 3.8383, "step": 1186 }, { "epoch": 0.09740199846272099, "grad_norm": 0.811406672000885, "learning_rate": 9.938939067394706e-05, "loss": 3.7581, "step": 1188 }, { "epoch": 0.09756597489110941, "grad_norm": 0.8467538356781006, "learning_rate": 9.93851590559437e-05, "loss": 3.8692, "step": 1190 }, { "epoch": 0.09772995131949783, "grad_norm": 0.8470588326454163, "learning_rate": 9.938091291635039e-05, "loss": 3.8054, "step": 1192 }, { "epoch": 0.09789392774788624, "grad_norm": 0.8332253694534302, "learning_rate": 9.93766522564157e-05, "loss": 3.8541, "step": 1194 }, { "epoch": 0.09805790417627466, "grad_norm": 0.7565471529960632, "learning_rate": 9.93723770773925e-05, "loss": 3.7615, "step": 1196 }, { "epoch": 0.09822188060466308, "grad_norm": 0.9599220156669617, "learning_rate": 9.936808738053785e-05, "loss": 3.8253, "step": 1198 }, { "epoch": 0.0983858570330515, "grad_norm": 0.7333558201789856, "learning_rate": 9.936378316711317e-05, "loss": 3.8042, "step": 1200 }, { "epoch": 0.09854983346143992, "grad_norm": 0.7844712138175964, "learning_rate": 9.935946443838407e-05, "loss": 3.8675, "step": 1202 }, { "epoch": 0.09871380988982834, "grad_norm": 0.9951752424240112, "learning_rate": 9.935513119562045e-05, "loss": 3.8046, "step": 1204 }, { "epoch": 0.09887778631821675, "grad_norm": 0.8403246998786926, "learning_rate": 9.935078344009654e-05, "loss": 3.8651, "step": 1206 }, { "epoch": 0.09904176274660517, "grad_norm": 0.9809087514877319, "learning_rate": 9.934642117309074e-05, "loss": 3.7967, "step": 1208 }, { "epoch": 0.09920573917499359, "grad_norm": 1.024038553237915, "learning_rate": 9.93420443958858e-05, "loss": 3.798, "step": 1210 }, { "epoch": 0.09936971560338201, "grad_norm": 0.8824047446250916, "learning_rate": 9.933765310976867e-05, "loss": 3.7725, "step": 1212 }, { "epoch": 0.09953369203177043, "grad_norm": 0.7645026445388794, "learning_rate": 9.933324731603063e-05, "loss": 3.7802, "step": 1214 }, { "epoch": 0.09969766846015886, "grad_norm": 0.7119176387786865, "learning_rate": 9.932882701596716e-05, "loss": 3.7905, "step": 1216 }, { "epoch": 0.09986164488854728, "grad_norm": 0.6710290908813477, "learning_rate": 9.932439221087806e-05, "loss": 3.7898, "step": 1218 }, { "epoch": 0.1000256213169357, "grad_norm": 0.8256493210792542, "learning_rate": 9.931994290206738e-05, "loss": 3.7857, "step": 1220 }, { "epoch": 0.10018959774532411, "grad_norm": 0.8101679086685181, "learning_rate": 9.931547909084339e-05, "loss": 3.806, "step": 1222 }, { "epoch": 0.10035357417371253, "grad_norm": 0.7876362204551697, "learning_rate": 9.931100077851871e-05, "loss": 3.7395, "step": 1224 }, { "epoch": 0.10051755060210095, "grad_norm": 0.7746016979217529, "learning_rate": 9.930650796641017e-05, "loss": 3.7961, "step": 1226 }, { "epoch": 0.10068152703048937, "grad_norm": 0.6673750877380371, "learning_rate": 9.930200065583883e-05, "loss": 3.7708, "step": 1228 }, { "epoch": 0.10084550345887779, "grad_norm": 0.796775221824646, "learning_rate": 9.92974788481301e-05, "loss": 3.8279, "step": 1230 }, { "epoch": 0.1010094798872662, "grad_norm": 0.725659191608429, "learning_rate": 9.929294254461359e-05, "loss": 3.7343, "step": 1232 }, { "epoch": 0.10117345631565462, "grad_norm": 0.7385995984077454, "learning_rate": 9.928839174662317e-05, "loss": 3.8351, "step": 1234 }, { "epoch": 0.10133743274404304, "grad_norm": 0.7573429346084595, "learning_rate": 9.928382645549703e-05, "loss": 3.8307, "step": 1236 }, { "epoch": 0.10150140917243146, "grad_norm": 0.7082958221435547, "learning_rate": 9.927924667257756e-05, "loss": 3.812, "step": 1238 }, { "epoch": 0.10166538560081988, "grad_norm": 0.819148063659668, "learning_rate": 9.927465239921143e-05, "loss": 3.8176, "step": 1240 }, { "epoch": 0.1018293620292083, "grad_norm": 0.8235107660293579, "learning_rate": 9.927004363674959e-05, "loss": 3.766, "step": 1242 }, { "epoch": 0.10199333845759673, "grad_norm": 0.8283859491348267, "learning_rate": 9.926542038654722e-05, "loss": 3.7771, "step": 1244 }, { "epoch": 0.10215731488598515, "grad_norm": 0.739612340927124, "learning_rate": 9.92607826499638e-05, "loss": 3.7786, "step": 1246 }, { "epoch": 0.10232129131437356, "grad_norm": 0.6946161985397339, "learning_rate": 9.925613042836302e-05, "loss": 3.7143, "step": 1248 }, { "epoch": 0.10248526774276198, "grad_norm": 0.8461303114891052, "learning_rate": 9.925146372311288e-05, "loss": 3.7804, "step": 1250 }, { "epoch": 0.1026492441711504, "grad_norm": 0.8450109958648682, "learning_rate": 9.924678253558557e-05, "loss": 3.7209, "step": 1252 }, { "epoch": 0.10281322059953882, "grad_norm": 0.7622053623199463, "learning_rate": 9.924208686715763e-05, "loss": 3.7491, "step": 1254 }, { "epoch": 0.10297719702792724, "grad_norm": 0.8452515602111816, "learning_rate": 9.923737671920978e-05, "loss": 3.7461, "step": 1256 }, { "epoch": 0.10314117345631565, "grad_norm": 0.7765418887138367, "learning_rate": 9.923265209312704e-05, "loss": 3.7491, "step": 1258 }, { "epoch": 0.10330514988470407, "grad_norm": 0.8474555015563965, "learning_rate": 9.922791299029868e-05, "loss": 3.7277, "step": 1260 }, { "epoch": 0.10346912631309249, "grad_norm": 0.9031925201416016, "learning_rate": 9.922315941211823e-05, "loss": 3.7966, "step": 1262 }, { "epoch": 0.10363310274148091, "grad_norm": 0.7896429300308228, "learning_rate": 9.921839135998343e-05, "loss": 3.7119, "step": 1264 }, { "epoch": 0.10379707916986933, "grad_norm": 0.8127464056015015, "learning_rate": 9.921360883529636e-05, "loss": 3.7126, "step": 1266 }, { "epoch": 0.10396105559825775, "grad_norm": 0.9778748750686646, "learning_rate": 9.920881183946328e-05, "loss": 3.786, "step": 1268 }, { "epoch": 0.10412503202664616, "grad_norm": 0.9540830254554749, "learning_rate": 9.920400037389474e-05, "loss": 3.7243, "step": 1270 }, { "epoch": 0.10428900845503458, "grad_norm": 1.0872488021850586, "learning_rate": 9.919917444000555e-05, "loss": 3.7761, "step": 1272 }, { "epoch": 0.10445298488342301, "grad_norm": 0.8405986428260803, "learning_rate": 9.919433403921476e-05, "loss": 3.7128, "step": 1274 }, { "epoch": 0.10461696131181143, "grad_norm": 0.9105572700500488, "learning_rate": 9.918947917294568e-05, "loss": 3.7153, "step": 1276 }, { "epoch": 0.10478093774019985, "grad_norm": 0.7782844305038452, "learning_rate": 9.918460984262588e-05, "loss": 3.6612, "step": 1278 }, { "epoch": 0.10494491416858827, "grad_norm": 0.746457576751709, "learning_rate": 9.917972604968715e-05, "loss": 3.7196, "step": 1280 }, { "epoch": 0.10510889059697669, "grad_norm": 0.856855034828186, "learning_rate": 9.917482779556557e-05, "loss": 3.6806, "step": 1282 }, { "epoch": 0.1052728670253651, "grad_norm": 0.6793504953384399, "learning_rate": 9.916991508170148e-05, "loss": 3.706, "step": 1284 }, { "epoch": 0.10543684345375352, "grad_norm": 0.9537250995635986, "learning_rate": 9.916498790953943e-05, "loss": 3.7844, "step": 1286 }, { "epoch": 0.10560081988214194, "grad_norm": 0.6688050627708435, "learning_rate": 9.916004628052824e-05, "loss": 3.7161, "step": 1288 }, { "epoch": 0.10576479631053036, "grad_norm": 0.7840797305107117, "learning_rate": 9.9155090196121e-05, "loss": 3.7309, "step": 1290 }, { "epoch": 0.10592877273891878, "grad_norm": 0.7196126580238342, "learning_rate": 9.9150119657775e-05, "loss": 3.7571, "step": 1292 }, { "epoch": 0.1060927491673072, "grad_norm": 0.8072746396064758, "learning_rate": 9.914513466695188e-05, "loss": 3.713, "step": 1294 }, { "epoch": 0.10625672559569561, "grad_norm": 0.7342846393585205, "learning_rate": 9.914013522511743e-05, "loss": 3.7195, "step": 1296 }, { "epoch": 0.10642070202408403, "grad_norm": 0.7047367691993713, "learning_rate": 9.91351213337417e-05, "loss": 3.7306, "step": 1298 }, { "epoch": 0.10658467845247245, "grad_norm": 0.6987332105636597, "learning_rate": 9.913009299429904e-05, "loss": 3.7034, "step": 1300 }, { "epoch": 0.10674865488086088, "grad_norm": 0.6787108778953552, "learning_rate": 9.912505020826801e-05, "loss": 3.7059, "step": 1302 }, { "epoch": 0.1069126313092493, "grad_norm": 0.6666189432144165, "learning_rate": 9.911999297713145e-05, "loss": 3.7006, "step": 1304 }, { "epoch": 0.10707660773763772, "grad_norm": 0.6904592514038086, "learning_rate": 9.91149213023764e-05, "loss": 3.762, "step": 1306 }, { "epoch": 0.10724058416602614, "grad_norm": 0.8941283822059631, "learning_rate": 9.91098351854942e-05, "loss": 3.6793, "step": 1308 }, { "epoch": 0.10740456059441456, "grad_norm": 0.7099062204360962, "learning_rate": 9.910473462798039e-05, "loss": 3.6232, "step": 1310 }, { "epoch": 0.10756853702280297, "grad_norm": 0.8660025596618652, "learning_rate": 9.909961963133479e-05, "loss": 3.7272, "step": 1312 }, { "epoch": 0.10773251345119139, "grad_norm": 0.7533067464828491, "learning_rate": 9.909449019706145e-05, "loss": 3.7422, "step": 1314 }, { "epoch": 0.10789648987957981, "grad_norm": 0.7809666991233826, "learning_rate": 9.908934632666864e-05, "loss": 3.6608, "step": 1316 }, { "epoch": 0.10806046630796823, "grad_norm": 0.7331179976463318, "learning_rate": 9.908418802166894e-05, "loss": 3.6718, "step": 1318 }, { "epoch": 0.10822444273635665, "grad_norm": 0.7965632081031799, "learning_rate": 9.907901528357915e-05, "loss": 3.7616, "step": 1320 }, { "epoch": 0.10838841916474506, "grad_norm": 0.7728394269943237, "learning_rate": 9.907382811392026e-05, "loss": 3.6811, "step": 1322 }, { "epoch": 0.10855239559313348, "grad_norm": 0.7595298290252686, "learning_rate": 9.906862651421756e-05, "loss": 3.7385, "step": 1324 }, { "epoch": 0.1087163720215219, "grad_norm": 0.8519642353057861, "learning_rate": 9.906341048600056e-05, "loss": 3.7245, "step": 1326 }, { "epoch": 0.10888034844991032, "grad_norm": 0.7890890836715698, "learning_rate": 9.905818003080305e-05, "loss": 3.7362, "step": 1328 }, { "epoch": 0.10904432487829875, "grad_norm": 0.784578800201416, "learning_rate": 9.9052935150163e-05, "loss": 3.6611, "step": 1330 }, { "epoch": 0.10920830130668717, "grad_norm": 0.8048536777496338, "learning_rate": 9.904767584562267e-05, "loss": 3.7034, "step": 1332 }, { "epoch": 0.10937227773507559, "grad_norm": 0.7695967555046082, "learning_rate": 9.904240211872855e-05, "loss": 3.6495, "step": 1334 }, { "epoch": 0.109536254163464, "grad_norm": 0.6730368733406067, "learning_rate": 9.903711397103136e-05, "loss": 3.6522, "step": 1336 }, { "epoch": 0.10970023059185242, "grad_norm": 0.7607198357582092, "learning_rate": 9.903181140408609e-05, "loss": 3.6837, "step": 1338 }, { "epoch": 0.10986420702024084, "grad_norm": 0.7482820749282837, "learning_rate": 9.902649441945188e-05, "loss": 3.6851, "step": 1340 }, { "epoch": 0.11002818344862926, "grad_norm": 0.7840356230735779, "learning_rate": 9.902116301869227e-05, "loss": 3.6291, "step": 1342 }, { "epoch": 0.11019215987701768, "grad_norm": 0.6231241822242737, "learning_rate": 9.901581720337488e-05, "loss": 3.6361, "step": 1344 }, { "epoch": 0.1103561363054061, "grad_norm": 0.7832990884780884, "learning_rate": 9.901045697507165e-05, "loss": 3.6948, "step": 1346 }, { "epoch": 0.11052011273379451, "grad_norm": 0.7512111067771912, "learning_rate": 9.900508233535875e-05, "loss": 3.6697, "step": 1348 }, { "epoch": 0.11068408916218293, "grad_norm": 0.7433375716209412, "learning_rate": 9.899969328581659e-05, "loss": 3.7029, "step": 1350 }, { "epoch": 0.11084806559057135, "grad_norm": 0.7757459878921509, "learning_rate": 9.899428982802979e-05, "loss": 3.6965, "step": 1352 }, { "epoch": 0.11101204201895977, "grad_norm": 0.7528740763664246, "learning_rate": 9.898887196358721e-05, "loss": 3.6376, "step": 1354 }, { "epoch": 0.11117601844734819, "grad_norm": 0.7715753316879272, "learning_rate": 9.898343969408199e-05, "loss": 3.6403, "step": 1356 }, { "epoch": 0.11133999487573662, "grad_norm": 0.8425229787826538, "learning_rate": 9.897799302111146e-05, "loss": 3.6655, "step": 1358 }, { "epoch": 0.11150397130412504, "grad_norm": 0.8818288445472717, "learning_rate": 9.897253194627722e-05, "loss": 3.6635, "step": 1360 }, { "epoch": 0.11166794773251346, "grad_norm": 0.9619779586791992, "learning_rate": 9.896705647118504e-05, "loss": 3.6766, "step": 1362 }, { "epoch": 0.11183192416090187, "grad_norm": 0.9253937005996704, "learning_rate": 9.896156659744504e-05, "loss": 3.6859, "step": 1364 }, { "epoch": 0.11199590058929029, "grad_norm": 0.9797042608261108, "learning_rate": 9.895606232667144e-05, "loss": 3.649, "step": 1366 }, { "epoch": 0.11215987701767871, "grad_norm": 0.7821505665779114, "learning_rate": 9.895054366048281e-05, "loss": 3.7164, "step": 1368 }, { "epoch": 0.11232385344606713, "grad_norm": 0.8402411937713623, "learning_rate": 9.894501060050186e-05, "loss": 3.7369, "step": 1370 }, { "epoch": 0.11248782987445555, "grad_norm": 0.9361245036125183, "learning_rate": 9.893946314835559e-05, "loss": 3.6806, "step": 1372 }, { "epoch": 0.11265180630284397, "grad_norm": 0.9412410855293274, "learning_rate": 9.893390130567523e-05, "loss": 3.676, "step": 1374 }, { "epoch": 0.11281578273123238, "grad_norm": 0.8595388531684875, "learning_rate": 9.89283250740962e-05, "loss": 3.6322, "step": 1376 }, { "epoch": 0.1129797591596208, "grad_norm": 0.7236664295196533, "learning_rate": 9.892273445525817e-05, "loss": 3.6686, "step": 1378 }, { "epoch": 0.11314373558800922, "grad_norm": 0.659474790096283, "learning_rate": 9.891712945080508e-05, "loss": 3.6125, "step": 1380 }, { "epoch": 0.11330771201639764, "grad_norm": 0.7330523133277893, "learning_rate": 9.891151006238507e-05, "loss": 3.6654, "step": 1382 }, { "epoch": 0.11347168844478606, "grad_norm": 0.8555669784545898, "learning_rate": 9.890587629165049e-05, "loss": 3.5857, "step": 1384 }, { "epoch": 0.11363566487317447, "grad_norm": 0.9295856952667236, "learning_rate": 9.890022814025792e-05, "loss": 3.6874, "step": 1386 }, { "epoch": 0.1137996413015629, "grad_norm": 0.63069087266922, "learning_rate": 9.889456560986823e-05, "loss": 3.6358, "step": 1388 }, { "epoch": 0.11396361772995132, "grad_norm": 0.7602105140686035, "learning_rate": 9.888888870214643e-05, "loss": 3.6113, "step": 1390 }, { "epoch": 0.11412759415833974, "grad_norm": 0.7809365391731262, "learning_rate": 9.888319741876185e-05, "loss": 3.6585, "step": 1392 }, { "epoch": 0.11429157058672816, "grad_norm": 0.6976439952850342, "learning_rate": 9.887749176138794e-05, "loss": 3.695, "step": 1394 }, { "epoch": 0.11445554701511658, "grad_norm": 0.775341272354126, "learning_rate": 9.887177173170248e-05, "loss": 3.6783, "step": 1396 }, { "epoch": 0.114619523443505, "grad_norm": 0.7208604216575623, "learning_rate": 9.886603733138742e-05, "loss": 3.692, "step": 1398 }, { "epoch": 0.11478349987189342, "grad_norm": 0.7146006226539612, "learning_rate": 9.886028856212893e-05, "loss": 3.6103, "step": 1400 }, { "epoch": 0.11494747630028183, "grad_norm": 0.6759282946586609, "learning_rate": 9.885452542561744e-05, "loss": 3.7273, "step": 1402 }, { "epoch": 0.11511145272867025, "grad_norm": 0.680182158946991, "learning_rate": 9.884874792354758e-05, "loss": 3.6314, "step": 1404 }, { "epoch": 0.11527542915705867, "grad_norm": 0.8232313394546509, "learning_rate": 9.884295605761822e-05, "loss": 3.6098, "step": 1406 }, { "epoch": 0.11543940558544709, "grad_norm": 0.6593087911605835, "learning_rate": 9.883714982953244e-05, "loss": 3.5716, "step": 1408 }, { "epoch": 0.1156033820138355, "grad_norm": 0.6459859013557434, "learning_rate": 9.883132924099753e-05, "loss": 3.6418, "step": 1410 }, { "epoch": 0.11576735844222392, "grad_norm": 0.7395800352096558, "learning_rate": 9.882549429372505e-05, "loss": 3.6148, "step": 1412 }, { "epoch": 0.11593133487061234, "grad_norm": 0.6539946794509888, "learning_rate": 9.881964498943074e-05, "loss": 3.6126, "step": 1414 }, { "epoch": 0.11609531129900078, "grad_norm": 0.7250804901123047, "learning_rate": 9.881378132983456e-05, "loss": 3.5968, "step": 1416 }, { "epoch": 0.1162592877273892, "grad_norm": 0.6665759086608887, "learning_rate": 9.880790331666073e-05, "loss": 3.5718, "step": 1418 }, { "epoch": 0.11642326415577761, "grad_norm": 0.6589260697364807, "learning_rate": 9.880201095163765e-05, "loss": 3.6868, "step": 1420 }, { "epoch": 0.11658724058416603, "grad_norm": 0.7453758716583252, "learning_rate": 9.879610423649795e-05, "loss": 3.5785, "step": 1422 }, { "epoch": 0.11675121701255445, "grad_norm": 0.760637640953064, "learning_rate": 9.879018317297852e-05, "loss": 3.6157, "step": 1424 }, { "epoch": 0.11691519344094287, "grad_norm": 0.7891120910644531, "learning_rate": 9.878424776282039e-05, "loss": 3.6347, "step": 1426 }, { "epoch": 0.11707916986933128, "grad_norm": 0.8159520626068115, "learning_rate": 9.877829800776887e-05, "loss": 3.5827, "step": 1428 }, { "epoch": 0.1172431462977197, "grad_norm": 0.7945658564567566, "learning_rate": 9.877233390957348e-05, "loss": 3.6225, "step": 1430 }, { "epoch": 0.11740712272610812, "grad_norm": 0.6557226777076721, "learning_rate": 9.876635546998795e-05, "loss": 3.5789, "step": 1432 }, { "epoch": 0.11757109915449654, "grad_norm": 0.7214797735214233, "learning_rate": 9.876036269077021e-05, "loss": 3.6572, "step": 1434 }, { "epoch": 0.11773507558288496, "grad_norm": 0.6005678772926331, "learning_rate": 9.875435557368245e-05, "loss": 3.5788, "step": 1436 }, { "epoch": 0.11789905201127338, "grad_norm": 0.7711865901947021, "learning_rate": 9.874833412049102e-05, "loss": 3.6336, "step": 1438 }, { "epoch": 0.1180630284396618, "grad_norm": 0.6728807091712952, "learning_rate": 9.874229833296654e-05, "loss": 3.5508, "step": 1440 }, { "epoch": 0.11822700486805021, "grad_norm": 0.7253196239471436, "learning_rate": 9.873624821288378e-05, "loss": 3.5958, "step": 1442 }, { "epoch": 0.11839098129643864, "grad_norm": 0.7579479813575745, "learning_rate": 9.87301837620218e-05, "loss": 3.5491, "step": 1444 }, { "epoch": 0.11855495772482706, "grad_norm": 0.8953156471252441, "learning_rate": 9.872410498216382e-05, "loss": 3.5745, "step": 1446 }, { "epoch": 0.11871893415321548, "grad_norm": 0.7993676662445068, "learning_rate": 9.87180118750973e-05, "loss": 3.6284, "step": 1448 }, { "epoch": 0.1188829105816039, "grad_norm": 0.7717795968055725, "learning_rate": 9.871190444261391e-05, "loss": 3.6401, "step": 1450 }, { "epoch": 0.11904688700999232, "grad_norm": 0.7042213082313538, "learning_rate": 9.870578268650951e-05, "loss": 3.5621, "step": 1452 }, { "epoch": 0.11921086343838073, "grad_norm": 0.7422952651977539, "learning_rate": 9.86996466085842e-05, "loss": 3.6372, "step": 1454 }, { "epoch": 0.11937483986676915, "grad_norm": 0.6974375247955322, "learning_rate": 9.869349621064228e-05, "loss": 3.6112, "step": 1456 }, { "epoch": 0.11953881629515757, "grad_norm": 0.6447159051895142, "learning_rate": 9.868733149449224e-05, "loss": 3.6014, "step": 1458 }, { "epoch": 0.11970279272354599, "grad_norm": 0.7838313579559326, "learning_rate": 9.868115246194682e-05, "loss": 3.6063, "step": 1460 }, { "epoch": 0.11986676915193441, "grad_norm": 0.7431493997573853, "learning_rate": 9.867495911482295e-05, "loss": 3.6058, "step": 1462 }, { "epoch": 0.12003074558032283, "grad_norm": 0.7936450242996216, "learning_rate": 9.866875145494175e-05, "loss": 3.5997, "step": 1464 }, { "epoch": 0.12019472200871124, "grad_norm": 0.7873966097831726, "learning_rate": 9.866252948412859e-05, "loss": 3.5496, "step": 1466 }, { "epoch": 0.12035869843709966, "grad_norm": 0.6305302381515503, "learning_rate": 9.865629320421301e-05, "loss": 3.5844, "step": 1468 }, { "epoch": 0.12052267486548808, "grad_norm": 0.8464959263801575, "learning_rate": 9.865004261702879e-05, "loss": 3.5736, "step": 1470 }, { "epoch": 0.1206866512938765, "grad_norm": 0.5869840383529663, "learning_rate": 9.86437777244139e-05, "loss": 3.5445, "step": 1472 }, { "epoch": 0.12085062772226493, "grad_norm": 0.680647611618042, "learning_rate": 9.863749852821049e-05, "loss": 3.5203, "step": 1474 }, { "epoch": 0.12101460415065335, "grad_norm": 0.61550372838974, "learning_rate": 9.863120503026497e-05, "loss": 3.5638, "step": 1476 }, { "epoch": 0.12117858057904177, "grad_norm": 0.7552183270454407, "learning_rate": 9.862489723242792e-05, "loss": 3.5792, "step": 1478 }, { "epoch": 0.12134255700743019, "grad_norm": 0.6832086443901062, "learning_rate": 9.861857513655413e-05, "loss": 3.5966, "step": 1480 }, { "epoch": 0.1215065334358186, "grad_norm": 0.7540295124053955, "learning_rate": 9.86122387445026e-05, "loss": 3.6294, "step": 1482 }, { "epoch": 0.12167050986420702, "grad_norm": 0.8079466223716736, "learning_rate": 9.860588805813653e-05, "loss": 3.5425, "step": 1484 }, { "epoch": 0.12183448629259544, "grad_norm": 0.6720893979072571, "learning_rate": 9.859952307932334e-05, "loss": 3.5946, "step": 1486 }, { "epoch": 0.12199846272098386, "grad_norm": 0.7064858078956604, "learning_rate": 9.85931438099346e-05, "loss": 3.5714, "step": 1488 }, { "epoch": 0.12216243914937228, "grad_norm": 0.7637129426002502, "learning_rate": 9.858675025184616e-05, "loss": 3.5547, "step": 1490 }, { "epoch": 0.1223264155777607, "grad_norm": 0.7691531181335449, "learning_rate": 9.8580342406938e-05, "loss": 3.5592, "step": 1492 }, { "epoch": 0.12249039200614911, "grad_norm": 0.8551957607269287, "learning_rate": 9.857392027709435e-05, "loss": 3.5714, "step": 1494 }, { "epoch": 0.12265436843453753, "grad_norm": 0.792309582233429, "learning_rate": 9.856748386420362e-05, "loss": 3.6033, "step": 1496 }, { "epoch": 0.12281834486292595, "grad_norm": 0.6698387861251831, "learning_rate": 9.856103317015841e-05, "loss": 3.5304, "step": 1498 }, { "epoch": 0.12298232129131437, "grad_norm": 0.7881389260292053, "learning_rate": 9.855456819685555e-05, "loss": 3.551, "step": 1500 }, { "epoch": 0.1231462977197028, "grad_norm": 0.7923277616500854, "learning_rate": 9.854808894619602e-05, "loss": 3.5718, "step": 1502 }, { "epoch": 0.12331027414809122, "grad_norm": 0.8148519992828369, "learning_rate": 9.854159542008508e-05, "loss": 3.5866, "step": 1504 }, { "epoch": 0.12347425057647964, "grad_norm": 0.7714492082595825, "learning_rate": 9.853508762043209e-05, "loss": 3.6145, "step": 1506 }, { "epoch": 0.12363822700486805, "grad_norm": 0.8480067253112793, "learning_rate": 9.852856554915066e-05, "loss": 3.5683, "step": 1508 }, { "epoch": 0.12380220343325647, "grad_norm": 0.6881988644599915, "learning_rate": 9.85220292081586e-05, "loss": 3.5086, "step": 1510 }, { "epoch": 0.12396617986164489, "grad_norm": 0.7517293095588684, "learning_rate": 9.85154785993779e-05, "loss": 3.582, "step": 1512 }, { "epoch": 0.12413015629003331, "grad_norm": 1.0973633527755737, "learning_rate": 9.850891372473478e-05, "loss": 3.5674, "step": 1514 }, { "epoch": 0.12429413271842173, "grad_norm": 0.837307870388031, "learning_rate": 9.850233458615957e-05, "loss": 3.6073, "step": 1516 }, { "epoch": 0.12445810914681014, "grad_norm": 0.9713445901870728, "learning_rate": 9.84957411855869e-05, "loss": 3.5698, "step": 1518 }, { "epoch": 0.12462208557519856, "grad_norm": 0.9163139462471008, "learning_rate": 9.848913352495551e-05, "loss": 3.6004, "step": 1520 }, { "epoch": 0.12478606200358698, "grad_norm": 0.7779731154441833, "learning_rate": 9.848251160620839e-05, "loss": 3.6013, "step": 1522 }, { "epoch": 0.1249500384319754, "grad_norm": 0.7217362523078918, "learning_rate": 9.847587543129269e-05, "loss": 3.5181, "step": 1524 }, { "epoch": 0.12511401486036383, "grad_norm": 0.7261420488357544, "learning_rate": 9.846922500215976e-05, "loss": 3.5826, "step": 1526 }, { "epoch": 0.12527799128875225, "grad_norm": 0.6862401962280273, "learning_rate": 9.846256032076515e-05, "loss": 3.4789, "step": 1528 }, { "epoch": 0.12544196771714067, "grad_norm": 0.7996855974197388, "learning_rate": 9.845588138906859e-05, "loss": 3.5581, "step": 1530 }, { "epoch": 0.12560594414552909, "grad_norm": 0.6853513717651367, "learning_rate": 9.8449188209034e-05, "loss": 3.5594, "step": 1532 }, { "epoch": 0.1257699205739175, "grad_norm": 0.7655189037322998, "learning_rate": 9.84424807826295e-05, "loss": 3.5514, "step": 1534 }, { "epoch": 0.12593389700230592, "grad_norm": 0.6501437425613403, "learning_rate": 9.84357591118274e-05, "loss": 3.5318, "step": 1536 }, { "epoch": 0.12609787343069434, "grad_norm": 0.7742712497711182, "learning_rate": 9.842902319860417e-05, "loss": 3.505, "step": 1538 }, { "epoch": 0.12626184985908276, "grad_norm": 0.632228672504425, "learning_rate": 9.842227304494051e-05, "loss": 3.6209, "step": 1540 }, { "epoch": 0.12642582628747118, "grad_norm": 0.5981665253639221, "learning_rate": 9.841550865282128e-05, "loss": 3.5373, "step": 1542 }, { "epoch": 0.1265898027158596, "grad_norm": 0.6225053071975708, "learning_rate": 9.840873002423552e-05, "loss": 3.5161, "step": 1544 }, { "epoch": 0.126753779144248, "grad_norm": 0.6428139209747314, "learning_rate": 9.84019371611765e-05, "loss": 3.5194, "step": 1546 }, { "epoch": 0.12691775557263643, "grad_norm": 0.6379141807556152, "learning_rate": 9.83951300656416e-05, "loss": 3.5363, "step": 1548 }, { "epoch": 0.12708173200102485, "grad_norm": 0.6773483753204346, "learning_rate": 9.838830873963249e-05, "loss": 3.5162, "step": 1550 }, { "epoch": 0.12724570842941327, "grad_norm": 0.6917803883552551, "learning_rate": 9.838147318515491e-05, "loss": 3.5388, "step": 1552 }, { "epoch": 0.12740968485780169, "grad_norm": 0.6757781505584717, "learning_rate": 9.837462340421886e-05, "loss": 3.5399, "step": 1554 }, { "epoch": 0.1275736612861901, "grad_norm": 0.6116536855697632, "learning_rate": 9.836775939883852e-05, "loss": 3.5487, "step": 1556 }, { "epoch": 0.12773763771457852, "grad_norm": 0.6963580250740051, "learning_rate": 9.836088117103222e-05, "loss": 3.5252, "step": 1558 }, { "epoch": 0.12790161414296694, "grad_norm": 0.7524001002311707, "learning_rate": 9.835398872282247e-05, "loss": 3.507, "step": 1560 }, { "epoch": 0.12806559057135536, "grad_norm": 0.6589372754096985, "learning_rate": 9.834708205623599e-05, "loss": 3.5236, "step": 1562 }, { "epoch": 0.12822956699974378, "grad_norm": 0.6432667970657349, "learning_rate": 9.834016117330369e-05, "loss": 3.5065, "step": 1564 }, { "epoch": 0.1283935434281322, "grad_norm": 0.6807281970977783, "learning_rate": 9.833322607606062e-05, "loss": 3.494, "step": 1566 }, { "epoch": 0.1285575198565206, "grad_norm": 0.6932308077812195, "learning_rate": 9.832627676654601e-05, "loss": 3.5196, "step": 1568 }, { "epoch": 0.12872149628490903, "grad_norm": 0.6904752254486084, "learning_rate": 9.831931324680333e-05, "loss": 3.5883, "step": 1570 }, { "epoch": 0.12888547271329745, "grad_norm": 0.7266760468482971, "learning_rate": 9.831233551888015e-05, "loss": 3.5637, "step": 1572 }, { "epoch": 0.1290494491416859, "grad_norm": 0.6184994578361511, "learning_rate": 9.830534358482827e-05, "loss": 3.512, "step": 1574 }, { "epoch": 0.12921342557007431, "grad_norm": 0.6875196695327759, "learning_rate": 9.829833744670366e-05, "loss": 3.5822, "step": 1576 }, { "epoch": 0.12937740199846273, "grad_norm": 0.6475251317024231, "learning_rate": 9.829131710656645e-05, "loss": 3.4807, "step": 1578 }, { "epoch": 0.12954137842685115, "grad_norm": 0.6316696405410767, "learning_rate": 9.828428256648095e-05, "loss": 3.5333, "step": 1580 }, { "epoch": 0.12970535485523957, "grad_norm": 0.6235971450805664, "learning_rate": 9.827723382851565e-05, "loss": 3.4228, "step": 1582 }, { "epoch": 0.129869331283628, "grad_norm": 0.7109240293502808, "learning_rate": 9.827017089474324e-05, "loss": 3.4983, "step": 1584 }, { "epoch": 0.1300333077120164, "grad_norm": 0.7131465673446655, "learning_rate": 9.826309376724052e-05, "loss": 3.5094, "step": 1586 }, { "epoch": 0.13019728414040482, "grad_norm": 0.6761036515235901, "learning_rate": 9.825600244808853e-05, "loss": 3.5461, "step": 1588 }, { "epoch": 0.13036126056879324, "grad_norm": 0.6749283671379089, "learning_rate": 9.824889693937245e-05, "loss": 3.5214, "step": 1590 }, { "epoch": 0.13052523699718166, "grad_norm": 0.952836811542511, "learning_rate": 9.824177724318162e-05, "loss": 3.5135, "step": 1592 }, { "epoch": 0.13068921342557008, "grad_norm": 0.7100101113319397, "learning_rate": 9.823464336160959e-05, "loss": 3.5523, "step": 1594 }, { "epoch": 0.1308531898539585, "grad_norm": 0.7093891501426697, "learning_rate": 9.822749529675406e-05, "loss": 3.5338, "step": 1596 }, { "epoch": 0.13101716628234691, "grad_norm": 0.716699481010437, "learning_rate": 9.822033305071689e-05, "loss": 3.5206, "step": 1598 }, { "epoch": 0.13118114271073533, "grad_norm": 0.6527066230773926, "learning_rate": 9.821315662560415e-05, "loss": 3.5264, "step": 1600 }, { "epoch": 0.13134511913912375, "grad_norm": 0.6964712738990784, "learning_rate": 9.820596602352601e-05, "loss": 3.4957, "step": 1602 }, { "epoch": 0.13150909556751217, "grad_norm": 0.7309548854827881, "learning_rate": 9.819876124659687e-05, "loss": 3.5073, "step": 1604 }, { "epoch": 0.1316730719959006, "grad_norm": 0.7031247019767761, "learning_rate": 9.819154229693529e-05, "loss": 3.4814, "step": 1606 }, { "epoch": 0.131837048424289, "grad_norm": 0.6443150639533997, "learning_rate": 9.818430917666397e-05, "loss": 3.4822, "step": 1608 }, { "epoch": 0.13200102485267742, "grad_norm": 0.6941884160041809, "learning_rate": 9.817706188790979e-05, "loss": 3.4766, "step": 1610 }, { "epoch": 0.13216500128106584, "grad_norm": 0.6277130246162415, "learning_rate": 9.81698004328038e-05, "loss": 3.4984, "step": 1612 }, { "epoch": 0.13232897770945426, "grad_norm": 0.7393566966056824, "learning_rate": 9.816252481348122e-05, "loss": 3.4467, "step": 1614 }, { "epoch": 0.13249295413784268, "grad_norm": 0.8626520037651062, "learning_rate": 9.815523503208141e-05, "loss": 3.5066, "step": 1616 }, { "epoch": 0.1326569305662311, "grad_norm": 0.7005507946014404, "learning_rate": 9.814793109074795e-05, "loss": 3.5249, "step": 1618 }, { "epoch": 0.13282090699461951, "grad_norm": 0.695397675037384, "learning_rate": 9.814061299162853e-05, "loss": 3.5428, "step": 1620 }, { "epoch": 0.13298488342300793, "grad_norm": 0.7181118130683899, "learning_rate": 9.8133280736875e-05, "loss": 3.4943, "step": 1622 }, { "epoch": 0.13314885985139635, "grad_norm": 0.7019610404968262, "learning_rate": 9.812593432864343e-05, "loss": 3.4751, "step": 1624 }, { "epoch": 0.13331283627978477, "grad_norm": 0.629170298576355, "learning_rate": 9.811857376909398e-05, "loss": 3.4791, "step": 1626 }, { "epoch": 0.1334768127081732, "grad_norm": 0.7574000358581543, "learning_rate": 9.8111199060391e-05, "loss": 3.5068, "step": 1628 }, { "epoch": 0.13364078913656163, "grad_norm": 0.6433237195014954, "learning_rate": 9.810381020470303e-05, "loss": 3.5356, "step": 1630 }, { "epoch": 0.13380476556495005, "grad_norm": 0.8434267044067383, "learning_rate": 9.809640720420275e-05, "loss": 3.5549, "step": 1632 }, { "epoch": 0.13396874199333847, "grad_norm": 0.7503165602684021, "learning_rate": 9.808899006106697e-05, "loss": 3.4256, "step": 1634 }, { "epoch": 0.1341327184217269, "grad_norm": 0.6501772403717041, "learning_rate": 9.808155877747671e-05, "loss": 3.4676, "step": 1636 }, { "epoch": 0.1342966948501153, "grad_norm": 0.6620165109634399, "learning_rate": 9.80741133556171e-05, "loss": 3.4372, "step": 1638 }, { "epoch": 0.13446067127850372, "grad_norm": 0.7528221607208252, "learning_rate": 9.806665379767746e-05, "loss": 3.4309, "step": 1640 }, { "epoch": 0.13462464770689214, "grad_norm": 0.6489667892456055, "learning_rate": 9.805918010585124e-05, "loss": 3.4438, "step": 1642 }, { "epoch": 0.13478862413528056, "grad_norm": 0.5966881513595581, "learning_rate": 9.805169228233608e-05, "loss": 3.4465, "step": 1644 }, { "epoch": 0.13495260056366898, "grad_norm": 0.6900391578674316, "learning_rate": 9.804419032933377e-05, "loss": 3.5272, "step": 1646 }, { "epoch": 0.1351165769920574, "grad_norm": 0.7607645988464355, "learning_rate": 9.80366742490502e-05, "loss": 3.4956, "step": 1648 }, { "epoch": 0.13528055342044581, "grad_norm": 0.6069225668907166, "learning_rate": 9.802914404369548e-05, "loss": 3.4641, "step": 1650 }, { "epoch": 0.13544452984883423, "grad_norm": 0.6781127452850342, "learning_rate": 9.802159971548386e-05, "loss": 3.5116, "step": 1652 }, { "epoch": 0.13560850627722265, "grad_norm": 0.5579132437705994, "learning_rate": 9.801404126663372e-05, "loss": 3.5021, "step": 1654 }, { "epoch": 0.13577248270561107, "grad_norm": 0.8519318103790283, "learning_rate": 9.800646869936758e-05, "loss": 3.4645, "step": 1656 }, { "epoch": 0.1359364591339995, "grad_norm": 0.8295395374298096, "learning_rate": 9.799888201591219e-05, "loss": 3.4875, "step": 1658 }, { "epoch": 0.1361004355623879, "grad_norm": 0.7860473990440369, "learning_rate": 9.799128121849835e-05, "loss": 3.5143, "step": 1660 }, { "epoch": 0.13626441199077632, "grad_norm": 0.676199197769165, "learning_rate": 9.798366630936107e-05, "loss": 3.4924, "step": 1662 }, { "epoch": 0.13642838841916474, "grad_norm": 0.7471193075180054, "learning_rate": 9.797603729073949e-05, "loss": 3.4606, "step": 1664 }, { "epoch": 0.13659236484755316, "grad_norm": 0.7911469340324402, "learning_rate": 9.796839416487693e-05, "loss": 3.487, "step": 1666 }, { "epoch": 0.13675634127594158, "grad_norm": 0.7229553461074829, "learning_rate": 9.796073693402081e-05, "loss": 3.5058, "step": 1668 }, { "epoch": 0.13692031770433, "grad_norm": 0.7046807408332825, "learning_rate": 9.795306560042272e-05, "loss": 3.4739, "step": 1670 }, { "epoch": 0.13708429413271842, "grad_norm": 0.7285602688789368, "learning_rate": 9.794538016633842e-05, "loss": 3.4592, "step": 1672 }, { "epoch": 0.13724827056110683, "grad_norm": 0.7747913002967834, "learning_rate": 9.793768063402777e-05, "loss": 3.4593, "step": 1674 }, { "epoch": 0.13741224698949525, "grad_norm": 0.7013533711433411, "learning_rate": 9.792996700575481e-05, "loss": 3.5569, "step": 1676 }, { "epoch": 0.13757622341788367, "grad_norm": 0.6556512117385864, "learning_rate": 9.792223928378772e-05, "loss": 3.4342, "step": 1678 }, { "epoch": 0.1377401998462721, "grad_norm": 0.7427647709846497, "learning_rate": 9.79144974703988e-05, "loss": 3.45, "step": 1680 }, { "epoch": 0.1379041762746605, "grad_norm": 0.7319619059562683, "learning_rate": 9.790674156786452e-05, "loss": 3.5378, "step": 1682 }, { "epoch": 0.13806815270304892, "grad_norm": 0.6408218741416931, "learning_rate": 9.78989715784655e-05, "loss": 3.5022, "step": 1684 }, { "epoch": 0.13823212913143734, "grad_norm": 0.5858979821205139, "learning_rate": 9.789118750448647e-05, "loss": 3.461, "step": 1686 }, { "epoch": 0.1383961055598258, "grad_norm": 0.6623833775520325, "learning_rate": 9.788338934821632e-05, "loss": 3.5288, "step": 1688 }, { "epoch": 0.1385600819882142, "grad_norm": 0.6391535997390747, "learning_rate": 9.787557711194808e-05, "loss": 3.5074, "step": 1690 }, { "epoch": 0.13872405841660262, "grad_norm": 0.5775202512741089, "learning_rate": 9.786775079797893e-05, "loss": 3.4467, "step": 1692 }, { "epoch": 0.13888803484499104, "grad_norm": 0.6308355927467346, "learning_rate": 9.785991040861017e-05, "loss": 3.4568, "step": 1694 }, { "epoch": 0.13905201127337946, "grad_norm": 0.7259300351142883, "learning_rate": 9.785205594614725e-05, "loss": 3.5018, "step": 1696 }, { "epoch": 0.13921598770176788, "grad_norm": 0.7123456001281738, "learning_rate": 9.784418741289975e-05, "loss": 3.4491, "step": 1698 }, { "epoch": 0.1393799641301563, "grad_norm": 0.7426223754882812, "learning_rate": 9.783630481118141e-05, "loss": 3.5045, "step": 1700 }, { "epoch": 0.13954394055854472, "grad_norm": 0.8505781292915344, "learning_rate": 9.782840814331007e-05, "loss": 3.5193, "step": 1702 }, { "epoch": 0.13970791698693313, "grad_norm": 0.7968536615371704, "learning_rate": 9.782049741160775e-05, "loss": 3.499, "step": 1704 }, { "epoch": 0.13987189341532155, "grad_norm": 0.7513880729675293, "learning_rate": 9.781257261840055e-05, "loss": 3.4711, "step": 1706 }, { "epoch": 0.14003586984370997, "grad_norm": 0.8544629812240601, "learning_rate": 9.780463376601878e-05, "loss": 3.4969, "step": 1708 }, { "epoch": 0.1401998462720984, "grad_norm": 0.5832239985466003, "learning_rate": 9.77966808567968e-05, "loss": 3.3526, "step": 1710 }, { "epoch": 0.1403638227004868, "grad_norm": 0.7072123885154724, "learning_rate": 9.778871389307318e-05, "loss": 3.4907, "step": 1712 }, { "epoch": 0.14052779912887522, "grad_norm": 0.8044034242630005, "learning_rate": 9.778073287719054e-05, "loss": 3.5346, "step": 1714 }, { "epoch": 0.14069177555726364, "grad_norm": 0.6524981260299683, "learning_rate": 9.777273781149574e-05, "loss": 3.429, "step": 1716 }, { "epoch": 0.14085575198565206, "grad_norm": 0.5953693389892578, "learning_rate": 9.776472869833965e-05, "loss": 3.4326, "step": 1718 }, { "epoch": 0.14101972841404048, "grad_norm": 0.586199164390564, "learning_rate": 9.775670554007736e-05, "loss": 3.382, "step": 1720 }, { "epoch": 0.1411837048424289, "grad_norm": 0.6465304493904114, "learning_rate": 9.774866833906808e-05, "loss": 3.4143, "step": 1722 }, { "epoch": 0.14134768127081732, "grad_norm": 0.8190158009529114, "learning_rate": 9.774061709767508e-05, "loss": 3.4286, "step": 1724 }, { "epoch": 0.14151165769920573, "grad_norm": 0.778614342212677, "learning_rate": 9.773255181826586e-05, "loss": 3.4458, "step": 1726 }, { "epoch": 0.14167563412759415, "grad_norm": 0.8797032237052917, "learning_rate": 9.772447250321197e-05, "loss": 3.4719, "step": 1728 }, { "epoch": 0.14183961055598257, "grad_norm": 0.6563115119934082, "learning_rate": 9.771637915488911e-05, "loss": 3.3792, "step": 1730 }, { "epoch": 0.142003586984371, "grad_norm": 0.823006272315979, "learning_rate": 9.770827177567712e-05, "loss": 3.4733, "step": 1732 }, { "epoch": 0.1421675634127594, "grad_norm": 0.7860798239707947, "learning_rate": 9.770015036795996e-05, "loss": 3.5151, "step": 1734 }, { "epoch": 0.14233153984114782, "grad_norm": 0.8051521182060242, "learning_rate": 9.76920149341257e-05, "loss": 3.4156, "step": 1736 }, { "epoch": 0.14249551626953624, "grad_norm": 0.6009500026702881, "learning_rate": 9.768386547656655e-05, "loss": 3.436, "step": 1738 }, { "epoch": 0.14265949269792466, "grad_norm": 0.68117356300354, "learning_rate": 9.767570199767883e-05, "loss": 3.4671, "step": 1740 }, { "epoch": 0.14282346912631308, "grad_norm": 0.6417118906974792, "learning_rate": 9.766752449986301e-05, "loss": 3.4416, "step": 1742 }, { "epoch": 0.14298744555470153, "grad_norm": 0.6248669624328613, "learning_rate": 9.765933298552366e-05, "loss": 3.3769, "step": 1744 }, { "epoch": 0.14315142198308994, "grad_norm": 0.5746626853942871, "learning_rate": 9.765112745706945e-05, "loss": 3.408, "step": 1746 }, { "epoch": 0.14331539841147836, "grad_norm": 0.6235172748565674, "learning_rate": 9.764290791691324e-05, "loss": 3.4454, "step": 1748 }, { "epoch": 0.14347937483986678, "grad_norm": 0.63954758644104, "learning_rate": 9.763467436747193e-05, "loss": 3.4275, "step": 1750 }, { "epoch": 0.1436433512682552, "grad_norm": 0.6917594075202942, "learning_rate": 9.76264268111666e-05, "loss": 3.3796, "step": 1752 }, { "epoch": 0.14380732769664362, "grad_norm": 0.6280871629714966, "learning_rate": 9.76181652504224e-05, "loss": 3.4056, "step": 1754 }, { "epoch": 0.14397130412503203, "grad_norm": 0.5994766354560852, "learning_rate": 9.760988968766864e-05, "loss": 3.458, "step": 1756 }, { "epoch": 0.14413528055342045, "grad_norm": 0.7142448425292969, "learning_rate": 9.760160012533872e-05, "loss": 3.4442, "step": 1758 }, { "epoch": 0.14429925698180887, "grad_norm": 0.632342517375946, "learning_rate": 9.759329656587017e-05, "loss": 3.4396, "step": 1760 }, { "epoch": 0.1444632334101973, "grad_norm": 0.6980354189872742, "learning_rate": 9.758497901170465e-05, "loss": 3.4684, "step": 1762 }, { "epoch": 0.1446272098385857, "grad_norm": 0.6333186626434326, "learning_rate": 9.75766474652879e-05, "loss": 3.4394, "step": 1764 }, { "epoch": 0.14479118626697413, "grad_norm": 0.5935460329055786, "learning_rate": 9.756830192906978e-05, "loss": 3.3757, "step": 1766 }, { "epoch": 0.14495516269536254, "grad_norm": 0.6060703992843628, "learning_rate": 9.75599424055043e-05, "loss": 3.3985, "step": 1768 }, { "epoch": 0.14511913912375096, "grad_norm": 0.7013797760009766, "learning_rate": 9.755156889704953e-05, "loss": 3.4613, "step": 1770 }, { "epoch": 0.14528311555213938, "grad_norm": 0.6497318744659424, "learning_rate": 9.75431814061677e-05, "loss": 3.3849, "step": 1772 }, { "epoch": 0.1454470919805278, "grad_norm": 0.6609060764312744, "learning_rate": 9.753477993532514e-05, "loss": 3.3863, "step": 1774 }, { "epoch": 0.14561106840891622, "grad_norm": 0.6392355561256409, "learning_rate": 9.752636448699227e-05, "loss": 3.4147, "step": 1776 }, { "epoch": 0.14577504483730463, "grad_norm": 0.729839563369751, "learning_rate": 9.751793506364362e-05, "loss": 3.3952, "step": 1778 }, { "epoch": 0.14593902126569305, "grad_norm": 0.6394525170326233, "learning_rate": 9.750949166775786e-05, "loss": 3.4272, "step": 1780 }, { "epoch": 0.14610299769408147, "grad_norm": 0.7037297487258911, "learning_rate": 9.750103430181776e-05, "loss": 3.3667, "step": 1782 }, { "epoch": 0.1462669741224699, "grad_norm": 0.6344433426856995, "learning_rate": 9.749256296831017e-05, "loss": 3.3558, "step": 1784 }, { "epoch": 0.1464309505508583, "grad_norm": 0.6455307006835938, "learning_rate": 9.748407766972607e-05, "loss": 3.3936, "step": 1786 }, { "epoch": 0.14659492697924673, "grad_norm": 0.7534605860710144, "learning_rate": 9.747557840856055e-05, "loss": 3.3973, "step": 1788 }, { "epoch": 0.14675890340763514, "grad_norm": 0.6741543412208557, "learning_rate": 9.746706518731278e-05, "loss": 3.4123, "step": 1790 }, { "epoch": 0.14692287983602356, "grad_norm": 0.7014438509941101, "learning_rate": 9.745853800848606e-05, "loss": 3.4118, "step": 1792 }, { "epoch": 0.14708685626441198, "grad_norm": 0.6591073870658875, "learning_rate": 9.74499968745878e-05, "loss": 3.4319, "step": 1794 }, { "epoch": 0.1472508326928004, "grad_norm": 0.6363744735717773, "learning_rate": 9.74414417881295e-05, "loss": 3.384, "step": 1796 }, { "epoch": 0.14741480912118882, "grad_norm": 0.6241241097450256, "learning_rate": 9.743287275162673e-05, "loss": 3.4458, "step": 1798 }, { "epoch": 0.14757878554957723, "grad_norm": 0.7173709869384766, "learning_rate": 9.742428976759925e-05, "loss": 3.4145, "step": 1800 }, { "epoch": 0.14774276197796568, "grad_norm": 0.6002538800239563, "learning_rate": 9.741569283857082e-05, "loss": 3.3948, "step": 1802 }, { "epoch": 0.1479067384063541, "grad_norm": 0.7210296988487244, "learning_rate": 9.740708196706936e-05, "loss": 3.3912, "step": 1804 }, { "epoch": 0.14807071483474252, "grad_norm": 0.5635441541671753, "learning_rate": 9.739845715562688e-05, "loss": 3.4489, "step": 1806 }, { "epoch": 0.14823469126313094, "grad_norm": 0.6474645733833313, "learning_rate": 9.738981840677948e-05, "loss": 3.429, "step": 1808 }, { "epoch": 0.14839866769151935, "grad_norm": 0.6213793754577637, "learning_rate": 9.738116572306737e-05, "loss": 3.4043, "step": 1810 }, { "epoch": 0.14856264411990777, "grad_norm": 0.5918754935264587, "learning_rate": 9.737249910703485e-05, "loss": 3.371, "step": 1812 }, { "epoch": 0.1487266205482962, "grad_norm": 0.5595375299453735, "learning_rate": 9.736381856123034e-05, "loss": 3.3467, "step": 1814 }, { "epoch": 0.1488905969766846, "grad_norm": 0.5739578604698181, "learning_rate": 9.735512408820628e-05, "loss": 3.4073, "step": 1816 }, { "epoch": 0.14905457340507303, "grad_norm": 0.5913086533546448, "learning_rate": 9.73464156905193e-05, "loss": 3.37, "step": 1818 }, { "epoch": 0.14921854983346144, "grad_norm": 0.5342605710029602, "learning_rate": 9.733769337073009e-05, "loss": 3.3615, "step": 1820 }, { "epoch": 0.14938252626184986, "grad_norm": 0.6710630655288696, "learning_rate": 9.73289571314034e-05, "loss": 3.3903, "step": 1822 }, { "epoch": 0.14954650269023828, "grad_norm": 0.6280022263526917, "learning_rate": 9.732020697510811e-05, "loss": 3.366, "step": 1824 }, { "epoch": 0.1497104791186267, "grad_norm": 0.6352916955947876, "learning_rate": 9.731144290441718e-05, "loss": 3.4006, "step": 1826 }, { "epoch": 0.14987445554701512, "grad_norm": 0.6925874948501587, "learning_rate": 9.730266492190769e-05, "loss": 3.4513, "step": 1828 }, { "epoch": 0.15003843197540354, "grad_norm": 0.7678630948066711, "learning_rate": 9.729387303016076e-05, "loss": 3.3515, "step": 1830 }, { "epoch": 0.15020240840379195, "grad_norm": 0.7654356956481934, "learning_rate": 9.728506723176162e-05, "loss": 3.357, "step": 1832 }, { "epoch": 0.15036638483218037, "grad_norm": 0.6860572695732117, "learning_rate": 9.727624752929962e-05, "loss": 3.4024, "step": 1834 }, { "epoch": 0.1505303612605688, "grad_norm": 0.6397068500518799, "learning_rate": 9.726741392536815e-05, "loss": 3.4281, "step": 1836 }, { "epoch": 0.1506943376889572, "grad_norm": 0.6445949673652649, "learning_rate": 9.725856642256472e-05, "loss": 3.3339, "step": 1838 }, { "epoch": 0.15085831411734563, "grad_norm": 0.6427408456802368, "learning_rate": 9.724970502349091e-05, "loss": 3.3984, "step": 1840 }, { "epoch": 0.15102229054573404, "grad_norm": 0.6301809549331665, "learning_rate": 9.72408297307524e-05, "loss": 3.3884, "step": 1842 }, { "epoch": 0.15118626697412246, "grad_norm": 0.561808705329895, "learning_rate": 9.723194054695894e-05, "loss": 3.39, "step": 1844 }, { "epoch": 0.15135024340251088, "grad_norm": 0.5919866561889648, "learning_rate": 9.722303747472441e-05, "loss": 3.3677, "step": 1846 }, { "epoch": 0.1515142198308993, "grad_norm": 0.6336367726325989, "learning_rate": 9.721412051666668e-05, "loss": 3.3986, "step": 1848 }, { "epoch": 0.15167819625928772, "grad_norm": 0.687470018863678, "learning_rate": 9.720518967540781e-05, "loss": 3.3543, "step": 1850 }, { "epoch": 0.15184217268767614, "grad_norm": 0.7600200176239014, "learning_rate": 9.719624495357387e-05, "loss": 3.4157, "step": 1852 }, { "epoch": 0.15200614911606455, "grad_norm": 0.6732688546180725, "learning_rate": 9.718728635379502e-05, "loss": 3.4003, "step": 1854 }, { "epoch": 0.15217012554445297, "grad_norm": 0.7202364206314087, "learning_rate": 9.717831387870555e-05, "loss": 3.3777, "step": 1856 }, { "epoch": 0.1523341019728414, "grad_norm": 0.6364483833312988, "learning_rate": 9.716932753094376e-05, "loss": 3.3864, "step": 1858 }, { "epoch": 0.15249807840122984, "grad_norm": 0.5882256627082825, "learning_rate": 9.71603273131521e-05, "loss": 3.3319, "step": 1860 }, { "epoch": 0.15266205482961825, "grad_norm": 0.716076672077179, "learning_rate": 9.715131322797704e-05, "loss": 3.4332, "step": 1862 }, { "epoch": 0.15282603125800667, "grad_norm": 0.6526336073875427, "learning_rate": 9.714228527806915e-05, "loss": 3.3591, "step": 1864 }, { "epoch": 0.1529900076863951, "grad_norm": 0.588830292224884, "learning_rate": 9.71332434660831e-05, "loss": 3.4014, "step": 1866 }, { "epoch": 0.1531539841147835, "grad_norm": 0.5935143232345581, "learning_rate": 9.712418779467758e-05, "loss": 3.3398, "step": 1868 }, { "epoch": 0.15331796054317193, "grad_norm": 0.6331619620323181, "learning_rate": 9.71151182665154e-05, "loss": 3.3374, "step": 1870 }, { "epoch": 0.15348193697156035, "grad_norm": 0.5878372192382812, "learning_rate": 9.710603488426345e-05, "loss": 3.3286, "step": 1872 }, { "epoch": 0.15364591339994876, "grad_norm": 0.5949060320854187, "learning_rate": 9.709693765059266e-05, "loss": 3.3715, "step": 1874 }, { "epoch": 0.15380988982833718, "grad_norm": 0.665457546710968, "learning_rate": 9.708782656817807e-05, "loss": 3.355, "step": 1876 }, { "epoch": 0.1539738662567256, "grad_norm": 0.6434701681137085, "learning_rate": 9.707870163969874e-05, "loss": 3.3317, "step": 1878 }, { "epoch": 0.15413784268511402, "grad_norm": 0.6445391774177551, "learning_rate": 9.706956286783786e-05, "loss": 3.3943, "step": 1880 }, { "epoch": 0.15430181911350244, "grad_norm": 0.6335451602935791, "learning_rate": 9.706041025528266e-05, "loss": 3.3645, "step": 1882 }, { "epoch": 0.15446579554189085, "grad_norm": 0.6084844470024109, "learning_rate": 9.705124380472443e-05, "loss": 3.3914, "step": 1884 }, { "epoch": 0.15462977197027927, "grad_norm": 0.6261113286018372, "learning_rate": 9.704206351885857e-05, "loss": 3.3381, "step": 1886 }, { "epoch": 0.1547937483986677, "grad_norm": 0.6548987030982971, "learning_rate": 9.703286940038449e-05, "loss": 3.3277, "step": 1888 }, { "epoch": 0.1549577248270561, "grad_norm": 0.6208562254905701, "learning_rate": 9.702366145200573e-05, "loss": 3.3789, "step": 1890 }, { "epoch": 0.15512170125544453, "grad_norm": 0.6488550901412964, "learning_rate": 9.701443967642984e-05, "loss": 3.3861, "step": 1892 }, { "epoch": 0.15528567768383295, "grad_norm": 0.6071347594261169, "learning_rate": 9.700520407636849e-05, "loss": 3.4027, "step": 1894 }, { "epoch": 0.15544965411222136, "grad_norm": 0.6656597852706909, "learning_rate": 9.699595465453734e-05, "loss": 3.3003, "step": 1896 }, { "epoch": 0.15561363054060978, "grad_norm": 0.6349019408226013, "learning_rate": 9.69866914136562e-05, "loss": 3.398, "step": 1898 }, { "epoch": 0.1557776069689982, "grad_norm": 0.6088286638259888, "learning_rate": 9.69774143564489e-05, "loss": 3.3114, "step": 1900 }, { "epoch": 0.15594158339738662, "grad_norm": 0.8147541284561157, "learning_rate": 9.696812348564331e-05, "loss": 3.3856, "step": 1902 }, { "epoch": 0.15610555982577504, "grad_norm": 0.6513493657112122, "learning_rate": 9.695881880397143e-05, "loss": 3.4239, "step": 1904 }, { "epoch": 0.15626953625416345, "grad_norm": 0.6540910601615906, "learning_rate": 9.694950031416925e-05, "loss": 3.3575, "step": 1906 }, { "epoch": 0.15643351268255187, "grad_norm": 0.6017822027206421, "learning_rate": 9.694016801897685e-05, "loss": 3.3468, "step": 1908 }, { "epoch": 0.1565974891109403, "grad_norm": 0.7597635984420776, "learning_rate": 9.693082192113839e-05, "loss": 3.3359, "step": 1910 }, { "epoch": 0.1567614655393287, "grad_norm": 0.8275761008262634, "learning_rate": 9.692146202340206e-05, "loss": 3.3465, "step": 1912 }, { "epoch": 0.15692544196771713, "grad_norm": 0.8236324787139893, "learning_rate": 9.69120883285201e-05, "loss": 3.3518, "step": 1914 }, { "epoch": 0.15708941839610557, "grad_norm": 0.7957652807235718, "learning_rate": 9.690270083924883e-05, "loss": 3.3424, "step": 1916 }, { "epoch": 0.157253394824494, "grad_norm": 0.7953089475631714, "learning_rate": 9.689329955834865e-05, "loss": 3.353, "step": 1918 }, { "epoch": 0.1574173712528824, "grad_norm": 0.7492114305496216, "learning_rate": 9.688388448858394e-05, "loss": 3.3389, "step": 1920 }, { "epoch": 0.15758134768127083, "grad_norm": 0.612477719783783, "learning_rate": 9.68744556327232e-05, "loss": 3.4137, "step": 1922 }, { "epoch": 0.15774532410965925, "grad_norm": 0.6381865739822388, "learning_rate": 9.686501299353895e-05, "loss": 3.3281, "step": 1924 }, { "epoch": 0.15790930053804766, "grad_norm": 0.6546152234077454, "learning_rate": 9.68555565738078e-05, "loss": 3.3501, "step": 1926 }, { "epoch": 0.15807327696643608, "grad_norm": 0.6780794262886047, "learning_rate": 9.684608637631036e-05, "loss": 3.3393, "step": 1928 }, { "epoch": 0.1582372533948245, "grad_norm": 0.637367308139801, "learning_rate": 9.683660240383135e-05, "loss": 3.3353, "step": 1930 }, { "epoch": 0.15840122982321292, "grad_norm": 0.647280216217041, "learning_rate": 9.68271046591595e-05, "loss": 3.4004, "step": 1932 }, { "epoch": 0.15856520625160134, "grad_norm": 0.6508013010025024, "learning_rate": 9.681759314508758e-05, "loss": 3.3203, "step": 1934 }, { "epoch": 0.15872918267998976, "grad_norm": 0.6182466745376587, "learning_rate": 9.680806786441244e-05, "loss": 3.3234, "step": 1936 }, { "epoch": 0.15889315910837817, "grad_norm": 0.6148533821105957, "learning_rate": 9.679852881993496e-05, "loss": 3.4097, "step": 1938 }, { "epoch": 0.1590571355367666, "grad_norm": 0.7295002937316895, "learning_rate": 9.678897601446008e-05, "loss": 3.3832, "step": 1940 }, { "epoch": 0.159221111965155, "grad_norm": 0.611003041267395, "learning_rate": 9.67794094507968e-05, "loss": 3.3381, "step": 1942 }, { "epoch": 0.15938508839354343, "grad_norm": 0.6375026106834412, "learning_rate": 9.676982913175813e-05, "loss": 3.3224, "step": 1944 }, { "epoch": 0.15954906482193185, "grad_norm": 0.5718812942504883, "learning_rate": 9.676023506016112e-05, "loss": 3.3178, "step": 1946 }, { "epoch": 0.15971304125032026, "grad_norm": 0.5863606333732605, "learning_rate": 9.675062723882691e-05, "loss": 3.3181, "step": 1948 }, { "epoch": 0.15987701767870868, "grad_norm": 0.6420906782150269, "learning_rate": 9.674100567058064e-05, "loss": 3.3457, "step": 1950 }, { "epoch": 0.1600409941070971, "grad_norm": 0.6146227121353149, "learning_rate": 9.673137035825153e-05, "loss": 3.3324, "step": 1952 }, { "epoch": 0.16020497053548552, "grad_norm": 0.6017966866493225, "learning_rate": 9.672172130467281e-05, "loss": 3.3241, "step": 1954 }, { "epoch": 0.16036894696387394, "grad_norm": 0.6874404549598694, "learning_rate": 9.671205851268175e-05, "loss": 3.3436, "step": 1956 }, { "epoch": 0.16053292339226236, "grad_norm": 0.6882346868515015, "learning_rate": 9.670238198511969e-05, "loss": 3.3263, "step": 1958 }, { "epoch": 0.16069689982065077, "grad_norm": 0.7165938019752502, "learning_rate": 9.669269172483197e-05, "loss": 3.2836, "step": 1960 }, { "epoch": 0.1608608762490392, "grad_norm": 0.7419902086257935, "learning_rate": 9.668298773466802e-05, "loss": 3.3599, "step": 1962 }, { "epoch": 0.1610248526774276, "grad_norm": 0.7155027985572815, "learning_rate": 9.667327001748125e-05, "loss": 3.328, "step": 1964 }, { "epoch": 0.16118882910581603, "grad_norm": 0.5572860836982727, "learning_rate": 9.666353857612913e-05, "loss": 3.2618, "step": 1966 }, { "epoch": 0.16135280553420445, "grad_norm": 0.6426743865013123, "learning_rate": 9.665379341347318e-05, "loss": 3.3289, "step": 1968 }, { "epoch": 0.16151678196259286, "grad_norm": 0.7807396054267883, "learning_rate": 9.664403453237894e-05, "loss": 3.3503, "step": 1970 }, { "epoch": 0.16168075839098128, "grad_norm": 0.6256475448608398, "learning_rate": 9.663426193571598e-05, "loss": 3.368, "step": 1972 }, { "epoch": 0.16184473481936973, "grad_norm": 0.6786140203475952, "learning_rate": 9.662447562635791e-05, "loss": 3.3482, "step": 1974 }, { "epoch": 0.16200871124775815, "grad_norm": 0.6162734627723694, "learning_rate": 9.661467560718237e-05, "loss": 3.3226, "step": 1976 }, { "epoch": 0.16217268767614657, "grad_norm": 0.609999418258667, "learning_rate": 9.660486188107104e-05, "loss": 3.3118, "step": 1978 }, { "epoch": 0.16233666410453498, "grad_norm": 0.7471441626548767, "learning_rate": 9.659503445090963e-05, "loss": 3.3642, "step": 1980 }, { "epoch": 0.1625006405329234, "grad_norm": 0.6361717581748962, "learning_rate": 9.658519331958785e-05, "loss": 3.2547, "step": 1982 }, { "epoch": 0.16266461696131182, "grad_norm": 0.5587472915649414, "learning_rate": 9.657533848999947e-05, "loss": 3.3375, "step": 1984 }, { "epoch": 0.16282859338970024, "grad_norm": 0.6860288381576538, "learning_rate": 9.65654699650423e-05, "loss": 3.2796, "step": 1986 }, { "epoch": 0.16299256981808866, "grad_norm": 0.6986459493637085, "learning_rate": 9.655558774761813e-05, "loss": 3.3553, "step": 1988 }, { "epoch": 0.16315654624647707, "grad_norm": 0.7562621235847473, "learning_rate": 9.654569184063282e-05, "loss": 3.3779, "step": 1990 }, { "epoch": 0.1633205226748655, "grad_norm": 0.7100428938865662, "learning_rate": 9.653578224699622e-05, "loss": 3.3995, "step": 1992 }, { "epoch": 0.1634844991032539, "grad_norm": 0.776755690574646, "learning_rate": 9.652585896962223e-05, "loss": 3.3391, "step": 1994 }, { "epoch": 0.16364847553164233, "grad_norm": 0.6308813095092773, "learning_rate": 9.651592201142879e-05, "loss": 3.2949, "step": 1996 }, { "epoch": 0.16381245196003075, "grad_norm": 0.7114334106445312, "learning_rate": 9.650597137533782e-05, "loss": 3.3665, "step": 1998 }, { "epoch": 0.16397642838841917, "grad_norm": 0.6608272194862366, "learning_rate": 9.649600706427525e-05, "loss": 3.2825, "step": 2000 }, { "epoch": 0.16414040481680758, "grad_norm": 0.6543484926223755, "learning_rate": 9.648602908117112e-05, "loss": 3.3447, "step": 2002 }, { "epoch": 0.164304381245196, "grad_norm": 0.6831576228141785, "learning_rate": 9.647603742895939e-05, "loss": 3.3979, "step": 2004 }, { "epoch": 0.16446835767358442, "grad_norm": 0.717369019985199, "learning_rate": 9.646603211057809e-05, "loss": 3.3508, "step": 2006 }, { "epoch": 0.16463233410197284, "grad_norm": 0.6552402973175049, "learning_rate": 9.645601312896929e-05, "loss": 3.326, "step": 2008 }, { "epoch": 0.16479631053036126, "grad_norm": 0.7372413277626038, "learning_rate": 9.644598048707901e-05, "loss": 3.3048, "step": 2010 }, { "epoch": 0.16496028695874967, "grad_norm": 0.5617173910140991, "learning_rate": 9.643593418785734e-05, "loss": 3.3341, "step": 2012 }, { "epoch": 0.1651242633871381, "grad_norm": 0.6190782785415649, "learning_rate": 9.642587423425839e-05, "loss": 3.3441, "step": 2014 }, { "epoch": 0.1652882398155265, "grad_norm": 0.6181708574295044, "learning_rate": 9.641580062924022e-05, "loss": 3.3729, "step": 2016 }, { "epoch": 0.16545221624391493, "grad_norm": 0.5956866145133972, "learning_rate": 9.640571337576499e-05, "loss": 3.3385, "step": 2018 }, { "epoch": 0.16561619267230335, "grad_norm": 0.7407371401786804, "learning_rate": 9.639561247679883e-05, "loss": 3.2941, "step": 2020 }, { "epoch": 0.16578016910069177, "grad_norm": 0.6292521953582764, "learning_rate": 9.638549793531186e-05, "loss": 3.3027, "step": 2022 }, { "epoch": 0.16594414552908018, "grad_norm": 0.6599383354187012, "learning_rate": 9.637536975427826e-05, "loss": 3.2438, "step": 2024 }, { "epoch": 0.1661081219574686, "grad_norm": 0.6305571794509888, "learning_rate": 9.636522793667617e-05, "loss": 3.3542, "step": 2026 }, { "epoch": 0.16627209838585702, "grad_norm": 0.6023452877998352, "learning_rate": 9.635507248548781e-05, "loss": 3.2435, "step": 2028 }, { "epoch": 0.16643607481424547, "grad_norm": 0.60383540391922, "learning_rate": 9.634490340369933e-05, "loss": 3.3507, "step": 2030 }, { "epoch": 0.16660005124263388, "grad_norm": 0.6280787587165833, "learning_rate": 9.633472069430094e-05, "loss": 3.343, "step": 2032 }, { "epoch": 0.1667640276710223, "grad_norm": 0.6468386650085449, "learning_rate": 9.632452436028685e-05, "loss": 3.2778, "step": 2034 }, { "epoch": 0.16692800409941072, "grad_norm": 0.6163133978843689, "learning_rate": 9.631431440465526e-05, "loss": 3.2935, "step": 2036 }, { "epoch": 0.16709198052779914, "grad_norm": 0.6122549772262573, "learning_rate": 9.630409083040837e-05, "loss": 3.3503, "step": 2038 }, { "epoch": 0.16725595695618756, "grad_norm": 0.7354429364204407, "learning_rate": 9.629385364055242e-05, "loss": 3.3263, "step": 2040 }, { "epoch": 0.16741993338457598, "grad_norm": 0.5872082710266113, "learning_rate": 9.628360283809761e-05, "loss": 3.2704, "step": 2042 }, { "epoch": 0.1675839098129644, "grad_norm": 0.6023679375648499, "learning_rate": 9.627333842605819e-05, "loss": 3.2958, "step": 2044 }, { "epoch": 0.1677478862413528, "grad_norm": 0.5730355978012085, "learning_rate": 9.626306040745237e-05, "loss": 3.3174, "step": 2046 }, { "epoch": 0.16791186266974123, "grad_norm": 0.5722047090530396, "learning_rate": 9.625276878530237e-05, "loss": 3.3153, "step": 2048 }, { "epoch": 0.16807583909812965, "grad_norm": 0.6651049852371216, "learning_rate": 9.624246356263444e-05, "loss": 3.3299, "step": 2050 }, { "epoch": 0.16823981552651807, "grad_norm": 0.6372424364089966, "learning_rate": 9.623214474247878e-05, "loss": 3.3651, "step": 2052 }, { "epoch": 0.16840379195490648, "grad_norm": 0.5711623430252075, "learning_rate": 9.622181232786963e-05, "loss": 3.3216, "step": 2054 }, { "epoch": 0.1685677683832949, "grad_norm": 0.6460525393486023, "learning_rate": 9.621146632184521e-05, "loss": 3.3674, "step": 2056 }, { "epoch": 0.16873174481168332, "grad_norm": 0.6044295430183411, "learning_rate": 9.620110672744776e-05, "loss": 3.3083, "step": 2058 }, { "epoch": 0.16889572124007174, "grad_norm": 0.5659945011138916, "learning_rate": 9.619073354772344e-05, "loss": 3.3649, "step": 2060 }, { "epoch": 0.16905969766846016, "grad_norm": 0.5560106039047241, "learning_rate": 9.618034678572252e-05, "loss": 3.2848, "step": 2062 }, { "epoch": 0.16922367409684858, "grad_norm": 0.5644478797912598, "learning_rate": 9.616994644449915e-05, "loss": 3.3233, "step": 2064 }, { "epoch": 0.169387650525237, "grad_norm": 0.6400248408317566, "learning_rate": 9.615953252711157e-05, "loss": 3.3204, "step": 2066 }, { "epoch": 0.1695516269536254, "grad_norm": 0.5804336667060852, "learning_rate": 9.614910503662196e-05, "loss": 3.3332, "step": 2068 }, { "epoch": 0.16971560338201383, "grad_norm": 0.6843202710151672, "learning_rate": 9.613866397609646e-05, "loss": 3.3108, "step": 2070 }, { "epoch": 0.16987957981040225, "grad_norm": 0.6259203553199768, "learning_rate": 9.612820934860529e-05, "loss": 3.2955, "step": 2072 }, { "epoch": 0.17004355623879067, "grad_norm": 0.7539075016975403, "learning_rate": 9.611774115722258e-05, "loss": 3.3332, "step": 2074 }, { "epoch": 0.17020753266717908, "grad_norm": 0.6109238266944885, "learning_rate": 9.610725940502648e-05, "loss": 3.2937, "step": 2076 }, { "epoch": 0.1703715090955675, "grad_norm": 0.6348362565040588, "learning_rate": 9.609676409509912e-05, "loss": 3.2358, "step": 2078 }, { "epoch": 0.17053548552395592, "grad_norm": 0.7391447424888611, "learning_rate": 9.608625523052663e-05, "loss": 3.3637, "step": 2080 }, { "epoch": 0.17069946195234434, "grad_norm": 0.6116240620613098, "learning_rate": 9.607573281439913e-05, "loss": 3.299, "step": 2082 }, { "epoch": 0.17086343838073276, "grad_norm": 0.6683641672134399, "learning_rate": 9.60651968498107e-05, "loss": 3.3175, "step": 2084 }, { "epoch": 0.17102741480912118, "grad_norm": 0.5680612921714783, "learning_rate": 9.605464733985941e-05, "loss": 3.2804, "step": 2086 }, { "epoch": 0.17119139123750962, "grad_norm": 0.5645765066146851, "learning_rate": 9.60440842876473e-05, "loss": 3.307, "step": 2088 }, { "epoch": 0.17135536766589804, "grad_norm": 0.6338248252868652, "learning_rate": 9.603350769628045e-05, "loss": 3.2567, "step": 2090 }, { "epoch": 0.17151934409428646, "grad_norm": 0.6122543811798096, "learning_rate": 9.602291756886888e-05, "loss": 3.3027, "step": 2092 }, { "epoch": 0.17168332052267488, "grad_norm": 0.6619787216186523, "learning_rate": 9.601231390852656e-05, "loss": 3.2741, "step": 2094 }, { "epoch": 0.1718472969510633, "grad_norm": 0.5675482153892517, "learning_rate": 9.600169671837149e-05, "loss": 3.2811, "step": 2096 }, { "epoch": 0.1720112733794517, "grad_norm": 0.6895171999931335, "learning_rate": 9.599106600152563e-05, "loss": 3.3162, "step": 2098 }, { "epoch": 0.17217524980784013, "grad_norm": 0.7057105898857117, "learning_rate": 9.59804217611149e-05, "loss": 3.2611, "step": 2100 }, { "epoch": 0.17233922623622855, "grad_norm": 0.5840970277786255, "learning_rate": 9.596976400026925e-05, "loss": 3.2843, "step": 2102 }, { "epoch": 0.17250320266461697, "grad_norm": 0.6528168320655823, "learning_rate": 9.595909272212254e-05, "loss": 3.3212, "step": 2104 }, { "epoch": 0.17266717909300539, "grad_norm": 0.5484073162078857, "learning_rate": 9.594840792981265e-05, "loss": 3.2849, "step": 2106 }, { "epoch": 0.1728311555213938, "grad_norm": 0.5874817967414856, "learning_rate": 9.59377096264814e-05, "loss": 3.3081, "step": 2108 }, { "epoch": 0.17299513194978222, "grad_norm": 0.5669682025909424, "learning_rate": 9.592699781527461e-05, "loss": 3.3498, "step": 2110 }, { "epoch": 0.17315910837817064, "grad_norm": 0.6883938312530518, "learning_rate": 9.591627249934207e-05, "loss": 3.2798, "step": 2112 }, { "epoch": 0.17332308480655906, "grad_norm": 0.6597236394882202, "learning_rate": 9.590553368183753e-05, "loss": 3.2804, "step": 2114 }, { "epoch": 0.17348706123494748, "grad_norm": 0.5905894637107849, "learning_rate": 9.589478136591872e-05, "loss": 3.2825, "step": 2116 }, { "epoch": 0.1736510376633359, "grad_norm": 0.637214183807373, "learning_rate": 9.588401555474732e-05, "loss": 3.3519, "step": 2118 }, { "epoch": 0.1738150140917243, "grad_norm": 0.5943377017974854, "learning_rate": 9.587323625148899e-05, "loss": 3.2875, "step": 2120 }, { "epoch": 0.17397899052011273, "grad_norm": 0.49435749650001526, "learning_rate": 9.586244345931336e-05, "loss": 3.3332, "step": 2122 }, { "epoch": 0.17414296694850115, "grad_norm": 0.6532015800476074, "learning_rate": 9.585163718139405e-05, "loss": 3.3226, "step": 2124 }, { "epoch": 0.17430694337688957, "grad_norm": 0.6478725075721741, "learning_rate": 9.584081742090861e-05, "loss": 3.2902, "step": 2126 }, { "epoch": 0.17447091980527799, "grad_norm": 0.6059595942497253, "learning_rate": 9.582998418103854e-05, "loss": 3.3047, "step": 2128 }, { "epoch": 0.1746348962336664, "grad_norm": 0.6445087790489197, "learning_rate": 9.581913746496934e-05, "loss": 3.3007, "step": 2130 }, { "epoch": 0.17479887266205482, "grad_norm": 0.5795426964759827, "learning_rate": 9.580827727589048e-05, "loss": 3.3193, "step": 2132 }, { "epoch": 0.17496284909044324, "grad_norm": 0.6961095929145813, "learning_rate": 9.579740361699535e-05, "loss": 3.328, "step": 2134 }, { "epoch": 0.17512682551883166, "grad_norm": 0.7018781900405884, "learning_rate": 9.578651649148133e-05, "loss": 3.2589, "step": 2136 }, { "epoch": 0.17529080194722008, "grad_norm": 0.6638593077659607, "learning_rate": 9.577561590254977e-05, "loss": 3.3047, "step": 2138 }, { "epoch": 0.1754547783756085, "grad_norm": 0.8037712574005127, "learning_rate": 9.576470185340596e-05, "loss": 3.3166, "step": 2140 }, { "epoch": 0.1756187548039969, "grad_norm": 0.630042314529419, "learning_rate": 9.575377434725911e-05, "loss": 3.279, "step": 2142 }, { "epoch": 0.17578273123238533, "grad_norm": 0.5768330097198486, "learning_rate": 9.57428333873225e-05, "loss": 3.2599, "step": 2144 }, { "epoch": 0.17594670766077378, "grad_norm": 0.5603185296058655, "learning_rate": 9.573187897681322e-05, "loss": 3.2339, "step": 2146 }, { "epoch": 0.1761106840891622, "grad_norm": 0.7578685879707336, "learning_rate": 9.572091111895243e-05, "loss": 3.3329, "step": 2148 }, { "epoch": 0.1762746605175506, "grad_norm": 0.5876714587211609, "learning_rate": 9.57099298169652e-05, "loss": 3.2627, "step": 2150 }, { "epoch": 0.17643863694593903, "grad_norm": 0.6411724090576172, "learning_rate": 9.569893507408055e-05, "loss": 3.2554, "step": 2152 }, { "epoch": 0.17660261337432745, "grad_norm": 0.6768316626548767, "learning_rate": 9.568792689353143e-05, "loss": 3.2805, "step": 2154 }, { "epoch": 0.17676658980271587, "grad_norm": 0.7020912766456604, "learning_rate": 9.567690527855483e-05, "loss": 3.2498, "step": 2156 }, { "epoch": 0.17693056623110429, "grad_norm": 0.630198061466217, "learning_rate": 9.566587023239157e-05, "loss": 3.3052, "step": 2158 }, { "epoch": 0.1770945426594927, "grad_norm": 0.7058777213096619, "learning_rate": 9.565482175828653e-05, "loss": 3.2939, "step": 2160 }, { "epoch": 0.17725851908788112, "grad_norm": 0.684704601764679, "learning_rate": 9.564375985948846e-05, "loss": 3.2628, "step": 2162 }, { "epoch": 0.17742249551626954, "grad_norm": 0.6196277737617493, "learning_rate": 9.56326845392501e-05, "loss": 3.2804, "step": 2164 }, { "epoch": 0.17758647194465796, "grad_norm": 0.6597474217414856, "learning_rate": 9.562159580082808e-05, "loss": 3.357, "step": 2166 }, { "epoch": 0.17775044837304638, "grad_norm": 0.6917456388473511, "learning_rate": 9.561049364748307e-05, "loss": 3.2818, "step": 2168 }, { "epoch": 0.1779144248014348, "grad_norm": 0.6305201649665833, "learning_rate": 9.559937808247961e-05, "loss": 3.2252, "step": 2170 }, { "epoch": 0.1780784012298232, "grad_norm": 0.6192725896835327, "learning_rate": 9.55882491090862e-05, "loss": 3.3079, "step": 2172 }, { "epoch": 0.17824237765821163, "grad_norm": 0.6527867913246155, "learning_rate": 9.55771067305753e-05, "loss": 3.2065, "step": 2174 }, { "epoch": 0.17840635408660005, "grad_norm": 0.6737766265869141, "learning_rate": 9.556595095022331e-05, "loss": 3.2789, "step": 2176 }, { "epoch": 0.17857033051498847, "grad_norm": 0.5953449010848999, "learning_rate": 9.555478177131052e-05, "loss": 3.2632, "step": 2178 }, { "epoch": 0.17873430694337689, "grad_norm": 0.6226819157600403, "learning_rate": 9.554359919712124e-05, "loss": 3.2756, "step": 2180 }, { "epoch": 0.1788982833717653, "grad_norm": 0.6152170896530151, "learning_rate": 9.553240323094368e-05, "loss": 3.2677, "step": 2182 }, { "epoch": 0.17906225980015372, "grad_norm": 0.5997733473777771, "learning_rate": 9.552119387606997e-05, "loss": 3.2369, "step": 2184 }, { "epoch": 0.17922623622854214, "grad_norm": 0.5696983933448792, "learning_rate": 9.550997113579618e-05, "loss": 3.3002, "step": 2186 }, { "epoch": 0.17939021265693056, "grad_norm": 0.5793240666389465, "learning_rate": 9.549873501342237e-05, "loss": 3.224, "step": 2188 }, { "epoch": 0.17955418908531898, "grad_norm": 0.5453920364379883, "learning_rate": 9.548748551225246e-05, "loss": 3.3058, "step": 2190 }, { "epoch": 0.1797181655137074, "grad_norm": 0.558608889579773, "learning_rate": 9.547622263559437e-05, "loss": 3.3077, "step": 2192 }, { "epoch": 0.1798821419420958, "grad_norm": 0.577103853225708, "learning_rate": 9.546494638675989e-05, "loss": 3.2894, "step": 2194 }, { "epoch": 0.18004611837048423, "grad_norm": 0.5528544187545776, "learning_rate": 9.54536567690648e-05, "loss": 3.236, "step": 2196 }, { "epoch": 0.18021009479887265, "grad_norm": 0.5541223883628845, "learning_rate": 9.544235378582877e-05, "loss": 3.268, "step": 2198 }, { "epoch": 0.18037407122726107, "grad_norm": 0.5316542983055115, "learning_rate": 9.543103744037544e-05, "loss": 3.2532, "step": 2200 }, { "epoch": 0.18053804765564951, "grad_norm": 0.6354775428771973, "learning_rate": 9.541970773603233e-05, "loss": 3.2782, "step": 2202 }, { "epoch": 0.18070202408403793, "grad_norm": 0.5799155235290527, "learning_rate": 9.540836467613092e-05, "loss": 3.224, "step": 2204 }, { "epoch": 0.18086600051242635, "grad_norm": 0.6716517210006714, "learning_rate": 9.53970082640066e-05, "loss": 3.2579, "step": 2206 }, { "epoch": 0.18102997694081477, "grad_norm": 0.6734136343002319, "learning_rate": 9.53856385029987e-05, "loss": 3.2698, "step": 2208 }, { "epoch": 0.1811939533692032, "grad_norm": 0.7301097512245178, "learning_rate": 9.53742553964505e-05, "loss": 3.2738, "step": 2210 }, { "epoch": 0.1813579297975916, "grad_norm": 0.6413136720657349, "learning_rate": 9.536285894770914e-05, "loss": 3.2058, "step": 2212 }, { "epoch": 0.18152190622598002, "grad_norm": 0.6198046803474426, "learning_rate": 9.535144916012575e-05, "loss": 3.2869, "step": 2214 }, { "epoch": 0.18168588265436844, "grad_norm": 0.6237669587135315, "learning_rate": 9.534002603705532e-05, "loss": 3.223, "step": 2216 }, { "epoch": 0.18184985908275686, "grad_norm": 0.626846969127655, "learning_rate": 9.53285895818568e-05, "loss": 3.1932, "step": 2218 }, { "epoch": 0.18201383551114528, "grad_norm": 0.5577837824821472, "learning_rate": 9.531713979789308e-05, "loss": 3.2983, "step": 2220 }, { "epoch": 0.1821778119395337, "grad_norm": 0.5938622951507568, "learning_rate": 9.53056766885309e-05, "loss": 3.2237, "step": 2222 }, { "epoch": 0.18234178836792211, "grad_norm": 0.5766521096229553, "learning_rate": 9.529420025714099e-05, "loss": 3.3077, "step": 2224 }, { "epoch": 0.18250576479631053, "grad_norm": 0.5775324702262878, "learning_rate": 9.528271050709797e-05, "loss": 3.298, "step": 2226 }, { "epoch": 0.18266974122469895, "grad_norm": 0.575670599937439, "learning_rate": 9.527120744178034e-05, "loss": 3.2163, "step": 2228 }, { "epoch": 0.18283371765308737, "grad_norm": 0.5995689630508423, "learning_rate": 9.525969106457059e-05, "loss": 3.2937, "step": 2230 }, { "epoch": 0.1829976940814758, "grad_norm": 0.6333314180374146, "learning_rate": 9.524816137885506e-05, "loss": 3.2528, "step": 2232 }, { "epoch": 0.1831616705098642, "grad_norm": 0.6713608503341675, "learning_rate": 9.523661838802404e-05, "loss": 3.2981, "step": 2234 }, { "epoch": 0.18332564693825262, "grad_norm": 0.5735260844230652, "learning_rate": 9.52250620954717e-05, "loss": 3.2495, "step": 2236 }, { "epoch": 0.18348962336664104, "grad_norm": 0.5098928809165955, "learning_rate": 9.521349250459617e-05, "loss": 3.2896, "step": 2238 }, { "epoch": 0.18365359979502946, "grad_norm": 0.5835528373718262, "learning_rate": 9.520190961879942e-05, "loss": 3.3013, "step": 2240 }, { "epoch": 0.18381757622341788, "grad_norm": 0.5812976360321045, "learning_rate": 9.519031344148741e-05, "loss": 3.2155, "step": 2242 }, { "epoch": 0.1839815526518063, "grad_norm": 0.6407437324523926, "learning_rate": 9.517870397606996e-05, "loss": 3.3021, "step": 2244 }, { "epoch": 0.18414552908019471, "grad_norm": 0.6286873817443848, "learning_rate": 9.516708122596079e-05, "loss": 3.2593, "step": 2246 }, { "epoch": 0.18430950550858313, "grad_norm": 0.6462428569793701, "learning_rate": 9.515544519457755e-05, "loss": 3.2773, "step": 2248 }, { "epoch": 0.18447348193697155, "grad_norm": 0.5487723350524902, "learning_rate": 9.51437958853418e-05, "loss": 3.2368, "step": 2250 }, { "epoch": 0.18463745836535997, "grad_norm": 0.6235312819480896, "learning_rate": 9.513213330167898e-05, "loss": 3.2401, "step": 2252 }, { "epoch": 0.1848014347937484, "grad_norm": 0.6422250866889954, "learning_rate": 9.512045744701843e-05, "loss": 3.252, "step": 2254 }, { "epoch": 0.1849654112221368, "grad_norm": 0.6642846465110779, "learning_rate": 9.510876832479343e-05, "loss": 3.2247, "step": 2256 }, { "epoch": 0.18512938765052522, "grad_norm": 0.6005398631095886, "learning_rate": 9.509706593844114e-05, "loss": 3.2238, "step": 2258 }, { "epoch": 0.18529336407891367, "grad_norm": 0.5651940107345581, "learning_rate": 9.508535029140262e-05, "loss": 3.2212, "step": 2260 }, { "epoch": 0.1854573405073021, "grad_norm": 0.6252986788749695, "learning_rate": 9.507362138712282e-05, "loss": 3.2603, "step": 2262 }, { "epoch": 0.1856213169356905, "grad_norm": 0.6485080718994141, "learning_rate": 9.506187922905057e-05, "loss": 3.2465, "step": 2264 }, { "epoch": 0.18578529336407892, "grad_norm": 0.5912032723426819, "learning_rate": 9.505012382063869e-05, "loss": 3.1827, "step": 2266 }, { "epoch": 0.18594926979246734, "grad_norm": 0.6242038607597351, "learning_rate": 9.503835516534376e-05, "loss": 3.2717, "step": 2268 }, { "epoch": 0.18611324622085576, "grad_norm": 0.6092646718025208, "learning_rate": 9.502657326662637e-05, "loss": 3.2605, "step": 2270 }, { "epoch": 0.18627722264924418, "grad_norm": 0.5553577542304993, "learning_rate": 9.501477812795094e-05, "loss": 3.2083, "step": 2272 }, { "epoch": 0.1864411990776326, "grad_norm": 0.5823682546615601, "learning_rate": 9.500296975278581e-05, "loss": 3.2408, "step": 2274 }, { "epoch": 0.18660517550602101, "grad_norm": 0.6024842858314514, "learning_rate": 9.499114814460323e-05, "loss": 3.2654, "step": 2276 }, { "epoch": 0.18676915193440943, "grad_norm": 0.6093845367431641, "learning_rate": 9.497931330687926e-05, "loss": 3.2261, "step": 2278 }, { "epoch": 0.18693312836279785, "grad_norm": 0.5323441624641418, "learning_rate": 9.496746524309396e-05, "loss": 3.286, "step": 2280 }, { "epoch": 0.18709710479118627, "grad_norm": 0.6634844541549683, "learning_rate": 9.495560395673119e-05, "loss": 3.2319, "step": 2282 }, { "epoch": 0.1872610812195747, "grad_norm": 0.5159028172492981, "learning_rate": 9.494372945127873e-05, "loss": 3.2609, "step": 2284 }, { "epoch": 0.1874250576479631, "grad_norm": 0.6331459283828735, "learning_rate": 9.493184173022829e-05, "loss": 3.2177, "step": 2286 }, { "epoch": 0.18758903407635152, "grad_norm": 0.7428011894226074, "learning_rate": 9.49199407970754e-05, "loss": 3.2783, "step": 2288 }, { "epoch": 0.18775301050473994, "grad_norm": 0.5949831604957581, "learning_rate": 9.49080266553195e-05, "loss": 3.2425, "step": 2290 }, { "epoch": 0.18791698693312836, "grad_norm": 0.7588968873023987, "learning_rate": 9.489609930846391e-05, "loss": 3.2672, "step": 2292 }, { "epoch": 0.18808096336151678, "grad_norm": 0.592046320438385, "learning_rate": 9.488415876001586e-05, "loss": 3.2341, "step": 2294 }, { "epoch": 0.1882449397899052, "grad_norm": 0.5699316263198853, "learning_rate": 9.487220501348642e-05, "loss": 3.2835, "step": 2296 }, { "epoch": 0.18840891621829362, "grad_norm": 0.634863555431366, "learning_rate": 9.486023807239057e-05, "loss": 3.2315, "step": 2298 }, { "epoch": 0.18857289264668203, "grad_norm": 0.5566238760948181, "learning_rate": 9.484825794024716e-05, "loss": 3.1947, "step": 2300 }, { "epoch": 0.18873686907507045, "grad_norm": 0.6761863231658936, "learning_rate": 9.483626462057893e-05, "loss": 3.2531, "step": 2302 }, { "epoch": 0.18890084550345887, "grad_norm": 0.561553955078125, "learning_rate": 9.482425811691247e-05, "loss": 3.1987, "step": 2304 }, { "epoch": 0.1890648219318473, "grad_norm": 0.7657498121261597, "learning_rate": 9.481223843277827e-05, "loss": 3.2818, "step": 2306 }, { "epoch": 0.1892287983602357, "grad_norm": 0.5893799066543579, "learning_rate": 9.480020557171068e-05, "loss": 3.2392, "step": 2308 }, { "epoch": 0.18939277478862412, "grad_norm": 0.6204321980476379, "learning_rate": 9.478815953724796e-05, "loss": 3.1719, "step": 2310 }, { "epoch": 0.18955675121701254, "grad_norm": 0.5662544965744019, "learning_rate": 9.47761003329322e-05, "loss": 3.1849, "step": 2312 }, { "epoch": 0.18972072764540096, "grad_norm": 0.5701155662536621, "learning_rate": 9.476402796230938e-05, "loss": 3.2727, "step": 2314 }, { "epoch": 0.1898847040737894, "grad_norm": 0.5828278064727783, "learning_rate": 9.475194242892936e-05, "loss": 3.2154, "step": 2316 }, { "epoch": 0.19004868050217782, "grad_norm": 0.6154817342758179, "learning_rate": 9.473984373634586e-05, "loss": 3.2404, "step": 2318 }, { "epoch": 0.19021265693056624, "grad_norm": 0.5927799940109253, "learning_rate": 9.472773188811647e-05, "loss": 3.2507, "step": 2320 }, { "epoch": 0.19037663335895466, "grad_norm": 0.5183115601539612, "learning_rate": 9.471560688780266e-05, "loss": 3.2129, "step": 2322 }, { "epoch": 0.19054060978734308, "grad_norm": 0.5379958748817444, "learning_rate": 9.470346873896974e-05, "loss": 3.2855, "step": 2324 }, { "epoch": 0.1907045862157315, "grad_norm": 0.5806155204772949, "learning_rate": 9.46913174451869e-05, "loss": 3.2789, "step": 2326 }, { "epoch": 0.19086856264411992, "grad_norm": 0.5110951066017151, "learning_rate": 9.467915301002723e-05, "loss": 3.1858, "step": 2328 }, { "epoch": 0.19103253907250833, "grad_norm": 0.6017136573791504, "learning_rate": 9.466697543706764e-05, "loss": 3.214, "step": 2330 }, { "epoch": 0.19119651550089675, "grad_norm": 0.620817244052887, "learning_rate": 9.46547847298889e-05, "loss": 3.2424, "step": 2332 }, { "epoch": 0.19136049192928517, "grad_norm": 0.5650565028190613, "learning_rate": 9.464258089207569e-05, "loss": 3.1888, "step": 2334 }, { "epoch": 0.1915244683576736, "grad_norm": 0.690185546875, "learning_rate": 9.46303639272165e-05, "loss": 3.2409, "step": 2336 }, { "epoch": 0.191688444786062, "grad_norm": 0.5948742032051086, "learning_rate": 9.461813383890367e-05, "loss": 3.2372, "step": 2338 }, { "epoch": 0.19185242121445042, "grad_norm": 0.6050938367843628, "learning_rate": 9.460589063073349e-05, "loss": 3.1971, "step": 2340 }, { "epoch": 0.19201639764283884, "grad_norm": 0.6243955492973328, "learning_rate": 9.459363430630601e-05, "loss": 3.2438, "step": 2342 }, { "epoch": 0.19218037407122726, "grad_norm": 0.6408066153526306, "learning_rate": 9.458136486922519e-05, "loss": 3.2632, "step": 2344 }, { "epoch": 0.19234435049961568, "grad_norm": 0.6085670590400696, "learning_rate": 9.45690823230988e-05, "loss": 3.2042, "step": 2346 }, { "epoch": 0.1925083269280041, "grad_norm": 0.6089074015617371, "learning_rate": 9.455678667153853e-05, "loss": 3.1707, "step": 2348 }, { "epoch": 0.19267230335639252, "grad_norm": 0.572694718837738, "learning_rate": 9.454447791815986e-05, "loss": 3.1984, "step": 2350 }, { "epoch": 0.19283627978478093, "grad_norm": 0.5600984692573547, "learning_rate": 9.453215606658217e-05, "loss": 3.2981, "step": 2352 }, { "epoch": 0.19300025621316935, "grad_norm": 0.6271937489509583, "learning_rate": 9.451982112042866e-05, "loss": 3.2026, "step": 2354 }, { "epoch": 0.19316423264155777, "grad_norm": 0.5863776803016663, "learning_rate": 9.450747308332639e-05, "loss": 3.1766, "step": 2356 }, { "epoch": 0.1933282090699462, "grad_norm": 0.545541524887085, "learning_rate": 9.449511195890628e-05, "loss": 3.2055, "step": 2358 }, { "epoch": 0.1934921854983346, "grad_norm": 0.5626051425933838, "learning_rate": 9.44827377508031e-05, "loss": 3.2633, "step": 2360 }, { "epoch": 0.19365616192672302, "grad_norm": 0.5464023351669312, "learning_rate": 9.44703504626554e-05, "loss": 3.2158, "step": 2362 }, { "epoch": 0.19382013835511144, "grad_norm": 0.57725989818573, "learning_rate": 9.445795009810572e-05, "loss": 3.2539, "step": 2364 }, { "epoch": 0.19398411478349986, "grad_norm": 0.5377549529075623, "learning_rate": 9.444553666080029e-05, "loss": 3.2038, "step": 2366 }, { "epoch": 0.19414809121188828, "grad_norm": 0.5789408087730408, "learning_rate": 9.443311015438927e-05, "loss": 3.2505, "step": 2368 }, { "epoch": 0.1943120676402767, "grad_norm": 0.5615429878234863, "learning_rate": 9.442067058252666e-05, "loss": 3.2348, "step": 2370 }, { "epoch": 0.19447604406866512, "grad_norm": 0.5808910131454468, "learning_rate": 9.440821794887028e-05, "loss": 3.2645, "step": 2372 }, { "epoch": 0.19464002049705356, "grad_norm": 0.5342815518379211, "learning_rate": 9.439575225708179e-05, "loss": 3.1556, "step": 2374 }, { "epoch": 0.19480399692544198, "grad_norm": 0.6104359030723572, "learning_rate": 9.438327351082669e-05, "loss": 3.1896, "step": 2376 }, { "epoch": 0.1949679733538304, "grad_norm": 0.47551843523979187, "learning_rate": 9.437078171377437e-05, "loss": 3.236, "step": 2378 }, { "epoch": 0.19513194978221882, "grad_norm": 0.5371410846710205, "learning_rate": 9.435827686959795e-05, "loss": 3.2451, "step": 2380 }, { "epoch": 0.19529592621060723, "grad_norm": 0.5497537851333618, "learning_rate": 9.43457589819745e-05, "loss": 3.1947, "step": 2382 }, { "epoch": 0.19545990263899565, "grad_norm": 0.6186292767524719, "learning_rate": 9.433322805458484e-05, "loss": 3.242, "step": 2384 }, { "epoch": 0.19562387906738407, "grad_norm": 0.6111587882041931, "learning_rate": 9.43206840911137e-05, "loss": 3.2098, "step": 2386 }, { "epoch": 0.1957878554957725, "grad_norm": 0.636885941028595, "learning_rate": 9.430812709524956e-05, "loss": 3.2636, "step": 2388 }, { "epoch": 0.1959518319241609, "grad_norm": 0.5833829045295715, "learning_rate": 9.42955570706848e-05, "loss": 3.147, "step": 2390 }, { "epoch": 0.19611580835254933, "grad_norm": 0.645831823348999, "learning_rate": 9.42829740211156e-05, "loss": 3.2301, "step": 2392 }, { "epoch": 0.19627978478093774, "grad_norm": 0.5893756151199341, "learning_rate": 9.427037795024199e-05, "loss": 3.2677, "step": 2394 }, { "epoch": 0.19644376120932616, "grad_norm": 0.5982114672660828, "learning_rate": 9.425776886176778e-05, "loss": 3.1879, "step": 2396 }, { "epoch": 0.19660773763771458, "grad_norm": 0.6196883916854858, "learning_rate": 9.424514675940068e-05, "loss": 3.1708, "step": 2398 }, { "epoch": 0.196771714066103, "grad_norm": 0.5512893199920654, "learning_rate": 9.423251164685217e-05, "loss": 3.1997, "step": 2400 }, { "epoch": 0.19693569049449142, "grad_norm": 0.6537207961082458, "learning_rate": 9.421986352783759e-05, "loss": 3.2384, "step": 2402 }, { "epoch": 0.19709966692287983, "grad_norm": 0.572920024394989, "learning_rate": 9.420720240607606e-05, "loss": 3.1938, "step": 2404 }, { "epoch": 0.19726364335126825, "grad_norm": 0.5719939470291138, "learning_rate": 9.419452828529058e-05, "loss": 3.2079, "step": 2406 }, { "epoch": 0.19742761977965667, "grad_norm": 0.5642483234405518, "learning_rate": 9.418184116920794e-05, "loss": 3.2037, "step": 2408 }, { "epoch": 0.1975915962080451, "grad_norm": 0.4986971914768219, "learning_rate": 9.416914106155875e-05, "loss": 3.1913, "step": 2410 }, { "epoch": 0.1977555726364335, "grad_norm": 0.5548354983329773, "learning_rate": 9.415642796607746e-05, "loss": 3.2272, "step": 2412 }, { "epoch": 0.19791954906482193, "grad_norm": 0.6837654113769531, "learning_rate": 9.414370188650231e-05, "loss": 3.2308, "step": 2414 }, { "epoch": 0.19808352549321034, "grad_norm": 0.7141901850700378, "learning_rate": 9.413096282657538e-05, "loss": 3.1775, "step": 2416 }, { "epoch": 0.19824750192159876, "grad_norm": 0.7015743851661682, "learning_rate": 9.411821079004258e-05, "loss": 3.1921, "step": 2418 }, { "epoch": 0.19841147834998718, "grad_norm": 0.5208891034126282, "learning_rate": 9.410544578065358e-05, "loss": 3.1988, "step": 2420 }, { "epoch": 0.1985754547783756, "grad_norm": 0.6202064752578735, "learning_rate": 9.409266780216191e-05, "loss": 3.1672, "step": 2422 }, { "epoch": 0.19873943120676402, "grad_norm": 0.670427143573761, "learning_rate": 9.407987685832493e-05, "loss": 3.2011, "step": 2424 }, { "epoch": 0.19890340763515243, "grad_norm": 0.6327905058860779, "learning_rate": 9.406707295290377e-05, "loss": 3.216, "step": 2426 }, { "epoch": 0.19906738406354085, "grad_norm": 0.6474474668502808, "learning_rate": 9.405425608966338e-05, "loss": 3.2229, "step": 2428 }, { "epoch": 0.19923136049192927, "grad_norm": 0.7056578993797302, "learning_rate": 9.404142627237255e-05, "loss": 3.2508, "step": 2430 }, { "epoch": 0.19939533692031772, "grad_norm": 0.5632738471031189, "learning_rate": 9.402858350480383e-05, "loss": 3.1891, "step": 2432 }, { "epoch": 0.19955931334870614, "grad_norm": 0.5663987994194031, "learning_rate": 9.401572779073363e-05, "loss": 3.1317, "step": 2434 }, { "epoch": 0.19972328977709455, "grad_norm": 0.6044129729270935, "learning_rate": 9.400285913394213e-05, "loss": 3.1676, "step": 2436 }, { "epoch": 0.19988726620548297, "grad_norm": 0.5997135639190674, "learning_rate": 9.398997753821334e-05, "loss": 3.2122, "step": 2438 }, { "epoch": 0.2000512426338714, "grad_norm": 0.5731354355812073, "learning_rate": 9.397708300733503e-05, "loss": 3.146, "step": 2440 }, { "epoch": 0.2002152190622598, "grad_norm": 0.5749174356460571, "learning_rate": 9.396417554509885e-05, "loss": 3.2533, "step": 2442 }, { "epoch": 0.20037919549064823, "grad_norm": 0.6123077869415283, "learning_rate": 9.395125515530019e-05, "loss": 3.2046, "step": 2444 }, { "epoch": 0.20054317191903664, "grad_norm": 0.5596455335617065, "learning_rate": 9.393832184173826e-05, "loss": 3.2109, "step": 2446 }, { "epoch": 0.20070714834742506, "grad_norm": 0.6029247641563416, "learning_rate": 9.392537560821606e-05, "loss": 3.2487, "step": 2448 }, { "epoch": 0.20087112477581348, "grad_norm": 0.5740619897842407, "learning_rate": 9.391241645854041e-05, "loss": 3.1954, "step": 2450 }, { "epoch": 0.2010351012042019, "grad_norm": 0.4807377755641937, "learning_rate": 9.389944439652194e-05, "loss": 3.232, "step": 2452 }, { "epoch": 0.20119907763259032, "grad_norm": 0.4954237937927246, "learning_rate": 9.388645942597501e-05, "loss": 3.1604, "step": 2454 }, { "epoch": 0.20136305406097874, "grad_norm": 0.5956186652183533, "learning_rate": 9.387346155071785e-05, "loss": 3.2222, "step": 2456 }, { "epoch": 0.20152703048936715, "grad_norm": 0.5427073240280151, "learning_rate": 9.386045077457244e-05, "loss": 3.199, "step": 2458 }, { "epoch": 0.20169100691775557, "grad_norm": 0.6691755056381226, "learning_rate": 9.384742710136458e-05, "loss": 3.1441, "step": 2460 }, { "epoch": 0.201854983346144, "grad_norm": 0.5961546301841736, "learning_rate": 9.383439053492384e-05, "loss": 3.2128, "step": 2462 }, { "epoch": 0.2020189597745324, "grad_norm": 0.627190887928009, "learning_rate": 9.38213410790836e-05, "loss": 3.2204, "step": 2464 }, { "epoch": 0.20218293620292083, "grad_norm": 0.6342671513557434, "learning_rate": 9.380827873768101e-05, "loss": 3.1279, "step": 2466 }, { "epoch": 0.20234691263130924, "grad_norm": 0.5942181944847107, "learning_rate": 9.379520351455705e-05, "loss": 3.204, "step": 2468 }, { "epoch": 0.20251088905969766, "grad_norm": 0.5538034439086914, "learning_rate": 9.378211541355643e-05, "loss": 3.2224, "step": 2470 }, { "epoch": 0.20267486548808608, "grad_norm": 0.5361983776092529, "learning_rate": 9.376901443852767e-05, "loss": 3.1103, "step": 2472 }, { "epoch": 0.2028388419164745, "grad_norm": 0.6236636638641357, "learning_rate": 9.375590059332311e-05, "loss": 3.2047, "step": 2474 }, { "epoch": 0.20300281834486292, "grad_norm": 0.5351163148880005, "learning_rate": 9.374277388179882e-05, "loss": 3.2153, "step": 2476 }, { "epoch": 0.20316679477325134, "grad_norm": 0.5665524005889893, "learning_rate": 9.37296343078147e-05, "loss": 3.1779, "step": 2478 }, { "epoch": 0.20333077120163975, "grad_norm": 0.5840953588485718, "learning_rate": 9.371648187523439e-05, "loss": 3.1732, "step": 2480 }, { "epoch": 0.20349474763002817, "grad_norm": 0.5002064108848572, "learning_rate": 9.370331658792534e-05, "loss": 3.1473, "step": 2482 }, { "epoch": 0.2036587240584166, "grad_norm": 0.5152116417884827, "learning_rate": 9.369013844975878e-05, "loss": 3.1973, "step": 2484 }, { "epoch": 0.203822700486805, "grad_norm": 0.5382422208786011, "learning_rate": 9.367694746460969e-05, "loss": 3.1354, "step": 2486 }, { "epoch": 0.20398667691519345, "grad_norm": 0.5114589929580688, "learning_rate": 9.366374363635688e-05, "loss": 3.1416, "step": 2488 }, { "epoch": 0.20415065334358187, "grad_norm": 0.5633603930473328, "learning_rate": 9.365052696888288e-05, "loss": 3.1328, "step": 2490 }, { "epoch": 0.2043146297719703, "grad_norm": 0.6122515201568604, "learning_rate": 9.363729746607401e-05, "loss": 3.2152, "step": 2492 }, { "epoch": 0.2044786062003587, "grad_norm": 0.49608293175697327, "learning_rate": 9.36240551318204e-05, "loss": 3.1354, "step": 2494 }, { "epoch": 0.20464258262874713, "grad_norm": 0.6546223759651184, "learning_rate": 9.361079997001592e-05, "loss": 3.2126, "step": 2496 }, { "epoch": 0.20480655905713555, "grad_norm": 0.6353023648262024, "learning_rate": 9.359753198455823e-05, "loss": 3.1782, "step": 2498 }, { "epoch": 0.20497053548552396, "grad_norm": 0.5790070295333862, "learning_rate": 9.358425117934873e-05, "loss": 3.1635, "step": 2500 }, { "epoch": 0.20513451191391238, "grad_norm": 0.5316998958587646, "learning_rate": 9.357095755829259e-05, "loss": 3.1684, "step": 2502 }, { "epoch": 0.2052984883423008, "grad_norm": 0.5418469905853271, "learning_rate": 9.355765112529882e-05, "loss": 3.2322, "step": 2504 }, { "epoch": 0.20546246477068922, "grad_norm": 0.5909755229949951, "learning_rate": 9.35443318842801e-05, "loss": 3.1637, "step": 2506 }, { "epoch": 0.20562644119907764, "grad_norm": 0.5913002490997314, "learning_rate": 9.353099983915298e-05, "loss": 3.1568, "step": 2508 }, { "epoch": 0.20579041762746605, "grad_norm": 0.5578615665435791, "learning_rate": 9.351765499383764e-05, "loss": 3.1382, "step": 2510 }, { "epoch": 0.20595439405585447, "grad_norm": 0.5887861251831055, "learning_rate": 9.350429735225816e-05, "loss": 3.1946, "step": 2512 }, { "epoch": 0.2061183704842429, "grad_norm": 0.5453567504882812, "learning_rate": 9.34909269183423e-05, "loss": 3.1474, "step": 2514 }, { "epoch": 0.2062823469126313, "grad_norm": 0.5504783987998962, "learning_rate": 9.34775436960216e-05, "loss": 3.1371, "step": 2516 }, { "epoch": 0.20644632334101973, "grad_norm": 0.5522333979606628, "learning_rate": 9.346414768923138e-05, "loss": 3.1736, "step": 2518 }, { "epoch": 0.20661029976940815, "grad_norm": 0.5545846223831177, "learning_rate": 9.345073890191067e-05, "loss": 3.155, "step": 2520 }, { "epoch": 0.20677427619779656, "grad_norm": 0.5755533576011658, "learning_rate": 9.343731733800235e-05, "loss": 3.1211, "step": 2522 }, { "epoch": 0.20693825262618498, "grad_norm": 0.6164469122886658, "learning_rate": 9.342388300145294e-05, "loss": 3.1781, "step": 2524 }, { "epoch": 0.2071022290545734, "grad_norm": 0.5937029123306274, "learning_rate": 9.341043589621282e-05, "loss": 3.2039, "step": 2526 }, { "epoch": 0.20726620548296182, "grad_norm": 0.5489475727081299, "learning_rate": 9.339697602623605e-05, "loss": 3.1502, "step": 2528 }, { "epoch": 0.20743018191135024, "grad_norm": 0.6091250777244568, "learning_rate": 9.338350339548048e-05, "loss": 3.1774, "step": 2530 }, { "epoch": 0.20759415833973865, "grad_norm": 0.5674654841423035, "learning_rate": 9.337001800790773e-05, "loss": 3.1535, "step": 2532 }, { "epoch": 0.20775813476812707, "grad_norm": 0.644279420375824, "learning_rate": 9.33565198674831e-05, "loss": 3.1406, "step": 2534 }, { "epoch": 0.2079221111965155, "grad_norm": 0.6195595264434814, "learning_rate": 9.334300897817574e-05, "loss": 3.1527, "step": 2536 }, { "epoch": 0.2080860876249039, "grad_norm": 0.5304683446884155, "learning_rate": 9.332948534395846e-05, "loss": 3.1957, "step": 2538 }, { "epoch": 0.20825006405329233, "grad_norm": 0.6691213250160217, "learning_rate": 9.331594896880787e-05, "loss": 3.2468, "step": 2540 }, { "epoch": 0.20841404048168075, "grad_norm": 0.5579569339752197, "learning_rate": 9.330239985670427e-05, "loss": 3.1475, "step": 2542 }, { "epoch": 0.20857801691006916, "grad_norm": 0.6016284227371216, "learning_rate": 9.328883801163181e-05, "loss": 3.1298, "step": 2544 }, { "epoch": 0.2087419933384576, "grad_norm": 0.5903862714767456, "learning_rate": 9.327526343757826e-05, "loss": 3.1804, "step": 2546 }, { "epoch": 0.20890596976684603, "grad_norm": 0.5137822031974792, "learning_rate": 9.326167613853523e-05, "loss": 3.1662, "step": 2548 }, { "epoch": 0.20906994619523445, "grad_norm": 0.5315471887588501, "learning_rate": 9.324807611849802e-05, "loss": 3.2222, "step": 2550 }, { "epoch": 0.20923392262362286, "grad_norm": 0.5678295493125916, "learning_rate": 9.323446338146568e-05, "loss": 3.147, "step": 2552 }, { "epoch": 0.20939789905201128, "grad_norm": 0.5671442151069641, "learning_rate": 9.322083793144101e-05, "loss": 3.1514, "step": 2554 }, { "epoch": 0.2095618754803997, "grad_norm": 0.5480635166168213, "learning_rate": 9.320719977243052e-05, "loss": 3.1943, "step": 2556 }, { "epoch": 0.20972585190878812, "grad_norm": 0.573996901512146, "learning_rate": 9.319354890844451e-05, "loss": 3.1084, "step": 2558 }, { "epoch": 0.20988982833717654, "grad_norm": 0.5476592183113098, "learning_rate": 9.317988534349697e-05, "loss": 3.1328, "step": 2560 }, { "epoch": 0.21005380476556496, "grad_norm": 0.5603650808334351, "learning_rate": 9.316620908160562e-05, "loss": 3.256, "step": 2562 }, { "epoch": 0.21021778119395337, "grad_norm": 0.5470094680786133, "learning_rate": 9.315252012679198e-05, "loss": 3.2453, "step": 2564 }, { "epoch": 0.2103817576223418, "grad_norm": 0.5147728323936462, "learning_rate": 9.313881848308123e-05, "loss": 3.2067, "step": 2566 }, { "epoch": 0.2105457340507302, "grad_norm": 0.5771604776382446, "learning_rate": 9.312510415450228e-05, "loss": 3.1415, "step": 2568 }, { "epoch": 0.21070971047911863, "grad_norm": 0.5814144015312195, "learning_rate": 9.311137714508785e-05, "loss": 3.1481, "step": 2570 }, { "epoch": 0.21087368690750705, "grad_norm": 0.589153528213501, "learning_rate": 9.309763745887428e-05, "loss": 3.1297, "step": 2572 }, { "epoch": 0.21103766333589546, "grad_norm": 0.5519060492515564, "learning_rate": 9.308388509990171e-05, "loss": 3.1409, "step": 2574 }, { "epoch": 0.21120163976428388, "grad_norm": 0.5374418497085571, "learning_rate": 9.307012007221401e-05, "loss": 3.2072, "step": 2576 }, { "epoch": 0.2113656161926723, "grad_norm": 0.5539153218269348, "learning_rate": 9.305634237985874e-05, "loss": 3.1928, "step": 2578 }, { "epoch": 0.21152959262106072, "grad_norm": 0.47456660866737366, "learning_rate": 9.304255202688721e-05, "loss": 3.1669, "step": 2580 }, { "epoch": 0.21169356904944914, "grad_norm": 0.5018851161003113, "learning_rate": 9.302874901735441e-05, "loss": 3.1694, "step": 2582 }, { "epoch": 0.21185754547783756, "grad_norm": 0.5462913513183594, "learning_rate": 9.301493335531911e-05, "loss": 3.1753, "step": 2584 }, { "epoch": 0.21202152190622597, "grad_norm": 0.5394952297210693, "learning_rate": 9.300110504484377e-05, "loss": 3.1824, "step": 2586 }, { "epoch": 0.2121854983346144, "grad_norm": 0.6784063577651978, "learning_rate": 9.298726408999455e-05, "loss": 3.1717, "step": 2588 }, { "epoch": 0.2123494747630028, "grad_norm": 0.5093061923980713, "learning_rate": 9.297341049484139e-05, "loss": 3.1197, "step": 2590 }, { "epoch": 0.21251345119139123, "grad_norm": 0.5276237726211548, "learning_rate": 9.295954426345786e-05, "loss": 3.1307, "step": 2592 }, { "epoch": 0.21267742761977965, "grad_norm": 0.6057010293006897, "learning_rate": 9.294566539992132e-05, "loss": 3.1619, "step": 2594 }, { "epoch": 0.21284140404816806, "grad_norm": 0.6017722487449646, "learning_rate": 9.293177390831282e-05, "loss": 3.1196, "step": 2596 }, { "epoch": 0.21300538047655648, "grad_norm": 0.5458320379257202, "learning_rate": 9.291786979271712e-05, "loss": 3.1665, "step": 2598 }, { "epoch": 0.2131693569049449, "grad_norm": 0.6224083304405212, "learning_rate": 9.290395305722269e-05, "loss": 3.1768, "step": 2600 }, { "epoch": 0.21333333333333335, "grad_norm": 0.5466166138648987, "learning_rate": 9.289002370592168e-05, "loss": 3.135, "step": 2602 }, { "epoch": 0.21349730976172177, "grad_norm": 0.6690223217010498, "learning_rate": 9.287608174291004e-05, "loss": 3.2039, "step": 2604 }, { "epoch": 0.21366128619011018, "grad_norm": 0.6225460767745972, "learning_rate": 9.286212717228734e-05, "loss": 3.1377, "step": 2606 }, { "epoch": 0.2138252626184986, "grad_norm": 0.6038724184036255, "learning_rate": 9.28481599981569e-05, "loss": 3.1148, "step": 2608 }, { "epoch": 0.21398923904688702, "grad_norm": 0.5335320830345154, "learning_rate": 9.283418022462571e-05, "loss": 3.1726, "step": 2610 }, { "epoch": 0.21415321547527544, "grad_norm": 0.5151216387748718, "learning_rate": 9.282018785580452e-05, "loss": 3.1839, "step": 2612 }, { "epoch": 0.21431719190366386, "grad_norm": 0.5432109236717224, "learning_rate": 9.280618289580773e-05, "loss": 3.1388, "step": 2614 }, { "epoch": 0.21448116833205227, "grad_norm": 0.4836788475513458, "learning_rate": 9.27921653487535e-05, "loss": 3.133, "step": 2616 }, { "epoch": 0.2146451447604407, "grad_norm": 0.5969836115837097, "learning_rate": 9.277813521876361e-05, "loss": 3.1967, "step": 2618 }, { "epoch": 0.2148091211888291, "grad_norm": 0.5812448859214783, "learning_rate": 9.276409250996362e-05, "loss": 3.0804, "step": 2620 }, { "epoch": 0.21497309761721753, "grad_norm": 0.6391003131866455, "learning_rate": 9.275003722648274e-05, "loss": 3.1616, "step": 2622 }, { "epoch": 0.21513707404560595, "grad_norm": 0.6133304834365845, "learning_rate": 9.27359693724539e-05, "loss": 3.1099, "step": 2624 }, { "epoch": 0.21530105047399437, "grad_norm": 0.7073734402656555, "learning_rate": 9.272188895201372e-05, "loss": 3.2123, "step": 2626 }, { "epoch": 0.21546502690238278, "grad_norm": 0.6187078952789307, "learning_rate": 9.270779596930252e-05, "loss": 3.1732, "step": 2628 }, { "epoch": 0.2156290033307712, "grad_norm": 0.5692609548568726, "learning_rate": 9.269369042846428e-05, "loss": 3.1112, "step": 2630 }, { "epoch": 0.21579297975915962, "grad_norm": 0.6214010715484619, "learning_rate": 9.267957233364674e-05, "loss": 3.1889, "step": 2632 }, { "epoch": 0.21595695618754804, "grad_norm": 0.575520932674408, "learning_rate": 9.266544168900126e-05, "loss": 3.1127, "step": 2634 }, { "epoch": 0.21612093261593646, "grad_norm": 0.5261242985725403, "learning_rate": 9.265129849868294e-05, "loss": 3.1123, "step": 2636 }, { "epoch": 0.21628490904432487, "grad_norm": 0.5849714279174805, "learning_rate": 9.263714276685056e-05, "loss": 3.1153, "step": 2638 }, { "epoch": 0.2164488854727133, "grad_norm": 0.563801109790802, "learning_rate": 9.262297449766657e-05, "loss": 3.1947, "step": 2640 }, { "epoch": 0.2166128619011017, "grad_norm": 0.5794183611869812, "learning_rate": 9.260879369529711e-05, "loss": 3.1205, "step": 2642 }, { "epoch": 0.21677683832949013, "grad_norm": 0.6427027583122253, "learning_rate": 9.259460036391201e-05, "loss": 3.1642, "step": 2644 }, { "epoch": 0.21694081475787855, "grad_norm": 0.5406637191772461, "learning_rate": 9.25803945076848e-05, "loss": 3.1695, "step": 2646 }, { "epoch": 0.21710479118626697, "grad_norm": 0.5383151769638062, "learning_rate": 9.256617613079267e-05, "loss": 3.0954, "step": 2648 }, { "epoch": 0.21726876761465538, "grad_norm": 0.6771288514137268, "learning_rate": 9.25519452374165e-05, "loss": 3.1934, "step": 2650 }, { "epoch": 0.2174327440430438, "grad_norm": 0.639716386795044, "learning_rate": 9.253770183174085e-05, "loss": 3.1192, "step": 2652 }, { "epoch": 0.21759672047143222, "grad_norm": 0.5584697127342224, "learning_rate": 9.252344591795396e-05, "loss": 3.2127, "step": 2654 }, { "epoch": 0.21776069689982064, "grad_norm": 0.5381549000740051, "learning_rate": 9.250917750024777e-05, "loss": 3.1321, "step": 2656 }, { "epoch": 0.21792467332820906, "grad_norm": 0.5931708216667175, "learning_rate": 9.249489658281783e-05, "loss": 3.1762, "step": 2658 }, { "epoch": 0.2180886497565975, "grad_norm": 0.5969710946083069, "learning_rate": 9.248060316986344e-05, "loss": 3.1158, "step": 2660 }, { "epoch": 0.21825262618498592, "grad_norm": 0.6363986134529114, "learning_rate": 9.246629726558756e-05, "loss": 3.242, "step": 2662 }, { "epoch": 0.21841660261337434, "grad_norm": 0.540717363357544, "learning_rate": 9.245197887419676e-05, "loss": 3.1661, "step": 2664 }, { "epoch": 0.21858057904176276, "grad_norm": 0.589412271976471, "learning_rate": 9.243764799990136e-05, "loss": 3.1018, "step": 2666 }, { "epoch": 0.21874455547015118, "grad_norm": 0.5175191164016724, "learning_rate": 9.242330464691533e-05, "loss": 3.1773, "step": 2668 }, { "epoch": 0.2189085318985396, "grad_norm": 0.5175068974494934, "learning_rate": 9.240894881945627e-05, "loss": 3.1566, "step": 2670 }, { "epoch": 0.219072508326928, "grad_norm": 0.48125573992729187, "learning_rate": 9.239458052174551e-05, "loss": 3.106, "step": 2672 }, { "epoch": 0.21923648475531643, "grad_norm": 0.6103034019470215, "learning_rate": 9.238019975800799e-05, "loss": 3.1316, "step": 2674 }, { "epoch": 0.21940046118370485, "grad_norm": 0.5315214991569519, "learning_rate": 9.236580653247235e-05, "loss": 3.0961, "step": 2676 }, { "epoch": 0.21956443761209327, "grad_norm": 0.5600281357765198, "learning_rate": 9.235140084937086e-05, "loss": 3.0966, "step": 2678 }, { "epoch": 0.21972841404048168, "grad_norm": 0.5466108322143555, "learning_rate": 9.233698271293953e-05, "loss": 3.1325, "step": 2680 }, { "epoch": 0.2198923904688701, "grad_norm": 0.5320989489555359, "learning_rate": 9.232255212741792e-05, "loss": 3.1617, "step": 2682 }, { "epoch": 0.22005636689725852, "grad_norm": 0.4955200254917145, "learning_rate": 9.230810909704934e-05, "loss": 3.1351, "step": 2684 }, { "epoch": 0.22022034332564694, "grad_norm": 0.5174024105072021, "learning_rate": 9.229365362608074e-05, "loss": 3.1993, "step": 2686 }, { "epoch": 0.22038431975403536, "grad_norm": 0.576806366443634, "learning_rate": 9.22791857187627e-05, "loss": 3.1945, "step": 2688 }, { "epoch": 0.22054829618242378, "grad_norm": 0.6114248633384705, "learning_rate": 9.226470537934948e-05, "loss": 3.1272, "step": 2690 }, { "epoch": 0.2207122726108122, "grad_norm": 0.6304234266281128, "learning_rate": 9.225021261209898e-05, "loss": 3.1276, "step": 2692 }, { "epoch": 0.2208762490392006, "grad_norm": 0.5603763461112976, "learning_rate": 9.223570742127278e-05, "loss": 3.068, "step": 2694 }, { "epoch": 0.22104022546758903, "grad_norm": 0.5506424307823181, "learning_rate": 9.222118981113607e-05, "loss": 3.1429, "step": 2696 }, { "epoch": 0.22120420189597745, "grad_norm": 0.5758050084114075, "learning_rate": 9.220665978595775e-05, "loss": 3.137, "step": 2698 }, { "epoch": 0.22136817832436587, "grad_norm": 0.5011979341506958, "learning_rate": 9.219211735001034e-05, "loss": 3.1048, "step": 2700 }, { "epoch": 0.22153215475275428, "grad_norm": 0.65125972032547, "learning_rate": 9.217756250756996e-05, "loss": 3.1607, "step": 2702 }, { "epoch": 0.2216961311811427, "grad_norm": 0.5995632410049438, "learning_rate": 9.21629952629165e-05, "loss": 3.055, "step": 2704 }, { "epoch": 0.22186010760953112, "grad_norm": 0.7397205233573914, "learning_rate": 9.214841562033338e-05, "loss": 3.1355, "step": 2706 }, { "epoch": 0.22202408403791954, "grad_norm": 0.636867880821228, "learning_rate": 9.213382358410771e-05, "loss": 3.158, "step": 2708 }, { "epoch": 0.22218806046630796, "grad_norm": 0.6258612275123596, "learning_rate": 9.211921915853026e-05, "loss": 3.1435, "step": 2710 }, { "epoch": 0.22235203689469638, "grad_norm": 0.6227878332138062, "learning_rate": 9.210460234789542e-05, "loss": 3.115, "step": 2712 }, { "epoch": 0.2225160133230848, "grad_norm": 0.552367091178894, "learning_rate": 9.20899731565012e-05, "loss": 3.1096, "step": 2714 }, { "epoch": 0.22267998975147324, "grad_norm": 0.5357968211174011, "learning_rate": 9.207533158864934e-05, "loss": 3.1431, "step": 2716 }, { "epoch": 0.22284396617986166, "grad_norm": 0.5799859166145325, "learning_rate": 9.206067764864512e-05, "loss": 3.1239, "step": 2718 }, { "epoch": 0.22300794260825008, "grad_norm": 0.541117787361145, "learning_rate": 9.204601134079749e-05, "loss": 3.1551, "step": 2720 }, { "epoch": 0.2231719190366385, "grad_norm": 0.5454208254814148, "learning_rate": 9.203133266941906e-05, "loss": 3.1585, "step": 2722 }, { "epoch": 0.2233358954650269, "grad_norm": 0.6066946983337402, "learning_rate": 9.201664163882605e-05, "loss": 3.1415, "step": 2724 }, { "epoch": 0.22349987189341533, "grad_norm": 0.5827730298042297, "learning_rate": 9.200193825333833e-05, "loss": 3.0677, "step": 2726 }, { "epoch": 0.22366384832180375, "grad_norm": 0.6678103804588318, "learning_rate": 9.198722251727941e-05, "loss": 3.1344, "step": 2728 }, { "epoch": 0.22382782475019217, "grad_norm": 0.553477942943573, "learning_rate": 9.197249443497638e-05, "loss": 3.1403, "step": 2730 }, { "epoch": 0.22399180117858059, "grad_norm": 0.6305515766143799, "learning_rate": 9.195775401076001e-05, "loss": 3.1172, "step": 2732 }, { "epoch": 0.224155777606969, "grad_norm": 0.6065593957901001, "learning_rate": 9.194300124896471e-05, "loss": 3.1931, "step": 2734 }, { "epoch": 0.22431975403535742, "grad_norm": 0.5823774933815002, "learning_rate": 9.192823615392848e-05, "loss": 3.1133, "step": 2736 }, { "epoch": 0.22448373046374584, "grad_norm": 0.5522893667221069, "learning_rate": 9.191345872999297e-05, "loss": 3.111, "step": 2738 }, { "epoch": 0.22464770689213426, "grad_norm": 0.5573318600654602, "learning_rate": 9.189866898150343e-05, "loss": 3.1446, "step": 2740 }, { "epoch": 0.22481168332052268, "grad_norm": 0.5337832570075989, "learning_rate": 9.188386691280875e-05, "loss": 3.1285, "step": 2742 }, { "epoch": 0.2249756597489111, "grad_norm": 0.4911380112171173, "learning_rate": 9.186905252826147e-05, "loss": 3.0945, "step": 2744 }, { "epoch": 0.2251396361772995, "grad_norm": 0.546593189239502, "learning_rate": 9.18542258322177e-05, "loss": 3.1581, "step": 2746 }, { "epoch": 0.22530361260568793, "grad_norm": 0.5859017372131348, "learning_rate": 9.183938682903721e-05, "loss": 3.1265, "step": 2748 }, { "epoch": 0.22546758903407635, "grad_norm": 0.610236406326294, "learning_rate": 9.182453552308335e-05, "loss": 3.1202, "step": 2750 }, { "epoch": 0.22563156546246477, "grad_norm": 0.521246075630188, "learning_rate": 9.180967191872315e-05, "loss": 3.1225, "step": 2752 }, { "epoch": 0.22579554189085319, "grad_norm": 0.5413455367088318, "learning_rate": 9.179479602032719e-05, "loss": 3.1589, "step": 2754 }, { "epoch": 0.2259595183192416, "grad_norm": 0.5279058814048767, "learning_rate": 9.177990783226969e-05, "loss": 3.1119, "step": 2756 }, { "epoch": 0.22612349474763002, "grad_norm": 0.5241829752922058, "learning_rate": 9.17650073589285e-05, "loss": 3.0924, "step": 2758 }, { "epoch": 0.22628747117601844, "grad_norm": 0.5258579850196838, "learning_rate": 9.175009460468507e-05, "loss": 3.1179, "step": 2760 }, { "epoch": 0.22645144760440686, "grad_norm": 0.5945084095001221, "learning_rate": 9.173516957392446e-05, "loss": 3.1142, "step": 2762 }, { "epoch": 0.22661542403279528, "grad_norm": 0.49606871604919434, "learning_rate": 9.172023227103533e-05, "loss": 3.1269, "step": 2764 }, { "epoch": 0.2267794004611837, "grad_norm": 0.5690228343009949, "learning_rate": 9.170528270040996e-05, "loss": 3.1252, "step": 2766 }, { "epoch": 0.2269433768895721, "grad_norm": 0.5027005672454834, "learning_rate": 9.169032086644425e-05, "loss": 3.1549, "step": 2768 }, { "epoch": 0.22710735331796053, "grad_norm": 0.5609970092773438, "learning_rate": 9.167534677353768e-05, "loss": 3.1085, "step": 2770 }, { "epoch": 0.22727132974634895, "grad_norm": 0.5917537212371826, "learning_rate": 9.166036042609336e-05, "loss": 3.1457, "step": 2772 }, { "epoch": 0.2274353061747374, "grad_norm": 0.4679684042930603, "learning_rate": 9.164536182851797e-05, "loss": 3.1672, "step": 2774 }, { "epoch": 0.2275992826031258, "grad_norm": 0.5247348546981812, "learning_rate": 9.163035098522182e-05, "loss": 3.0683, "step": 2776 }, { "epoch": 0.22776325903151423, "grad_norm": 0.5608956813812256, "learning_rate": 9.161532790061882e-05, "loss": 3.093, "step": 2778 }, { "epoch": 0.22792723545990265, "grad_norm": 0.6004567742347717, "learning_rate": 9.160029257912646e-05, "loss": 3.0853, "step": 2780 }, { "epoch": 0.22809121188829107, "grad_norm": 0.4759785532951355, "learning_rate": 9.158524502516586e-05, "loss": 3.1141, "step": 2782 }, { "epoch": 0.22825518831667949, "grad_norm": 0.5630537867546082, "learning_rate": 9.15701852431617e-05, "loss": 3.112, "step": 2784 }, { "epoch": 0.2284191647450679, "grad_norm": 0.5082862377166748, "learning_rate": 9.15551132375423e-05, "loss": 3.0943, "step": 2786 }, { "epoch": 0.22858314117345632, "grad_norm": 0.5193647146224976, "learning_rate": 9.15400290127395e-05, "loss": 3.1019, "step": 2788 }, { "epoch": 0.22874711760184474, "grad_norm": 0.604705274105072, "learning_rate": 9.152493257318882e-05, "loss": 3.0763, "step": 2790 }, { "epoch": 0.22891109403023316, "grad_norm": 0.6304996013641357, "learning_rate": 9.150982392332932e-05, "loss": 3.1561, "step": 2792 }, { "epoch": 0.22907507045862158, "grad_norm": 0.6620004177093506, "learning_rate": 9.149470306760368e-05, "loss": 3.1256, "step": 2794 }, { "epoch": 0.22923904688701, "grad_norm": 0.5921337604522705, "learning_rate": 9.147957001045813e-05, "loss": 3.14, "step": 2796 }, { "epoch": 0.2294030233153984, "grad_norm": 0.5974056720733643, "learning_rate": 9.146442475634252e-05, "loss": 3.1025, "step": 2798 }, { "epoch": 0.22956699974378683, "grad_norm": 0.5777150988578796, "learning_rate": 9.144926730971027e-05, "loss": 3.0568, "step": 2800 }, { "epoch": 0.22973097617217525, "grad_norm": 0.5552829504013062, "learning_rate": 9.143409767501839e-05, "loss": 3.1045, "step": 2802 }, { "epoch": 0.22989495260056367, "grad_norm": 0.5311617255210876, "learning_rate": 9.141891585672748e-05, "loss": 3.1206, "step": 2804 }, { "epoch": 0.23005892902895209, "grad_norm": 0.5451520681381226, "learning_rate": 9.140372185930172e-05, "loss": 3.1293, "step": 2806 }, { "epoch": 0.2302229054573405, "grad_norm": 0.5763382315635681, "learning_rate": 9.138851568720886e-05, "loss": 3.1129, "step": 2808 }, { "epoch": 0.23038688188572892, "grad_norm": 0.5894972681999207, "learning_rate": 9.137329734492026e-05, "loss": 3.0796, "step": 2810 }, { "epoch": 0.23055085831411734, "grad_norm": 0.538650631904602, "learning_rate": 9.135806683691082e-05, "loss": 3.09, "step": 2812 }, { "epoch": 0.23071483474250576, "grad_norm": 0.5035321116447449, "learning_rate": 9.134282416765905e-05, "loss": 3.1168, "step": 2814 }, { "epoch": 0.23087881117089418, "grad_norm": 0.5038688778877258, "learning_rate": 9.132756934164699e-05, "loss": 3.1212, "step": 2816 }, { "epoch": 0.2310427875992826, "grad_norm": 0.53536057472229, "learning_rate": 9.131230236336032e-05, "loss": 3.118, "step": 2818 }, { "epoch": 0.231206764027671, "grad_norm": 0.5447813272476196, "learning_rate": 9.129702323728824e-05, "loss": 3.0866, "step": 2820 }, { "epoch": 0.23137074045605943, "grad_norm": 0.49734705686569214, "learning_rate": 9.128173196792355e-05, "loss": 3.1489, "step": 2822 }, { "epoch": 0.23153471688444785, "grad_norm": 0.5162733793258667, "learning_rate": 9.12664285597626e-05, "loss": 3.166, "step": 2824 }, { "epoch": 0.23169869331283627, "grad_norm": 0.47742417454719543, "learning_rate": 9.125111301730534e-05, "loss": 3.0757, "step": 2826 }, { "epoch": 0.2318626697412247, "grad_norm": 0.5511021018028259, "learning_rate": 9.123578534505525e-05, "loss": 3.1382, "step": 2828 }, { "epoch": 0.2320266461696131, "grad_norm": 0.6152271628379822, "learning_rate": 9.122044554751942e-05, "loss": 3.1326, "step": 2830 }, { "epoch": 0.23219062259800155, "grad_norm": 0.576244592666626, "learning_rate": 9.120509362920846e-05, "loss": 3.1151, "step": 2832 }, { "epoch": 0.23235459902638997, "grad_norm": 0.6472841501235962, "learning_rate": 9.118972959463656e-05, "loss": 3.1018, "step": 2834 }, { "epoch": 0.2325185754547784, "grad_norm": 0.5974353551864624, "learning_rate": 9.11743534483215e-05, "loss": 3.1377, "step": 2836 }, { "epoch": 0.2326825518831668, "grad_norm": 0.5625829696655273, "learning_rate": 9.115896519478458e-05, "loss": 3.1366, "step": 2838 }, { "epoch": 0.23284652831155522, "grad_norm": 0.6244992613792419, "learning_rate": 9.11435648385507e-05, "loss": 3.1386, "step": 2840 }, { "epoch": 0.23301050473994364, "grad_norm": 0.6011447310447693, "learning_rate": 9.11281523841483e-05, "loss": 3.1042, "step": 2842 }, { "epoch": 0.23317448116833206, "grad_norm": 0.6402640342712402, "learning_rate": 9.111272783610934e-05, "loss": 3.1381, "step": 2844 }, { "epoch": 0.23333845759672048, "grad_norm": 0.689268171787262, "learning_rate": 9.109729119896941e-05, "loss": 3.152, "step": 2846 }, { "epoch": 0.2335024340251089, "grad_norm": 0.6104257702827454, "learning_rate": 9.108184247726759e-05, "loss": 3.0882, "step": 2848 }, { "epoch": 0.23366641045349731, "grad_norm": 0.490530401468277, "learning_rate": 9.106638167554657e-05, "loss": 3.1375, "step": 2850 }, { "epoch": 0.23383038688188573, "grad_norm": 0.5417265892028809, "learning_rate": 9.105090879835254e-05, "loss": 3.0875, "step": 2852 }, { "epoch": 0.23399436331027415, "grad_norm": 0.5406416654586792, "learning_rate": 9.103542385023526e-05, "loss": 3.1689, "step": 2854 }, { "epoch": 0.23415833973866257, "grad_norm": 0.5073980093002319, "learning_rate": 9.101992683574805e-05, "loss": 3.1425, "step": 2856 }, { "epoch": 0.234322316167051, "grad_norm": 0.5920371413230896, "learning_rate": 9.100441775944779e-05, "loss": 3.1296, "step": 2858 }, { "epoch": 0.2344862925954394, "grad_norm": 0.4810742735862732, "learning_rate": 9.098889662589485e-05, "loss": 3.0661, "step": 2860 }, { "epoch": 0.23465026902382782, "grad_norm": 0.5148147344589233, "learning_rate": 9.097336343965321e-05, "loss": 3.0586, "step": 2862 }, { "epoch": 0.23481424545221624, "grad_norm": 0.5372908115386963, "learning_rate": 9.095781820529036e-05, "loss": 3.0886, "step": 2864 }, { "epoch": 0.23497822188060466, "grad_norm": 0.5466518402099609, "learning_rate": 9.094226092737734e-05, "loss": 3.1166, "step": 2866 }, { "epoch": 0.23514219830899308, "grad_norm": 0.6220472455024719, "learning_rate": 9.092669161048873e-05, "loss": 3.0579, "step": 2868 }, { "epoch": 0.2353061747373815, "grad_norm": 0.5313682556152344, "learning_rate": 9.091111025920266e-05, "loss": 3.0914, "step": 2870 }, { "epoch": 0.23547015116576991, "grad_norm": 0.5322121381759644, "learning_rate": 9.089551687810076e-05, "loss": 3.1197, "step": 2872 }, { "epoch": 0.23563412759415833, "grad_norm": 0.5401471257209778, "learning_rate": 9.087991147176827e-05, "loss": 3.0361, "step": 2874 }, { "epoch": 0.23579810402254675, "grad_norm": 0.5407954454421997, "learning_rate": 9.086429404479389e-05, "loss": 3.0943, "step": 2876 }, { "epoch": 0.23596208045093517, "grad_norm": 0.6507935523986816, "learning_rate": 9.084866460176991e-05, "loss": 3.1444, "step": 2878 }, { "epoch": 0.2361260568793236, "grad_norm": 0.621780276298523, "learning_rate": 9.08330231472921e-05, "loss": 3.1094, "step": 2880 }, { "epoch": 0.236290033307712, "grad_norm": 0.560219943523407, "learning_rate": 9.081736968595982e-05, "loss": 3.1433, "step": 2882 }, { "epoch": 0.23645400973610042, "grad_norm": 0.5493839979171753, "learning_rate": 9.080170422237593e-05, "loss": 3.0879, "step": 2884 }, { "epoch": 0.23661798616448884, "grad_norm": 0.4997730851173401, "learning_rate": 9.07860267611468e-05, "loss": 3.0705, "step": 2886 }, { "epoch": 0.2367819625928773, "grad_norm": 0.5000733137130737, "learning_rate": 9.077033730688239e-05, "loss": 3.0918, "step": 2888 }, { "epoch": 0.2369459390212657, "grad_norm": 0.595166027545929, "learning_rate": 9.075463586419613e-05, "loss": 3.1018, "step": 2890 }, { "epoch": 0.23710991544965412, "grad_norm": 0.5767890214920044, "learning_rate": 9.073892243770497e-05, "loss": 3.0718, "step": 2892 }, { "epoch": 0.23727389187804254, "grad_norm": 0.544654369354248, "learning_rate": 9.072319703202942e-05, "loss": 3.0892, "step": 2894 }, { "epoch": 0.23743786830643096, "grad_norm": 0.6508696675300598, "learning_rate": 9.070745965179353e-05, "loss": 3.1152, "step": 2896 }, { "epoch": 0.23760184473481938, "grad_norm": 0.6514599323272705, "learning_rate": 9.06917103016248e-05, "loss": 3.1539, "step": 2898 }, { "epoch": 0.2377658211632078, "grad_norm": 0.5342041254043579, "learning_rate": 9.06759489861543e-05, "loss": 3.0977, "step": 2900 }, { "epoch": 0.23792979759159621, "grad_norm": 0.5804237127304077, "learning_rate": 9.066017571001662e-05, "loss": 3.0767, "step": 2902 }, { "epoch": 0.23809377401998463, "grad_norm": 0.5167868137359619, "learning_rate": 9.064439047784982e-05, "loss": 3.0855, "step": 2904 }, { "epoch": 0.23825775044837305, "grad_norm": 0.5301772356033325, "learning_rate": 9.062859329429556e-05, "loss": 3.1147, "step": 2906 }, { "epoch": 0.23842172687676147, "grad_norm": 0.5427364110946655, "learning_rate": 9.061278416399895e-05, "loss": 3.1045, "step": 2908 }, { "epoch": 0.2385857033051499, "grad_norm": 0.5556970834732056, "learning_rate": 9.059696309160859e-05, "loss": 3.1235, "step": 2910 }, { "epoch": 0.2387496797335383, "grad_norm": 0.5361714363098145, "learning_rate": 9.058113008177667e-05, "loss": 3.1411, "step": 2912 }, { "epoch": 0.23891365616192672, "grad_norm": 0.7139540910720825, "learning_rate": 9.056528513915882e-05, "loss": 3.1739, "step": 2914 }, { "epoch": 0.23907763259031514, "grad_norm": 0.6499415040016174, "learning_rate": 9.054942826841427e-05, "loss": 3.0815, "step": 2916 }, { "epoch": 0.23924160901870356, "grad_norm": 0.6187708973884583, "learning_rate": 9.05335594742056e-05, "loss": 3.1197, "step": 2918 }, { "epoch": 0.23940558544709198, "grad_norm": 0.5267696976661682, "learning_rate": 9.051767876119906e-05, "loss": 3.1279, "step": 2920 }, { "epoch": 0.2395695618754804, "grad_norm": 0.5782443284988403, "learning_rate": 9.050178613406432e-05, "loss": 3.1206, "step": 2922 }, { "epoch": 0.23973353830386882, "grad_norm": 0.5910431742668152, "learning_rate": 9.048588159747457e-05, "loss": 3.11, "step": 2924 }, { "epoch": 0.23989751473225723, "grad_norm": 0.5470311641693115, "learning_rate": 9.046996515610649e-05, "loss": 3.0588, "step": 2926 }, { "epoch": 0.24006149116064565, "grad_norm": 0.6402561068534851, "learning_rate": 9.045403681464028e-05, "loss": 3.131, "step": 2928 }, { "epoch": 0.24022546758903407, "grad_norm": 0.5332674980163574, "learning_rate": 9.043809657775964e-05, "loss": 3.1398, "step": 2930 }, { "epoch": 0.2403894440174225, "grad_norm": 0.5881835222244263, "learning_rate": 9.042214445015176e-05, "loss": 3.1354, "step": 2932 }, { "epoch": 0.2405534204458109, "grad_norm": 0.6585120558738708, "learning_rate": 9.04061804365073e-05, "loss": 3.1273, "step": 2934 }, { "epoch": 0.24071739687419932, "grad_norm": 0.6403549313545227, "learning_rate": 9.039020454152047e-05, "loss": 3.1051, "step": 2936 }, { "epoch": 0.24088137330258774, "grad_norm": 0.60472571849823, "learning_rate": 9.037421676988893e-05, "loss": 3.1076, "step": 2938 }, { "epoch": 0.24104534973097616, "grad_norm": 0.5805239081382751, "learning_rate": 9.035821712631385e-05, "loss": 3.1201, "step": 2940 }, { "epoch": 0.24120932615936458, "grad_norm": 0.4733094274997711, "learning_rate": 9.034220561549988e-05, "loss": 3.127, "step": 2942 }, { "epoch": 0.241373302587753, "grad_norm": 0.6691949963569641, "learning_rate": 9.03261822421552e-05, "loss": 3.1115, "step": 2944 }, { "epoch": 0.24153727901614144, "grad_norm": 0.6166451573371887, "learning_rate": 9.031014701099139e-05, "loss": 3.1177, "step": 2946 }, { "epoch": 0.24170125544452986, "grad_norm": 0.7083244323730469, "learning_rate": 9.029409992672359e-05, "loss": 3.095, "step": 2948 }, { "epoch": 0.24186523187291828, "grad_norm": 0.6332703232765198, "learning_rate": 9.027804099407045e-05, "loss": 3.1122, "step": 2950 }, { "epoch": 0.2420292083013067, "grad_norm": 0.6068108081817627, "learning_rate": 9.026197021775402e-05, "loss": 3.0873, "step": 2952 }, { "epoch": 0.24219318472969512, "grad_norm": 0.5226258039474487, "learning_rate": 9.024588760249988e-05, "loss": 3.0131, "step": 2954 }, { "epoch": 0.24235716115808353, "grad_norm": 0.5775647163391113, "learning_rate": 9.02297931530371e-05, "loss": 3.115, "step": 2956 }, { "epoch": 0.24252113758647195, "grad_norm": 0.5021248459815979, "learning_rate": 9.021368687409819e-05, "loss": 3.0398, "step": 2958 }, { "epoch": 0.24268511401486037, "grad_norm": 0.5804054737091064, "learning_rate": 9.019756877041918e-05, "loss": 3.1158, "step": 2960 }, { "epoch": 0.2428490904432488, "grad_norm": 0.5360262989997864, "learning_rate": 9.018143884673957e-05, "loss": 3.1476, "step": 2962 }, { "epoch": 0.2430130668716372, "grad_norm": 0.5107494592666626, "learning_rate": 9.016529710780231e-05, "loss": 3.0919, "step": 2964 }, { "epoch": 0.24317704330002562, "grad_norm": 0.5997065305709839, "learning_rate": 9.014914355835384e-05, "loss": 3.08, "step": 2966 }, { "epoch": 0.24334101972841404, "grad_norm": 0.5437501668930054, "learning_rate": 9.013297820314408e-05, "loss": 3.1194, "step": 2968 }, { "epoch": 0.24350499615680246, "grad_norm": 0.5322654843330383, "learning_rate": 9.01168010469264e-05, "loss": 3.0669, "step": 2970 }, { "epoch": 0.24366897258519088, "grad_norm": 0.48639851808547974, "learning_rate": 9.010061209445769e-05, "loss": 3.1127, "step": 2972 }, { "epoch": 0.2438329490135793, "grad_norm": 0.4471394717693329, "learning_rate": 9.008441135049823e-05, "loss": 3.1262, "step": 2974 }, { "epoch": 0.24399692544196772, "grad_norm": 0.4837088882923126, "learning_rate": 9.006819881981184e-05, "loss": 3.0944, "step": 2976 }, { "epoch": 0.24416090187035613, "grad_norm": 0.4812915027141571, "learning_rate": 9.005197450716577e-05, "loss": 3.0651, "step": 2978 }, { "epoch": 0.24432487829874455, "grad_norm": 0.49380356073379517, "learning_rate": 9.003573841733075e-05, "loss": 3.1066, "step": 2980 }, { "epoch": 0.24448885472713297, "grad_norm": 0.49508631229400635, "learning_rate": 9.001949055508094e-05, "loss": 3.0422, "step": 2982 }, { "epoch": 0.2446528311555214, "grad_norm": 0.5182914137840271, "learning_rate": 9.0003230925194e-05, "loss": 3.1087, "step": 2984 }, { "epoch": 0.2448168075839098, "grad_norm": 0.5734208226203918, "learning_rate": 8.998695953245103e-05, "loss": 3.0798, "step": 2986 }, { "epoch": 0.24498078401229822, "grad_norm": 0.5737510919570923, "learning_rate": 8.99706763816366e-05, "loss": 3.1035, "step": 2988 }, { "epoch": 0.24514476044068664, "grad_norm": 0.5363655686378479, "learning_rate": 8.995438147753874e-05, "loss": 3.1577, "step": 2990 }, { "epoch": 0.24530873686907506, "grad_norm": 0.5472906231880188, "learning_rate": 8.993807482494892e-05, "loss": 3.1101, "step": 2992 }, { "epoch": 0.24547271329746348, "grad_norm": 0.5381254553794861, "learning_rate": 8.992175642866208e-05, "loss": 3.0842, "step": 2994 }, { "epoch": 0.2456366897258519, "grad_norm": 0.5339920520782471, "learning_rate": 8.990542629347658e-05, "loss": 3.0531, "step": 2996 }, { "epoch": 0.24580066615424032, "grad_norm": 0.5434836149215698, "learning_rate": 8.988908442419429e-05, "loss": 3.0259, "step": 2998 }, { "epoch": 0.24596464258262873, "grad_norm": 0.5441123247146606, "learning_rate": 8.987273082562048e-05, "loss": 3.0843, "step": 3000 } ], "logging_steps": 2, "max_steps": 12197, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.533162852810752e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }