{ "best_global_step": 2478, "best_metric": 0.08039100468158722, "best_model_checkpoint": "saves_multiple/lora/llama-3-8b-instruct/train_mrpc_101112_1760638020/checkpoint-2478", "epoch": 20.0, "eval_steps": 826, "global_step": 16520, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006053268765133172, "grad_norm": 1.9800204038619995, "learning_rate": 1.2106537530266344e-07, "loss": 0.1446, "num_input_tokens_seen": 2080, "step": 5 }, { "epoch": 0.012106537530266344, "grad_norm": 2.5265538692474365, "learning_rate": 2.7239709443099273e-07, "loss": 0.2783, "num_input_tokens_seen": 4064, "step": 10 }, { "epoch": 0.018159806295399514, "grad_norm": 4.288227558135986, "learning_rate": 4.2372881355932204e-07, "loss": 0.3542, "num_input_tokens_seen": 6144, "step": 15 }, { "epoch": 0.024213075060532687, "grad_norm": 5.7630462646484375, "learning_rate": 5.750605326876514e-07, "loss": 0.2933, "num_input_tokens_seen": 8128, "step": 20 }, { "epoch": 0.03026634382566586, "grad_norm": 9.716553688049316, "learning_rate": 7.263922518159807e-07, "loss": 0.369, "num_input_tokens_seen": 10240, "step": 25 }, { "epoch": 0.03631961259079903, "grad_norm": 3.2884416580200195, "learning_rate": 8.777239709443099e-07, "loss": 0.2213, "num_input_tokens_seen": 12288, "step": 30 }, { "epoch": 0.0423728813559322, "grad_norm": 1.5755492448806763, "learning_rate": 1.0290556900726392e-06, "loss": 0.2221, "num_input_tokens_seen": 14208, "step": 35 }, { "epoch": 0.048426150121065374, "grad_norm": 4.575976848602295, "learning_rate": 1.1803874092009686e-06, "loss": 0.2697, "num_input_tokens_seen": 16224, "step": 40 }, { "epoch": 0.05447941888619855, "grad_norm": 1.9681897163391113, "learning_rate": 1.3317191283292979e-06, "loss": 0.1707, "num_input_tokens_seen": 18112, "step": 45 }, { "epoch": 0.06053268765133172, "grad_norm": 4.186971664428711, "learning_rate": 1.4830508474576273e-06, "loss": 0.2202, "num_input_tokens_seen": 20192, "step": 50 }, { "epoch": 0.06658595641646489, "grad_norm": 1.2563527822494507, "learning_rate": 1.6343825665859565e-06, "loss": 0.211, "num_input_tokens_seen": 22176, "step": 55 }, { "epoch": 0.07263922518159806, "grad_norm": 3.6956756114959717, "learning_rate": 1.7857142857142857e-06, "loss": 0.2931, "num_input_tokens_seen": 24256, "step": 60 }, { "epoch": 0.07869249394673124, "grad_norm": 4.089221477508545, "learning_rate": 1.937046004842615e-06, "loss": 0.362, "num_input_tokens_seen": 26336, "step": 65 }, { "epoch": 0.0847457627118644, "grad_norm": 1.4154469966888428, "learning_rate": 2.0883777239709445e-06, "loss": 0.2602, "num_input_tokens_seen": 28288, "step": 70 }, { "epoch": 0.09079903147699758, "grad_norm": 2.053586483001709, "learning_rate": 2.239709443099274e-06, "loss": 0.2653, "num_input_tokens_seen": 30432, "step": 75 }, { "epoch": 0.09685230024213075, "grad_norm": 1.4030762910842896, "learning_rate": 2.391041162227603e-06, "loss": 0.201, "num_input_tokens_seen": 32352, "step": 80 }, { "epoch": 0.10290556900726393, "grad_norm": 2.171947479248047, "learning_rate": 2.5423728813559323e-06, "loss": 0.0863, "num_input_tokens_seen": 34304, "step": 85 }, { "epoch": 0.1089588377723971, "grad_norm": 4.807977199554443, "learning_rate": 2.6937046004842618e-06, "loss": 0.2264, "num_input_tokens_seen": 36288, "step": 90 }, { "epoch": 0.11501210653753027, "grad_norm": 4.449787139892578, "learning_rate": 2.8450363196125907e-06, "loss": 0.3494, "num_input_tokens_seen": 38272, "step": 95 }, { "epoch": 0.12106537530266344, "grad_norm": 1.9579466581344604, "learning_rate": 2.99636803874092e-06, "loss": 0.3296, "num_input_tokens_seen": 40192, "step": 100 }, { "epoch": 0.1271186440677966, "grad_norm": 2.2621707916259766, "learning_rate": 3.1476997578692496e-06, "loss": 0.1591, "num_input_tokens_seen": 42176, "step": 105 }, { "epoch": 0.13317191283292978, "grad_norm": 3.0984034538269043, "learning_rate": 3.2990314769975786e-06, "loss": 0.2609, "num_input_tokens_seen": 44096, "step": 110 }, { "epoch": 0.13922518159806296, "grad_norm": 3.165229320526123, "learning_rate": 3.450363196125908e-06, "loss": 0.3684, "num_input_tokens_seen": 45984, "step": 115 }, { "epoch": 0.14527845036319612, "grad_norm": 7.0474772453308105, "learning_rate": 3.6016949152542374e-06, "loss": 0.2646, "num_input_tokens_seen": 47936, "step": 120 }, { "epoch": 0.1513317191283293, "grad_norm": 1.9770607948303223, "learning_rate": 3.7530266343825673e-06, "loss": 0.2784, "num_input_tokens_seen": 49984, "step": 125 }, { "epoch": 0.15738498789346247, "grad_norm": 8.313761711120605, "learning_rate": 3.904358353510896e-06, "loss": 0.304, "num_input_tokens_seen": 52096, "step": 130 }, { "epoch": 0.16343825665859565, "grad_norm": 2.0595953464508057, "learning_rate": 4.055690072639226e-06, "loss": 0.169, "num_input_tokens_seen": 54112, "step": 135 }, { "epoch": 0.1694915254237288, "grad_norm": 2.4984781742095947, "learning_rate": 4.207021791767555e-06, "loss": 0.1728, "num_input_tokens_seen": 56192, "step": 140 }, { "epoch": 0.17554479418886199, "grad_norm": 7.156078338623047, "learning_rate": 4.358353510895884e-06, "loss": 0.2623, "num_input_tokens_seen": 58304, "step": 145 }, { "epoch": 0.18159806295399517, "grad_norm": 2.4931037425994873, "learning_rate": 4.5096852300242135e-06, "loss": 0.2427, "num_input_tokens_seen": 60448, "step": 150 }, { "epoch": 0.18765133171912832, "grad_norm": 2.7002623081207275, "learning_rate": 4.6610169491525425e-06, "loss": 0.1457, "num_input_tokens_seen": 62368, "step": 155 }, { "epoch": 0.1937046004842615, "grad_norm": 4.783320903778076, "learning_rate": 4.812348668280872e-06, "loss": 0.1251, "num_input_tokens_seen": 64448, "step": 160 }, { "epoch": 0.19975786924939468, "grad_norm": 2.822256565093994, "learning_rate": 4.963680387409201e-06, "loss": 0.2381, "num_input_tokens_seen": 66464, "step": 165 }, { "epoch": 0.20581113801452786, "grad_norm": 1.981095314025879, "learning_rate": 5.11501210653753e-06, "loss": 0.1521, "num_input_tokens_seen": 68480, "step": 170 }, { "epoch": 0.211864406779661, "grad_norm": 2.2380545139312744, "learning_rate": 5.26634382566586e-06, "loss": 0.2866, "num_input_tokens_seen": 70592, "step": 175 }, { "epoch": 0.2179176755447942, "grad_norm": 1.6467034816741943, "learning_rate": 5.417675544794189e-06, "loss": 0.2546, "num_input_tokens_seen": 72608, "step": 180 }, { "epoch": 0.22397094430992737, "grad_norm": 2.7245090007781982, "learning_rate": 5.569007263922518e-06, "loss": 0.1371, "num_input_tokens_seen": 74560, "step": 185 }, { "epoch": 0.23002421307506055, "grad_norm": 1.9143247604370117, "learning_rate": 5.720338983050848e-06, "loss": 0.1794, "num_input_tokens_seen": 76672, "step": 190 }, { "epoch": 0.2360774818401937, "grad_norm": 1.327850103378296, "learning_rate": 5.871670702179177e-06, "loss": 0.1783, "num_input_tokens_seen": 78880, "step": 195 }, { "epoch": 0.24213075060532688, "grad_norm": 3.0520005226135254, "learning_rate": 6.023002421307506e-06, "loss": 0.1389, "num_input_tokens_seen": 81056, "step": 200 }, { "epoch": 0.24818401937046006, "grad_norm": 3.0520761013031006, "learning_rate": 6.174334140435836e-06, "loss": 0.1968, "num_input_tokens_seen": 83200, "step": 205 }, { "epoch": 0.2542372881355932, "grad_norm": 1.8392207622528076, "learning_rate": 6.325665859564164e-06, "loss": 0.2201, "num_input_tokens_seen": 85152, "step": 210 }, { "epoch": 0.2602905569007264, "grad_norm": 3.8807926177978516, "learning_rate": 6.476997578692494e-06, "loss": 0.2455, "num_input_tokens_seen": 87200, "step": 215 }, { "epoch": 0.26634382566585957, "grad_norm": 3.6868693828582764, "learning_rate": 6.628329297820824e-06, "loss": 0.1815, "num_input_tokens_seen": 89280, "step": 220 }, { "epoch": 0.27239709443099275, "grad_norm": 2.5510149002075195, "learning_rate": 6.779661016949153e-06, "loss": 0.1887, "num_input_tokens_seen": 91072, "step": 225 }, { "epoch": 0.2784503631961259, "grad_norm": 2.575061321258545, "learning_rate": 6.9309927360774825e-06, "loss": 0.1125, "num_input_tokens_seen": 93120, "step": 230 }, { "epoch": 0.2845036319612591, "grad_norm": 6.728092193603516, "learning_rate": 7.0823244552058115e-06, "loss": 0.1583, "num_input_tokens_seen": 95104, "step": 235 }, { "epoch": 0.29055690072639223, "grad_norm": 3.3902955055236816, "learning_rate": 7.233656174334141e-06, "loss": 0.1716, "num_input_tokens_seen": 97152, "step": 240 }, { "epoch": 0.2966101694915254, "grad_norm": 3.7272040843963623, "learning_rate": 7.3849878934624694e-06, "loss": 0.1932, "num_input_tokens_seen": 99296, "step": 245 }, { "epoch": 0.3026634382566586, "grad_norm": 6.930401802062988, "learning_rate": 7.536319612590799e-06, "loss": 0.1886, "num_input_tokens_seen": 101376, "step": 250 }, { "epoch": 0.30871670702179177, "grad_norm": 4.180955410003662, "learning_rate": 7.687651331719129e-06, "loss": 0.2197, "num_input_tokens_seen": 103456, "step": 255 }, { "epoch": 0.31476997578692495, "grad_norm": 2.4029951095581055, "learning_rate": 7.838983050847458e-06, "loss": 0.1641, "num_input_tokens_seen": 105728, "step": 260 }, { "epoch": 0.32082324455205813, "grad_norm": 6.375380039215088, "learning_rate": 7.990314769975787e-06, "loss": 0.2097, "num_input_tokens_seen": 107968, "step": 265 }, { "epoch": 0.3268765133171913, "grad_norm": 7.2147216796875, "learning_rate": 8.141646489104116e-06, "loss": 0.24, "num_input_tokens_seen": 109984, "step": 270 }, { "epoch": 0.33292978208232443, "grad_norm": 6.215048789978027, "learning_rate": 8.292978208232447e-06, "loss": 0.1479, "num_input_tokens_seen": 112096, "step": 275 }, { "epoch": 0.3389830508474576, "grad_norm": 2.919790267944336, "learning_rate": 8.444309927360774e-06, "loss": 0.133, "num_input_tokens_seen": 114208, "step": 280 }, { "epoch": 0.3450363196125908, "grad_norm": 7.3741254806518555, "learning_rate": 8.595641646489105e-06, "loss": 0.1972, "num_input_tokens_seen": 116256, "step": 285 }, { "epoch": 0.35108958837772397, "grad_norm": 3.0361359119415283, "learning_rate": 8.746973365617434e-06, "loss": 0.1828, "num_input_tokens_seen": 118272, "step": 290 }, { "epoch": 0.35714285714285715, "grad_norm": 2.1943860054016113, "learning_rate": 8.898305084745763e-06, "loss": 0.26, "num_input_tokens_seen": 120224, "step": 295 }, { "epoch": 0.36319612590799033, "grad_norm": 2.3614513874053955, "learning_rate": 9.049636803874092e-06, "loss": 0.1286, "num_input_tokens_seen": 122240, "step": 300 }, { "epoch": 0.3692493946731235, "grad_norm": 2.0788748264312744, "learning_rate": 9.200968523002422e-06, "loss": 0.2968, "num_input_tokens_seen": 124288, "step": 305 }, { "epoch": 0.37530266343825663, "grad_norm": 1.8409334421157837, "learning_rate": 9.352300242130751e-06, "loss": 0.1741, "num_input_tokens_seen": 126240, "step": 310 }, { "epoch": 0.3813559322033898, "grad_norm": 1.3723686933517456, "learning_rate": 9.50363196125908e-06, "loss": 0.1219, "num_input_tokens_seen": 128288, "step": 315 }, { "epoch": 0.387409200968523, "grad_norm": 2.239436626434326, "learning_rate": 9.65496368038741e-06, "loss": 0.1389, "num_input_tokens_seen": 130208, "step": 320 }, { "epoch": 0.3934624697336562, "grad_norm": 3.2390482425689697, "learning_rate": 9.806295399515738e-06, "loss": 0.1594, "num_input_tokens_seen": 132384, "step": 325 }, { "epoch": 0.39951573849878935, "grad_norm": 2.712637424468994, "learning_rate": 9.957627118644067e-06, "loss": 0.1615, "num_input_tokens_seen": 134528, "step": 330 }, { "epoch": 0.40556900726392253, "grad_norm": 1.0883562564849854, "learning_rate": 1.0108958837772398e-05, "loss": 0.0799, "num_input_tokens_seen": 136544, "step": 335 }, { "epoch": 0.4116222760290557, "grad_norm": 5.525536060333252, "learning_rate": 1.0260290556900727e-05, "loss": 0.1407, "num_input_tokens_seen": 138528, "step": 340 }, { "epoch": 0.41767554479418884, "grad_norm": 2.4147253036499023, "learning_rate": 1.0411622276029056e-05, "loss": 0.1851, "num_input_tokens_seen": 140640, "step": 345 }, { "epoch": 0.423728813559322, "grad_norm": 1.946520447731018, "learning_rate": 1.0562953995157387e-05, "loss": 0.1379, "num_input_tokens_seen": 142784, "step": 350 }, { "epoch": 0.4297820823244552, "grad_norm": 6.313877105712891, "learning_rate": 1.0714285714285714e-05, "loss": 0.2062, "num_input_tokens_seen": 144864, "step": 355 }, { "epoch": 0.4358353510895884, "grad_norm": 6.465234756469727, "learning_rate": 1.0865617433414043e-05, "loss": 0.1322, "num_input_tokens_seen": 146976, "step": 360 }, { "epoch": 0.44188861985472155, "grad_norm": 4.061296463012695, "learning_rate": 1.1016949152542374e-05, "loss": 0.2014, "num_input_tokens_seen": 149024, "step": 365 }, { "epoch": 0.44794188861985473, "grad_norm": 4.017556190490723, "learning_rate": 1.1168280871670703e-05, "loss": 0.1912, "num_input_tokens_seen": 151104, "step": 370 }, { "epoch": 0.4539951573849879, "grad_norm": 4.068127632141113, "learning_rate": 1.1319612590799032e-05, "loss": 0.1466, "num_input_tokens_seen": 153248, "step": 375 }, { "epoch": 0.4600484261501211, "grad_norm": 8.310772895812988, "learning_rate": 1.1470944309927362e-05, "loss": 0.2081, "num_input_tokens_seen": 155296, "step": 380 }, { "epoch": 0.4661016949152542, "grad_norm": 3.9908905029296875, "learning_rate": 1.1622276029055691e-05, "loss": 0.2394, "num_input_tokens_seen": 157408, "step": 385 }, { "epoch": 0.4721549636803874, "grad_norm": 1.4164057970046997, "learning_rate": 1.1773607748184019e-05, "loss": 0.1448, "num_input_tokens_seen": 159424, "step": 390 }, { "epoch": 0.4782082324455206, "grad_norm": 4.86757230758667, "learning_rate": 1.192493946731235e-05, "loss": 0.2052, "num_input_tokens_seen": 161600, "step": 395 }, { "epoch": 0.48426150121065376, "grad_norm": 4.650191307067871, "learning_rate": 1.2076271186440678e-05, "loss": 0.1446, "num_input_tokens_seen": 163616, "step": 400 }, { "epoch": 0.49031476997578693, "grad_norm": 1.4177706241607666, "learning_rate": 1.2227602905569007e-05, "loss": 0.1454, "num_input_tokens_seen": 165568, "step": 405 }, { "epoch": 0.4963680387409201, "grad_norm": 5.054205417633057, "learning_rate": 1.2378934624697338e-05, "loss": 0.1734, "num_input_tokens_seen": 167840, "step": 410 }, { "epoch": 0.5024213075060533, "grad_norm": 2.702026128768921, "learning_rate": 1.2530266343825667e-05, "loss": 0.1121, "num_input_tokens_seen": 169920, "step": 415 }, { "epoch": 0.5084745762711864, "grad_norm": 2.4215333461761475, "learning_rate": 1.2681598062953994e-05, "loss": 0.167, "num_input_tokens_seen": 172064, "step": 420 }, { "epoch": 0.5145278450363197, "grad_norm": 7.523287773132324, "learning_rate": 1.2832929782082325e-05, "loss": 0.2793, "num_input_tokens_seen": 174112, "step": 425 }, { "epoch": 0.5205811138014528, "grad_norm": 2.9136404991149902, "learning_rate": 1.2984261501210654e-05, "loss": 0.1727, "num_input_tokens_seen": 176192, "step": 430 }, { "epoch": 0.5266343825665859, "grad_norm": 8.430058479309082, "learning_rate": 1.3135593220338985e-05, "loss": 0.178, "num_input_tokens_seen": 178176, "step": 435 }, { "epoch": 0.5326876513317191, "grad_norm": 1.6761995553970337, "learning_rate": 1.3286924939467312e-05, "loss": 0.1393, "num_input_tokens_seen": 180320, "step": 440 }, { "epoch": 0.5387409200968523, "grad_norm": 1.0710009336471558, "learning_rate": 1.3438256658595641e-05, "loss": 0.0716, "num_input_tokens_seen": 182272, "step": 445 }, { "epoch": 0.5447941888619855, "grad_norm": 3.777778387069702, "learning_rate": 1.3589588377723972e-05, "loss": 0.1874, "num_input_tokens_seen": 184384, "step": 450 }, { "epoch": 0.5508474576271186, "grad_norm": 4.706080913543701, "learning_rate": 1.37409200968523e-05, "loss": 0.1488, "num_input_tokens_seen": 186464, "step": 455 }, { "epoch": 0.5569007263922519, "grad_norm": 3.6153249740600586, "learning_rate": 1.3892251815980631e-05, "loss": 0.2036, "num_input_tokens_seen": 188544, "step": 460 }, { "epoch": 0.562953995157385, "grad_norm": 2.539186477661133, "learning_rate": 1.4043583535108959e-05, "loss": 0.1563, "num_input_tokens_seen": 190656, "step": 465 }, { "epoch": 0.5690072639225182, "grad_norm": 1.9568341970443726, "learning_rate": 1.419491525423729e-05, "loss": 0.1708, "num_input_tokens_seen": 192672, "step": 470 }, { "epoch": 0.5750605326876513, "grad_norm": 6.348586082458496, "learning_rate": 1.4346246973365618e-05, "loss": 0.1843, "num_input_tokens_seen": 194688, "step": 475 }, { "epoch": 0.5811138014527845, "grad_norm": 3.799901008605957, "learning_rate": 1.4497578692493946e-05, "loss": 0.1387, "num_input_tokens_seen": 196832, "step": 480 }, { "epoch": 0.5871670702179177, "grad_norm": 3.499620199203491, "learning_rate": 1.4648910411622276e-05, "loss": 0.0999, "num_input_tokens_seen": 198976, "step": 485 }, { "epoch": 0.5932203389830508, "grad_norm": 12.64189624786377, "learning_rate": 1.4800242130750605e-05, "loss": 0.1746, "num_input_tokens_seen": 201024, "step": 490 }, { "epoch": 0.5992736077481841, "grad_norm": 2.569739580154419, "learning_rate": 1.4951573849878936e-05, "loss": 0.1875, "num_input_tokens_seen": 203264, "step": 495 }, { "epoch": 0.6053268765133172, "grad_norm": 8.225655555725098, "learning_rate": 1.5102905569007263e-05, "loss": 0.2047, "num_input_tokens_seen": 205376, "step": 500 }, { "epoch": 0.6113801452784504, "grad_norm": 7.60185432434082, "learning_rate": 1.5254237288135596e-05, "loss": 0.166, "num_input_tokens_seen": 207360, "step": 505 }, { "epoch": 0.6174334140435835, "grad_norm": 2.648143768310547, "learning_rate": 1.5405569007263923e-05, "loss": 0.1884, "num_input_tokens_seen": 209472, "step": 510 }, { "epoch": 0.6234866828087167, "grad_norm": 2.6249589920043945, "learning_rate": 1.5556900726392254e-05, "loss": 0.0979, "num_input_tokens_seen": 211744, "step": 515 }, { "epoch": 0.6295399515738499, "grad_norm": 1.3475792407989502, "learning_rate": 1.570823244552058e-05, "loss": 0.0567, "num_input_tokens_seen": 213792, "step": 520 }, { "epoch": 0.635593220338983, "grad_norm": 1.3126599788665771, "learning_rate": 1.5859564164648912e-05, "loss": 0.0876, "num_input_tokens_seen": 215936, "step": 525 }, { "epoch": 0.6416464891041163, "grad_norm": 2.6036107540130615, "learning_rate": 1.6010895883777242e-05, "loss": 0.2341, "num_input_tokens_seen": 218112, "step": 530 }, { "epoch": 0.6476997578692494, "grad_norm": 3.8887522220611572, "learning_rate": 1.616222760290557e-05, "loss": 0.2664, "num_input_tokens_seen": 220256, "step": 535 }, { "epoch": 0.6537530266343826, "grad_norm": 23.43755340576172, "learning_rate": 1.63135593220339e-05, "loss": 0.1814, "num_input_tokens_seen": 222272, "step": 540 }, { "epoch": 0.6598062953995157, "grad_norm": 2.0692665576934814, "learning_rate": 1.6464891041162228e-05, "loss": 0.1469, "num_input_tokens_seen": 224288, "step": 545 }, { "epoch": 0.6658595641646489, "grad_norm": 1.9245775938034058, "learning_rate": 1.661622276029056e-05, "loss": 0.1394, "num_input_tokens_seen": 226304, "step": 550 }, { "epoch": 0.6719128329297821, "grad_norm": 1.4072096347808838, "learning_rate": 1.6767554479418886e-05, "loss": 0.1412, "num_input_tokens_seen": 228480, "step": 555 }, { "epoch": 0.6779661016949152, "grad_norm": 3.1420459747314453, "learning_rate": 1.6918886198547216e-05, "loss": 0.0896, "num_input_tokens_seen": 230528, "step": 560 }, { "epoch": 0.6840193704600485, "grad_norm": 6.924084186553955, "learning_rate": 1.7070217917675547e-05, "loss": 0.0713, "num_input_tokens_seen": 232768, "step": 565 }, { "epoch": 0.6900726392251816, "grad_norm": 4.526210308074951, "learning_rate": 1.7221549636803874e-05, "loss": 0.2103, "num_input_tokens_seen": 234720, "step": 570 }, { "epoch": 0.6961259079903148, "grad_norm": 11.115130424499512, "learning_rate": 1.7372881355932205e-05, "loss": 0.3039, "num_input_tokens_seen": 236768, "step": 575 }, { "epoch": 0.7021791767554479, "grad_norm": 2.869133710861206, "learning_rate": 1.7524213075060532e-05, "loss": 0.0456, "num_input_tokens_seen": 238848, "step": 580 }, { "epoch": 0.7082324455205811, "grad_norm": 0.3127691447734833, "learning_rate": 1.7675544794188863e-05, "loss": 0.0914, "num_input_tokens_seen": 240864, "step": 585 }, { "epoch": 0.7142857142857143, "grad_norm": 11.704195022583008, "learning_rate": 1.7826876513317194e-05, "loss": 0.2421, "num_input_tokens_seen": 242976, "step": 590 }, { "epoch": 0.7203389830508474, "grad_norm": 3.9950180053710938, "learning_rate": 1.797820823244552e-05, "loss": 0.1682, "num_input_tokens_seen": 245184, "step": 595 }, { "epoch": 0.7263922518159807, "grad_norm": 7.782661437988281, "learning_rate": 1.812953995157385e-05, "loss": 0.1031, "num_input_tokens_seen": 247264, "step": 600 }, { "epoch": 0.7324455205811138, "grad_norm": 7.236667633056641, "learning_rate": 1.828087167070218e-05, "loss": 0.2891, "num_input_tokens_seen": 249248, "step": 605 }, { "epoch": 0.738498789346247, "grad_norm": 9.128091812133789, "learning_rate": 1.843220338983051e-05, "loss": 0.0852, "num_input_tokens_seen": 251264, "step": 610 }, { "epoch": 0.7445520581113801, "grad_norm": 9.13149356842041, "learning_rate": 1.8583535108958837e-05, "loss": 0.1993, "num_input_tokens_seen": 253216, "step": 615 }, { "epoch": 0.7506053268765133, "grad_norm": 5.277822017669678, "learning_rate": 1.8734866828087168e-05, "loss": 0.1698, "num_input_tokens_seen": 255072, "step": 620 }, { "epoch": 0.7566585956416465, "grad_norm": 6.5058064460754395, "learning_rate": 1.88861985472155e-05, "loss": 0.1405, "num_input_tokens_seen": 257056, "step": 625 }, { "epoch": 0.7627118644067796, "grad_norm": 2.3638041019439697, "learning_rate": 1.9037530266343826e-05, "loss": 0.0771, "num_input_tokens_seen": 259040, "step": 630 }, { "epoch": 0.7687651331719129, "grad_norm": 9.269247055053711, "learning_rate": 1.9188861985472156e-05, "loss": 0.1483, "num_input_tokens_seen": 261248, "step": 635 }, { "epoch": 0.774818401937046, "grad_norm": 1.8186548948287964, "learning_rate": 1.9340193704600484e-05, "loss": 0.1564, "num_input_tokens_seen": 263360, "step": 640 }, { "epoch": 0.7808716707021792, "grad_norm": 6.884756565093994, "learning_rate": 1.9491525423728814e-05, "loss": 0.0586, "num_input_tokens_seen": 265472, "step": 645 }, { "epoch": 0.7869249394673123, "grad_norm": 4.52500057220459, "learning_rate": 1.9642857142857145e-05, "loss": 0.1392, "num_input_tokens_seen": 267456, "step": 650 }, { "epoch": 0.7929782082324455, "grad_norm": 9.977668762207031, "learning_rate": 1.9794188861985476e-05, "loss": 0.2266, "num_input_tokens_seen": 269440, "step": 655 }, { "epoch": 0.7990314769975787, "grad_norm": 9.654376983642578, "learning_rate": 1.9945520581113803e-05, "loss": 0.1266, "num_input_tokens_seen": 271456, "step": 660 }, { "epoch": 0.8050847457627118, "grad_norm": 0.7653871178627014, "learning_rate": 2.009685230024213e-05, "loss": 0.1411, "num_input_tokens_seen": 273472, "step": 665 }, { "epoch": 0.8111380145278451, "grad_norm": 7.34264612197876, "learning_rate": 2.024818401937046e-05, "loss": 0.1159, "num_input_tokens_seen": 275488, "step": 670 }, { "epoch": 0.8171912832929782, "grad_norm": 21.716815948486328, "learning_rate": 2.0399515738498788e-05, "loss": 0.3004, "num_input_tokens_seen": 277472, "step": 675 }, { "epoch": 0.8232445520581114, "grad_norm": 13.480831146240234, "learning_rate": 2.055084745762712e-05, "loss": 0.3068, "num_input_tokens_seen": 279488, "step": 680 }, { "epoch": 0.8292978208232445, "grad_norm": 2.9228641986846924, "learning_rate": 2.070217917675545e-05, "loss": 0.2521, "num_input_tokens_seen": 281504, "step": 685 }, { "epoch": 0.8353510895883777, "grad_norm": 8.571534156799316, "learning_rate": 2.085351089588378e-05, "loss": 0.2321, "num_input_tokens_seen": 283552, "step": 690 }, { "epoch": 0.8414043583535109, "grad_norm": 2.4278573989868164, "learning_rate": 2.1004842615012108e-05, "loss": 0.1332, "num_input_tokens_seen": 285536, "step": 695 }, { "epoch": 0.847457627118644, "grad_norm": 6.597072124481201, "learning_rate": 2.1156174334140435e-05, "loss": 0.0986, "num_input_tokens_seen": 287584, "step": 700 }, { "epoch": 0.8535108958837773, "grad_norm": 0.7041083574295044, "learning_rate": 2.1307506053268766e-05, "loss": 0.0704, "num_input_tokens_seen": 289600, "step": 705 }, { "epoch": 0.8595641646489104, "grad_norm": 6.055403709411621, "learning_rate": 2.1458837772397093e-05, "loss": 0.0899, "num_input_tokens_seen": 291744, "step": 710 }, { "epoch": 0.8656174334140436, "grad_norm": 5.187188625335693, "learning_rate": 2.1610169491525427e-05, "loss": 0.1188, "num_input_tokens_seen": 293728, "step": 715 }, { "epoch": 0.8716707021791767, "grad_norm": 6.773065090179443, "learning_rate": 2.1761501210653754e-05, "loss": 0.1339, "num_input_tokens_seen": 295840, "step": 720 }, { "epoch": 0.8777239709443099, "grad_norm": 13.379932403564453, "learning_rate": 2.1912832929782085e-05, "loss": 0.1958, "num_input_tokens_seen": 297888, "step": 725 }, { "epoch": 0.8837772397094431, "grad_norm": 8.1151704788208, "learning_rate": 2.2064164648910412e-05, "loss": 0.2288, "num_input_tokens_seen": 299904, "step": 730 }, { "epoch": 0.8898305084745762, "grad_norm": 8.06647777557373, "learning_rate": 2.221549636803874e-05, "loss": 0.1834, "num_input_tokens_seen": 302016, "step": 735 }, { "epoch": 0.8958837772397095, "grad_norm": 0.7120025753974915, "learning_rate": 2.236682808716707e-05, "loss": 0.2286, "num_input_tokens_seen": 304128, "step": 740 }, { "epoch": 0.9019370460048426, "grad_norm": 7.01760196685791, "learning_rate": 2.25181598062954e-05, "loss": 0.1706, "num_input_tokens_seen": 306112, "step": 745 }, { "epoch": 0.9079903147699758, "grad_norm": 5.26881742477417, "learning_rate": 2.266949152542373e-05, "loss": 0.1078, "num_input_tokens_seen": 308256, "step": 750 }, { "epoch": 0.914043583535109, "grad_norm": 1.5413148403167725, "learning_rate": 2.282082324455206e-05, "loss": 0.106, "num_input_tokens_seen": 310240, "step": 755 }, { "epoch": 0.9200968523002422, "grad_norm": 7.139612674713135, "learning_rate": 2.297215496368039e-05, "loss": 0.2175, "num_input_tokens_seen": 312320, "step": 760 }, { "epoch": 0.9261501210653753, "grad_norm": 6.30340051651001, "learning_rate": 2.3123486682808717e-05, "loss": 0.1848, "num_input_tokens_seen": 314400, "step": 765 }, { "epoch": 0.9322033898305084, "grad_norm": 2.6550755500793457, "learning_rate": 2.3274818401937044e-05, "loss": 0.1464, "num_input_tokens_seen": 316480, "step": 770 }, { "epoch": 0.9382566585956417, "grad_norm": 0.5587025880813599, "learning_rate": 2.342615012106538e-05, "loss": 0.115, "num_input_tokens_seen": 318560, "step": 775 }, { "epoch": 0.9443099273607748, "grad_norm": 5.275771141052246, "learning_rate": 2.3577481840193706e-05, "loss": 0.1867, "num_input_tokens_seen": 320768, "step": 780 }, { "epoch": 0.950363196125908, "grad_norm": 2.8277268409729004, "learning_rate": 2.3728813559322036e-05, "loss": 0.0592, "num_input_tokens_seen": 322784, "step": 785 }, { "epoch": 0.9564164648910412, "grad_norm": 1.8943177461624146, "learning_rate": 2.3880145278450364e-05, "loss": 0.0863, "num_input_tokens_seen": 324768, "step": 790 }, { "epoch": 0.9624697336561744, "grad_norm": 1.0839357376098633, "learning_rate": 2.4031476997578694e-05, "loss": 0.1306, "num_input_tokens_seen": 326880, "step": 795 }, { "epoch": 0.9685230024213075, "grad_norm": 3.8869221210479736, "learning_rate": 2.418280871670702e-05, "loss": 0.2429, "num_input_tokens_seen": 328800, "step": 800 }, { "epoch": 0.9745762711864406, "grad_norm": 9.761240005493164, "learning_rate": 2.4334140435835352e-05, "loss": 0.2907, "num_input_tokens_seen": 330880, "step": 805 }, { "epoch": 0.9806295399515739, "grad_norm": 9.088152885437012, "learning_rate": 2.4485472154963683e-05, "loss": 0.0799, "num_input_tokens_seen": 332896, "step": 810 }, { "epoch": 0.986682808716707, "grad_norm": 3.13368821144104, "learning_rate": 2.463680387409201e-05, "loss": 0.0494, "num_input_tokens_seen": 335040, "step": 815 }, { "epoch": 0.9927360774818402, "grad_norm": 9.361994743347168, "learning_rate": 2.478813559322034e-05, "loss": 0.241, "num_input_tokens_seen": 337120, "step": 820 }, { "epoch": 0.9987893462469734, "grad_norm": 7.153463363647461, "learning_rate": 2.4939467312348668e-05, "loss": 0.1407, "num_input_tokens_seen": 339040, "step": 825 }, { "epoch": 1.0, "eval_loss": 0.10593511909246445, "eval_runtime": 4.9446, "eval_samples_per_second": 74.222, "eval_steps_per_second": 18.606, "num_input_tokens_seen": 339144, "step": 826 }, { "epoch": 1.0048426150121066, "grad_norm": 7.769481658935547, "learning_rate": 2.5090799031476996e-05, "loss": 0.117, "num_input_tokens_seen": 340776, "step": 830 }, { "epoch": 1.0108958837772397, "grad_norm": 5.543120384216309, "learning_rate": 2.524213075060533e-05, "loss": 0.1021, "num_input_tokens_seen": 342792, "step": 835 }, { "epoch": 1.0169491525423728, "grad_norm": 0.7291125059127808, "learning_rate": 2.539346246973366e-05, "loss": 0.0523, "num_input_tokens_seen": 344840, "step": 840 }, { "epoch": 1.023002421307506, "grad_norm": 0.07052650302648544, "learning_rate": 2.5544794188861988e-05, "loss": 0.0282, "num_input_tokens_seen": 346760, "step": 845 }, { "epoch": 1.0290556900726393, "grad_norm": 0.05153718218207359, "learning_rate": 2.5696125907990315e-05, "loss": 0.222, "num_input_tokens_seen": 348776, "step": 850 }, { "epoch": 1.0351089588377724, "grad_norm": 8.406330108642578, "learning_rate": 2.5847457627118642e-05, "loss": 0.1927, "num_input_tokens_seen": 350632, "step": 855 }, { "epoch": 1.0411622276029056, "grad_norm": 4.195484638214111, "learning_rate": 2.5998789346246976e-05, "loss": 0.1109, "num_input_tokens_seen": 352584, "step": 860 }, { "epoch": 1.0472154963680387, "grad_norm": 1.0287458896636963, "learning_rate": 2.6150121065375304e-05, "loss": 0.0891, "num_input_tokens_seen": 354632, "step": 865 }, { "epoch": 1.053268765133172, "grad_norm": 1.4667588472366333, "learning_rate": 2.6301452784503634e-05, "loss": 0.0574, "num_input_tokens_seen": 356648, "step": 870 }, { "epoch": 1.0593220338983051, "grad_norm": 5.369099140167236, "learning_rate": 2.645278450363196e-05, "loss": 0.0482, "num_input_tokens_seen": 358632, "step": 875 }, { "epoch": 1.0653753026634383, "grad_norm": 7.432891368865967, "learning_rate": 2.660411622276029e-05, "loss": 0.1538, "num_input_tokens_seen": 360712, "step": 880 }, { "epoch": 1.0714285714285714, "grad_norm": 6.042410373687744, "learning_rate": 2.6755447941888623e-05, "loss": 0.1262, "num_input_tokens_seen": 362760, "step": 885 }, { "epoch": 1.0774818401937045, "grad_norm": 3.5321125984191895, "learning_rate": 2.690677966101695e-05, "loss": 0.2652, "num_input_tokens_seen": 364808, "step": 890 }, { "epoch": 1.0835351089588379, "grad_norm": 1.1720975637435913, "learning_rate": 2.7058111380145278e-05, "loss": 0.1286, "num_input_tokens_seen": 366984, "step": 895 }, { "epoch": 1.089588377723971, "grad_norm": 4.729674816131592, "learning_rate": 2.7209443099273608e-05, "loss": 0.0931, "num_input_tokens_seen": 369064, "step": 900 }, { "epoch": 1.0956416464891041, "grad_norm": 9.650104522705078, "learning_rate": 2.7360774818401942e-05, "loss": 0.1596, "num_input_tokens_seen": 371144, "step": 905 }, { "epoch": 1.1016949152542372, "grad_norm": 5.4600701332092285, "learning_rate": 2.751210653753027e-05, "loss": 0.1625, "num_input_tokens_seen": 373192, "step": 910 }, { "epoch": 1.1077481840193704, "grad_norm": 0.49121126532554626, "learning_rate": 2.7663438256658597e-05, "loss": 0.0905, "num_input_tokens_seen": 375304, "step": 915 }, { "epoch": 1.1138014527845037, "grad_norm": 2.933244466781616, "learning_rate": 2.7814769975786924e-05, "loss": 0.2035, "num_input_tokens_seen": 377384, "step": 920 }, { "epoch": 1.1198547215496368, "grad_norm": 2.2544615268707275, "learning_rate": 2.7966101694915255e-05, "loss": 0.1379, "num_input_tokens_seen": 379496, "step": 925 }, { "epoch": 1.12590799031477, "grad_norm": 6.070184707641602, "learning_rate": 2.8117433414043586e-05, "loss": 0.0848, "num_input_tokens_seen": 381384, "step": 930 }, { "epoch": 1.131961259079903, "grad_norm": 1.3605411052703857, "learning_rate": 2.8268765133171916e-05, "loss": 0.168, "num_input_tokens_seen": 383400, "step": 935 }, { "epoch": 1.1380145278450362, "grad_norm": 0.6348873376846313, "learning_rate": 2.8420096852300244e-05, "loss": 0.0957, "num_input_tokens_seen": 385416, "step": 940 }, { "epoch": 1.1440677966101696, "grad_norm": 6.059495449066162, "learning_rate": 2.857142857142857e-05, "loss": 0.1076, "num_input_tokens_seen": 387464, "step": 945 }, { "epoch": 1.1501210653753027, "grad_norm": 4.087613582611084, "learning_rate": 2.8722760290556905e-05, "loss": 0.131, "num_input_tokens_seen": 389544, "step": 950 }, { "epoch": 1.1561743341404358, "grad_norm": 1.3269816637039185, "learning_rate": 2.8874092009685232e-05, "loss": 0.0275, "num_input_tokens_seen": 391624, "step": 955 }, { "epoch": 1.162227602905569, "grad_norm": 3.5393283367156982, "learning_rate": 2.902542372881356e-05, "loss": 0.1452, "num_input_tokens_seen": 393736, "step": 960 }, { "epoch": 1.1682808716707023, "grad_norm": 3.91328501701355, "learning_rate": 2.917675544794189e-05, "loss": 0.0581, "num_input_tokens_seen": 395752, "step": 965 }, { "epoch": 1.1743341404358354, "grad_norm": 4.251715183258057, "learning_rate": 2.9328087167070218e-05, "loss": 0.1224, "num_input_tokens_seen": 397800, "step": 970 }, { "epoch": 1.1803874092009685, "grad_norm": 0.09829419106245041, "learning_rate": 2.947941888619855e-05, "loss": 0.0959, "num_input_tokens_seen": 399752, "step": 975 }, { "epoch": 1.1864406779661016, "grad_norm": 0.10955099016427994, "learning_rate": 2.963075060532688e-05, "loss": 0.1474, "num_input_tokens_seen": 401832, "step": 980 }, { "epoch": 1.192493946731235, "grad_norm": 12.049365043640137, "learning_rate": 2.9782082324455206e-05, "loss": 0.032, "num_input_tokens_seen": 403848, "step": 985 }, { "epoch": 1.1985472154963681, "grad_norm": 0.02394855208694935, "learning_rate": 2.9933414043583537e-05, "loss": 0.0295, "num_input_tokens_seen": 405800, "step": 990 }, { "epoch": 1.2046004842615012, "grad_norm": 21.266433715820312, "learning_rate": 3.0084745762711864e-05, "loss": 0.4869, "num_input_tokens_seen": 407816, "step": 995 }, { "epoch": 1.2106537530266344, "grad_norm": 8.90036392211914, "learning_rate": 3.0236077481840198e-05, "loss": 0.1715, "num_input_tokens_seen": 409896, "step": 1000 }, { "epoch": 1.2167070217917675, "grad_norm": 0.14741992950439453, "learning_rate": 3.0387409200968526e-05, "loss": 0.0748, "num_input_tokens_seen": 411880, "step": 1005 }, { "epoch": 1.2227602905569008, "grad_norm": 0.07192472368478775, "learning_rate": 3.053874092009685e-05, "loss": 0.1312, "num_input_tokens_seen": 413928, "step": 1010 }, { "epoch": 1.228813559322034, "grad_norm": 7.1197991371154785, "learning_rate": 3.069007263922518e-05, "loss": 0.0315, "num_input_tokens_seen": 416072, "step": 1015 }, { "epoch": 1.234866828087167, "grad_norm": 4.554861545562744, "learning_rate": 3.0841404358353514e-05, "loss": 0.2901, "num_input_tokens_seen": 418056, "step": 1020 }, { "epoch": 1.2409200968523002, "grad_norm": 3.653834342956543, "learning_rate": 3.099273607748184e-05, "loss": 0.1786, "num_input_tokens_seen": 420200, "step": 1025 }, { "epoch": 1.2469733656174333, "grad_norm": 7.41217565536499, "learning_rate": 3.114406779661017e-05, "loss": 0.1188, "num_input_tokens_seen": 422344, "step": 1030 }, { "epoch": 1.2530266343825667, "grad_norm": 0.7658401727676392, "learning_rate": 3.1295399515738496e-05, "loss": 0.0837, "num_input_tokens_seen": 424328, "step": 1035 }, { "epoch": 1.2590799031476998, "grad_norm": 6.2062907218933105, "learning_rate": 3.144673123486683e-05, "loss": 0.2052, "num_input_tokens_seen": 426408, "step": 1040 }, { "epoch": 1.265133171912833, "grad_norm": 7.733348846435547, "learning_rate": 3.1598062953995164e-05, "loss": 0.1868, "num_input_tokens_seen": 428424, "step": 1045 }, { "epoch": 1.271186440677966, "grad_norm": 1.089958667755127, "learning_rate": 3.174939467312349e-05, "loss": 0.0297, "num_input_tokens_seen": 430440, "step": 1050 }, { "epoch": 1.2772397094430992, "grad_norm": 12.642589569091797, "learning_rate": 3.190072639225182e-05, "loss": 0.0844, "num_input_tokens_seen": 432648, "step": 1055 }, { "epoch": 1.2832929782082325, "grad_norm": 3.6318118572235107, "learning_rate": 3.2052058111380146e-05, "loss": 0.1702, "num_input_tokens_seen": 434888, "step": 1060 }, { "epoch": 1.2893462469733656, "grad_norm": 16.81538963317871, "learning_rate": 3.2203389830508473e-05, "loss": 0.1106, "num_input_tokens_seen": 437000, "step": 1065 }, { "epoch": 1.2953995157384988, "grad_norm": 0.04348110035061836, "learning_rate": 3.235472154963681e-05, "loss": 0.1103, "num_input_tokens_seen": 439080, "step": 1070 }, { "epoch": 1.3014527845036319, "grad_norm": 15.042291641235352, "learning_rate": 3.2506053268765135e-05, "loss": 0.2333, "num_input_tokens_seen": 441160, "step": 1075 }, { "epoch": 1.307506053268765, "grad_norm": 4.896246433258057, "learning_rate": 3.265738498789346e-05, "loss": 0.2537, "num_input_tokens_seen": 443208, "step": 1080 }, { "epoch": 1.3135593220338984, "grad_norm": 4.226076126098633, "learning_rate": 3.280871670702179e-05, "loss": 0.1248, "num_input_tokens_seen": 445224, "step": 1085 }, { "epoch": 1.3196125907990315, "grad_norm": 1.5398776531219482, "learning_rate": 3.2960048426150124e-05, "loss": 0.0336, "num_input_tokens_seen": 447400, "step": 1090 }, { "epoch": 1.3256658595641646, "grad_norm": 9.211803436279297, "learning_rate": 3.311138014527845e-05, "loss": 0.2503, "num_input_tokens_seen": 449288, "step": 1095 }, { "epoch": 1.331719128329298, "grad_norm": 0.733562707901001, "learning_rate": 3.326271186440678e-05, "loss": 0.1271, "num_input_tokens_seen": 451400, "step": 1100 }, { "epoch": 1.3377723970944309, "grad_norm": 0.8373104333877563, "learning_rate": 3.341404358353511e-05, "loss": 0.0695, "num_input_tokens_seen": 453480, "step": 1105 }, { "epoch": 1.3438256658595642, "grad_norm": 0.4727541506290436, "learning_rate": 3.356537530266344e-05, "loss": 0.1173, "num_input_tokens_seen": 455272, "step": 1110 }, { "epoch": 1.3498789346246973, "grad_norm": 0.6321443915367126, "learning_rate": 3.3716707021791774e-05, "loss": 0.0872, "num_input_tokens_seen": 457288, "step": 1115 }, { "epoch": 1.3559322033898304, "grad_norm": 7.899008750915527, "learning_rate": 3.38680387409201e-05, "loss": 0.0256, "num_input_tokens_seen": 459304, "step": 1120 }, { "epoch": 1.3619854721549638, "grad_norm": 4.347733020782471, "learning_rate": 3.401937046004843e-05, "loss": 0.1007, "num_input_tokens_seen": 461288, "step": 1125 }, { "epoch": 1.368038740920097, "grad_norm": 0.12617227435112, "learning_rate": 3.4170702179176755e-05, "loss": 0.2066, "num_input_tokens_seen": 463560, "step": 1130 }, { "epoch": 1.37409200968523, "grad_norm": 8.780061721801758, "learning_rate": 3.432203389830508e-05, "loss": 0.1984, "num_input_tokens_seen": 465800, "step": 1135 }, { "epoch": 1.3801452784503632, "grad_norm": 0.11520235985517502, "learning_rate": 3.447336561743342e-05, "loss": 0.0939, "num_input_tokens_seen": 467912, "step": 1140 }, { "epoch": 1.3861985472154963, "grad_norm": 0.6378135085105896, "learning_rate": 3.4624697336561744e-05, "loss": 0.0516, "num_input_tokens_seen": 469832, "step": 1145 }, { "epoch": 1.3922518159806296, "grad_norm": 1.978184461593628, "learning_rate": 3.477602905569007e-05, "loss": 0.0874, "num_input_tokens_seen": 472008, "step": 1150 }, { "epoch": 1.3983050847457628, "grad_norm": 13.126570701599121, "learning_rate": 3.49273607748184e-05, "loss": 0.1528, "num_input_tokens_seen": 474088, "step": 1155 }, { "epoch": 1.4043583535108959, "grad_norm": 5.558993339538574, "learning_rate": 3.507869249394673e-05, "loss": 0.2051, "num_input_tokens_seen": 476008, "step": 1160 }, { "epoch": 1.410411622276029, "grad_norm": 8.514824867248535, "learning_rate": 3.523002421307507e-05, "loss": 0.1266, "num_input_tokens_seen": 478120, "step": 1165 }, { "epoch": 1.4164648910411621, "grad_norm": 4.913051128387451, "learning_rate": 3.5381355932203394e-05, "loss": 0.115, "num_input_tokens_seen": 480200, "step": 1170 }, { "epoch": 1.4225181598062955, "grad_norm": 0.6005980968475342, "learning_rate": 3.553268765133172e-05, "loss": 0.1683, "num_input_tokens_seen": 482248, "step": 1175 }, { "epoch": 1.4285714285714286, "grad_norm": 0.08805664628744125, "learning_rate": 3.568401937046005e-05, "loss": 0.0627, "num_input_tokens_seen": 484264, "step": 1180 }, { "epoch": 1.4346246973365617, "grad_norm": 3.3285725116729736, "learning_rate": 3.583535108958838e-05, "loss": 0.1508, "num_input_tokens_seen": 486312, "step": 1185 }, { "epoch": 1.4406779661016949, "grad_norm": 4.923320770263672, "learning_rate": 3.598668280871671e-05, "loss": 0.1097, "num_input_tokens_seen": 488392, "step": 1190 }, { "epoch": 1.446731234866828, "grad_norm": 7.057864189147949, "learning_rate": 3.613801452784504e-05, "loss": 0.1335, "num_input_tokens_seen": 490440, "step": 1195 }, { "epoch": 1.4527845036319613, "grad_norm": 0.43066057562828064, "learning_rate": 3.6289346246973365e-05, "loss": 0.0393, "num_input_tokens_seen": 492552, "step": 1200 }, { "epoch": 1.4588377723970944, "grad_norm": 0.2145460993051529, "learning_rate": 3.644067796610169e-05, "loss": 0.0873, "num_input_tokens_seen": 494568, "step": 1205 }, { "epoch": 1.4648910411622276, "grad_norm": 0.19442042708396912, "learning_rate": 3.6592009685230026e-05, "loss": 0.0564, "num_input_tokens_seen": 496552, "step": 1210 }, { "epoch": 1.4709443099273607, "grad_norm": 0.07670941203832626, "learning_rate": 3.6743341404358353e-05, "loss": 0.2459, "num_input_tokens_seen": 498760, "step": 1215 }, { "epoch": 1.4769975786924938, "grad_norm": 3.2059402465820312, "learning_rate": 3.689467312348668e-05, "loss": 0.0788, "num_input_tokens_seen": 500840, "step": 1220 }, { "epoch": 1.4830508474576272, "grad_norm": 2.928030014038086, "learning_rate": 3.7046004842615015e-05, "loss": 0.1555, "num_input_tokens_seen": 503048, "step": 1225 }, { "epoch": 1.4891041162227603, "grad_norm": 1.1421383619308472, "learning_rate": 3.719733656174335e-05, "loss": 0.0333, "num_input_tokens_seen": 505032, "step": 1230 }, { "epoch": 1.4951573849878934, "grad_norm": 10.540042877197266, "learning_rate": 3.7348668280871676e-05, "loss": 0.1397, "num_input_tokens_seen": 507208, "step": 1235 }, { "epoch": 1.5012106537530268, "grad_norm": 0.07583373785018921, "learning_rate": 3.7500000000000003e-05, "loss": 0.1331, "num_input_tokens_seen": 509192, "step": 1240 }, { "epoch": 1.5072639225181597, "grad_norm": 1.0351825952529907, "learning_rate": 3.765133171912833e-05, "loss": 0.1764, "num_input_tokens_seen": 511176, "step": 1245 }, { "epoch": 1.513317191283293, "grad_norm": 16.822246551513672, "learning_rate": 3.780266343825666e-05, "loss": 0.0262, "num_input_tokens_seen": 513224, "step": 1250 }, { "epoch": 1.5193704600484261, "grad_norm": 7.521337985992432, "learning_rate": 3.795399515738499e-05, "loss": 0.1774, "num_input_tokens_seen": 515176, "step": 1255 }, { "epoch": 1.5254237288135593, "grad_norm": 14.006416320800781, "learning_rate": 3.810532687651332e-05, "loss": 0.1108, "num_input_tokens_seen": 517160, "step": 1260 }, { "epoch": 1.5314769975786926, "grad_norm": 0.37980884313583374, "learning_rate": 3.825665859564165e-05, "loss": 0.0138, "num_input_tokens_seen": 519080, "step": 1265 }, { "epoch": 1.5375302663438255, "grad_norm": 5.5329155921936035, "learning_rate": 3.8407990314769974e-05, "loss": 0.0841, "num_input_tokens_seen": 521064, "step": 1270 }, { "epoch": 1.5435835351089588, "grad_norm": 0.09349199384450912, "learning_rate": 3.855932203389831e-05, "loss": 0.2595, "num_input_tokens_seen": 523080, "step": 1275 }, { "epoch": 1.549636803874092, "grad_norm": 0.05344921723008156, "learning_rate": 3.8710653753026635e-05, "loss": 0.0986, "num_input_tokens_seen": 525128, "step": 1280 }, { "epoch": 1.555690072639225, "grad_norm": 0.08712620288133621, "learning_rate": 3.886198547215496e-05, "loss": 0.133, "num_input_tokens_seen": 527176, "step": 1285 }, { "epoch": 1.5617433414043584, "grad_norm": 0.14657877385616302, "learning_rate": 3.90133171912833e-05, "loss": 0.0818, "num_input_tokens_seen": 529064, "step": 1290 }, { "epoch": 1.5677966101694916, "grad_norm": 1.7303410768508911, "learning_rate": 3.9164648910411624e-05, "loss": 0.1969, "num_input_tokens_seen": 531176, "step": 1295 }, { "epoch": 1.5738498789346247, "grad_norm": 6.266784191131592, "learning_rate": 3.931598062953996e-05, "loss": 0.357, "num_input_tokens_seen": 533160, "step": 1300 }, { "epoch": 1.5799031476997578, "grad_norm": 0.37423044443130493, "learning_rate": 3.9467312348668285e-05, "loss": 0.1919, "num_input_tokens_seen": 535336, "step": 1305 }, { "epoch": 1.585956416464891, "grad_norm": 4.699956893920898, "learning_rate": 3.961864406779661e-05, "loss": 0.1232, "num_input_tokens_seen": 537320, "step": 1310 }, { "epoch": 1.5920096852300243, "grad_norm": 3.922513246536255, "learning_rate": 3.976997578692494e-05, "loss": 0.1584, "num_input_tokens_seen": 539272, "step": 1315 }, { "epoch": 1.5980629539951574, "grad_norm": 0.31896528601646423, "learning_rate": 3.992130750605327e-05, "loss": 0.0812, "num_input_tokens_seen": 541384, "step": 1320 }, { "epoch": 1.6041162227602905, "grad_norm": 4.468360424041748, "learning_rate": 4.00726392251816e-05, "loss": 0.1531, "num_input_tokens_seen": 543464, "step": 1325 }, { "epoch": 1.6101694915254239, "grad_norm": 1.333152413368225, "learning_rate": 4.022397094430993e-05, "loss": 0.0831, "num_input_tokens_seen": 545608, "step": 1330 }, { "epoch": 1.6162227602905568, "grad_norm": 3.404475688934326, "learning_rate": 4.0375302663438256e-05, "loss": 0.1288, "num_input_tokens_seen": 547720, "step": 1335 }, { "epoch": 1.6222760290556901, "grad_norm": 1.174735188484192, "learning_rate": 4.052663438256658e-05, "loss": 0.096, "num_input_tokens_seen": 549832, "step": 1340 }, { "epoch": 1.6283292978208233, "grad_norm": 0.04824168235063553, "learning_rate": 4.067796610169492e-05, "loss": 0.1392, "num_input_tokens_seen": 551880, "step": 1345 }, { "epoch": 1.6343825665859564, "grad_norm": 0.2520720362663269, "learning_rate": 4.0829297820823245e-05, "loss": 0.2143, "num_input_tokens_seen": 554088, "step": 1350 }, { "epoch": 1.6404358353510897, "grad_norm": 0.355550616979599, "learning_rate": 4.098062953995158e-05, "loss": 0.0652, "num_input_tokens_seen": 556232, "step": 1355 }, { "epoch": 1.6464891041162226, "grad_norm": 0.1743062287569046, "learning_rate": 4.1131961259079906e-05, "loss": 0.1186, "num_input_tokens_seen": 558312, "step": 1360 }, { "epoch": 1.652542372881356, "grad_norm": 1.065703272819519, "learning_rate": 4.1283292978208233e-05, "loss": 0.0974, "num_input_tokens_seen": 560168, "step": 1365 }, { "epoch": 1.658595641646489, "grad_norm": 5.667807102203369, "learning_rate": 4.143462469733657e-05, "loss": 0.1578, "num_input_tokens_seen": 562152, "step": 1370 }, { "epoch": 1.6646489104116222, "grad_norm": 4.380779266357422, "learning_rate": 4.1585956416464895e-05, "loss": 0.1227, "num_input_tokens_seen": 564136, "step": 1375 }, { "epoch": 1.6707021791767556, "grad_norm": 0.4175266921520233, "learning_rate": 4.173728813559322e-05, "loss": 0.1342, "num_input_tokens_seen": 566184, "step": 1380 }, { "epoch": 1.6767554479418885, "grad_norm": 9.720357894897461, "learning_rate": 4.188861985472155e-05, "loss": 0.1928, "num_input_tokens_seen": 568456, "step": 1385 }, { "epoch": 1.6828087167070218, "grad_norm": 4.286689281463623, "learning_rate": 4.203995157384988e-05, "loss": 0.0652, "num_input_tokens_seen": 570440, "step": 1390 }, { "epoch": 1.688861985472155, "grad_norm": 10.233795166015625, "learning_rate": 4.219128329297821e-05, "loss": 0.1938, "num_input_tokens_seen": 572424, "step": 1395 }, { "epoch": 1.694915254237288, "grad_norm": 0.11206325143575668, "learning_rate": 4.234261501210654e-05, "loss": 0.1332, "num_input_tokens_seen": 574472, "step": 1400 }, { "epoch": 1.7009685230024214, "grad_norm": 5.004248142242432, "learning_rate": 4.2493946731234865e-05, "loss": 0.2443, "num_input_tokens_seen": 576456, "step": 1405 }, { "epoch": 1.7070217917675545, "grad_norm": 5.831644535064697, "learning_rate": 4.26452784503632e-05, "loss": 0.1216, "num_input_tokens_seen": 578472, "step": 1410 }, { "epoch": 1.7130750605326877, "grad_norm": 4.169266223907471, "learning_rate": 4.279661016949153e-05, "loss": 0.1596, "num_input_tokens_seen": 580456, "step": 1415 }, { "epoch": 1.7191283292978208, "grad_norm": 3.915609359741211, "learning_rate": 4.294794188861986e-05, "loss": 0.1499, "num_input_tokens_seen": 582664, "step": 1420 }, { "epoch": 1.725181598062954, "grad_norm": 3.3363466262817383, "learning_rate": 4.309927360774819e-05, "loss": 0.0838, "num_input_tokens_seen": 584680, "step": 1425 }, { "epoch": 1.7312348668280872, "grad_norm": 2.6740055084228516, "learning_rate": 4.3250605326876515e-05, "loss": 0.1575, "num_input_tokens_seen": 586728, "step": 1430 }, { "epoch": 1.7372881355932204, "grad_norm": 2.055663824081421, "learning_rate": 4.340193704600484e-05, "loss": 0.1424, "num_input_tokens_seen": 588808, "step": 1435 }, { "epoch": 1.7433414043583535, "grad_norm": 2.280672073364258, "learning_rate": 4.355326876513318e-05, "loss": 0.0712, "num_input_tokens_seen": 590920, "step": 1440 }, { "epoch": 1.7493946731234868, "grad_norm": 3.3333287239074707, "learning_rate": 4.3704600484261504e-05, "loss": 0.2326, "num_input_tokens_seen": 592968, "step": 1445 }, { "epoch": 1.7554479418886197, "grad_norm": 4.786584854125977, "learning_rate": 4.385593220338983e-05, "loss": 0.1063, "num_input_tokens_seen": 595144, "step": 1450 }, { "epoch": 1.761501210653753, "grad_norm": 2.1361184120178223, "learning_rate": 4.400726392251816e-05, "loss": 0.0341, "num_input_tokens_seen": 597192, "step": 1455 }, { "epoch": 1.7675544794188862, "grad_norm": 3.6753082275390625, "learning_rate": 4.4158595641646486e-05, "loss": 0.0917, "num_input_tokens_seen": 599240, "step": 1460 }, { "epoch": 1.7736077481840193, "grad_norm": 15.70644760131836, "learning_rate": 4.430992736077482e-05, "loss": 0.256, "num_input_tokens_seen": 601288, "step": 1465 }, { "epoch": 1.7796610169491527, "grad_norm": 9.818907737731934, "learning_rate": 4.446125907990315e-05, "loss": 0.0806, "num_input_tokens_seen": 603240, "step": 1470 }, { "epoch": 1.7857142857142856, "grad_norm": 0.17702531814575195, "learning_rate": 4.461259079903148e-05, "loss": 0.2258, "num_input_tokens_seen": 605288, "step": 1475 }, { "epoch": 1.791767554479419, "grad_norm": 3.491499423980713, "learning_rate": 4.476392251815981e-05, "loss": 0.1337, "num_input_tokens_seen": 607368, "step": 1480 }, { "epoch": 1.797820823244552, "grad_norm": 0.03223598748445511, "learning_rate": 4.491525423728814e-05, "loss": 0.0931, "num_input_tokens_seen": 609384, "step": 1485 }, { "epoch": 1.8038740920096852, "grad_norm": 1.5729188919067383, "learning_rate": 4.506658595641647e-05, "loss": 0.1407, "num_input_tokens_seen": 611528, "step": 1490 }, { "epoch": 1.8099273607748185, "grad_norm": 0.2596810758113861, "learning_rate": 4.52179176755448e-05, "loss": 0.0546, "num_input_tokens_seen": 613544, "step": 1495 }, { "epoch": 1.8159806295399514, "grad_norm": 7.1788129806518555, "learning_rate": 4.5369249394673125e-05, "loss": 0.0436, "num_input_tokens_seen": 615656, "step": 1500 }, { "epoch": 1.8220338983050848, "grad_norm": 13.192105293273926, "learning_rate": 4.552058111380145e-05, "loss": 0.2886, "num_input_tokens_seen": 617672, "step": 1505 }, { "epoch": 1.828087167070218, "grad_norm": 0.028203818947076797, "learning_rate": 4.5671912832929786e-05, "loss": 0.0443, "num_input_tokens_seen": 619784, "step": 1510 }, { "epoch": 1.834140435835351, "grad_norm": 2.2290990352630615, "learning_rate": 4.582324455205811e-05, "loss": 0.1949, "num_input_tokens_seen": 621736, "step": 1515 }, { "epoch": 1.8401937046004844, "grad_norm": 1.1592543125152588, "learning_rate": 4.597457627118644e-05, "loss": 0.2862, "num_input_tokens_seen": 623880, "step": 1520 }, { "epoch": 1.8462469733656173, "grad_norm": 4.421098232269287, "learning_rate": 4.612590799031477e-05, "loss": 0.278, "num_input_tokens_seen": 625768, "step": 1525 }, { "epoch": 1.8523002421307506, "grad_norm": 0.15022696554660797, "learning_rate": 4.62772397094431e-05, "loss": 0.0648, "num_input_tokens_seen": 627912, "step": 1530 }, { "epoch": 1.8583535108958837, "grad_norm": 0.24027438461780548, "learning_rate": 4.642857142857143e-05, "loss": 0.1585, "num_input_tokens_seen": 629992, "step": 1535 }, { "epoch": 1.8644067796610169, "grad_norm": 0.09815403819084167, "learning_rate": 4.6579903147699763e-05, "loss": 0.0757, "num_input_tokens_seen": 631944, "step": 1540 }, { "epoch": 1.8704600484261502, "grad_norm": 9.321256637573242, "learning_rate": 4.673123486682809e-05, "loss": 0.1218, "num_input_tokens_seen": 633928, "step": 1545 }, { "epoch": 1.8765133171912833, "grad_norm": 5.844165325164795, "learning_rate": 4.688256658595642e-05, "loss": 0.0535, "num_input_tokens_seen": 635976, "step": 1550 }, { "epoch": 1.8825665859564165, "grad_norm": 6.887495040893555, "learning_rate": 4.703389830508475e-05, "loss": 0.1713, "num_input_tokens_seen": 638152, "step": 1555 }, { "epoch": 1.8886198547215496, "grad_norm": 8.237529754638672, "learning_rate": 4.718523002421308e-05, "loss": 0.3364, "num_input_tokens_seen": 640136, "step": 1560 }, { "epoch": 1.8946731234866827, "grad_norm": 0.5328813791275024, "learning_rate": 4.733656174334141e-05, "loss": 0.0951, "num_input_tokens_seen": 642152, "step": 1565 }, { "epoch": 1.900726392251816, "grad_norm": 3.5063600540161133, "learning_rate": 4.7487893462469734e-05, "loss": 0.1417, "num_input_tokens_seen": 644200, "step": 1570 }, { "epoch": 1.9067796610169492, "grad_norm": 0.4912002384662628, "learning_rate": 4.763922518159806e-05, "loss": 0.099, "num_input_tokens_seen": 646344, "step": 1575 }, { "epoch": 1.9128329297820823, "grad_norm": 0.8073127865791321, "learning_rate": 4.7790556900726395e-05, "loss": 0.081, "num_input_tokens_seen": 648264, "step": 1580 }, { "epoch": 1.9188861985472156, "grad_norm": 1.4225828647613525, "learning_rate": 4.794188861985472e-05, "loss": 0.0615, "num_input_tokens_seen": 650248, "step": 1585 }, { "epoch": 1.9249394673123486, "grad_norm": 8.7071533203125, "learning_rate": 4.809322033898305e-05, "loss": 0.1007, "num_input_tokens_seen": 652360, "step": 1590 }, { "epoch": 1.930992736077482, "grad_norm": 4.1327948570251465, "learning_rate": 4.8244552058111384e-05, "loss": 0.1293, "num_input_tokens_seen": 654376, "step": 1595 }, { "epoch": 1.937046004842615, "grad_norm": 11.142343521118164, "learning_rate": 4.839588377723971e-05, "loss": 0.2243, "num_input_tokens_seen": 656392, "step": 1600 }, { "epoch": 1.9430992736077481, "grad_norm": 0.6282988786697388, "learning_rate": 4.8547215496368045e-05, "loss": 0.0802, "num_input_tokens_seen": 658376, "step": 1605 }, { "epoch": 1.9491525423728815, "grad_norm": 2.129737138748169, "learning_rate": 4.869854721549637e-05, "loss": 0.0926, "num_input_tokens_seen": 660328, "step": 1610 }, { "epoch": 1.9552058111380144, "grad_norm": 6.5117411613464355, "learning_rate": 4.88498789346247e-05, "loss": 0.1777, "num_input_tokens_seen": 662312, "step": 1615 }, { "epoch": 1.9612590799031477, "grad_norm": 0.49309098720550537, "learning_rate": 4.900121065375303e-05, "loss": 0.1105, "num_input_tokens_seen": 664136, "step": 1620 }, { "epoch": 1.9673123486682809, "grad_norm": 5.370323657989502, "learning_rate": 4.915254237288136e-05, "loss": 0.1081, "num_input_tokens_seen": 666088, "step": 1625 }, { "epoch": 1.973365617433414, "grad_norm": 2.3991456031799316, "learning_rate": 4.930387409200969e-05, "loss": 0.093, "num_input_tokens_seen": 668200, "step": 1630 }, { "epoch": 1.9794188861985473, "grad_norm": 0.6650813817977905, "learning_rate": 4.9455205811138016e-05, "loss": 0.0608, "num_input_tokens_seen": 670248, "step": 1635 }, { "epoch": 1.9854721549636802, "grad_norm": 9.133147239685059, "learning_rate": 4.960653753026634e-05, "loss": 0.1644, "num_input_tokens_seen": 672488, "step": 1640 }, { "epoch": 1.9915254237288136, "grad_norm": 0.03334037587046623, "learning_rate": 4.975786924939467e-05, "loss": 0.0651, "num_input_tokens_seen": 674568, "step": 1645 }, { "epoch": 1.9975786924939467, "grad_norm": 0.04566989094018936, "learning_rate": 4.9909200968523005e-05, "loss": 0.134, "num_input_tokens_seen": 676584, "step": 1650 }, { "epoch": 2.0, "eval_loss": 0.1023661345243454, "eval_runtime": 4.9629, "eval_samples_per_second": 73.949, "eval_steps_per_second": 18.538, "num_input_tokens_seen": 677000, "step": 1652 }, { "epoch": 2.00363196125908, "grad_norm": 0.8894397616386414, "learning_rate": 4.9999997767637914e-05, "loss": 0.1621, "num_input_tokens_seen": 678216, "step": 1655 }, { "epoch": 2.009685230024213, "grad_norm": 1.1072958707809448, "learning_rate": 4.9999972653568995e-05, "loss": 0.146, "num_input_tokens_seen": 680424, "step": 1660 }, { "epoch": 2.015738498789346, "grad_norm": 0.026830771937966347, "learning_rate": 4.9999919635006676e-05, "loss": 0.0271, "num_input_tokens_seen": 682568, "step": 1665 }, { "epoch": 2.0217917675544794, "grad_norm": 0.15988993644714355, "learning_rate": 4.999983871201014e-05, "loss": 0.1508, "num_input_tokens_seen": 684488, "step": 1670 }, { "epoch": 2.0278450363196128, "grad_norm": 4.612785816192627, "learning_rate": 4.99997298846697e-05, "loss": 0.1583, "num_input_tokens_seen": 686440, "step": 1675 }, { "epoch": 2.0338983050847457, "grad_norm": 0.019917165860533714, "learning_rate": 4.9999593153106825e-05, "loss": 0.1598, "num_input_tokens_seen": 688424, "step": 1680 }, { "epoch": 2.039951573849879, "grad_norm": 2.4089601039886475, "learning_rate": 4.999942851747414e-05, "loss": 0.1277, "num_input_tokens_seen": 690440, "step": 1685 }, { "epoch": 2.046004842615012, "grad_norm": 8.718679428100586, "learning_rate": 4.999923597795542e-05, "loss": 0.1415, "num_input_tokens_seen": 692488, "step": 1690 }, { "epoch": 2.0520581113801453, "grad_norm": 3.5554420948028564, "learning_rate": 4.999901553476555e-05, "loss": 0.0218, "num_input_tokens_seen": 694536, "step": 1695 }, { "epoch": 2.0581113801452786, "grad_norm": 1.5239126682281494, "learning_rate": 4.9998767188150605e-05, "loss": 0.0853, "num_input_tokens_seen": 696680, "step": 1700 }, { "epoch": 2.0641646489104115, "grad_norm": 1.0185511112213135, "learning_rate": 4.9998490938387767e-05, "loss": 0.0218, "num_input_tokens_seen": 698600, "step": 1705 }, { "epoch": 2.070217917675545, "grad_norm": 0.25121477246284485, "learning_rate": 4.99981867857854e-05, "loss": 0.0454, "num_input_tokens_seen": 700616, "step": 1710 }, { "epoch": 2.0762711864406778, "grad_norm": 1.5077519416809082, "learning_rate": 4.999785473068297e-05, "loss": 0.0817, "num_input_tokens_seen": 702728, "step": 1715 }, { "epoch": 2.082324455205811, "grad_norm": 0.02467760257422924, "learning_rate": 4.999749477345113e-05, "loss": 0.0724, "num_input_tokens_seen": 704744, "step": 1720 }, { "epoch": 2.0883777239709445, "grad_norm": 0.1022225096821785, "learning_rate": 4.9997106914491646e-05, "loss": 0.0211, "num_input_tokens_seen": 706984, "step": 1725 }, { "epoch": 2.0944309927360774, "grad_norm": 0.24314634501934052, "learning_rate": 4.999669115423745e-05, "loss": 0.2938, "num_input_tokens_seen": 709032, "step": 1730 }, { "epoch": 2.1004842615012107, "grad_norm": 0.20275698602199554, "learning_rate": 4.999624749315259e-05, "loss": 0.0893, "num_input_tokens_seen": 711176, "step": 1735 }, { "epoch": 2.106537530266344, "grad_norm": 0.08545180410146713, "learning_rate": 4.999577593173229e-05, "loss": 0.1369, "num_input_tokens_seen": 713224, "step": 1740 }, { "epoch": 2.112590799031477, "grad_norm": 0.7550906538963318, "learning_rate": 4.999527647050289e-05, "loss": 0.0766, "num_input_tokens_seen": 715208, "step": 1745 }, { "epoch": 2.1186440677966103, "grad_norm": 5.9434332847595215, "learning_rate": 4.9994749110021874e-05, "loss": 0.1718, "num_input_tokens_seen": 717256, "step": 1750 }, { "epoch": 2.124697336561743, "grad_norm": 7.2458600997924805, "learning_rate": 4.999419385087787e-05, "loss": 0.1144, "num_input_tokens_seen": 719496, "step": 1755 }, { "epoch": 2.1307506053268765, "grad_norm": 0.823076069355011, "learning_rate": 4.9993610693690666e-05, "loss": 0.024, "num_input_tokens_seen": 721576, "step": 1760 }, { "epoch": 2.13680387409201, "grad_norm": 0.25200021266937256, "learning_rate": 4.999299963911115e-05, "loss": 0.0559, "num_input_tokens_seen": 723528, "step": 1765 }, { "epoch": 2.142857142857143, "grad_norm": 2.297940969467163, "learning_rate": 4.9992360687821385e-05, "loss": 0.1001, "num_input_tokens_seen": 725672, "step": 1770 }, { "epoch": 2.148910411622276, "grad_norm": 11.231027603149414, "learning_rate": 4.999169384053454e-05, "loss": 0.0671, "num_input_tokens_seen": 727752, "step": 1775 }, { "epoch": 2.154963680387409, "grad_norm": 0.20129230618476868, "learning_rate": 4.999099909799495e-05, "loss": 0.0744, "num_input_tokens_seen": 729800, "step": 1780 }, { "epoch": 2.1610169491525424, "grad_norm": 0.09001114964485168, "learning_rate": 4.9990276460978074e-05, "loss": 0.0284, "num_input_tokens_seen": 731848, "step": 1785 }, { "epoch": 2.1670702179176757, "grad_norm": 0.46201828122138977, "learning_rate": 4.99895259302905e-05, "loss": 0.1034, "num_input_tokens_seen": 733864, "step": 1790 }, { "epoch": 2.1731234866828086, "grad_norm": 0.15420645475387573, "learning_rate": 4.998874750676996e-05, "loss": 0.0647, "num_input_tokens_seen": 735816, "step": 1795 }, { "epoch": 2.179176755447942, "grad_norm": 0.11122027039527893, "learning_rate": 4.99879411912853e-05, "loss": 0.0927, "num_input_tokens_seen": 737992, "step": 1800 }, { "epoch": 2.185230024213075, "grad_norm": 0.10964418947696686, "learning_rate": 4.998710698473654e-05, "loss": 0.1851, "num_input_tokens_seen": 740136, "step": 1805 }, { "epoch": 2.1912832929782082, "grad_norm": 11.59616470336914, "learning_rate": 4.9986244888054786e-05, "loss": 0.2468, "num_input_tokens_seen": 741960, "step": 1810 }, { "epoch": 2.1973365617433416, "grad_norm": 6.320070743560791, "learning_rate": 4.998535490220231e-05, "loss": 0.1276, "num_input_tokens_seen": 743976, "step": 1815 }, { "epoch": 2.2033898305084745, "grad_norm": 0.5940458178520203, "learning_rate": 4.998443702817248e-05, "loss": 0.0491, "num_input_tokens_seen": 746120, "step": 1820 }, { "epoch": 2.209443099273608, "grad_norm": 7.003976345062256, "learning_rate": 4.9983491266989815e-05, "loss": 0.1445, "num_input_tokens_seen": 748200, "step": 1825 }, { "epoch": 2.2154963680387407, "grad_norm": 3.633148193359375, "learning_rate": 4.998251761970997e-05, "loss": 0.1311, "num_input_tokens_seen": 750120, "step": 1830 }, { "epoch": 2.221549636803874, "grad_norm": 2.8354320526123047, "learning_rate": 4.998151608741969e-05, "loss": 0.1333, "num_input_tokens_seen": 752328, "step": 1835 }, { "epoch": 2.2276029055690074, "grad_norm": 2.127987861633301, "learning_rate": 4.998048667123688e-05, "loss": 0.0688, "num_input_tokens_seen": 754440, "step": 1840 }, { "epoch": 2.2336561743341403, "grad_norm": 5.695094585418701, "learning_rate": 4.9979429372310544e-05, "loss": 0.1762, "num_input_tokens_seen": 756328, "step": 1845 }, { "epoch": 2.2397094430992737, "grad_norm": 6.138016700744629, "learning_rate": 4.9978344191820825e-05, "loss": 0.0411, "num_input_tokens_seen": 758408, "step": 1850 }, { "epoch": 2.2457627118644066, "grad_norm": 1.80014169216156, "learning_rate": 4.9977231130978986e-05, "loss": 0.0943, "num_input_tokens_seen": 760392, "step": 1855 }, { "epoch": 2.25181598062954, "grad_norm": 0.26050910353660583, "learning_rate": 4.99760901910274e-05, "loss": 0.0633, "num_input_tokens_seen": 762472, "step": 1860 }, { "epoch": 2.2578692493946733, "grad_norm": 0.18825511634349823, "learning_rate": 4.997492137323956e-05, "loss": 0.1862, "num_input_tokens_seen": 764520, "step": 1865 }, { "epoch": 2.263922518159806, "grad_norm": 7.164376735687256, "learning_rate": 4.997372467892008e-05, "loss": 0.1474, "num_input_tokens_seen": 766504, "step": 1870 }, { "epoch": 2.2699757869249395, "grad_norm": 0.1786065399646759, "learning_rate": 4.997250010940469e-05, "loss": 0.0084, "num_input_tokens_seen": 768552, "step": 1875 }, { "epoch": 2.2760290556900724, "grad_norm": 8.683467864990234, "learning_rate": 4.997124766606023e-05, "loss": 0.0714, "num_input_tokens_seen": 770600, "step": 1880 }, { "epoch": 2.2820823244552058, "grad_norm": 0.26213735342025757, "learning_rate": 4.996996735028465e-05, "loss": 0.102, "num_input_tokens_seen": 772616, "step": 1885 }, { "epoch": 2.288135593220339, "grad_norm": 18.57248878479004, "learning_rate": 4.9968659163507014e-05, "loss": 0.0677, "num_input_tokens_seen": 774696, "step": 1890 }, { "epoch": 2.294188861985472, "grad_norm": 0.21370252966880798, "learning_rate": 4.9967323107187494e-05, "loss": 0.0569, "num_input_tokens_seen": 776712, "step": 1895 }, { "epoch": 2.3002421307506054, "grad_norm": 10.21783447265625, "learning_rate": 4.996595918281738e-05, "loss": 0.2921, "num_input_tokens_seen": 778760, "step": 1900 }, { "epoch": 2.3062953995157383, "grad_norm": 1.071658730506897, "learning_rate": 4.996456739191905e-05, "loss": 0.0133, "num_input_tokens_seen": 780936, "step": 1905 }, { "epoch": 2.3123486682808716, "grad_norm": 7.034092903137207, "learning_rate": 4.9963147736046e-05, "loss": 0.2764, "num_input_tokens_seen": 782920, "step": 1910 }, { "epoch": 2.318401937046005, "grad_norm": 0.06497315317392349, "learning_rate": 4.9961700216782816e-05, "loss": 0.0347, "num_input_tokens_seen": 784808, "step": 1915 }, { "epoch": 2.324455205811138, "grad_norm": 0.0576774887740612, "learning_rate": 4.996022483574519e-05, "loss": 0.1964, "num_input_tokens_seen": 786792, "step": 1920 }, { "epoch": 2.330508474576271, "grad_norm": 1.3441082239151, "learning_rate": 4.995872159457994e-05, "loss": 0.0383, "num_input_tokens_seen": 788776, "step": 1925 }, { "epoch": 2.3365617433414045, "grad_norm": 0.2703811824321747, "learning_rate": 4.995719049496491e-05, "loss": 0.015, "num_input_tokens_seen": 790760, "step": 1930 }, { "epoch": 2.3426150121065374, "grad_norm": 0.06731747835874557, "learning_rate": 4.9955631538609127e-05, "loss": 0.0186, "num_input_tokens_seen": 792776, "step": 1935 }, { "epoch": 2.348668280871671, "grad_norm": 4.9664692878723145, "learning_rate": 4.995404472725264e-05, "loss": 0.1019, "num_input_tokens_seen": 794952, "step": 1940 }, { "epoch": 2.3547215496368037, "grad_norm": 1.4589275121688843, "learning_rate": 4.995243006266663e-05, "loss": 0.2192, "num_input_tokens_seen": 797032, "step": 1945 }, { "epoch": 2.360774818401937, "grad_norm": 2.5762882232666016, "learning_rate": 4.9950787546653354e-05, "loss": 0.161, "num_input_tokens_seen": 799272, "step": 1950 }, { "epoch": 2.3668280871670704, "grad_norm": 2.5055031776428223, "learning_rate": 4.9949117181046156e-05, "loss": 0.1064, "num_input_tokens_seen": 801352, "step": 1955 }, { "epoch": 2.3728813559322033, "grad_norm": 4.71099853515625, "learning_rate": 4.9947418967709465e-05, "loss": 0.0771, "num_input_tokens_seen": 803368, "step": 1960 }, { "epoch": 2.3789346246973366, "grad_norm": 0.2060762494802475, "learning_rate": 4.99456929085388e-05, "loss": 0.0489, "num_input_tokens_seen": 805480, "step": 1965 }, { "epoch": 2.38498789346247, "grad_norm": 6.6143012046813965, "learning_rate": 4.994393900546074e-05, "loss": 0.0737, "num_input_tokens_seen": 807464, "step": 1970 }, { "epoch": 2.391041162227603, "grad_norm": 8.21241569519043, "learning_rate": 4.994215726043298e-05, "loss": 0.0903, "num_input_tokens_seen": 809576, "step": 1975 }, { "epoch": 2.3970944309927362, "grad_norm": 4.89132833480835, "learning_rate": 4.9940347675444254e-05, "loss": 0.1582, "num_input_tokens_seen": 811528, "step": 1980 }, { "epoch": 2.403147699757869, "grad_norm": 0.10856650769710541, "learning_rate": 4.993851025251439e-05, "loss": 0.0465, "num_input_tokens_seen": 813480, "step": 1985 }, { "epoch": 2.4092009685230025, "grad_norm": 5.713769435882568, "learning_rate": 4.993664499369429e-05, "loss": 0.1266, "num_input_tokens_seen": 815560, "step": 1990 }, { "epoch": 2.415254237288136, "grad_norm": 8.373800277709961, "learning_rate": 4.993475190106591e-05, "loss": 0.0986, "num_input_tokens_seen": 817576, "step": 1995 }, { "epoch": 2.4213075060532687, "grad_norm": 0.33537766337394714, "learning_rate": 4.9932830976742294e-05, "loss": 0.0769, "num_input_tokens_seen": 819560, "step": 2000 }, { "epoch": 2.427360774818402, "grad_norm": 6.679559230804443, "learning_rate": 4.993088222286754e-05, "loss": 0.0987, "num_input_tokens_seen": 821640, "step": 2005 }, { "epoch": 2.433414043583535, "grad_norm": 0.16475240886211395, "learning_rate": 4.9928905641616794e-05, "loss": 0.1587, "num_input_tokens_seen": 823784, "step": 2010 }, { "epoch": 2.4394673123486683, "grad_norm": 0.26706570386886597, "learning_rate": 4.992690123519631e-05, "loss": 0.0833, "num_input_tokens_seen": 825800, "step": 2015 }, { "epoch": 2.4455205811138017, "grad_norm": 5.374063014984131, "learning_rate": 4.992486900584334e-05, "loss": 0.0375, "num_input_tokens_seen": 827944, "step": 2020 }, { "epoch": 2.4515738498789346, "grad_norm": 8.693047523498535, "learning_rate": 4.992280895582623e-05, "loss": 0.0564, "num_input_tokens_seen": 830152, "step": 2025 }, { "epoch": 2.457627118644068, "grad_norm": 0.0284031443297863, "learning_rate": 4.992072108744436e-05, "loss": 0.0542, "num_input_tokens_seen": 832232, "step": 2030 }, { "epoch": 2.463680387409201, "grad_norm": 0.1019381508231163, "learning_rate": 4.99186054030282e-05, "loss": 0.0239, "num_input_tokens_seen": 834344, "step": 2035 }, { "epoch": 2.469733656174334, "grad_norm": 0.040061287581920624, "learning_rate": 4.991646190493919e-05, "loss": 0.0526, "num_input_tokens_seen": 836232, "step": 2040 }, { "epoch": 2.4757869249394675, "grad_norm": 8.05505084991455, "learning_rate": 4.9914290595569895e-05, "loss": 0.062, "num_input_tokens_seen": 838216, "step": 2045 }, { "epoch": 2.4818401937046004, "grad_norm": 10.849311828613281, "learning_rate": 4.991209147734388e-05, "loss": 0.1211, "num_input_tokens_seen": 840200, "step": 2050 }, { "epoch": 2.4878934624697338, "grad_norm": 0.7695053219795227, "learning_rate": 4.990986455271576e-05, "loss": 0.0042, "num_input_tokens_seen": 842376, "step": 2055 }, { "epoch": 2.4939467312348667, "grad_norm": 0.11878375709056854, "learning_rate": 4.990760982417118e-05, "loss": 0.0148, "num_input_tokens_seen": 844488, "step": 2060 }, { "epoch": 2.5, "grad_norm": 0.6916325688362122, "learning_rate": 4.9905327294226834e-05, "loss": 0.1321, "num_input_tokens_seen": 846536, "step": 2065 }, { "epoch": 2.5060532687651333, "grad_norm": 0.05584622547030449, "learning_rate": 4.990301696543043e-05, "loss": 0.0813, "num_input_tokens_seen": 848584, "step": 2070 }, { "epoch": 2.5121065375302662, "grad_norm": 12.514789581298828, "learning_rate": 4.9900678840360714e-05, "loss": 0.1222, "num_input_tokens_seen": 850696, "step": 2075 }, { "epoch": 2.5181598062953996, "grad_norm": 1.05078125, "learning_rate": 4.989831292162747e-05, "loss": 0.0872, "num_input_tokens_seen": 852744, "step": 2080 }, { "epoch": 2.5242130750605325, "grad_norm": 8.321377754211426, "learning_rate": 4.9895919211871465e-05, "loss": 0.1842, "num_input_tokens_seen": 854696, "step": 2085 }, { "epoch": 2.530266343825666, "grad_norm": 2.0978598594665527, "learning_rate": 4.989349771376454e-05, "loss": 0.0538, "num_input_tokens_seen": 856680, "step": 2090 }, { "epoch": 2.536319612590799, "grad_norm": 1.191090703010559, "learning_rate": 4.9891048430009515e-05, "loss": 0.0961, "num_input_tokens_seen": 858600, "step": 2095 }, { "epoch": 2.542372881355932, "grad_norm": 3.424934148788452, "learning_rate": 4.988857136334023e-05, "loss": 0.0211, "num_input_tokens_seen": 860744, "step": 2100 }, { "epoch": 2.5484261501210654, "grad_norm": 6.350499629974365, "learning_rate": 4.9886066516521535e-05, "loss": 0.1175, "num_input_tokens_seen": 862888, "step": 2105 }, { "epoch": 2.5544794188861983, "grad_norm": 0.12872959673404694, "learning_rate": 4.9883533892349306e-05, "loss": 0.0788, "num_input_tokens_seen": 864872, "step": 2110 }, { "epoch": 2.5605326876513317, "grad_norm": 13.053696632385254, "learning_rate": 4.9880973493650394e-05, "loss": 0.1924, "num_input_tokens_seen": 866856, "step": 2115 }, { "epoch": 2.566585956416465, "grad_norm": 4.3271002769470215, "learning_rate": 4.9878385323282686e-05, "loss": 0.1301, "num_input_tokens_seen": 868936, "step": 2120 }, { "epoch": 2.572639225181598, "grad_norm": 0.7733317017555237, "learning_rate": 4.987576938413504e-05, "loss": 0.0807, "num_input_tokens_seen": 870888, "step": 2125 }, { "epoch": 2.5786924939467313, "grad_norm": 0.29834604263305664, "learning_rate": 4.987312567912731e-05, "loss": 0.1588, "num_input_tokens_seen": 873000, "step": 2130 }, { "epoch": 2.584745762711864, "grad_norm": 0.23283173143863678, "learning_rate": 4.987045421121036e-05, "loss": 0.0344, "num_input_tokens_seen": 874952, "step": 2135 }, { "epoch": 2.5907990314769975, "grad_norm": 0.9966800212860107, "learning_rate": 4.986775498336602e-05, "loss": 0.0761, "num_input_tokens_seen": 877032, "step": 2140 }, { "epoch": 2.596852300242131, "grad_norm": 7.554386615753174, "learning_rate": 4.9865027998607127e-05, "loss": 0.205, "num_input_tokens_seen": 879080, "step": 2145 }, { "epoch": 2.6029055690072638, "grad_norm": 0.2937626540660858, "learning_rate": 4.986227325997749e-05, "loss": 0.0641, "num_input_tokens_seen": 880968, "step": 2150 }, { "epoch": 2.608958837772397, "grad_norm": 3.250307083129883, "learning_rate": 4.985949077055189e-05, "loss": 0.0558, "num_input_tokens_seen": 883144, "step": 2155 }, { "epoch": 2.61501210653753, "grad_norm": 0.12180660665035248, "learning_rate": 4.985668053343609e-05, "loss": 0.1168, "num_input_tokens_seen": 885160, "step": 2160 }, { "epoch": 2.6210653753026634, "grad_norm": 7.6713128089904785, "learning_rate": 4.9853842551766825e-05, "loss": 0.1212, "num_input_tokens_seen": 887272, "step": 2165 }, { "epoch": 2.6271186440677967, "grad_norm": 3.7881033420562744, "learning_rate": 4.9850976828711796e-05, "loss": 0.125, "num_input_tokens_seen": 889352, "step": 2170 }, { "epoch": 2.6331719128329296, "grad_norm": 0.11999412626028061, "learning_rate": 4.984808336746966e-05, "loss": 0.0499, "num_input_tokens_seen": 891368, "step": 2175 }, { "epoch": 2.639225181598063, "grad_norm": 0.6555036902427673, "learning_rate": 4.984516217127005e-05, "loss": 0.1242, "num_input_tokens_seen": 893352, "step": 2180 }, { "epoch": 2.645278450363196, "grad_norm": 3.24464750289917, "learning_rate": 4.984221324337356e-05, "loss": 0.1036, "num_input_tokens_seen": 895432, "step": 2185 }, { "epoch": 2.651331719128329, "grad_norm": 2.2484326362609863, "learning_rate": 4.983923658707172e-05, "loss": 0.2375, "num_input_tokens_seen": 897608, "step": 2190 }, { "epoch": 2.6573849878934626, "grad_norm": 4.118319511413574, "learning_rate": 4.9836232205687006e-05, "loss": 0.1108, "num_input_tokens_seen": 899496, "step": 2195 }, { "epoch": 2.663438256658596, "grad_norm": 2.313467502593994, "learning_rate": 4.983320010257287e-05, "loss": 0.1058, "num_input_tokens_seen": 901608, "step": 2200 }, { "epoch": 2.669491525423729, "grad_norm": 3.7684824466705322, "learning_rate": 4.9830140281113666e-05, "loss": 0.084, "num_input_tokens_seen": 903816, "step": 2205 }, { "epoch": 2.6755447941888617, "grad_norm": 2.4435553550720215, "learning_rate": 4.982705274472472e-05, "loss": 0.1471, "num_input_tokens_seen": 905864, "step": 2210 }, { "epoch": 2.681598062953995, "grad_norm": 0.20433717966079712, "learning_rate": 4.982393749685229e-05, "loss": 0.084, "num_input_tokens_seen": 908040, "step": 2215 }, { "epoch": 2.6876513317191284, "grad_norm": 0.1152157410979271, "learning_rate": 4.982079454097354e-05, "loss": 0.0681, "num_input_tokens_seen": 910152, "step": 2220 }, { "epoch": 2.6937046004842617, "grad_norm": 0.005961982067674398, "learning_rate": 4.9817623880596586e-05, "loss": 0.102, "num_input_tokens_seen": 912232, "step": 2225 }, { "epoch": 2.6997578692493946, "grad_norm": 4.668184757232666, "learning_rate": 4.981442551926047e-05, "loss": 0.0827, "num_input_tokens_seen": 914472, "step": 2230 }, { "epoch": 2.705811138014528, "grad_norm": 3.4346671104431152, "learning_rate": 4.981119946053512e-05, "loss": 0.0906, "num_input_tokens_seen": 916488, "step": 2235 }, { "epoch": 2.711864406779661, "grad_norm": 1.2049880027770996, "learning_rate": 4.980794570802141e-05, "loss": 0.0258, "num_input_tokens_seen": 918472, "step": 2240 }, { "epoch": 2.7179176755447942, "grad_norm": 12.488987922668457, "learning_rate": 4.980466426535112e-05, "loss": 0.2063, "num_input_tokens_seen": 920456, "step": 2245 }, { "epoch": 2.7239709443099276, "grad_norm": 0.08432286232709885, "learning_rate": 4.980135513618693e-05, "loss": 0.0862, "num_input_tokens_seen": 922472, "step": 2250 }, { "epoch": 2.7300242130750605, "grad_norm": 1.797545313835144, "learning_rate": 4.979801832422243e-05, "loss": 0.01, "num_input_tokens_seen": 924584, "step": 2255 }, { "epoch": 2.736077481840194, "grad_norm": 0.24521039426326752, "learning_rate": 4.9794653833182106e-05, "loss": 0.0909, "num_input_tokens_seen": 926536, "step": 2260 }, { "epoch": 2.7421307506053267, "grad_norm": 8.946980476379395, "learning_rate": 4.979126166682133e-05, "loss": 0.1686, "num_input_tokens_seen": 928520, "step": 2265 }, { "epoch": 2.74818401937046, "grad_norm": 0.7505645155906677, "learning_rate": 4.9787841828926395e-05, "loss": 0.0031, "num_input_tokens_seen": 930472, "step": 2270 }, { "epoch": 2.7542372881355934, "grad_norm": 0.42342114448547363, "learning_rate": 4.978439432331443e-05, "loss": 0.0814, "num_input_tokens_seen": 932392, "step": 2275 }, { "epoch": 2.7602905569007263, "grad_norm": 0.47326338291168213, "learning_rate": 4.9780919153833504e-05, "loss": 0.0078, "num_input_tokens_seen": 934312, "step": 2280 }, { "epoch": 2.7663438256658597, "grad_norm": 0.12262631952762604, "learning_rate": 4.977741632436251e-05, "loss": 0.1356, "num_input_tokens_seen": 936392, "step": 2285 }, { "epoch": 2.7723970944309926, "grad_norm": 0.036344945430755615, "learning_rate": 4.977388583881126e-05, "loss": 0.0357, "num_input_tokens_seen": 938408, "step": 2290 }, { "epoch": 2.778450363196126, "grad_norm": 0.02745259739458561, "learning_rate": 4.97703277011204e-05, "loss": 0.022, "num_input_tokens_seen": 940488, "step": 2295 }, { "epoch": 2.7845036319612593, "grad_norm": 0.04572382941842079, "learning_rate": 4.976674191526146e-05, "loss": 0.0467, "num_input_tokens_seen": 942632, "step": 2300 }, { "epoch": 2.790556900726392, "grad_norm": 2.7454938888549805, "learning_rate": 4.976312848523683e-05, "loss": 0.0546, "num_input_tokens_seen": 944744, "step": 2305 }, { "epoch": 2.7966101694915255, "grad_norm": 17.375802993774414, "learning_rate": 4.975948741507974e-05, "loss": 0.1246, "num_input_tokens_seen": 946952, "step": 2310 }, { "epoch": 2.8026634382566584, "grad_norm": 0.028796181082725525, "learning_rate": 4.9755818708854306e-05, "loss": 0.0609, "num_input_tokens_seen": 948968, "step": 2315 }, { "epoch": 2.8087167070217918, "grad_norm": 0.4098775088787079, "learning_rate": 4.975212237065544e-05, "loss": 0.1534, "num_input_tokens_seen": 951080, "step": 2320 }, { "epoch": 2.814769975786925, "grad_norm": 1.0437414646148682, "learning_rate": 4.974839840460895e-05, "loss": 0.0915, "num_input_tokens_seen": 953096, "step": 2325 }, { "epoch": 2.820823244552058, "grad_norm": 9.658169746398926, "learning_rate": 4.9744646814871435e-05, "loss": 0.1555, "num_input_tokens_seen": 955144, "step": 2330 }, { "epoch": 2.8268765133171914, "grad_norm": 4.012033939361572, "learning_rate": 4.974086760563036e-05, "loss": 0.1526, "num_input_tokens_seen": 957096, "step": 2335 }, { "epoch": 2.8329297820823243, "grad_norm": 4.228144645690918, "learning_rate": 4.973706078110401e-05, "loss": 0.0947, "num_input_tokens_seen": 959304, "step": 2340 }, { "epoch": 2.8389830508474576, "grad_norm": 5.867490768432617, "learning_rate": 4.973322634554147e-05, "loss": 0.0198, "num_input_tokens_seen": 961448, "step": 2345 }, { "epoch": 2.845036319612591, "grad_norm": 5.353538990020752, "learning_rate": 4.9729364303222684e-05, "loss": 0.0789, "num_input_tokens_seen": 963528, "step": 2350 }, { "epoch": 2.851089588377724, "grad_norm": 0.10336983948945999, "learning_rate": 4.972547465845839e-05, "loss": 0.0625, "num_input_tokens_seen": 965640, "step": 2355 }, { "epoch": 2.857142857142857, "grad_norm": 0.25345975160598755, "learning_rate": 4.972155741559012e-05, "loss": 0.0731, "num_input_tokens_seen": 967880, "step": 2360 }, { "epoch": 2.86319612590799, "grad_norm": 3.3928165435791016, "learning_rate": 4.971761257899024e-05, "loss": 0.219, "num_input_tokens_seen": 969928, "step": 2365 }, { "epoch": 2.8692493946731235, "grad_norm": 0.26536864042282104, "learning_rate": 4.971364015306189e-05, "loss": 0.0118, "num_input_tokens_seen": 972040, "step": 2370 }, { "epoch": 2.875302663438257, "grad_norm": 3.296708822250366, "learning_rate": 4.9709640142239036e-05, "loss": 0.0733, "num_input_tokens_seen": 974152, "step": 2375 }, { "epoch": 2.8813559322033897, "grad_norm": 0.1411980241537094, "learning_rate": 4.970561255098639e-05, "loss": 0.0142, "num_input_tokens_seen": 976136, "step": 2380 }, { "epoch": 2.887409200968523, "grad_norm": 0.11907722800970078, "learning_rate": 4.970155738379948e-05, "loss": 0.0195, "num_input_tokens_seen": 978152, "step": 2385 }, { "epoch": 2.893462469733656, "grad_norm": 17.118701934814453, "learning_rate": 4.969747464520461e-05, "loss": 0.0812, "num_input_tokens_seen": 980168, "step": 2390 }, { "epoch": 2.8995157384987893, "grad_norm": 0.4315287470817566, "learning_rate": 4.969336433975886e-05, "loss": 0.1342, "num_input_tokens_seen": 982216, "step": 2395 }, { "epoch": 2.9055690072639226, "grad_norm": 0.03750577196478844, "learning_rate": 4.968922647205007e-05, "loss": 0.0985, "num_input_tokens_seen": 984392, "step": 2400 }, { "epoch": 2.9116222760290555, "grad_norm": 0.23740816116333008, "learning_rate": 4.968506104669685e-05, "loss": 0.0616, "num_input_tokens_seen": 986376, "step": 2405 }, { "epoch": 2.917675544794189, "grad_norm": 0.03188396617770195, "learning_rate": 4.968086806834856e-05, "loss": 0.1144, "num_input_tokens_seen": 988488, "step": 2410 }, { "epoch": 2.923728813559322, "grad_norm": 17.792011260986328, "learning_rate": 4.967664754168533e-05, "loss": 0.1005, "num_input_tokens_seen": 990536, "step": 2415 }, { "epoch": 2.929782082324455, "grad_norm": 2.012704372406006, "learning_rate": 4.967239947141803e-05, "loss": 0.1737, "num_input_tokens_seen": 992680, "step": 2420 }, { "epoch": 2.9358353510895885, "grad_norm": 0.41579756140708923, "learning_rate": 4.9668123862288284e-05, "loss": 0.0729, "num_input_tokens_seen": 994728, "step": 2425 }, { "epoch": 2.9418886198547214, "grad_norm": 0.4666353166103363, "learning_rate": 4.966382071906843e-05, "loss": 0.0772, "num_input_tokens_seen": 996808, "step": 2430 }, { "epoch": 2.9479418886198547, "grad_norm": 5.378293991088867, "learning_rate": 4.9659490046561564e-05, "loss": 0.0827, "num_input_tokens_seen": 998696, "step": 2435 }, { "epoch": 2.9539951573849876, "grad_norm": 6.692707538604736, "learning_rate": 4.96551318496015e-05, "loss": 0.0982, "num_input_tokens_seen": 1000744, "step": 2440 }, { "epoch": 2.960048426150121, "grad_norm": 5.907947540283203, "learning_rate": 4.965074613305277e-05, "loss": 0.1592, "num_input_tokens_seen": 1002760, "step": 2445 }, { "epoch": 2.9661016949152543, "grad_norm": 8.462333679199219, "learning_rate": 4.9646332901810636e-05, "loss": 0.137, "num_input_tokens_seen": 1004744, "step": 2450 }, { "epoch": 2.9721549636803877, "grad_norm": 0.33113446831703186, "learning_rate": 4.9641892160801055e-05, "loss": 0.0669, "num_input_tokens_seen": 1006792, "step": 2455 }, { "epoch": 2.9782082324455206, "grad_norm": 0.17718105018138885, "learning_rate": 4.9637423914980687e-05, "loss": 0.1616, "num_input_tokens_seen": 1008776, "step": 2460 }, { "epoch": 2.9842615012106535, "grad_norm": 0.07657644897699356, "learning_rate": 4.963292816933692e-05, "loss": 0.033, "num_input_tokens_seen": 1010824, "step": 2465 }, { "epoch": 2.990314769975787, "grad_norm": 0.3621397316455841, "learning_rate": 4.962840492888783e-05, "loss": 0.1404, "num_input_tokens_seen": 1012840, "step": 2470 }, { "epoch": 2.99636803874092, "grad_norm": 0.6266226172447205, "learning_rate": 4.962385419868214e-05, "loss": 0.0748, "num_input_tokens_seen": 1014888, "step": 2475 }, { "epoch": 3.0, "eval_loss": 0.08039100468158722, "eval_runtime": 4.9448, "eval_samples_per_second": 74.219, "eval_steps_per_second": 18.605, "num_input_tokens_seen": 1015816, "step": 2478 }, { "epoch": 3.002421307506053, "grad_norm": 0.8302012085914612, "learning_rate": 4.9619275983799304e-05, "loss": 0.0547, "num_input_tokens_seen": 1016616, "step": 2480 }, { "epoch": 3.0084745762711864, "grad_norm": 6.047123432159424, "learning_rate": 4.961467028934945e-05, "loss": 0.0184, "num_input_tokens_seen": 1018760, "step": 2485 }, { "epoch": 3.0145278450363198, "grad_norm": 0.09908270090818405, "learning_rate": 4.961003712047335e-05, "loss": 0.0582, "num_input_tokens_seen": 1020872, "step": 2490 }, { "epoch": 3.0205811138014527, "grad_norm": 11.10853099822998, "learning_rate": 4.960537648234247e-05, "loss": 0.1047, "num_input_tokens_seen": 1022920, "step": 2495 }, { "epoch": 3.026634382566586, "grad_norm": 4.4050774574279785, "learning_rate": 4.960068838015892e-05, "loss": 0.104, "num_input_tokens_seen": 1025000, "step": 2500 }, { "epoch": 3.032687651331719, "grad_norm": 0.06613756716251373, "learning_rate": 4.959597281915546e-05, "loss": 0.0417, "num_input_tokens_seen": 1027272, "step": 2505 }, { "epoch": 3.0387409200968523, "grad_norm": 0.5666066408157349, "learning_rate": 4.959122980459553e-05, "loss": 0.0556, "num_input_tokens_seen": 1029320, "step": 2510 }, { "epoch": 3.0447941888619856, "grad_norm": 1.6452821493148804, "learning_rate": 4.958645934177318e-05, "loss": 0.0737, "num_input_tokens_seen": 1031272, "step": 2515 }, { "epoch": 3.0508474576271185, "grad_norm": 0.1000954732298851, "learning_rate": 4.95816614360131e-05, "loss": 0.0158, "num_input_tokens_seen": 1033256, "step": 2520 }, { "epoch": 3.056900726392252, "grad_norm": 2.132666826248169, "learning_rate": 4.957683609267065e-05, "loss": 0.0076, "num_input_tokens_seen": 1035240, "step": 2525 }, { "epoch": 3.062953995157385, "grad_norm": 14.89340877532959, "learning_rate": 4.9571983317131764e-05, "loss": 0.0918, "num_input_tokens_seen": 1037320, "step": 2530 }, { "epoch": 3.069007263922518, "grad_norm": 0.5204524993896484, "learning_rate": 4.956710311481303e-05, "loss": 0.0749, "num_input_tokens_seen": 1039304, "step": 2535 }, { "epoch": 3.0750605326876514, "grad_norm": 0.054114025086164474, "learning_rate": 4.956219549116162e-05, "loss": 0.0951, "num_input_tokens_seen": 1041480, "step": 2540 }, { "epoch": 3.0811138014527844, "grad_norm": 0.026865197345614433, "learning_rate": 4.955726045165534e-05, "loss": 0.0922, "num_input_tokens_seen": 1043432, "step": 2545 }, { "epoch": 3.0871670702179177, "grad_norm": 0.042629409581422806, "learning_rate": 4.955229800180259e-05, "loss": 0.05, "num_input_tokens_seen": 1045416, "step": 2550 }, { "epoch": 3.093220338983051, "grad_norm": 0.033739857375621796, "learning_rate": 4.954730814714236e-05, "loss": 0.0487, "num_input_tokens_seen": 1047400, "step": 2555 }, { "epoch": 3.099273607748184, "grad_norm": 0.0699935033917427, "learning_rate": 4.954229089324423e-05, "loss": 0.0013, "num_input_tokens_seen": 1049576, "step": 2560 }, { "epoch": 3.1053268765133173, "grad_norm": 16.420927047729492, "learning_rate": 4.953724624570837e-05, "loss": 0.0527, "num_input_tokens_seen": 1051592, "step": 2565 }, { "epoch": 3.11138014527845, "grad_norm": 0.08184728026390076, "learning_rate": 4.9532174210165505e-05, "loss": 0.1737, "num_input_tokens_seen": 1053640, "step": 2570 }, { "epoch": 3.1174334140435835, "grad_norm": 0.03397824615240097, "learning_rate": 4.952707479227695e-05, "loss": 0.0015, "num_input_tokens_seen": 1055656, "step": 2575 }, { "epoch": 3.123486682808717, "grad_norm": 0.06000056862831116, "learning_rate": 4.952194799773459e-05, "loss": 0.025, "num_input_tokens_seen": 1057704, "step": 2580 }, { "epoch": 3.12953995157385, "grad_norm": 0.09082405269145966, "learning_rate": 4.9516793832260836e-05, "loss": 0.0013, "num_input_tokens_seen": 1059784, "step": 2585 }, { "epoch": 3.135593220338983, "grad_norm": 0.010682323016226292, "learning_rate": 4.951161230160868e-05, "loss": 0.0458, "num_input_tokens_seen": 1061832, "step": 2590 }, { "epoch": 3.141646489104116, "grad_norm": 0.02184060402214527, "learning_rate": 4.950640341156165e-05, "loss": 0.0008, "num_input_tokens_seen": 1063848, "step": 2595 }, { "epoch": 3.1476997578692494, "grad_norm": 0.04209049046039581, "learning_rate": 4.950116716793381e-05, "loss": 0.0044, "num_input_tokens_seen": 1065896, "step": 2600 }, { "epoch": 3.1537530266343827, "grad_norm": 0.019220851361751556, "learning_rate": 4.949590357656975e-05, "loss": 0.1374, "num_input_tokens_seen": 1068040, "step": 2605 }, { "epoch": 3.1598062953995156, "grad_norm": 0.10112041980028152, "learning_rate": 4.949061264334459e-05, "loss": 0.0482, "num_input_tokens_seen": 1070152, "step": 2610 }, { "epoch": 3.165859564164649, "grad_norm": 14.931107521057129, "learning_rate": 4.948529437416397e-05, "loss": 0.1549, "num_input_tokens_seen": 1072200, "step": 2615 }, { "epoch": 3.171912832929782, "grad_norm": 1.8265745639801025, "learning_rate": 4.9479948774964055e-05, "loss": 0.0777, "num_input_tokens_seen": 1074248, "step": 2620 }, { "epoch": 3.1779661016949152, "grad_norm": 0.02708769589662552, "learning_rate": 4.947457585171148e-05, "loss": 0.004, "num_input_tokens_seen": 1076264, "step": 2625 }, { "epoch": 3.1840193704600486, "grad_norm": 0.08986163139343262, "learning_rate": 4.9469175610403406e-05, "loss": 0.0058, "num_input_tokens_seen": 1078248, "step": 2630 }, { "epoch": 3.1900726392251815, "grad_norm": 9.907031059265137, "learning_rate": 4.946374805706748e-05, "loss": 0.0705, "num_input_tokens_seen": 1080264, "step": 2635 }, { "epoch": 3.196125907990315, "grad_norm": 13.661909103393555, "learning_rate": 4.945829319776184e-05, "loss": 0.0654, "num_input_tokens_seen": 1082152, "step": 2640 }, { "epoch": 3.2021791767554477, "grad_norm": 2.1515917778015137, "learning_rate": 4.945281103857509e-05, "loss": 0.1247, "num_input_tokens_seen": 1084232, "step": 2645 }, { "epoch": 3.208232445520581, "grad_norm": 0.02746470831334591, "learning_rate": 4.9447301585626314e-05, "loss": 0.0008, "num_input_tokens_seen": 1086440, "step": 2650 }, { "epoch": 3.2142857142857144, "grad_norm": 0.008798208087682724, "learning_rate": 4.9441764845065066e-05, "loss": 0.0511, "num_input_tokens_seen": 1088520, "step": 2655 }, { "epoch": 3.2203389830508473, "grad_norm": 45.36145782470703, "learning_rate": 4.9436200823071327e-05, "loss": 0.0275, "num_input_tokens_seen": 1090600, "step": 2660 }, { "epoch": 3.2263922518159807, "grad_norm": 0.012937935069203377, "learning_rate": 4.9430609525855576e-05, "loss": 0.0021, "num_input_tokens_seen": 1092744, "step": 2665 }, { "epoch": 3.232445520581114, "grad_norm": 0.03362731263041496, "learning_rate": 4.94249909596587e-05, "loss": 0.0032, "num_input_tokens_seen": 1094856, "step": 2670 }, { "epoch": 3.238498789346247, "grad_norm": 0.05349215492606163, "learning_rate": 4.9419345130752044e-05, "loss": 0.2425, "num_input_tokens_seen": 1096872, "step": 2675 }, { "epoch": 3.2445520581113803, "grad_norm": 0.09200761467218399, "learning_rate": 4.941367204543736e-05, "loss": 0.001, "num_input_tokens_seen": 1099016, "step": 2680 }, { "epoch": 3.250605326876513, "grad_norm": 31.31439971923828, "learning_rate": 4.940797171004686e-05, "loss": 0.0571, "num_input_tokens_seen": 1101256, "step": 2685 }, { "epoch": 3.2566585956416465, "grad_norm": 0.09735773503780365, "learning_rate": 4.9402244130943135e-05, "loss": 0.0582, "num_input_tokens_seen": 1103272, "step": 2690 }, { "epoch": 3.26271186440678, "grad_norm": 1.3476866483688354, "learning_rate": 4.93964893145192e-05, "loss": 0.067, "num_input_tokens_seen": 1105352, "step": 2695 }, { "epoch": 3.2687651331719128, "grad_norm": 6.456122398376465, "learning_rate": 4.9390707267198475e-05, "loss": 0.1103, "num_input_tokens_seen": 1107368, "step": 2700 }, { "epoch": 3.274818401937046, "grad_norm": 0.05752401798963547, "learning_rate": 4.938489799543477e-05, "loss": 0.0557, "num_input_tokens_seen": 1109416, "step": 2705 }, { "epoch": 3.280871670702179, "grad_norm": 8.647043228149414, "learning_rate": 4.937906150571228e-05, "loss": 0.0128, "num_input_tokens_seen": 1111304, "step": 2710 }, { "epoch": 3.2869249394673123, "grad_norm": 0.054162342101335526, "learning_rate": 4.937319780454559e-05, "loss": 0.023, "num_input_tokens_seen": 1113448, "step": 2715 }, { "epoch": 3.2929782082324457, "grad_norm": 0.09134739637374878, "learning_rate": 4.936730689847965e-05, "loss": 0.0088, "num_input_tokens_seen": 1115464, "step": 2720 }, { "epoch": 3.2990314769975786, "grad_norm": 12.804100036621094, "learning_rate": 4.936138879408978e-05, "loss": 0.1123, "num_input_tokens_seen": 1117384, "step": 2725 }, { "epoch": 3.305084745762712, "grad_norm": 0.023572532460093498, "learning_rate": 4.935544349798164e-05, "loss": 0.0823, "num_input_tokens_seen": 1119528, "step": 2730 }, { "epoch": 3.3111380145278453, "grad_norm": 0.8664999604225159, "learning_rate": 4.934947101679128e-05, "loss": 0.1088, "num_input_tokens_seen": 1121512, "step": 2735 }, { "epoch": 3.317191283292978, "grad_norm": 0.008962500840425491, "learning_rate": 4.934347135718505e-05, "loss": 0.0629, "num_input_tokens_seen": 1123528, "step": 2740 }, { "epoch": 3.3232445520581115, "grad_norm": 0.039215296506881714, "learning_rate": 4.933744452585966e-05, "loss": 0.0575, "num_input_tokens_seen": 1125704, "step": 2745 }, { "epoch": 3.3292978208232444, "grad_norm": 29.36794090270996, "learning_rate": 4.933139052954216e-05, "loss": 0.0668, "num_input_tokens_seen": 1127496, "step": 2750 }, { "epoch": 3.335351089588378, "grad_norm": 0.0039254906587302685, "learning_rate": 4.9325309374989886e-05, "loss": 0.1172, "num_input_tokens_seen": 1129416, "step": 2755 }, { "epoch": 3.341404358353511, "grad_norm": 0.025894179940223694, "learning_rate": 4.931920106899052e-05, "loss": 0.0554, "num_input_tokens_seen": 1131496, "step": 2760 }, { "epoch": 3.347457627118644, "grad_norm": 0.09109282493591309, "learning_rate": 4.931306561836202e-05, "loss": 0.0214, "num_input_tokens_seen": 1133608, "step": 2765 }, { "epoch": 3.3535108958837774, "grad_norm": 13.913040161132812, "learning_rate": 4.930690302995268e-05, "loss": 0.0422, "num_input_tokens_seen": 1135592, "step": 2770 }, { "epoch": 3.3595641646489103, "grad_norm": 0.0414290614426136, "learning_rate": 4.930071331064104e-05, "loss": 0.0423, "num_input_tokens_seen": 1137640, "step": 2775 }, { "epoch": 3.3656174334140436, "grad_norm": 3.662402868270874, "learning_rate": 4.929449646733598e-05, "loss": 0.1291, "num_input_tokens_seen": 1139624, "step": 2780 }, { "epoch": 3.371670702179177, "grad_norm": 5.239326000213623, "learning_rate": 4.928825250697659e-05, "loss": 0.008, "num_input_tokens_seen": 1141672, "step": 2785 }, { "epoch": 3.37772397094431, "grad_norm": 0.058994635939598083, "learning_rate": 4.9281981436532275e-05, "loss": 0.0035, "num_input_tokens_seen": 1143784, "step": 2790 }, { "epoch": 3.383777239709443, "grad_norm": 0.017671583220362663, "learning_rate": 4.927568326300268e-05, "loss": 0.0297, "num_input_tokens_seen": 1145768, "step": 2795 }, { "epoch": 3.389830508474576, "grad_norm": 0.030405670404434204, "learning_rate": 4.926935799341771e-05, "loss": 0.0076, "num_input_tokens_seen": 1147848, "step": 2800 }, { "epoch": 3.3958837772397095, "grad_norm": 11.298994064331055, "learning_rate": 4.926300563483751e-05, "loss": 0.0873, "num_input_tokens_seen": 1150120, "step": 2805 }, { "epoch": 3.401937046004843, "grad_norm": 0.009805584326386452, "learning_rate": 4.9256626194352465e-05, "loss": 0.0727, "num_input_tokens_seen": 1152168, "step": 2810 }, { "epoch": 3.4079903147699757, "grad_norm": 3.441596269607544, "learning_rate": 4.925021967908316e-05, "loss": 0.0588, "num_input_tokens_seen": 1154152, "step": 2815 }, { "epoch": 3.414043583535109, "grad_norm": 9.824174880981445, "learning_rate": 4.924378609618047e-05, "loss": 0.0568, "num_input_tokens_seen": 1156232, "step": 2820 }, { "epoch": 3.420096852300242, "grad_norm": 0.08703053742647171, "learning_rate": 4.92373254528254e-05, "loss": 0.0557, "num_input_tokens_seen": 1158248, "step": 2825 }, { "epoch": 3.4261501210653753, "grad_norm": 0.07860667258501053, "learning_rate": 4.923083775622922e-05, "loss": 0.0412, "num_input_tokens_seen": 1160296, "step": 2830 }, { "epoch": 3.4322033898305087, "grad_norm": 8.061395645141602, "learning_rate": 4.922432301363335e-05, "loss": 0.2478, "num_input_tokens_seen": 1162600, "step": 2835 }, { "epoch": 3.4382566585956416, "grad_norm": 0.12018529325723648, "learning_rate": 4.921778123230945e-05, "loss": 0.0114, "num_input_tokens_seen": 1164648, "step": 2840 }, { "epoch": 3.444309927360775, "grad_norm": 0.03503362834453583, "learning_rate": 4.9211212419559305e-05, "loss": 0.039, "num_input_tokens_seen": 1166760, "step": 2845 }, { "epoch": 3.450363196125908, "grad_norm": 2.9047369956970215, "learning_rate": 4.920461658271492e-05, "loss": 0.0048, "num_input_tokens_seen": 1168808, "step": 2850 }, { "epoch": 3.456416464891041, "grad_norm": 0.029949354007840157, "learning_rate": 4.919799372913842e-05, "loss": 0.0493, "num_input_tokens_seen": 1170824, "step": 2855 }, { "epoch": 3.4624697336561745, "grad_norm": 14.441965103149414, "learning_rate": 4.919134386622214e-05, "loss": 0.2052, "num_input_tokens_seen": 1172744, "step": 2860 }, { "epoch": 3.4685230024213074, "grad_norm": 4.693414688110352, "learning_rate": 4.9184667001388504e-05, "loss": 0.1181, "num_input_tokens_seen": 1174920, "step": 2865 }, { "epoch": 3.4745762711864407, "grad_norm": 0.23777198791503906, "learning_rate": 4.9177963142090103e-05, "loss": 0.0147, "num_input_tokens_seen": 1176872, "step": 2870 }, { "epoch": 3.4806295399515736, "grad_norm": 0.372465580701828, "learning_rate": 4.917123229580967e-05, "loss": 0.0062, "num_input_tokens_seen": 1178888, "step": 2875 }, { "epoch": 3.486682808716707, "grad_norm": 0.1278444081544876, "learning_rate": 4.916447447006004e-05, "loss": 0.005, "num_input_tokens_seen": 1180936, "step": 2880 }, { "epoch": 3.4927360774818403, "grad_norm": 0.0479760505259037, "learning_rate": 4.9157689672384174e-05, "loss": 0.2056, "num_input_tokens_seen": 1182920, "step": 2885 }, { "epoch": 3.4987893462469732, "grad_norm": 0.08166644722223282, "learning_rate": 4.915087791035512e-05, "loss": 0.0354, "num_input_tokens_seen": 1184872, "step": 2890 }, { "epoch": 3.5048426150121066, "grad_norm": 0.061309389770030975, "learning_rate": 4.914403919157605e-05, "loss": 0.058, "num_input_tokens_seen": 1186984, "step": 2895 }, { "epoch": 3.5108958837772395, "grad_norm": 0.196208655834198, "learning_rate": 4.9137173523680214e-05, "loss": 0.0857, "num_input_tokens_seen": 1188936, "step": 2900 }, { "epoch": 3.516949152542373, "grad_norm": 0.338936448097229, "learning_rate": 4.913028091433093e-05, "loss": 0.0567, "num_input_tokens_seen": 1190952, "step": 2905 }, { "epoch": 3.523002421307506, "grad_norm": 15.304155349731445, "learning_rate": 4.912336137122161e-05, "loss": 0.0977, "num_input_tokens_seen": 1192936, "step": 2910 }, { "epoch": 3.529055690072639, "grad_norm": 0.06952658295631409, "learning_rate": 4.9116414902075704e-05, "loss": 0.0036, "num_input_tokens_seen": 1194888, "step": 2915 }, { "epoch": 3.5351089588377724, "grad_norm": 7.467464447021484, "learning_rate": 4.910944151464673e-05, "loss": 0.163, "num_input_tokens_seen": 1197032, "step": 2920 }, { "epoch": 3.5411622276029053, "grad_norm": 3.1003479957580566, "learning_rate": 4.9102441216718257e-05, "loss": 0.0792, "num_input_tokens_seen": 1199208, "step": 2925 }, { "epoch": 3.5472154963680387, "grad_norm": 4.725975036621094, "learning_rate": 4.909541401610387e-05, "loss": 0.0595, "num_input_tokens_seen": 1201320, "step": 2930 }, { "epoch": 3.553268765133172, "grad_norm": 2.4645166397094727, "learning_rate": 4.9088359920647216e-05, "loss": 0.0805, "num_input_tokens_seen": 1203432, "step": 2935 }, { "epoch": 3.559322033898305, "grad_norm": 0.032732367515563965, "learning_rate": 4.908127893822193e-05, "loss": 0.0459, "num_input_tokens_seen": 1205480, "step": 2940 }, { "epoch": 3.5653753026634383, "grad_norm": 0.037100136280059814, "learning_rate": 4.907417107673167e-05, "loss": 0.0023, "num_input_tokens_seen": 1207592, "step": 2945 }, { "epoch": 3.571428571428571, "grad_norm": 1.120453953742981, "learning_rate": 4.9067036344110104e-05, "loss": 0.1016, "num_input_tokens_seen": 1209640, "step": 2950 }, { "epoch": 3.5774818401937045, "grad_norm": 6.908242702484131, "learning_rate": 4.9059874748320876e-05, "loss": 0.2216, "num_input_tokens_seen": 1211752, "step": 2955 }, { "epoch": 3.583535108958838, "grad_norm": 0.21004368364810944, "learning_rate": 4.905268629735762e-05, "loss": 0.0139, "num_input_tokens_seen": 1213736, "step": 2960 }, { "epoch": 3.589588377723971, "grad_norm": 5.147643089294434, "learning_rate": 4.9045470999243956e-05, "loss": 0.0539, "num_input_tokens_seen": 1215752, "step": 2965 }, { "epoch": 3.595641646489104, "grad_norm": 12.56291389465332, "learning_rate": 4.903822886203347e-05, "loss": 0.082, "num_input_tokens_seen": 1217640, "step": 2970 }, { "epoch": 3.601694915254237, "grad_norm": 0.2580777704715729, "learning_rate": 4.903095989380968e-05, "loss": 0.0693, "num_input_tokens_seen": 1219752, "step": 2975 }, { "epoch": 3.6077481840193704, "grad_norm": 3.382721185684204, "learning_rate": 4.902366410268608e-05, "loss": 0.0196, "num_input_tokens_seen": 1221832, "step": 2980 }, { "epoch": 3.6138014527845037, "grad_norm": 0.09196113795042038, "learning_rate": 4.9016341496806087e-05, "loss": 0.0375, "num_input_tokens_seen": 1223976, "step": 2985 }, { "epoch": 3.619854721549637, "grad_norm": 0.021342476829886436, "learning_rate": 4.9008992084343064e-05, "loss": 0.0957, "num_input_tokens_seen": 1225992, "step": 2990 }, { "epoch": 3.62590799031477, "grad_norm": 3.304697275161743, "learning_rate": 4.900161587350027e-05, "loss": 0.0091, "num_input_tokens_seen": 1228136, "step": 2995 }, { "epoch": 3.6319612590799033, "grad_norm": 0.1119203194975853, "learning_rate": 4.899421287251091e-05, "loss": 0.0585, "num_input_tokens_seen": 1230024, "step": 3000 }, { "epoch": 3.638014527845036, "grad_norm": 0.04524768143892288, "learning_rate": 4.898678308963806e-05, "loss": 0.1951, "num_input_tokens_seen": 1232072, "step": 3005 }, { "epoch": 3.6440677966101696, "grad_norm": 0.07445601373910904, "learning_rate": 4.89793265331747e-05, "loss": 0.0261, "num_input_tokens_seen": 1234312, "step": 3010 }, { "epoch": 3.650121065375303, "grad_norm": 0.3859395980834961, "learning_rate": 4.8971843211443705e-05, "loss": 0.1201, "num_input_tokens_seen": 1236424, "step": 3015 }, { "epoch": 3.656174334140436, "grad_norm": 17.58110809326172, "learning_rate": 4.896433313279781e-05, "loss": 0.0421, "num_input_tokens_seen": 1238696, "step": 3020 }, { "epoch": 3.662227602905569, "grad_norm": 0.02837517485022545, "learning_rate": 4.895679630561963e-05, "loss": 0.0113, "num_input_tokens_seen": 1240744, "step": 3025 }, { "epoch": 3.668280871670702, "grad_norm": 1.153536081314087, "learning_rate": 4.894923273832162e-05, "loss": 0.0546, "num_input_tokens_seen": 1242792, "step": 3030 }, { "epoch": 3.6743341404358354, "grad_norm": 0.15960516035556793, "learning_rate": 4.894164243934609e-05, "loss": 0.0007, "num_input_tokens_seen": 1244872, "step": 3035 }, { "epoch": 3.6803874092009687, "grad_norm": 0.018164237961173058, "learning_rate": 4.8934025417165195e-05, "loss": 0.0017, "num_input_tokens_seen": 1246920, "step": 3040 }, { "epoch": 3.6864406779661016, "grad_norm": 21.57578468322754, "learning_rate": 4.8926381680280906e-05, "loss": 0.0138, "num_input_tokens_seen": 1248904, "step": 3045 }, { "epoch": 3.692493946731235, "grad_norm": 19.547895431518555, "learning_rate": 4.891871123722501e-05, "loss": 0.0461, "num_input_tokens_seen": 1250760, "step": 3050 }, { "epoch": 3.698547215496368, "grad_norm": 0.004361252766102552, "learning_rate": 4.891101409655911e-05, "loss": 0.0716, "num_input_tokens_seen": 1252936, "step": 3055 }, { "epoch": 3.7046004842615012, "grad_norm": 0.02290641888976097, "learning_rate": 4.890329026687462e-05, "loss": 0.1705, "num_input_tokens_seen": 1255112, "step": 3060 }, { "epoch": 3.7106537530266346, "grad_norm": 0.010718287900090218, "learning_rate": 4.889553975679272e-05, "loss": 0.0008, "num_input_tokens_seen": 1257160, "step": 3065 }, { "epoch": 3.7167070217917675, "grad_norm": 0.5278363227844238, "learning_rate": 4.8887762574964385e-05, "loss": 0.0015, "num_input_tokens_seen": 1259272, "step": 3070 }, { "epoch": 3.722760290556901, "grad_norm": 0.42650216817855835, "learning_rate": 4.887995873007036e-05, "loss": 0.1354, "num_input_tokens_seen": 1261192, "step": 3075 }, { "epoch": 3.7288135593220337, "grad_norm": 0.0061606140807271, "learning_rate": 4.8872128230821146e-05, "loss": 0.0007, "num_input_tokens_seen": 1263304, "step": 3080 }, { "epoch": 3.734866828087167, "grad_norm": 8.496513366699219, "learning_rate": 4.8864271085957e-05, "loss": 0.1236, "num_input_tokens_seen": 1265480, "step": 3085 }, { "epoch": 3.7409200968523004, "grad_norm": 4.784152984619141, "learning_rate": 4.8856387304247905e-05, "loss": 0.1182, "num_input_tokens_seen": 1267464, "step": 3090 }, { "epoch": 3.7469733656174333, "grad_norm": 0.13118687272071838, "learning_rate": 4.884847689449361e-05, "loss": 0.1002, "num_input_tokens_seen": 1269544, "step": 3095 }, { "epoch": 3.7530266343825667, "grad_norm": 3.91727352142334, "learning_rate": 4.884053986552355e-05, "loss": 0.0169, "num_input_tokens_seen": 1271592, "step": 3100 }, { "epoch": 3.7590799031476996, "grad_norm": 0.6115451455116272, "learning_rate": 4.883257622619688e-05, "loss": 0.0226, "num_input_tokens_seen": 1273640, "step": 3105 }, { "epoch": 3.765133171912833, "grad_norm": 0.13479582965373993, "learning_rate": 4.882458598540247e-05, "loss": 0.052, "num_input_tokens_seen": 1275784, "step": 3110 }, { "epoch": 3.7711864406779663, "grad_norm": 0.030646109953522682, "learning_rate": 4.881656915205888e-05, "loss": 0.0021, "num_input_tokens_seen": 1277864, "step": 3115 }, { "epoch": 3.777239709443099, "grad_norm": 0.023602638393640518, "learning_rate": 4.880852573511434e-05, "loss": 0.0894, "num_input_tokens_seen": 1279848, "step": 3120 }, { "epoch": 3.7832929782082325, "grad_norm": 0.15530624985694885, "learning_rate": 4.8800455743546756e-05, "loss": 0.0041, "num_input_tokens_seen": 1281768, "step": 3125 }, { "epoch": 3.7893462469733654, "grad_norm": 0.07410790026187897, "learning_rate": 4.87923591863637e-05, "loss": 0.1245, "num_input_tokens_seen": 1283784, "step": 3130 }, { "epoch": 3.7953995157384988, "grad_norm": 7.149677276611328, "learning_rate": 4.8784236072602404e-05, "loss": 0.0082, "num_input_tokens_seen": 1285896, "step": 3135 }, { "epoch": 3.801452784503632, "grad_norm": 5.109461784362793, "learning_rate": 4.8776086411329727e-05, "loss": 0.0868, "num_input_tokens_seen": 1288008, "step": 3140 }, { "epoch": 3.807506053268765, "grad_norm": 0.5222837328910828, "learning_rate": 4.8767910211642165e-05, "loss": 0.0627, "num_input_tokens_seen": 1290056, "step": 3145 }, { "epoch": 3.8135593220338984, "grad_norm": 0.015749802812933922, "learning_rate": 4.875970748266583e-05, "loss": 0.1036, "num_input_tokens_seen": 1292072, "step": 3150 }, { "epoch": 3.8196125907990313, "grad_norm": 0.11746250838041306, "learning_rate": 4.875147823355646e-05, "loss": 0.0076, "num_input_tokens_seen": 1294024, "step": 3155 }, { "epoch": 3.8256658595641646, "grad_norm": 0.022637126967310905, "learning_rate": 4.87432224734994e-05, "loss": 0.0021, "num_input_tokens_seen": 1296104, "step": 3160 }, { "epoch": 3.831719128329298, "grad_norm": 0.14612427353858948, "learning_rate": 4.873494021170953e-05, "loss": 0.1036, "num_input_tokens_seen": 1298024, "step": 3165 }, { "epoch": 3.837772397094431, "grad_norm": 0.02558349072933197, "learning_rate": 4.87266314574314e-05, "loss": 0.0767, "num_input_tokens_seen": 1300040, "step": 3170 }, { "epoch": 3.843825665859564, "grad_norm": 0.05110359191894531, "learning_rate": 4.871829621993905e-05, "loss": 0.0927, "num_input_tokens_seen": 1302216, "step": 3175 }, { "epoch": 3.849878934624697, "grad_norm": 0.11362777650356293, "learning_rate": 4.870993450853614e-05, "loss": 0.0943, "num_input_tokens_seen": 1304232, "step": 3180 }, { "epoch": 3.8559322033898304, "grad_norm": 6.187495231628418, "learning_rate": 4.870154633255583e-05, "loss": 0.0531, "num_input_tokens_seen": 1306280, "step": 3185 }, { "epoch": 3.861985472154964, "grad_norm": 0.03128233551979065, "learning_rate": 4.869313170136085e-05, "loss": 0.0197, "num_input_tokens_seen": 1308200, "step": 3190 }, { "epoch": 3.8680387409200967, "grad_norm": 0.01663835719227791, "learning_rate": 4.868469062434346e-05, "loss": 0.1009, "num_input_tokens_seen": 1310088, "step": 3195 }, { "epoch": 3.87409200968523, "grad_norm": 0.5087220668792725, "learning_rate": 4.8676223110925425e-05, "loss": 0.0292, "num_input_tokens_seen": 1312136, "step": 3200 }, { "epoch": 3.880145278450363, "grad_norm": 2.769798755645752, "learning_rate": 4.8667729170558006e-05, "loss": 0.0092, "num_input_tokens_seen": 1314184, "step": 3205 }, { "epoch": 3.8861985472154963, "grad_norm": 0.2357548177242279, "learning_rate": 4.8659208812722e-05, "loss": 0.0036, "num_input_tokens_seen": 1316200, "step": 3210 }, { "epoch": 3.8922518159806296, "grad_norm": 5.177121162414551, "learning_rate": 4.8650662046927666e-05, "loss": 0.0508, "num_input_tokens_seen": 1318248, "step": 3215 }, { "epoch": 3.898305084745763, "grad_norm": 5.829105854034424, "learning_rate": 4.864208888271472e-05, "loss": 0.1398, "num_input_tokens_seen": 1320232, "step": 3220 }, { "epoch": 3.904358353510896, "grad_norm": 0.04847264289855957, "learning_rate": 4.863348932965238e-05, "loss": 0.0529, "num_input_tokens_seen": 1322248, "step": 3225 }, { "epoch": 3.910411622276029, "grad_norm": 0.12306322157382965, "learning_rate": 4.8624863397339315e-05, "loss": 0.0476, "num_input_tokens_seen": 1324264, "step": 3230 }, { "epoch": 3.916464891041162, "grad_norm": 0.5315977931022644, "learning_rate": 4.8616211095403605e-05, "loss": 0.1572, "num_input_tokens_seen": 1326248, "step": 3235 }, { "epoch": 3.9225181598062955, "grad_norm": 6.036517143249512, "learning_rate": 4.860753243350279e-05, "loss": 0.0223, "num_input_tokens_seen": 1328328, "step": 3240 }, { "epoch": 3.928571428571429, "grad_norm": 10.970824241638184, "learning_rate": 4.859882742132383e-05, "loss": 0.0577, "num_input_tokens_seen": 1330248, "step": 3245 }, { "epoch": 3.9346246973365617, "grad_norm": 0.025540579110383987, "learning_rate": 4.85900960685831e-05, "loss": 0.0838, "num_input_tokens_seen": 1332456, "step": 3250 }, { "epoch": 3.940677966101695, "grad_norm": 0.0785602331161499, "learning_rate": 4.858133838502637e-05, "loss": 0.1115, "num_input_tokens_seen": 1334504, "step": 3255 }, { "epoch": 3.946731234866828, "grad_norm": 1.5323442220687866, "learning_rate": 4.8572554380428786e-05, "loss": 0.1659, "num_input_tokens_seen": 1336584, "step": 3260 }, { "epoch": 3.9527845036319613, "grad_norm": 3.0499684810638428, "learning_rate": 4.856374406459489e-05, "loss": 0.0177, "num_input_tokens_seen": 1338472, "step": 3265 }, { "epoch": 3.9588377723970947, "grad_norm": 0.22885937988758087, "learning_rate": 4.85549074473586e-05, "loss": 0.0433, "num_input_tokens_seen": 1340456, "step": 3270 }, { "epoch": 3.9648910411622276, "grad_norm": 10.749367713928223, "learning_rate": 4.8546044538583175e-05, "loss": 0.0419, "num_input_tokens_seen": 1342440, "step": 3275 }, { "epoch": 3.970944309927361, "grad_norm": 0.06775669753551483, "learning_rate": 4.8537155348161214e-05, "loss": 0.0793, "num_input_tokens_seen": 1344488, "step": 3280 }, { "epoch": 3.976997578692494, "grad_norm": 2.0922963619232178, "learning_rate": 4.852823988601468e-05, "loss": 0.0737, "num_input_tokens_seen": 1346600, "step": 3285 }, { "epoch": 3.983050847457627, "grad_norm": 14.65282917022705, "learning_rate": 4.851929816209483e-05, "loss": 0.0331, "num_input_tokens_seen": 1348776, "step": 3290 }, { "epoch": 3.9891041162227605, "grad_norm": 0.1719052940607071, "learning_rate": 4.8510330186382245e-05, "loss": 0.145, "num_input_tokens_seen": 1350792, "step": 3295 }, { "epoch": 3.9951573849878934, "grad_norm": 0.03229275345802307, "learning_rate": 4.850133596888682e-05, "loss": 0.0826, "num_input_tokens_seen": 1352808, "step": 3300 }, { "epoch": 4.0, "eval_loss": 0.10385129600763321, "eval_runtime": 4.9655, "eval_samples_per_second": 73.911, "eval_steps_per_second": 18.528, "num_input_tokens_seen": 1354072, "step": 3304 }, { "epoch": 4.001210653753026, "grad_norm": 0.18555189669132233, "learning_rate": 4.849231551964771e-05, "loss": 0.0965, "num_input_tokens_seen": 1354552, "step": 3305 }, { "epoch": 4.00726392251816, "grad_norm": 0.0990036353468895, "learning_rate": 4.8483268848733386e-05, "loss": 0.0066, "num_input_tokens_seen": 1356696, "step": 3310 }, { "epoch": 4.013317191283293, "grad_norm": 0.11698263138532639, "learning_rate": 4.847419596624157e-05, "loss": 0.0037, "num_input_tokens_seen": 1358712, "step": 3315 }, { "epoch": 4.019370460048426, "grad_norm": 7.631546497344971, "learning_rate": 4.846509688229923e-05, "loss": 0.0659, "num_input_tokens_seen": 1360568, "step": 3320 }, { "epoch": 4.02542372881356, "grad_norm": 0.0294627882540226, "learning_rate": 4.84559716070626e-05, "loss": 0.001, "num_input_tokens_seen": 1362680, "step": 3325 }, { "epoch": 4.031476997578692, "grad_norm": 0.0540626235306263, "learning_rate": 4.844682015071713e-05, "loss": 0.0011, "num_input_tokens_seen": 1364696, "step": 3330 }, { "epoch": 4.0375302663438255, "grad_norm": 0.04309828579425812, "learning_rate": 4.843764252347751e-05, "loss": 0.0173, "num_input_tokens_seen": 1366744, "step": 3335 }, { "epoch": 4.043583535108959, "grad_norm": 0.04923002049326897, "learning_rate": 4.8428438735587626e-05, "loss": 0.0029, "num_input_tokens_seen": 1368792, "step": 3340 }, { "epoch": 4.049636803874092, "grad_norm": 0.04220494627952576, "learning_rate": 4.8419208797320564e-05, "loss": 0.0011, "num_input_tokens_seen": 1370712, "step": 3345 }, { "epoch": 4.0556900726392255, "grad_norm": 0.008161104284226894, "learning_rate": 4.8409952718978616e-05, "loss": 0.0017, "num_input_tokens_seen": 1372728, "step": 3350 }, { "epoch": 4.061743341404358, "grad_norm": 0.007771191652864218, "learning_rate": 4.8400670510893245e-05, "loss": 0.1582, "num_input_tokens_seen": 1374904, "step": 3355 }, { "epoch": 4.067796610169491, "grad_norm": 0.07163462042808533, "learning_rate": 4.839136218342507e-05, "loss": 0.0181, "num_input_tokens_seen": 1377048, "step": 3360 }, { "epoch": 4.073849878934625, "grad_norm": 5.371501445770264, "learning_rate": 4.838202774696386e-05, "loss": 0.1738, "num_input_tokens_seen": 1378968, "step": 3365 }, { "epoch": 4.079903147699758, "grad_norm": 0.015581412240862846, "learning_rate": 4.8372667211928546e-05, "loss": 0.0092, "num_input_tokens_seen": 1381208, "step": 3370 }, { "epoch": 4.085956416464891, "grad_norm": 1.8815197944641113, "learning_rate": 4.836328058876717e-05, "loss": 0.0786, "num_input_tokens_seen": 1383224, "step": 3375 }, { "epoch": 4.092009685230024, "grad_norm": 0.08072246611118317, "learning_rate": 4.835386788795692e-05, "loss": 0.0584, "num_input_tokens_seen": 1385304, "step": 3380 }, { "epoch": 4.098062953995157, "grad_norm": 4.017838478088379, "learning_rate": 4.834442912000405e-05, "loss": 0.068, "num_input_tokens_seen": 1387320, "step": 3385 }, { "epoch": 4.1041162227602905, "grad_norm": 0.016942860558629036, "learning_rate": 4.833496429544394e-05, "loss": 0.075, "num_input_tokens_seen": 1389400, "step": 3390 }, { "epoch": 4.110169491525424, "grad_norm": 0.19460545480251312, "learning_rate": 4.832547342484106e-05, "loss": 0.0015, "num_input_tokens_seen": 1391416, "step": 3395 }, { "epoch": 4.116222760290557, "grad_norm": 0.06321632862091064, "learning_rate": 4.831595651878893e-05, "loss": 0.0014, "num_input_tokens_seen": 1393560, "step": 3400 }, { "epoch": 4.12227602905569, "grad_norm": 0.004801756236702204, "learning_rate": 4.830641358791014e-05, "loss": 0.0029, "num_input_tokens_seen": 1395672, "step": 3405 }, { "epoch": 4.128329297820823, "grad_norm": 8.673011779785156, "learning_rate": 4.829684464285632e-05, "loss": 0.0116, "num_input_tokens_seen": 1397752, "step": 3410 }, { "epoch": 4.134382566585956, "grad_norm": 0.007303446996957064, "learning_rate": 4.828724969430815e-05, "loss": 0.016, "num_input_tokens_seen": 1399992, "step": 3415 }, { "epoch": 4.14043583535109, "grad_norm": 0.009209591895341873, "learning_rate": 4.8277628752975336e-05, "loss": 0.0006, "num_input_tokens_seen": 1402200, "step": 3420 }, { "epoch": 4.146489104116223, "grad_norm": 0.023388415575027466, "learning_rate": 4.826798182959658e-05, "loss": 0.001, "num_input_tokens_seen": 1404280, "step": 3425 }, { "epoch": 4.1525423728813555, "grad_norm": 0.0015631900168955326, "learning_rate": 4.82583089349396e-05, "loss": 0.0354, "num_input_tokens_seen": 1406392, "step": 3430 }, { "epoch": 4.158595641646489, "grad_norm": 0.05450846254825592, "learning_rate": 4.824861007980109e-05, "loss": 0.0319, "num_input_tokens_seen": 1408504, "step": 3435 }, { "epoch": 4.164648910411622, "grad_norm": 0.11729560792446136, "learning_rate": 4.823888527500673e-05, "loss": 0.0003, "num_input_tokens_seen": 1410584, "step": 3440 }, { "epoch": 4.170702179176756, "grad_norm": 0.005959239788353443, "learning_rate": 4.8229134531411166e-05, "loss": 0.0814, "num_input_tokens_seen": 1412568, "step": 3445 }, { "epoch": 4.176755447941889, "grad_norm": 12.615947723388672, "learning_rate": 4.8219357859898e-05, "loss": 0.0557, "num_input_tokens_seen": 1414456, "step": 3450 }, { "epoch": 4.182808716707021, "grad_norm": 0.027701057493686676, "learning_rate": 4.8209555271379744e-05, "loss": 0.0705, "num_input_tokens_seen": 1416504, "step": 3455 }, { "epoch": 4.188861985472155, "grad_norm": 0.05782383680343628, "learning_rate": 4.819972677679788e-05, "loss": 0.0409, "num_input_tokens_seen": 1418648, "step": 3460 }, { "epoch": 4.194915254237288, "grad_norm": 0.27830183506011963, "learning_rate": 4.818987238712278e-05, "loss": 0.0028, "num_input_tokens_seen": 1420728, "step": 3465 }, { "epoch": 4.200968523002421, "grad_norm": 0.0319935604929924, "learning_rate": 4.8179992113353724e-05, "loss": 0.0008, "num_input_tokens_seen": 1422744, "step": 3470 }, { "epoch": 4.207021791767555, "grad_norm": 0.022661130875349045, "learning_rate": 4.8170085966518885e-05, "loss": 0.0018, "num_input_tokens_seen": 1424632, "step": 3475 }, { "epoch": 4.213075060532688, "grad_norm": 0.5545247793197632, "learning_rate": 4.8160153957675316e-05, "loss": 0.0042, "num_input_tokens_seen": 1426712, "step": 3480 }, { "epoch": 4.219128329297821, "grad_norm": 0.15866227447986603, "learning_rate": 4.815019609790894e-05, "loss": 0.0011, "num_input_tokens_seen": 1428664, "step": 3485 }, { "epoch": 4.225181598062954, "grad_norm": 0.011144421994686127, "learning_rate": 4.8140212398334526e-05, "loss": 0.0015, "num_input_tokens_seen": 1430680, "step": 3490 }, { "epoch": 4.231234866828087, "grad_norm": 0.01984487473964691, "learning_rate": 4.81302028700957e-05, "loss": 0.0004, "num_input_tokens_seen": 1432760, "step": 3495 }, { "epoch": 4.237288135593221, "grad_norm": 0.004704284947365522, "learning_rate": 4.812016752436489e-05, "loss": 0.0003, "num_input_tokens_seen": 1434680, "step": 3500 }, { "epoch": 4.243341404358354, "grad_norm": 0.003517726669088006, "learning_rate": 4.8110106372343366e-05, "loss": 0.0003, "num_input_tokens_seen": 1436760, "step": 3505 }, { "epoch": 4.249394673123486, "grad_norm": 0.05473700165748596, "learning_rate": 4.81000194252612e-05, "loss": 0.0004, "num_input_tokens_seen": 1438776, "step": 3510 }, { "epoch": 4.25544794188862, "grad_norm": 0.0034939744509756565, "learning_rate": 4.808990669437724e-05, "loss": 0.0165, "num_input_tokens_seen": 1440792, "step": 3515 }, { "epoch": 4.261501210653753, "grad_norm": 0.007801917847245932, "learning_rate": 4.8079768190979135e-05, "loss": 0.0005, "num_input_tokens_seen": 1442904, "step": 3520 }, { "epoch": 4.267554479418886, "grad_norm": 0.30015483498573303, "learning_rate": 4.806960392638328e-05, "loss": 0.0004, "num_input_tokens_seen": 1444888, "step": 3525 }, { "epoch": 4.27360774818402, "grad_norm": 0.038136955350637436, "learning_rate": 4.805941391193484e-05, "loss": 0.0877, "num_input_tokens_seen": 1447064, "step": 3530 }, { "epoch": 4.279661016949152, "grad_norm": 0.0045110140927135944, "learning_rate": 4.80491981590077e-05, "loss": 0.0003, "num_input_tokens_seen": 1449048, "step": 3535 }, { "epoch": 4.285714285714286, "grad_norm": 0.02299519255757332, "learning_rate": 4.803895667900451e-05, "loss": 0.0602, "num_input_tokens_seen": 1451096, "step": 3540 }, { "epoch": 4.291767554479419, "grad_norm": 52.0346794128418, "learning_rate": 4.80286894833566e-05, "loss": 0.0252, "num_input_tokens_seen": 1453176, "step": 3545 }, { "epoch": 4.297820823244552, "grad_norm": 1.5008364915847778, "learning_rate": 4.801839658352403e-05, "loss": 0.1129, "num_input_tokens_seen": 1455192, "step": 3550 }, { "epoch": 4.303874092009686, "grad_norm": 19.435768127441406, "learning_rate": 4.800807799099552e-05, "loss": 0.0648, "num_input_tokens_seen": 1457080, "step": 3555 }, { "epoch": 4.309927360774818, "grad_norm": 0.007639667019248009, "learning_rate": 4.7997733717288506e-05, "loss": 0.0006, "num_input_tokens_seen": 1459160, "step": 3560 }, { "epoch": 4.315980629539951, "grad_norm": 0.14900118112564087, "learning_rate": 4.798736377394907e-05, "loss": 0.0009, "num_input_tokens_seen": 1461272, "step": 3565 }, { "epoch": 4.322033898305085, "grad_norm": 0.0010028004180639982, "learning_rate": 4.7976968172551925e-05, "loss": 0.1503, "num_input_tokens_seen": 1463224, "step": 3570 }, { "epoch": 4.328087167070218, "grad_norm": 1.756608247756958, "learning_rate": 4.796654692470046e-05, "loss": 0.0928, "num_input_tokens_seen": 1465240, "step": 3575 }, { "epoch": 4.3341404358353515, "grad_norm": 0.03211286664009094, "learning_rate": 4.795610004202668e-05, "loss": 0.0431, "num_input_tokens_seen": 1467160, "step": 3580 }, { "epoch": 4.340193704600484, "grad_norm": 0.07110316306352615, "learning_rate": 4.7945627536191166e-05, "loss": 0.0462, "num_input_tokens_seen": 1469240, "step": 3585 }, { "epoch": 4.346246973365617, "grad_norm": 0.10647711902856827, "learning_rate": 4.793512941888316e-05, "loss": 0.0066, "num_input_tokens_seen": 1471224, "step": 3590 }, { "epoch": 4.352300242130751, "grad_norm": 1.158477544784546, "learning_rate": 4.792460570182044e-05, "loss": 0.0104, "num_input_tokens_seen": 1473368, "step": 3595 }, { "epoch": 4.358353510895884, "grad_norm": 0.36422085762023926, "learning_rate": 4.791405639674941e-05, "loss": 0.0025, "num_input_tokens_seen": 1475416, "step": 3600 }, { "epoch": 4.364406779661017, "grad_norm": 0.01873144321143627, "learning_rate": 4.790348151544497e-05, "loss": 0.0784, "num_input_tokens_seen": 1477496, "step": 3605 }, { "epoch": 4.37046004842615, "grad_norm": 0.07810250669717789, "learning_rate": 4.7892881069710606e-05, "loss": 0.0028, "num_input_tokens_seen": 1479576, "step": 3610 }, { "epoch": 4.376513317191283, "grad_norm": 0.09578061103820801, "learning_rate": 4.7882255071378346e-05, "loss": 0.0013, "num_input_tokens_seen": 1481656, "step": 3615 }, { "epoch": 4.3825665859564165, "grad_norm": 0.013224023394286633, "learning_rate": 4.787160353230873e-05, "loss": 0.0008, "num_input_tokens_seen": 1483672, "step": 3620 }, { "epoch": 4.38861985472155, "grad_norm": 33.742889404296875, "learning_rate": 4.7860926464390786e-05, "loss": 0.0353, "num_input_tokens_seen": 1485752, "step": 3625 }, { "epoch": 4.394673123486683, "grad_norm": 0.007884323596954346, "learning_rate": 4.785022387954206e-05, "loss": 0.0004, "num_input_tokens_seen": 1487672, "step": 3630 }, { "epoch": 4.400726392251816, "grad_norm": 0.020842164754867554, "learning_rate": 4.783949578970858e-05, "loss": 0.0527, "num_input_tokens_seen": 1489752, "step": 3635 }, { "epoch": 4.406779661016949, "grad_norm": 2.2368924617767334, "learning_rate": 4.782874220686483e-05, "loss": 0.0892, "num_input_tokens_seen": 1491864, "step": 3640 }, { "epoch": 4.412832929782082, "grad_norm": 0.03532866761088371, "learning_rate": 4.7817963143013755e-05, "loss": 0.0007, "num_input_tokens_seen": 1493912, "step": 3645 }, { "epoch": 4.418886198547216, "grad_norm": 0.062098462134599686, "learning_rate": 4.780715861018675e-05, "loss": 0.1166, "num_input_tokens_seen": 1495960, "step": 3650 }, { "epoch": 4.424939467312349, "grad_norm": 5.496737003326416, "learning_rate": 4.7796328620443616e-05, "loss": 0.106, "num_input_tokens_seen": 1498104, "step": 3655 }, { "epoch": 4.4309927360774815, "grad_norm": 0.08465112000703812, "learning_rate": 4.778547318587259e-05, "loss": 0.0068, "num_input_tokens_seen": 1500248, "step": 3660 }, { "epoch": 4.437046004842615, "grad_norm": 0.038109034299850464, "learning_rate": 4.777459231859028e-05, "loss": 0.0047, "num_input_tokens_seen": 1502296, "step": 3665 }, { "epoch": 4.443099273607748, "grad_norm": 0.02793186902999878, "learning_rate": 4.776368603074174e-05, "loss": 0.0036, "num_input_tokens_seen": 1504344, "step": 3670 }, { "epoch": 4.4491525423728815, "grad_norm": 0.03011367842555046, "learning_rate": 4.7752754334500326e-05, "loss": 0.0012, "num_input_tokens_seen": 1506360, "step": 3675 }, { "epoch": 4.455205811138015, "grad_norm": 0.016545677557587624, "learning_rate": 4.7741797242067816e-05, "loss": 0.0668, "num_input_tokens_seen": 1508376, "step": 3680 }, { "epoch": 4.461259079903147, "grad_norm": 0.005595378112047911, "learning_rate": 4.773081476567429e-05, "loss": 0.011, "num_input_tokens_seen": 1510424, "step": 3685 }, { "epoch": 4.467312348668281, "grad_norm": 21.55636978149414, "learning_rate": 4.771980691757819e-05, "loss": 0.0425, "num_input_tokens_seen": 1512408, "step": 3690 }, { "epoch": 4.473365617433414, "grad_norm": 1.1322370767593384, "learning_rate": 4.7708773710066255e-05, "loss": 0.0017, "num_input_tokens_seen": 1514424, "step": 3695 }, { "epoch": 4.479418886198547, "grad_norm": 0.03245539218187332, "learning_rate": 4.769771515545355e-05, "loss": 0.0005, "num_input_tokens_seen": 1516536, "step": 3700 }, { "epoch": 4.485472154963681, "grad_norm": 0.01185368187725544, "learning_rate": 4.768663126608342e-05, "loss": 0.0845, "num_input_tokens_seen": 1518488, "step": 3705 }, { "epoch": 4.491525423728813, "grad_norm": 0.018686678260564804, "learning_rate": 4.7675522054327495e-05, "loss": 0.0052, "num_input_tokens_seen": 1520696, "step": 3710 }, { "epoch": 4.4975786924939465, "grad_norm": 7.210537433624268, "learning_rate": 4.7664387532585655e-05, "loss": 0.0753, "num_input_tokens_seen": 1522584, "step": 3715 }, { "epoch": 4.50363196125908, "grad_norm": 0.0215544942766428, "learning_rate": 4.765322771328605e-05, "loss": 0.0009, "num_input_tokens_seen": 1524568, "step": 3720 }, { "epoch": 4.509685230024213, "grad_norm": 0.025423722341656685, "learning_rate": 4.7642042608885064e-05, "loss": 0.0007, "num_input_tokens_seen": 1526680, "step": 3725 }, { "epoch": 4.5157384987893465, "grad_norm": 0.013880668208003044, "learning_rate": 4.7630832231867286e-05, "loss": 0.0011, "num_input_tokens_seen": 1528600, "step": 3730 }, { "epoch": 4.521791767554479, "grad_norm": 0.12901706993579865, "learning_rate": 4.761959659474553e-05, "loss": 0.0006, "num_input_tokens_seen": 1530648, "step": 3735 }, { "epoch": 4.527845036319612, "grad_norm": 0.053421925753355026, "learning_rate": 4.76083357100608e-05, "loss": 0.0005, "num_input_tokens_seen": 1532920, "step": 3740 }, { "epoch": 4.533898305084746, "grad_norm": 0.0015764142153784633, "learning_rate": 4.759704959038228e-05, "loss": 0.0004, "num_input_tokens_seen": 1535000, "step": 3745 }, { "epoch": 4.539951573849879, "grad_norm": 0.017509447410702705, "learning_rate": 4.758573824830732e-05, "loss": 0.0004, "num_input_tokens_seen": 1537048, "step": 3750 }, { "epoch": 4.546004842615012, "grad_norm": 0.013161486014723778, "learning_rate": 4.757440169646142e-05, "loss": 0.0003, "num_input_tokens_seen": 1539064, "step": 3755 }, { "epoch": 4.552058111380145, "grad_norm": 0.4890718460083008, "learning_rate": 4.756303994749824e-05, "loss": 0.0052, "num_input_tokens_seen": 1541176, "step": 3760 }, { "epoch": 4.558111380145278, "grad_norm": 0.0018566690851002932, "learning_rate": 4.755165301409954e-05, "loss": 0.0001, "num_input_tokens_seen": 1543320, "step": 3765 }, { "epoch": 4.5641646489104115, "grad_norm": 0.002822115086019039, "learning_rate": 4.75402409089752e-05, "loss": 0.0001, "num_input_tokens_seen": 1545272, "step": 3770 }, { "epoch": 4.570217917675545, "grad_norm": 73.02928161621094, "learning_rate": 4.7528803644863184e-05, "loss": 0.1283, "num_input_tokens_seen": 1547416, "step": 3775 }, { "epoch": 4.576271186440678, "grad_norm": 4.363590240478516, "learning_rate": 4.751734123452956e-05, "loss": 0.0957, "num_input_tokens_seen": 1549432, "step": 3780 }, { "epoch": 4.582324455205811, "grad_norm": 0.005630722735077143, "learning_rate": 4.750585369076843e-05, "loss": 0.0612, "num_input_tokens_seen": 1551512, "step": 3785 }, { "epoch": 4.588377723970944, "grad_norm": 0.01621035486459732, "learning_rate": 4.7494341026402006e-05, "loss": 0.0006, "num_input_tokens_seen": 1553560, "step": 3790 }, { "epoch": 4.594430992736077, "grad_norm": 2.864713191986084, "learning_rate": 4.7482803254280484e-05, "loss": 0.0632, "num_input_tokens_seen": 1555608, "step": 3795 }, { "epoch": 4.600484261501211, "grad_norm": 10.281551361083984, "learning_rate": 4.747124038728211e-05, "loss": 0.0083, "num_input_tokens_seen": 1557624, "step": 3800 }, { "epoch": 4.606537530266344, "grad_norm": 0.35545697808265686, "learning_rate": 4.745965243831313e-05, "loss": 0.0046, "num_input_tokens_seen": 1559608, "step": 3805 }, { "epoch": 4.6125907990314765, "grad_norm": 0.049050308763980865, "learning_rate": 4.74480394203078e-05, "loss": 0.0237, "num_input_tokens_seen": 1561560, "step": 3810 }, { "epoch": 4.61864406779661, "grad_norm": 0.033277157694101334, "learning_rate": 4.743640134622835e-05, "loss": 0.0025, "num_input_tokens_seen": 1563608, "step": 3815 }, { "epoch": 4.624697336561743, "grad_norm": 0.009306500665843487, "learning_rate": 4.742473822906497e-05, "loss": 0.0506, "num_input_tokens_seen": 1565688, "step": 3820 }, { "epoch": 4.6307506053268765, "grad_norm": 0.009021952748298645, "learning_rate": 4.741305008183581e-05, "loss": 0.0005, "num_input_tokens_seen": 1567544, "step": 3825 }, { "epoch": 4.63680387409201, "grad_norm": 0.004347851499915123, "learning_rate": 4.7401336917586965e-05, "loss": 0.0004, "num_input_tokens_seen": 1569560, "step": 3830 }, { "epoch": 4.642857142857143, "grad_norm": 0.1778583526611328, "learning_rate": 4.7389598749392436e-05, "loss": 0.0008, "num_input_tokens_seen": 1571704, "step": 3835 }, { "epoch": 4.648910411622276, "grad_norm": 0.0049683270044624805, "learning_rate": 4.737783559035415e-05, "loss": 0.0004, "num_input_tokens_seen": 1573688, "step": 3840 }, { "epoch": 4.654963680387409, "grad_norm": 0.01314992643892765, "learning_rate": 4.736604745360192e-05, "loss": 0.0005, "num_input_tokens_seen": 1575544, "step": 3845 }, { "epoch": 4.661016949152542, "grad_norm": 0.27048471570014954, "learning_rate": 4.735423435229344e-05, "loss": 0.0007, "num_input_tokens_seen": 1577592, "step": 3850 }, { "epoch": 4.667070217917676, "grad_norm": 0.004713078029453754, "learning_rate": 4.734239629961426e-05, "loss": 0.0393, "num_input_tokens_seen": 1579672, "step": 3855 }, { "epoch": 4.673123486682809, "grad_norm": 0.003963234834372997, "learning_rate": 4.7330533308777804e-05, "loss": 0.0002, "num_input_tokens_seen": 1581752, "step": 3860 }, { "epoch": 4.6791767554479415, "grad_norm": 0.46759751439094543, "learning_rate": 4.731864539302531e-05, "loss": 0.0441, "num_input_tokens_seen": 1583992, "step": 3865 }, { "epoch": 4.685230024213075, "grad_norm": 2.816149950027466, "learning_rate": 4.7306732565625834e-05, "loss": 0.0869, "num_input_tokens_seen": 1586104, "step": 3870 }, { "epoch": 4.691283292978208, "grad_norm": 0.004162050783634186, "learning_rate": 4.7294794839876254e-05, "loss": 0.0003, "num_input_tokens_seen": 1588216, "step": 3875 }, { "epoch": 4.697336561743342, "grad_norm": 8.020161628723145, "learning_rate": 4.728283222910124e-05, "loss": 0.1356, "num_input_tokens_seen": 1590232, "step": 3880 }, { "epoch": 4.703389830508475, "grad_norm": 0.03209589421749115, "learning_rate": 4.727084474665322e-05, "loss": 0.0766, "num_input_tokens_seen": 1592216, "step": 3885 }, { "epoch": 4.709443099273607, "grad_norm": 0.039240144193172455, "learning_rate": 4.72588324059124e-05, "loss": 0.1007, "num_input_tokens_seen": 1594232, "step": 3890 }, { "epoch": 4.715496368038741, "grad_norm": 0.08536580950021744, "learning_rate": 4.724679522028672e-05, "loss": 0.0025, "num_input_tokens_seen": 1596312, "step": 3895 }, { "epoch": 4.721549636803874, "grad_norm": 0.007443590555340052, "learning_rate": 4.723473320321186e-05, "loss": 0.0013, "num_input_tokens_seen": 1598488, "step": 3900 }, { "epoch": 4.727602905569007, "grad_norm": 0.02005002833902836, "learning_rate": 4.722264636815121e-05, "loss": 0.0016, "num_input_tokens_seen": 1600472, "step": 3905 }, { "epoch": 4.733656174334141, "grad_norm": 0.3878021836280823, "learning_rate": 4.721053472859588e-05, "loss": 0.0095, "num_input_tokens_seen": 1602392, "step": 3910 }, { "epoch": 4.739709443099273, "grad_norm": 0.02332082763314247, "learning_rate": 4.719839829806463e-05, "loss": 0.0006, "num_input_tokens_seen": 1604568, "step": 3915 }, { "epoch": 4.745762711864407, "grad_norm": 0.004489803686738014, "learning_rate": 4.7186237090103936e-05, "loss": 0.0009, "num_input_tokens_seen": 1606680, "step": 3920 }, { "epoch": 4.75181598062954, "grad_norm": 20.505470275878906, "learning_rate": 4.717405111828788e-05, "loss": 0.0056, "num_input_tokens_seen": 1608824, "step": 3925 }, { "epoch": 4.757869249394673, "grad_norm": 0.005863486789166927, "learning_rate": 4.716184039621824e-05, "loss": 0.0003, "num_input_tokens_seen": 1610744, "step": 3930 }, { "epoch": 4.763922518159807, "grad_norm": 0.005467884708195925, "learning_rate": 4.714960493752436e-05, "loss": 0.0005, "num_input_tokens_seen": 1612728, "step": 3935 }, { "epoch": 4.76997578692494, "grad_norm": 0.00864969938993454, "learning_rate": 4.7137344755863254e-05, "loss": 0.0004, "num_input_tokens_seen": 1614680, "step": 3940 }, { "epoch": 4.776029055690072, "grad_norm": 0.0022404887713491917, "learning_rate": 4.712505986491949e-05, "loss": 0.0004, "num_input_tokens_seen": 1616792, "step": 3945 }, { "epoch": 4.782082324455206, "grad_norm": 0.2018781453371048, "learning_rate": 4.711275027840524e-05, "loss": 0.0007, "num_input_tokens_seen": 1618744, "step": 3950 }, { "epoch": 4.788135593220339, "grad_norm": 0.07796760648488998, "learning_rate": 4.710041601006021e-05, "loss": 0.0004, "num_input_tokens_seen": 1620824, "step": 3955 }, { "epoch": 4.7941888619854724, "grad_norm": 34.364383697509766, "learning_rate": 4.708805707365169e-05, "loss": 0.0964, "num_input_tokens_seen": 1622968, "step": 3960 }, { "epoch": 4.800242130750606, "grad_norm": 0.0014117968967184424, "learning_rate": 4.7075673482974494e-05, "loss": 0.0011, "num_input_tokens_seen": 1625016, "step": 3965 }, { "epoch": 4.806295399515738, "grad_norm": 0.04170842096209526, "learning_rate": 4.706326525185094e-05, "loss": 0.0002, "num_input_tokens_seen": 1627000, "step": 3970 }, { "epoch": 4.812348668280872, "grad_norm": 2.2679946422576904, "learning_rate": 4.705083239413086e-05, "loss": 0.0017, "num_input_tokens_seen": 1628984, "step": 3975 }, { "epoch": 4.818401937046005, "grad_norm": 0.0007130205631256104, "learning_rate": 4.703837492369157e-05, "loss": 0.0681, "num_input_tokens_seen": 1630968, "step": 3980 }, { "epoch": 4.824455205811138, "grad_norm": 0.00567208556458354, "learning_rate": 4.702589285443788e-05, "loss": 0.1022, "num_input_tokens_seen": 1633272, "step": 3985 }, { "epoch": 4.830508474576272, "grad_norm": 0.018829528242349625, "learning_rate": 4.701338620030201e-05, "loss": 0.0002, "num_input_tokens_seen": 1635352, "step": 3990 }, { "epoch": 4.836561743341404, "grad_norm": 0.15951110422611237, "learning_rate": 4.7000854975243676e-05, "loss": 0.0008, "num_input_tokens_seen": 1637336, "step": 3995 }, { "epoch": 4.842615012106537, "grad_norm": 3.271028757095337, "learning_rate": 4.698829919324997e-05, "loss": 0.063, "num_input_tokens_seen": 1639384, "step": 4000 }, { "epoch": 4.848668280871671, "grad_norm": 0.07503771036863327, "learning_rate": 4.697571886833544e-05, "loss": 0.0012, "num_input_tokens_seen": 1641528, "step": 4005 }, { "epoch": 4.854721549636804, "grad_norm": 0.09898870438337326, "learning_rate": 4.696311401454198e-05, "loss": 0.0018, "num_input_tokens_seen": 1643480, "step": 4010 }, { "epoch": 4.8607748184019375, "grad_norm": 0.051961272954940796, "learning_rate": 4.695048464593891e-05, "loss": 0.0098, "num_input_tokens_seen": 1645560, "step": 4015 }, { "epoch": 4.86682808716707, "grad_norm": 0.014189820736646652, "learning_rate": 4.693783077662287e-05, "loss": 0.0789, "num_input_tokens_seen": 1647480, "step": 4020 }, { "epoch": 4.872881355932203, "grad_norm": 0.0020488216541707516, "learning_rate": 4.692515242071788e-05, "loss": 0.0001, "num_input_tokens_seen": 1649496, "step": 4025 }, { "epoch": 4.878934624697337, "grad_norm": 0.013802570290863514, "learning_rate": 4.6912449592375286e-05, "loss": 0.0002, "num_input_tokens_seen": 1651480, "step": 4030 }, { "epoch": 4.88498789346247, "grad_norm": 21.692052841186523, "learning_rate": 4.689972230577373e-05, "loss": 0.0075, "num_input_tokens_seen": 1653496, "step": 4035 }, { "epoch": 4.891041162227603, "grad_norm": 0.03537558764219284, "learning_rate": 4.6886970575119174e-05, "loss": 0.0003, "num_input_tokens_seen": 1655576, "step": 4040 }, { "epoch": 4.897094430992736, "grad_norm": 0.02283511310815811, "learning_rate": 4.687419441464486e-05, "loss": 0.0004, "num_input_tokens_seen": 1657624, "step": 4045 }, { "epoch": 4.903147699757869, "grad_norm": 0.0019069024128839374, "learning_rate": 4.6861393838611284e-05, "loss": 0.0112, "num_input_tokens_seen": 1659640, "step": 4050 }, { "epoch": 4.9092009685230025, "grad_norm": 36.52399826049805, "learning_rate": 4.6848568861306215e-05, "loss": 0.1072, "num_input_tokens_seen": 1661560, "step": 4055 }, { "epoch": 4.915254237288136, "grad_norm": 0.0023445934057235718, "learning_rate": 4.683571949704464e-05, "loss": 0.0001, "num_input_tokens_seen": 1663704, "step": 4060 }, { "epoch": 4.921307506053269, "grad_norm": 0.01957731507718563, "learning_rate": 4.682284576016878e-05, "loss": 0.0007, "num_input_tokens_seen": 1665784, "step": 4065 }, { "epoch": 4.927360774818402, "grad_norm": 0.0029572800267487764, "learning_rate": 4.6809947665048057e-05, "loss": 0.0008, "num_input_tokens_seen": 1667800, "step": 4070 }, { "epoch": 4.933414043583535, "grad_norm": 0.006003914400935173, "learning_rate": 4.679702522607908e-05, "loss": 0.049, "num_input_tokens_seen": 1669944, "step": 4075 }, { "epoch": 4.939467312348668, "grad_norm": 40.71260070800781, "learning_rate": 4.678407845768562e-05, "loss": 0.0258, "num_input_tokens_seen": 1671992, "step": 4080 }, { "epoch": 4.945520581113802, "grad_norm": 0.0010937668848782778, "learning_rate": 4.677110737431862e-05, "loss": 0.0005, "num_input_tokens_seen": 1674072, "step": 4085 }, { "epoch": 4.951573849878935, "grad_norm": 0.0008237341535277665, "learning_rate": 4.675811199045616e-05, "loss": 0.0001, "num_input_tokens_seen": 1676312, "step": 4090 }, { "epoch": 4.9576271186440675, "grad_norm": 0.013605214655399323, "learning_rate": 4.674509232060343e-05, "loss": 0.0774, "num_input_tokens_seen": 1678424, "step": 4095 }, { "epoch": 4.963680387409201, "grad_norm": 0.027778100222349167, "learning_rate": 4.6732048379292745e-05, "loss": 0.0005, "num_input_tokens_seen": 1680472, "step": 4100 }, { "epoch": 4.969733656174334, "grad_norm": 0.27299264073371887, "learning_rate": 4.6718980181083504e-05, "loss": 0.0002, "num_input_tokens_seen": 1682520, "step": 4105 }, { "epoch": 4.9757869249394675, "grad_norm": 0.0037337581161409616, "learning_rate": 4.670588774056218e-05, "loss": 0.0002, "num_input_tokens_seen": 1684472, "step": 4110 }, { "epoch": 4.981840193704601, "grad_norm": 3.3837337493896484, "learning_rate": 4.66927710723423e-05, "loss": 0.0012, "num_input_tokens_seen": 1686680, "step": 4115 }, { "epoch": 4.987893462469733, "grad_norm": 0.4643261134624481, "learning_rate": 4.667963019106445e-05, "loss": 0.0895, "num_input_tokens_seen": 1688728, "step": 4120 }, { "epoch": 4.993946731234867, "grad_norm": 0.02758590318262577, "learning_rate": 4.6666465111396226e-05, "loss": 0.1152, "num_input_tokens_seen": 1690712, "step": 4125 }, { "epoch": 5.0, "grad_norm": 0.028146039694547653, "learning_rate": 4.665327584803223e-05, "loss": 0.0818, "num_input_tokens_seen": 1692608, "step": 4130 }, { "epoch": 5.0, "eval_loss": 0.16807201504707336, "eval_runtime": 4.952, "eval_samples_per_second": 74.111, "eval_steps_per_second": 18.578, "num_input_tokens_seen": 1692608, "step": 4130 }, { "epoch": 5.006053268765133, "grad_norm": 0.06156104430556297, "learning_rate": 4.664006241569409e-05, "loss": 0.0832, "num_input_tokens_seen": 1694784, "step": 4135 }, { "epoch": 5.012106537530267, "grad_norm": 0.020736213773489, "learning_rate": 4.6626824829130355e-05, "loss": 0.0648, "num_input_tokens_seen": 1696832, "step": 4140 }, { "epoch": 5.018159806295399, "grad_norm": 0.028303997591137886, "learning_rate": 4.6613563103116594e-05, "loss": 0.0017, "num_input_tokens_seen": 1698976, "step": 4145 }, { "epoch": 5.0242130750605325, "grad_norm": 0.04601948708295822, "learning_rate": 4.660027725245528e-05, "loss": 0.0225, "num_input_tokens_seen": 1700992, "step": 4150 }, { "epoch": 5.030266343825666, "grad_norm": 0.01708562858402729, "learning_rate": 4.658696729197584e-05, "loss": 0.0011, "num_input_tokens_seen": 1702976, "step": 4155 }, { "epoch": 5.036319612590799, "grad_norm": 0.04511260241270065, "learning_rate": 4.6573633236534575e-05, "loss": 0.0009, "num_input_tokens_seen": 1705024, "step": 4160 }, { "epoch": 5.0423728813559325, "grad_norm": 0.04118425399065018, "learning_rate": 4.656027510101473e-05, "loss": 0.001, "num_input_tokens_seen": 1707072, "step": 4165 }, { "epoch": 5.048426150121065, "grad_norm": 0.009535955265164375, "learning_rate": 4.654689290032638e-05, "loss": 0.0005, "num_input_tokens_seen": 1709024, "step": 4170 }, { "epoch": 5.054479418886198, "grad_norm": 0.01963157206773758, "learning_rate": 4.65334866494065e-05, "loss": 0.0005, "num_input_tokens_seen": 1711008, "step": 4175 }, { "epoch": 5.060532687651332, "grad_norm": 0.013547525741159916, "learning_rate": 4.6520056363218876e-05, "loss": 0.0757, "num_input_tokens_seen": 1713056, "step": 4180 }, { "epoch": 5.066585956416465, "grad_norm": 0.2229597121477127, "learning_rate": 4.6506602056754153e-05, "loss": 0.0009, "num_input_tokens_seen": 1715072, "step": 4185 }, { "epoch": 5.072639225181598, "grad_norm": 0.009361869655549526, "learning_rate": 4.649312374502976e-05, "loss": 0.0004, "num_input_tokens_seen": 1717120, "step": 4190 }, { "epoch": 5.078692493946731, "grad_norm": 0.0046058110892772675, "learning_rate": 4.647962144308994e-05, "loss": 0.0002, "num_input_tokens_seen": 1719264, "step": 4195 }, { "epoch": 5.084745762711864, "grad_norm": 0.01770162209868431, "learning_rate": 4.646609516600569e-05, "loss": 0.0002, "num_input_tokens_seen": 1721408, "step": 4200 }, { "epoch": 5.0907990314769975, "grad_norm": 0.00349655793979764, "learning_rate": 4.6452544928874805e-05, "loss": 0.0005, "num_input_tokens_seen": 1723328, "step": 4205 }, { "epoch": 5.096852300242131, "grad_norm": 0.011260677129030228, "learning_rate": 4.6438970746821785e-05, "loss": 0.0845, "num_input_tokens_seen": 1725344, "step": 4210 }, { "epoch": 5.102905569007264, "grad_norm": 9.36107063293457, "learning_rate": 4.642537263499788e-05, "loss": 0.013, "num_input_tokens_seen": 1727488, "step": 4215 }, { "epoch": 5.108958837772397, "grad_norm": 22.226741790771484, "learning_rate": 4.641175060858105e-05, "loss": 0.1067, "num_input_tokens_seen": 1729472, "step": 4220 }, { "epoch": 5.11501210653753, "grad_norm": 0.016993323341012, "learning_rate": 4.639810468277592e-05, "loss": 0.0025, "num_input_tokens_seen": 1731456, "step": 4225 }, { "epoch": 5.121065375302663, "grad_norm": 0.01691816747188568, "learning_rate": 4.638443487281384e-05, "loss": 0.0006, "num_input_tokens_seen": 1733504, "step": 4230 }, { "epoch": 5.127118644067797, "grad_norm": 0.005476247984915972, "learning_rate": 4.637074119395277e-05, "loss": 0.0002, "num_input_tokens_seen": 1735552, "step": 4235 }, { "epoch": 5.13317191283293, "grad_norm": 0.007180575747042894, "learning_rate": 4.635702366147735e-05, "loss": 0.0002, "num_input_tokens_seen": 1737664, "step": 4240 }, { "epoch": 5.1392251815980625, "grad_norm": 0.0030778085347265005, "learning_rate": 4.634328229069881e-05, "loss": 0.0004, "num_input_tokens_seen": 1739584, "step": 4245 }, { "epoch": 5.145278450363196, "grad_norm": 0.029078351333737373, "learning_rate": 4.632951709695503e-05, "loss": 0.0003, "num_input_tokens_seen": 1741760, "step": 4250 }, { "epoch": 5.151331719128329, "grad_norm": 0.005109527613967657, "learning_rate": 4.6315728095610434e-05, "loss": 0.0713, "num_input_tokens_seen": 1743776, "step": 4255 }, { "epoch": 5.157384987893463, "grad_norm": 0.0032690216321498156, "learning_rate": 4.6301915302056054e-05, "loss": 0.0002, "num_input_tokens_seen": 1745888, "step": 4260 }, { "epoch": 5.163438256658596, "grad_norm": 0.009336970746517181, "learning_rate": 4.6288078731709474e-05, "loss": 0.0002, "num_input_tokens_seen": 1747904, "step": 4265 }, { "epoch": 5.169491525423728, "grad_norm": 0.08976799249649048, "learning_rate": 4.627421840001479e-05, "loss": 0.0003, "num_input_tokens_seen": 1749984, "step": 4270 }, { "epoch": 5.175544794188862, "grad_norm": 0.002009397139772773, "learning_rate": 4.6260334322442656e-05, "loss": 0.0001, "num_input_tokens_seen": 1752192, "step": 4275 }, { "epoch": 5.181598062953995, "grad_norm": 0.029716290533542633, "learning_rate": 4.6246426514490214e-05, "loss": 0.0001, "num_input_tokens_seen": 1754144, "step": 4280 }, { "epoch": 5.187651331719128, "grad_norm": 8.533007621765137, "learning_rate": 4.6232494991681094e-05, "loss": 0.0034, "num_input_tokens_seen": 1756256, "step": 4285 }, { "epoch": 5.193704600484262, "grad_norm": 0.0037602800875902176, "learning_rate": 4.6218539769565385e-05, "loss": 0.0001, "num_input_tokens_seen": 1758304, "step": 4290 }, { "epoch": 5.199757869249395, "grad_norm": 0.003721450688317418, "learning_rate": 4.620456086371966e-05, "loss": 0.0002, "num_input_tokens_seen": 1760352, "step": 4295 }, { "epoch": 5.2058111380145276, "grad_norm": 0.014277982525527477, "learning_rate": 4.619055828974689e-05, "loss": 0.0011, "num_input_tokens_seen": 1762464, "step": 4300 }, { "epoch": 5.211864406779661, "grad_norm": 0.017351360991597176, "learning_rate": 4.617653206327649e-05, "loss": 0.0001, "num_input_tokens_seen": 1764480, "step": 4305 }, { "epoch": 5.217917675544794, "grad_norm": 0.0011241794563829899, "learning_rate": 4.616248219996426e-05, "loss": 0.0, "num_input_tokens_seen": 1766688, "step": 4310 }, { "epoch": 5.223970944309928, "grad_norm": 0.0042012035846710205, "learning_rate": 4.6148408715492405e-05, "loss": 0.0619, "num_input_tokens_seen": 1768640, "step": 4315 }, { "epoch": 5.230024213075061, "grad_norm": 0.0019620792008936405, "learning_rate": 4.613431162556947e-05, "loss": 0.0002, "num_input_tokens_seen": 1770656, "step": 4320 }, { "epoch": 5.236077481840193, "grad_norm": 10.916512489318848, "learning_rate": 4.612019094593035e-05, "loss": 0.057, "num_input_tokens_seen": 1772544, "step": 4325 }, { "epoch": 5.242130750605327, "grad_norm": 0.0038026098627597094, "learning_rate": 4.61060466923363e-05, "loss": 0.0001, "num_input_tokens_seen": 1774592, "step": 4330 }, { "epoch": 5.24818401937046, "grad_norm": 0.8485783934593201, "learning_rate": 4.609187888057485e-05, "loss": 0.0005, "num_input_tokens_seen": 1776704, "step": 4335 }, { "epoch": 5.254237288135593, "grad_norm": 0.011841859668493271, "learning_rate": 4.607768752645984e-05, "loss": 0.0031, "num_input_tokens_seen": 1778720, "step": 4340 }, { "epoch": 5.260290556900727, "grad_norm": 0.000771852268371731, "learning_rate": 4.60634726458314e-05, "loss": 0.0178, "num_input_tokens_seen": 1780736, "step": 4345 }, { "epoch": 5.266343825665859, "grad_norm": 0.008520427159965038, "learning_rate": 4.60492342545559e-05, "loss": 0.0002, "num_input_tokens_seen": 1782752, "step": 4350 }, { "epoch": 5.272397094430993, "grad_norm": 0.003861643373966217, "learning_rate": 4.603497236852596e-05, "loss": 0.0001, "num_input_tokens_seen": 1784768, "step": 4355 }, { "epoch": 5.278450363196126, "grad_norm": 0.00485737482085824, "learning_rate": 4.6020687003660445e-05, "loss": 0.0001, "num_input_tokens_seen": 1786816, "step": 4360 }, { "epoch": 5.284503631961259, "grad_norm": 0.002303200773894787, "learning_rate": 4.600637817590438e-05, "loss": 0.0027, "num_input_tokens_seen": 1789024, "step": 4365 }, { "epoch": 5.290556900726393, "grad_norm": 0.00895336177200079, "learning_rate": 4.599204590122903e-05, "loss": 0.0, "num_input_tokens_seen": 1791136, "step": 4370 }, { "epoch": 5.296610169491525, "grad_norm": 0.26760879158973694, "learning_rate": 4.59776901956318e-05, "loss": 0.0004, "num_input_tokens_seen": 1793120, "step": 4375 }, { "epoch": 5.302663438256658, "grad_norm": 0.00627902103587985, "learning_rate": 4.596331107513623e-05, "loss": 0.0001, "num_input_tokens_seen": 1795072, "step": 4380 }, { "epoch": 5.308716707021792, "grad_norm": 5.8242340087890625, "learning_rate": 4.594890855579206e-05, "loss": 0.1647, "num_input_tokens_seen": 1797024, "step": 4385 }, { "epoch": 5.314769975786925, "grad_norm": 0.009973247535526752, "learning_rate": 4.593448265367508e-05, "loss": 0.0002, "num_input_tokens_seen": 1799072, "step": 4390 }, { "epoch": 5.3208232445520585, "grad_norm": 0.00502827949821949, "learning_rate": 4.5920033384887215e-05, "loss": 0.0002, "num_input_tokens_seen": 1801152, "step": 4395 }, { "epoch": 5.326876513317191, "grad_norm": 0.004278827924281359, "learning_rate": 4.5905560765556476e-05, "loss": 0.0004, "num_input_tokens_seen": 1803104, "step": 4400 }, { "epoch": 5.332929782082324, "grad_norm": 0.0020008967258036137, "learning_rate": 4.589106481183691e-05, "loss": 0.1098, "num_input_tokens_seen": 1805152, "step": 4405 }, { "epoch": 5.338983050847458, "grad_norm": 0.048032619059085846, "learning_rate": 4.5876545539908636e-05, "loss": 0.0005, "num_input_tokens_seen": 1807168, "step": 4410 }, { "epoch": 5.345036319612591, "grad_norm": 0.04805127531290054, "learning_rate": 4.586200296597777e-05, "loss": 0.001, "num_input_tokens_seen": 1809152, "step": 4415 }, { "epoch": 5.351089588377724, "grad_norm": 0.016722718253731728, "learning_rate": 4.584743710627648e-05, "loss": 0.0018, "num_input_tokens_seen": 1811008, "step": 4420 }, { "epoch": 5.357142857142857, "grad_norm": 0.12908999621868134, "learning_rate": 4.5832847977062874e-05, "loss": 0.0012, "num_input_tokens_seen": 1813024, "step": 4425 }, { "epoch": 5.36319612590799, "grad_norm": 0.053991809487342834, "learning_rate": 4.581823559462108e-05, "loss": 0.0397, "num_input_tokens_seen": 1815040, "step": 4430 }, { "epoch": 5.3692493946731235, "grad_norm": 0.21373096108436584, "learning_rate": 4.5803599975261166e-05, "loss": 0.0448, "num_input_tokens_seen": 1817120, "step": 4435 }, { "epoch": 5.375302663438257, "grad_norm": 0.08897626399993896, "learning_rate": 4.578894113531912e-05, "loss": 0.0005, "num_input_tokens_seen": 1819040, "step": 4440 }, { "epoch": 5.38135593220339, "grad_norm": 0.0069953021593391895, "learning_rate": 4.577425909115686e-05, "loss": 0.0003, "num_input_tokens_seen": 1820928, "step": 4445 }, { "epoch": 5.387409200968523, "grad_norm": 0.011915040202438831, "learning_rate": 4.57595538591622e-05, "loss": 0.0003, "num_input_tokens_seen": 1822880, "step": 4450 }, { "epoch": 5.393462469733656, "grad_norm": 0.011078120209276676, "learning_rate": 4.5744825455748853e-05, "loss": 0.0006, "num_input_tokens_seen": 1824832, "step": 4455 }, { "epoch": 5.399515738498789, "grad_norm": 0.050896741449832916, "learning_rate": 4.573007389735637e-05, "loss": 0.0003, "num_input_tokens_seen": 1826848, "step": 4460 }, { "epoch": 5.405569007263923, "grad_norm": 0.003739492269232869, "learning_rate": 4.571529920045018e-05, "loss": 0.0003, "num_input_tokens_seen": 1828896, "step": 4465 }, { "epoch": 5.411622276029056, "grad_norm": 0.027269357815384865, "learning_rate": 4.5700501381521485e-05, "loss": 0.0006, "num_input_tokens_seen": 1830880, "step": 4470 }, { "epoch": 5.4176755447941884, "grad_norm": 0.01259615458548069, "learning_rate": 4.568568045708735e-05, "loss": 0.0002, "num_input_tokens_seen": 1833024, "step": 4475 }, { "epoch": 5.423728813559322, "grad_norm": 0.0037806262262165546, "learning_rate": 4.567083644369062e-05, "loss": 0.0002, "num_input_tokens_seen": 1834976, "step": 4480 }, { "epoch": 5.429782082324455, "grad_norm": 0.0029252918902784586, "learning_rate": 4.5655969357899874e-05, "loss": 0.0003, "num_input_tokens_seen": 1837024, "step": 4485 }, { "epoch": 5.4358353510895885, "grad_norm": 0.0025679077953100204, "learning_rate": 4.564107921630948e-05, "loss": 0.0004, "num_input_tokens_seen": 1839104, "step": 4490 }, { "epoch": 5.441888619854722, "grad_norm": 0.005043715704232454, "learning_rate": 4.5626166035539545e-05, "loss": 0.0001, "num_input_tokens_seen": 1841216, "step": 4495 }, { "epoch": 5.447941888619855, "grad_norm": 0.015447926707565784, "learning_rate": 4.561122983223585e-05, "loss": 0.0001, "num_input_tokens_seen": 1843168, "step": 4500 }, { "epoch": 5.453995157384988, "grad_norm": 0.07067233324050903, "learning_rate": 4.559627062306993e-05, "loss": 0.1126, "num_input_tokens_seen": 1845184, "step": 4505 }, { "epoch": 5.460048426150121, "grad_norm": 0.0018002944998443127, "learning_rate": 4.558128842473894e-05, "loss": 0.0001, "num_input_tokens_seen": 1847136, "step": 4510 }, { "epoch": 5.466101694915254, "grad_norm": 0.08698341995477676, "learning_rate": 4.5566283253965744e-05, "loss": 0.0003, "num_input_tokens_seen": 1849248, "step": 4515 }, { "epoch": 5.472154963680388, "grad_norm": 0.002953785005956888, "learning_rate": 4.555125512749883e-05, "loss": 0.0001, "num_input_tokens_seen": 1851264, "step": 4520 }, { "epoch": 5.478208232445521, "grad_norm": 0.0016207287553697824, "learning_rate": 4.55362040621123e-05, "loss": 0.0001, "num_input_tokens_seen": 1853376, "step": 4525 }, { "epoch": 5.4842615012106535, "grad_norm": 0.01589774154126644, "learning_rate": 4.552113007460586e-05, "loss": 0.0002, "num_input_tokens_seen": 1855456, "step": 4530 }, { "epoch": 5.490314769975787, "grad_norm": 1.052240014076233, "learning_rate": 4.550603318180481e-05, "loss": 0.0007, "num_input_tokens_seen": 1857568, "step": 4535 }, { "epoch": 5.49636803874092, "grad_norm": 0.0017332733841612935, "learning_rate": 4.549091340056003e-05, "loss": 0.0001, "num_input_tokens_seen": 1859712, "step": 4540 }, { "epoch": 5.5024213075060535, "grad_norm": 0.013113804161548615, "learning_rate": 4.547577074774791e-05, "loss": 0.0004, "num_input_tokens_seen": 1861696, "step": 4545 }, { "epoch": 5.508474576271187, "grad_norm": 36.4571533203125, "learning_rate": 4.546060524027041e-05, "loss": 0.1723, "num_input_tokens_seen": 1863712, "step": 4550 }, { "epoch": 5.514527845036319, "grad_norm": 0.007026940118521452, "learning_rate": 4.544541689505497e-05, "loss": 0.1439, "num_input_tokens_seen": 1865696, "step": 4555 }, { "epoch": 5.520581113801453, "grad_norm": 0.013779771514236927, "learning_rate": 4.543020572905453e-05, "loss": 0.0002, "num_input_tokens_seen": 1867680, "step": 4560 }, { "epoch": 5.526634382566586, "grad_norm": 0.1315293312072754, "learning_rate": 4.54149717592475e-05, "loss": 0.0007, "num_input_tokens_seen": 1869888, "step": 4565 }, { "epoch": 5.532687651331719, "grad_norm": 0.008989150635898113, "learning_rate": 4.539971500263777e-05, "loss": 0.0002, "num_input_tokens_seen": 1872032, "step": 4570 }, { "epoch": 5.538740920096853, "grad_norm": 0.005280650220811367, "learning_rate": 4.5384435476254616e-05, "loss": 0.0002, "num_input_tokens_seen": 1874048, "step": 4575 }, { "epoch": 5.544794188861985, "grad_norm": 0.22098782658576965, "learning_rate": 4.536913319715277e-05, "loss": 0.0005, "num_input_tokens_seen": 1876032, "step": 4580 }, { "epoch": 5.5508474576271185, "grad_norm": 0.0087638720870018, "learning_rate": 4.535380818241233e-05, "loss": 0.0002, "num_input_tokens_seen": 1878080, "step": 4585 }, { "epoch": 5.556900726392252, "grad_norm": 0.001765310880728066, "learning_rate": 4.53384604491388e-05, "loss": 0.0001, "num_input_tokens_seen": 1880096, "step": 4590 }, { "epoch": 5.562953995157385, "grad_norm": 0.07208627462387085, "learning_rate": 4.532309001446302e-05, "loss": 0.0231, "num_input_tokens_seen": 1882080, "step": 4595 }, { "epoch": 5.5690072639225185, "grad_norm": 0.0020223897881805897, "learning_rate": 4.530769689554117e-05, "loss": 0.0001, "num_input_tokens_seen": 1884160, "step": 4600 }, { "epoch": 5.575060532687651, "grad_norm": 0.03922637179493904, "learning_rate": 4.529228110955478e-05, "loss": 0.0005, "num_input_tokens_seen": 1886240, "step": 4605 }, { "epoch": 5.581113801452784, "grad_norm": 0.13005660474300385, "learning_rate": 4.527684267371063e-05, "loss": 0.0584, "num_input_tokens_seen": 1888352, "step": 4610 }, { "epoch": 5.587167070217918, "grad_norm": 0.005168238189071417, "learning_rate": 4.526138160524083e-05, "loss": 0.0004, "num_input_tokens_seen": 1890400, "step": 4615 }, { "epoch": 5.593220338983051, "grad_norm": 0.004368984140455723, "learning_rate": 4.52458979214027e-05, "loss": 0.0001, "num_input_tokens_seen": 1892352, "step": 4620 }, { "epoch": 5.599273607748184, "grad_norm": 0.0018089881632477045, "learning_rate": 4.523039163947887e-05, "loss": 0.0001, "num_input_tokens_seen": 1894528, "step": 4625 }, { "epoch": 5.605326876513317, "grad_norm": 0.009426716715097427, "learning_rate": 4.521486277677714e-05, "loss": 0.0001, "num_input_tokens_seen": 1896544, "step": 4630 }, { "epoch": 5.61138014527845, "grad_norm": 0.0007473472505807877, "learning_rate": 4.519931135063051e-05, "loss": 0.0001, "num_input_tokens_seen": 1898496, "step": 4635 }, { "epoch": 5.6174334140435835, "grad_norm": 0.000417503499193117, "learning_rate": 4.5183737378397224e-05, "loss": 0.0002, "num_input_tokens_seen": 1900608, "step": 4640 }, { "epoch": 5.623486682808717, "grad_norm": 3.5919182300567627, "learning_rate": 4.516814087746063e-05, "loss": 0.0016, "num_input_tokens_seen": 1902496, "step": 4645 }, { "epoch": 5.62953995157385, "grad_norm": 0.0010951816802844405, "learning_rate": 4.5152521865229244e-05, "loss": 0.0, "num_input_tokens_seen": 1904512, "step": 4650 }, { "epoch": 5.635593220338983, "grad_norm": 0.0034006715286523104, "learning_rate": 4.513688035913672e-05, "loss": 0.0001, "num_input_tokens_seen": 1906592, "step": 4655 }, { "epoch": 5.641646489104116, "grad_norm": 0.01648564636707306, "learning_rate": 4.512121637664181e-05, "loss": 0.0004, "num_input_tokens_seen": 1908672, "step": 4660 }, { "epoch": 5.647699757869249, "grad_norm": 28.30640411376953, "learning_rate": 4.510552993522834e-05, "loss": 0.0658, "num_input_tokens_seen": 1910624, "step": 4665 }, { "epoch": 5.653753026634383, "grad_norm": 0.001029284787364304, "learning_rate": 4.508982105240523e-05, "loss": 0.0, "num_input_tokens_seen": 1912608, "step": 4670 }, { "epoch": 5.659806295399516, "grad_norm": 1.6146907806396484, "learning_rate": 4.5074089745706434e-05, "loss": 0.0008, "num_input_tokens_seen": 1914656, "step": 4675 }, { "epoch": 5.6658595641646485, "grad_norm": 0.0004873225116170943, "learning_rate": 4.505833603269093e-05, "loss": 0.0001, "num_input_tokens_seen": 1916864, "step": 4680 }, { "epoch": 5.671912832929782, "grad_norm": 0.00237577804364264, "learning_rate": 4.504255993094272e-05, "loss": 0.068, "num_input_tokens_seen": 1918912, "step": 4685 }, { "epoch": 5.677966101694915, "grad_norm": 0.00288300565443933, "learning_rate": 4.502676145807079e-05, "loss": 0.0, "num_input_tokens_seen": 1920896, "step": 4690 }, { "epoch": 5.684019370460049, "grad_norm": 0.0012799138203263283, "learning_rate": 4.501094063170908e-05, "loss": 0.0001, "num_input_tokens_seen": 1923104, "step": 4695 }, { "epoch": 5.690072639225182, "grad_norm": 0.003274239832535386, "learning_rate": 4.4995097469516506e-05, "loss": 0.0001, "num_input_tokens_seen": 1925152, "step": 4700 }, { "epoch": 5.696125907990314, "grad_norm": 0.0017929351888597012, "learning_rate": 4.49792319891769e-05, "loss": 0.0001, "num_input_tokens_seen": 1927296, "step": 4705 }, { "epoch": 5.702179176755448, "grad_norm": 0.0025752561632543802, "learning_rate": 4.496334420839902e-05, "loss": 0.0001, "num_input_tokens_seen": 1929440, "step": 4710 }, { "epoch": 5.708232445520581, "grad_norm": 0.17037180066108704, "learning_rate": 4.49474341449165e-05, "loss": 0.0001, "num_input_tokens_seen": 1931552, "step": 4715 }, { "epoch": 5.714285714285714, "grad_norm": 0.0021306953858584166, "learning_rate": 4.493150181648783e-05, "loss": 0.0, "num_input_tokens_seen": 1933632, "step": 4720 }, { "epoch": 5.720338983050848, "grad_norm": 0.004187833517789841, "learning_rate": 4.49155472408964e-05, "loss": 0.0001, "num_input_tokens_seen": 1935776, "step": 4725 }, { "epoch": 5.72639225181598, "grad_norm": 0.0018270231084898114, "learning_rate": 4.4899570435950386e-05, "loss": 0.0, "num_input_tokens_seen": 1937824, "step": 4730 }, { "epoch": 5.732445520581114, "grad_norm": 0.0005928910104557872, "learning_rate": 4.488357141948281e-05, "loss": 0.0, "num_input_tokens_seen": 1939904, "step": 4735 }, { "epoch": 5.738498789346247, "grad_norm": 0.01229714136570692, "learning_rate": 4.486755020935144e-05, "loss": 0.0, "num_input_tokens_seen": 1941920, "step": 4740 }, { "epoch": 5.74455205811138, "grad_norm": 0.012938208878040314, "learning_rate": 4.485150682343887e-05, "loss": 0.0001, "num_input_tokens_seen": 1943968, "step": 4745 }, { "epoch": 5.750605326876514, "grad_norm": 0.0002815666957758367, "learning_rate": 4.483544127965243e-05, "loss": 0.0, "num_input_tokens_seen": 1946208, "step": 4750 }, { "epoch": 5.756658595641646, "grad_norm": 0.0013445024378597736, "learning_rate": 4.481935359592414e-05, "loss": 0.0, "num_input_tokens_seen": 1948192, "step": 4755 }, { "epoch": 5.762711864406779, "grad_norm": 20.636503219604492, "learning_rate": 4.4803243790210795e-05, "loss": 0.0962, "num_input_tokens_seen": 1950240, "step": 4760 }, { "epoch": 5.768765133171913, "grad_norm": 0.007046419195830822, "learning_rate": 4.478711188049384e-05, "loss": 0.0791, "num_input_tokens_seen": 1952256, "step": 4765 }, { "epoch": 5.774818401937046, "grad_norm": 0.0010208688909187913, "learning_rate": 4.477095788477941e-05, "loss": 0.0001, "num_input_tokens_seen": 1954368, "step": 4770 }, { "epoch": 5.780871670702179, "grad_norm": 0.007547380402684212, "learning_rate": 4.475478182109829e-05, "loss": 0.0001, "num_input_tokens_seen": 1956512, "step": 4775 }, { "epoch": 5.786924939467312, "grad_norm": 0.03931564465165138, "learning_rate": 4.4738583707505885e-05, "loss": 0.0003, "num_input_tokens_seen": 1958464, "step": 4780 }, { "epoch": 5.792978208232445, "grad_norm": 0.00022476895537693053, "learning_rate": 4.472236356208224e-05, "loss": 0.0005, "num_input_tokens_seen": 1960448, "step": 4785 }, { "epoch": 5.799031476997579, "grad_norm": 0.002734287641942501, "learning_rate": 4.4706121402931946e-05, "loss": 0.0735, "num_input_tokens_seen": 1962336, "step": 4790 }, { "epoch": 5.805084745762712, "grad_norm": 0.04597781226038933, "learning_rate": 4.468985724818421e-05, "loss": 0.0006, "num_input_tokens_seen": 1964512, "step": 4795 }, { "epoch": 5.811138014527845, "grad_norm": 0.04112609103322029, "learning_rate": 4.4673571115992766e-05, "loss": 0.0636, "num_input_tokens_seen": 1966528, "step": 4800 }, { "epoch": 5.817191283292978, "grad_norm": 0.0923021212220192, "learning_rate": 4.465726302453589e-05, "loss": 0.0002, "num_input_tokens_seen": 1968512, "step": 4805 }, { "epoch": 5.823244552058111, "grad_norm": 0.008162632584571838, "learning_rate": 4.464093299201636e-05, "loss": 0.0004, "num_input_tokens_seen": 1970752, "step": 4810 }, { "epoch": 5.829297820823244, "grad_norm": 0.25362634658813477, "learning_rate": 4.462458103666144e-05, "loss": 0.0801, "num_input_tokens_seen": 1972800, "step": 4815 }, { "epoch": 5.835351089588378, "grad_norm": 0.011706355027854443, "learning_rate": 4.4608207176722884e-05, "loss": 0.0005, "num_input_tokens_seen": 1974880, "step": 4820 }, { "epoch": 5.841404358353511, "grad_norm": 0.06953795254230499, "learning_rate": 4.459181143047687e-05, "loss": 0.0033, "num_input_tokens_seen": 1976800, "step": 4825 }, { "epoch": 5.847457627118644, "grad_norm": 12.630387306213379, "learning_rate": 4.457539381622402e-05, "loss": 0.0283, "num_input_tokens_seen": 1978944, "step": 4830 }, { "epoch": 5.853510895883777, "grad_norm": 0.0037331325002014637, "learning_rate": 4.455895435228937e-05, "loss": 0.0001, "num_input_tokens_seen": 1980960, "step": 4835 }, { "epoch": 5.85956416464891, "grad_norm": 0.006345625966787338, "learning_rate": 4.4542493057022336e-05, "loss": 0.0001, "num_input_tokens_seen": 1983008, "step": 4840 }, { "epoch": 5.865617433414044, "grad_norm": 0.009668613784015179, "learning_rate": 4.4526009948796703e-05, "loss": 0.0001, "num_input_tokens_seen": 1985024, "step": 4845 }, { "epoch": 5.871670702179177, "grad_norm": 0.001737504149787128, "learning_rate": 4.450950504601059e-05, "loss": 0.0, "num_input_tokens_seen": 1987168, "step": 4850 }, { "epoch": 5.877723970944309, "grad_norm": 0.005905769765377045, "learning_rate": 4.449297836708647e-05, "loss": 0.0716, "num_input_tokens_seen": 1989248, "step": 4855 }, { "epoch": 5.883777239709443, "grad_norm": 0.0001236728421645239, "learning_rate": 4.44764299304711e-05, "loss": 0.0001, "num_input_tokens_seen": 1991552, "step": 4860 }, { "epoch": 5.889830508474576, "grad_norm": 0.00013050688721705228, "learning_rate": 4.445985975463552e-05, "loss": 0.0657, "num_input_tokens_seen": 1993632, "step": 4865 }, { "epoch": 5.8958837772397095, "grad_norm": 0.00032983810524456203, "learning_rate": 4.444326785807507e-05, "loss": 0.0015, "num_input_tokens_seen": 1995744, "step": 4870 }, { "epoch": 5.901937046004843, "grad_norm": 0.0016883015632629395, "learning_rate": 4.442665425930929e-05, "loss": 0.0197, "num_input_tokens_seen": 1997888, "step": 4875 }, { "epoch": 5.907990314769976, "grad_norm": 0.0457405187189579, "learning_rate": 4.4410018976881966e-05, "loss": 0.0005, "num_input_tokens_seen": 2000032, "step": 4880 }, { "epoch": 5.914043583535109, "grad_norm": 0.0525968074798584, "learning_rate": 4.4393362029361086e-05, "loss": 0.0001, "num_input_tokens_seen": 2002112, "step": 4885 }, { "epoch": 5.920096852300242, "grad_norm": 0.007332314271479845, "learning_rate": 4.4376683435338816e-05, "loss": 0.0001, "num_input_tokens_seen": 2004096, "step": 4890 }, { "epoch": 5.926150121065375, "grad_norm": 3.328958034515381, "learning_rate": 4.435998321343149e-05, "loss": 0.0017, "num_input_tokens_seen": 2006144, "step": 4895 }, { "epoch": 5.932203389830509, "grad_norm": 0.01501663401722908, "learning_rate": 4.434326138227957e-05, "loss": 0.0202, "num_input_tokens_seen": 2008128, "step": 4900 }, { "epoch": 5.938256658595642, "grad_norm": 0.0005250175017863512, "learning_rate": 4.4326517960547644e-05, "loss": 0.0002, "num_input_tokens_seen": 2010080, "step": 4905 }, { "epoch": 5.9443099273607745, "grad_norm": 0.002377455122768879, "learning_rate": 4.43097529669244e-05, "loss": 0.0001, "num_input_tokens_seen": 2012000, "step": 4910 }, { "epoch": 5.950363196125908, "grad_norm": 0.002046670764684677, "learning_rate": 4.4292966420122614e-05, "loss": 0.0001, "num_input_tokens_seen": 2014016, "step": 4915 }, { "epoch": 5.956416464891041, "grad_norm": 0.0004662233986891806, "learning_rate": 4.427615833887911e-05, "loss": 0.0677, "num_input_tokens_seen": 2016064, "step": 4920 }, { "epoch": 5.9624697336561745, "grad_norm": 0.0009716207860037684, "learning_rate": 4.4259328741954744e-05, "loss": 0.0128, "num_input_tokens_seen": 2018112, "step": 4925 }, { "epoch": 5.968523002421308, "grad_norm": 0.0017461152747273445, "learning_rate": 4.424247764813439e-05, "loss": 0.0002, "num_input_tokens_seen": 2020128, "step": 4930 }, { "epoch": 5.97457627118644, "grad_norm": 0.10973256826400757, "learning_rate": 4.422560507622692e-05, "loss": 0.0078, "num_input_tokens_seen": 2022144, "step": 4935 }, { "epoch": 5.980629539951574, "grad_norm": 0.0006367381429299712, "learning_rate": 4.4208711045065174e-05, "loss": 0.0001, "num_input_tokens_seen": 2024064, "step": 4940 }, { "epoch": 5.986682808716707, "grad_norm": 0.006334998644888401, "learning_rate": 4.419179557350594e-05, "loss": 0.0402, "num_input_tokens_seen": 2026144, "step": 4945 }, { "epoch": 5.99273607748184, "grad_norm": 0.0003036895941477269, "learning_rate": 4.417485868042998e-05, "loss": 0.0001, "num_input_tokens_seen": 2028352, "step": 4950 }, { "epoch": 5.998789346246974, "grad_norm": 0.004526303615421057, "learning_rate": 4.415790038474189e-05, "loss": 0.1482, "num_input_tokens_seen": 2030336, "step": 4955 }, { "epoch": 6.0, "eval_loss": 0.17872284352779388, "eval_runtime": 5.2284, "eval_samples_per_second": 70.194, "eval_steps_per_second": 17.596, "num_input_tokens_seen": 2030416, "step": 4956 }, { "epoch": 6.004842615012106, "grad_norm": 0.017790528014302254, "learning_rate": 4.414092070537022e-05, "loss": 0.0002, "num_input_tokens_seen": 2032080, "step": 4960 }, { "epoch": 6.0108958837772395, "grad_norm": 0.033765822649002075, "learning_rate": 4.412391966126735e-05, "loss": 0.0608, "num_input_tokens_seen": 2034160, "step": 4965 }, { "epoch": 6.016949152542373, "grad_norm": 0.08563821017742157, "learning_rate": 4.4106897271409544e-05, "loss": 0.0035, "num_input_tokens_seen": 2036208, "step": 4970 }, { "epoch": 6.023002421307506, "grad_norm": 1.340657114982605, "learning_rate": 4.408985355479685e-05, "loss": 0.0023, "num_input_tokens_seen": 2038288, "step": 4975 }, { "epoch": 6.0290556900726395, "grad_norm": 0.1492939442396164, "learning_rate": 4.4072788530453146e-05, "loss": 0.002, "num_input_tokens_seen": 2040272, "step": 4980 }, { "epoch": 6.035108958837772, "grad_norm": 0.03115965612232685, "learning_rate": 4.4055702217426084e-05, "loss": 0.0013, "num_input_tokens_seen": 2042416, "step": 4985 }, { "epoch": 6.041162227602905, "grad_norm": 0.027966121211647987, "learning_rate": 4.403859463478709e-05, "loss": 0.0005, "num_input_tokens_seen": 2044528, "step": 4990 }, { "epoch": 6.047215496368039, "grad_norm": 40.94167709350586, "learning_rate": 4.402146580163131e-05, "loss": 0.02, "num_input_tokens_seen": 2046576, "step": 4995 }, { "epoch": 6.053268765133172, "grad_norm": 0.01921970583498478, "learning_rate": 4.400431573707764e-05, "loss": 0.0865, "num_input_tokens_seen": 2048528, "step": 5000 }, { "epoch": 6.059322033898305, "grad_norm": 0.0007571494206786156, "learning_rate": 4.398714446026863e-05, "loss": 0.0012, "num_input_tokens_seen": 2050640, "step": 5005 }, { "epoch": 6.065375302663438, "grad_norm": 0.08183982223272324, "learning_rate": 4.396995199037055e-05, "loss": 0.002, "num_input_tokens_seen": 2052624, "step": 5010 }, { "epoch": 6.071428571428571, "grad_norm": 0.11652474850416183, "learning_rate": 4.3952738346573296e-05, "loss": 0.0018, "num_input_tokens_seen": 2054640, "step": 5015 }, { "epoch": 6.0774818401937045, "grad_norm": 0.00061530846869573, "learning_rate": 4.393550354809043e-05, "loss": 0.0006, "num_input_tokens_seen": 2056720, "step": 5020 }, { "epoch": 6.083535108958838, "grad_norm": 0.004143508151173592, "learning_rate": 4.391824761415908e-05, "loss": 0.0008, "num_input_tokens_seen": 2058800, "step": 5025 }, { "epoch": 6.089588377723971, "grad_norm": 13.468663215637207, "learning_rate": 4.390097056404001e-05, "loss": 0.0769, "num_input_tokens_seen": 2060816, "step": 5030 }, { "epoch": 6.095641646489105, "grad_norm": 0.0020933959167450666, "learning_rate": 4.388367241701753e-05, "loss": 0.0003, "num_input_tokens_seen": 2062896, "step": 5035 }, { "epoch": 6.101694915254237, "grad_norm": 0.0018664263188838959, "learning_rate": 4.386635319239949e-05, "loss": 0.0002, "num_input_tokens_seen": 2064816, "step": 5040 }, { "epoch": 6.10774818401937, "grad_norm": 0.007543828804045916, "learning_rate": 4.3849012909517286e-05, "loss": 0.0002, "num_input_tokens_seen": 2066768, "step": 5045 }, { "epoch": 6.113801452784504, "grad_norm": 0.001508082146756351, "learning_rate": 4.383165158772582e-05, "loss": 0.0001, "num_input_tokens_seen": 2068848, "step": 5050 }, { "epoch": 6.119854721549637, "grad_norm": 0.0047316476702690125, "learning_rate": 4.381426924640346e-05, "loss": 0.0031, "num_input_tokens_seen": 2070928, "step": 5055 }, { "epoch": 6.12590799031477, "grad_norm": 0.004892949480563402, "learning_rate": 4.3796865904952056e-05, "loss": 0.0001, "num_input_tokens_seen": 2073136, "step": 5060 }, { "epoch": 6.131961259079903, "grad_norm": 0.0004887140239588916, "learning_rate": 4.377944158279687e-05, "loss": 0.0001, "num_input_tokens_seen": 2075184, "step": 5065 }, { "epoch": 6.138014527845036, "grad_norm": 0.029134487733244896, "learning_rate": 4.37619962993866e-05, "loss": 0.0002, "num_input_tokens_seen": 2077136, "step": 5070 }, { "epoch": 6.1440677966101696, "grad_norm": 0.0021965617779642344, "learning_rate": 4.374453007419336e-05, "loss": 0.0576, "num_input_tokens_seen": 2079248, "step": 5075 }, { "epoch": 6.150121065375303, "grad_norm": 0.0038314389530569315, "learning_rate": 4.372704292671259e-05, "loss": 0.0002, "num_input_tokens_seen": 2081232, "step": 5080 }, { "epoch": 6.156174334140436, "grad_norm": 0.001621271250769496, "learning_rate": 4.370953487646313e-05, "loss": 0.0002, "num_input_tokens_seen": 2083376, "step": 5085 }, { "epoch": 6.162227602905569, "grad_norm": 0.006201398558914661, "learning_rate": 4.3692005942987125e-05, "loss": 0.0878, "num_input_tokens_seen": 2085552, "step": 5090 }, { "epoch": 6.168280871670702, "grad_norm": 0.014844809658825397, "learning_rate": 4.367445614585004e-05, "loss": 0.0003, "num_input_tokens_seen": 2087504, "step": 5095 }, { "epoch": 6.174334140435835, "grad_norm": 0.002202034695073962, "learning_rate": 4.365688550464062e-05, "loss": 0.0003, "num_input_tokens_seen": 2089520, "step": 5100 }, { "epoch": 6.180387409200969, "grad_norm": 0.0016478793695569038, "learning_rate": 4.363929403897088e-05, "loss": 0.0001, "num_input_tokens_seen": 2091664, "step": 5105 }, { "epoch": 6.186440677966102, "grad_norm": 0.004547104239463806, "learning_rate": 4.362168176847608e-05, "loss": 0.0002, "num_input_tokens_seen": 2093648, "step": 5110 }, { "epoch": 6.1924939467312345, "grad_norm": 0.0032591107301414013, "learning_rate": 4.3604048712814704e-05, "loss": 0.0003, "num_input_tokens_seen": 2095760, "step": 5115 }, { "epoch": 6.198547215496368, "grad_norm": 0.0007241108687594533, "learning_rate": 4.3586394891668436e-05, "loss": 0.0001, "num_input_tokens_seen": 2097712, "step": 5120 }, { "epoch": 6.204600484261501, "grad_norm": 0.0012535958085209131, "learning_rate": 4.356872032474213e-05, "loss": 0.0002, "num_input_tokens_seen": 2099760, "step": 5125 }, { "epoch": 6.210653753026635, "grad_norm": 0.0003163469664286822, "learning_rate": 4.3551025031763795e-05, "loss": 0.0002, "num_input_tokens_seen": 2101808, "step": 5130 }, { "epoch": 6.216707021791768, "grad_norm": 0.0031842999160289764, "learning_rate": 4.353330903248459e-05, "loss": 0.0003, "num_input_tokens_seen": 2103760, "step": 5135 }, { "epoch": 6.2227602905569, "grad_norm": 0.0029747977387160063, "learning_rate": 4.351557234667877e-05, "loss": 0.0003, "num_input_tokens_seen": 2105808, "step": 5140 }, { "epoch": 6.228813559322034, "grad_norm": 0.0024027784820646048, "learning_rate": 4.349781499414369e-05, "loss": 0.1055, "num_input_tokens_seen": 2107952, "step": 5145 }, { "epoch": 6.234866828087167, "grad_norm": 0.0023099014069885015, "learning_rate": 4.348003699469977e-05, "loss": 0.0002, "num_input_tokens_seen": 2110096, "step": 5150 }, { "epoch": 6.2409200968523, "grad_norm": 0.009512676857411861, "learning_rate": 4.3462238368190466e-05, "loss": 0.0017, "num_input_tokens_seen": 2112240, "step": 5155 }, { "epoch": 6.246973365617434, "grad_norm": 0.006422645412385464, "learning_rate": 4.344441913448228e-05, "loss": 0.0002, "num_input_tokens_seen": 2114288, "step": 5160 }, { "epoch": 6.253026634382566, "grad_norm": 0.006033272482454777, "learning_rate": 4.3426579313464686e-05, "loss": 0.0003, "num_input_tokens_seen": 2116272, "step": 5165 }, { "epoch": 6.2590799031477, "grad_norm": 0.12086091190576553, "learning_rate": 4.340871892505016e-05, "loss": 0.0003, "num_input_tokens_seen": 2118128, "step": 5170 }, { "epoch": 6.265133171912833, "grad_norm": 0.005981640424579382, "learning_rate": 4.339083798917413e-05, "loss": 0.0002, "num_input_tokens_seen": 2120112, "step": 5175 }, { "epoch": 6.271186440677966, "grad_norm": 0.0026503654662519693, "learning_rate": 4.337293652579495e-05, "loss": 0.0001, "num_input_tokens_seen": 2122192, "step": 5180 }, { "epoch": 6.2772397094431, "grad_norm": 0.011570453643798828, "learning_rate": 4.335501455489389e-05, "loss": 0.0003, "num_input_tokens_seen": 2124240, "step": 5185 }, { "epoch": 6.283292978208232, "grad_norm": 0.0041961465030908585, "learning_rate": 4.333707209647513e-05, "loss": 0.0001, "num_input_tokens_seen": 2126256, "step": 5190 }, { "epoch": 6.289346246973365, "grad_norm": 0.0008600951987318695, "learning_rate": 4.331910917056568e-05, "loss": 0.0001, "num_input_tokens_seen": 2128272, "step": 5195 }, { "epoch": 6.295399515738499, "grad_norm": 0.035136736929416656, "learning_rate": 4.330112579721542e-05, "loss": 0.0717, "num_input_tokens_seen": 2130320, "step": 5200 }, { "epoch": 6.301452784503632, "grad_norm": 0.015916310250759125, "learning_rate": 4.3283121996497064e-05, "loss": 0.0002, "num_input_tokens_seen": 2132368, "step": 5205 }, { "epoch": 6.3075060532687655, "grad_norm": 0.04831964150071144, "learning_rate": 4.32650977885061e-05, "loss": 0.0002, "num_input_tokens_seen": 2134384, "step": 5210 }, { "epoch": 6.313559322033898, "grad_norm": 0.0017921063117682934, "learning_rate": 4.3247053193360796e-05, "loss": 0.0001, "num_input_tokens_seen": 2136464, "step": 5215 }, { "epoch": 6.319612590799031, "grad_norm": 0.002758686663582921, "learning_rate": 4.3228988231202206e-05, "loss": 0.0001, "num_input_tokens_seen": 2138544, "step": 5220 }, { "epoch": 6.325665859564165, "grad_norm": 0.04718875512480736, "learning_rate": 4.3210902922194086e-05, "loss": 0.0001, "num_input_tokens_seen": 2140656, "step": 5225 }, { "epoch": 6.331719128329298, "grad_norm": 0.03436395525932312, "learning_rate": 4.319279728652291e-05, "loss": 0.0002, "num_input_tokens_seen": 2142896, "step": 5230 }, { "epoch": 6.337772397094431, "grad_norm": 0.012346488423645496, "learning_rate": 4.317467134439785e-05, "loss": 0.0001, "num_input_tokens_seen": 2144912, "step": 5235 }, { "epoch": 6.343825665859564, "grad_norm": 6.920846939086914, "learning_rate": 4.315652511605073e-05, "loss": 0.1178, "num_input_tokens_seen": 2146960, "step": 5240 }, { "epoch": 6.349878934624697, "grad_norm": 0.014269179664552212, "learning_rate": 4.313835862173603e-05, "loss": 0.0001, "num_input_tokens_seen": 2149072, "step": 5245 }, { "epoch": 6.3559322033898304, "grad_norm": 0.0038484425749629736, "learning_rate": 4.3120171881730854e-05, "loss": 0.0001, "num_input_tokens_seen": 2151056, "step": 5250 }, { "epoch": 6.361985472154964, "grad_norm": 0.0012674182653427124, "learning_rate": 4.310196491633488e-05, "loss": 0.0004, "num_input_tokens_seen": 2153168, "step": 5255 }, { "epoch": 6.368038740920097, "grad_norm": 0.009002907201647758, "learning_rate": 4.308373774587038e-05, "loss": 0.0003, "num_input_tokens_seen": 2155248, "step": 5260 }, { "epoch": 6.37409200968523, "grad_norm": 0.01977350562810898, "learning_rate": 4.3065490390682186e-05, "loss": 0.0004, "num_input_tokens_seen": 2157232, "step": 5265 }, { "epoch": 6.380145278450363, "grad_norm": 0.004840109497308731, "learning_rate": 4.3047222871137625e-05, "loss": 0.0842, "num_input_tokens_seen": 2159408, "step": 5270 }, { "epoch": 6.386198547215496, "grad_norm": 0.0003074906417168677, "learning_rate": 4.302893520762657e-05, "loss": 0.0002, "num_input_tokens_seen": 2161392, "step": 5275 }, { "epoch": 6.39225181598063, "grad_norm": 0.0031716725789010525, "learning_rate": 4.301062742056137e-05, "loss": 0.0006, "num_input_tokens_seen": 2163408, "step": 5280 }, { "epoch": 6.398305084745763, "grad_norm": 0.002030724659562111, "learning_rate": 4.299229953037681e-05, "loss": 0.0001, "num_input_tokens_seen": 2165360, "step": 5285 }, { "epoch": 6.404358353510895, "grad_norm": 0.0012285809498280287, "learning_rate": 4.297395155753015e-05, "loss": 0.0002, "num_input_tokens_seen": 2167568, "step": 5290 }, { "epoch": 6.410411622276029, "grad_norm": 0.001803940744139254, "learning_rate": 4.2955583522501035e-05, "loss": 0.0017, "num_input_tokens_seen": 2169552, "step": 5295 }, { "epoch": 6.416464891041162, "grad_norm": 0.004859514068812132, "learning_rate": 4.293719544579152e-05, "loss": 0.0001, "num_input_tokens_seen": 2171568, "step": 5300 }, { "epoch": 6.4225181598062955, "grad_norm": 0.005489617120474577, "learning_rate": 4.2918787347926024e-05, "loss": 0.0001, "num_input_tokens_seen": 2173616, "step": 5305 }, { "epoch": 6.428571428571429, "grad_norm": 0.001192908501252532, "learning_rate": 4.290035924945132e-05, "loss": 0.0005, "num_input_tokens_seen": 2175696, "step": 5310 }, { "epoch": 6.434624697336561, "grad_norm": 0.0010187341831624508, "learning_rate": 4.28819111709365e-05, "loss": 0.0, "num_input_tokens_seen": 2177904, "step": 5315 }, { "epoch": 6.440677966101695, "grad_norm": 0.0026205589529126883, "learning_rate": 4.286344313297295e-05, "loss": 0.0018, "num_input_tokens_seen": 2180016, "step": 5320 }, { "epoch": 6.446731234866828, "grad_norm": 0.0016283408040180802, "learning_rate": 4.284495515617435e-05, "loss": 0.0, "num_input_tokens_seen": 2182032, "step": 5325 }, { "epoch": 6.452784503631961, "grad_norm": 0.0024198030587285757, "learning_rate": 4.282644726117663e-05, "loss": 0.0002, "num_input_tokens_seen": 2184016, "step": 5330 }, { "epoch": 6.458837772397095, "grad_norm": 0.0030000603292137384, "learning_rate": 4.280791946863794e-05, "loss": 0.0001, "num_input_tokens_seen": 2185936, "step": 5335 }, { "epoch": 6.464891041162228, "grad_norm": 0.0013251692289486527, "learning_rate": 4.278937179923867e-05, "loss": 0.0001, "num_input_tokens_seen": 2188048, "step": 5340 }, { "epoch": 6.4709443099273605, "grad_norm": 0.0009033224196173251, "learning_rate": 4.2770804273681344e-05, "loss": 0.0, "num_input_tokens_seen": 2190096, "step": 5345 }, { "epoch": 6.476997578692494, "grad_norm": 0.003739095525816083, "learning_rate": 4.27522169126907e-05, "loss": 0.0002, "num_input_tokens_seen": 2192144, "step": 5350 }, { "epoch": 6.483050847457627, "grad_norm": 0.002320936182513833, "learning_rate": 4.273360973701359e-05, "loss": 0.0001, "num_input_tokens_seen": 2194096, "step": 5355 }, { "epoch": 6.4891041162227605, "grad_norm": 0.0020048567093908787, "learning_rate": 4.2714982767419e-05, "loss": 0.0001, "num_input_tokens_seen": 2196112, "step": 5360 }, { "epoch": 6.495157384987894, "grad_norm": 4.302217483520508, "learning_rate": 4.269633602469798e-05, "loss": 0.0032, "num_input_tokens_seen": 2198160, "step": 5365 }, { "epoch": 6.501210653753026, "grad_norm": 0.004352893680334091, "learning_rate": 4.267766952966369e-05, "loss": 0.0732, "num_input_tokens_seen": 2200144, "step": 5370 }, { "epoch": 6.50726392251816, "grad_norm": 0.00563370855525136, "learning_rate": 4.265898330315131e-05, "loss": 0.1097, "num_input_tokens_seen": 2202224, "step": 5375 }, { "epoch": 6.513317191283293, "grad_norm": 0.0018228809349238873, "learning_rate": 4.264027736601803e-05, "loss": 0.0657, "num_input_tokens_seen": 2204400, "step": 5380 }, { "epoch": 6.519370460048426, "grad_norm": 0.002972212852910161, "learning_rate": 4.262155173914309e-05, "loss": 0.0002, "num_input_tokens_seen": 2206640, "step": 5385 }, { "epoch": 6.52542372881356, "grad_norm": 0.23389305174350739, "learning_rate": 4.2602806443427665e-05, "loss": 0.0006, "num_input_tokens_seen": 2208720, "step": 5390 }, { "epoch": 6.531476997578692, "grad_norm": 0.03129570931196213, "learning_rate": 4.2584041499794904e-05, "loss": 0.0006, "num_input_tokens_seen": 2210800, "step": 5395 }, { "epoch": 6.5375302663438255, "grad_norm": 0.019101643934845924, "learning_rate": 4.2565256929189854e-05, "loss": 0.0005, "num_input_tokens_seen": 2212752, "step": 5400 }, { "epoch": 6.543583535108959, "grad_norm": 0.003007529303431511, "learning_rate": 4.2546452752579536e-05, "loss": 0.0002, "num_input_tokens_seen": 2214928, "step": 5405 }, { "epoch": 6.549636803874092, "grad_norm": 0.011285200715065002, "learning_rate": 4.252762899095278e-05, "loss": 0.0003, "num_input_tokens_seen": 2216944, "step": 5410 }, { "epoch": 6.5556900726392255, "grad_norm": 0.008727551437914371, "learning_rate": 4.2508785665320325e-05, "loss": 0.0004, "num_input_tokens_seen": 2218896, "step": 5415 }, { "epoch": 6.561743341404358, "grad_norm": 0.005425586365163326, "learning_rate": 4.248992279671473e-05, "loss": 0.0005, "num_input_tokens_seen": 2220848, "step": 5420 }, { "epoch": 6.567796610169491, "grad_norm": 0.0049201734364032745, "learning_rate": 4.247104040619037e-05, "loss": 0.0006, "num_input_tokens_seen": 2222896, "step": 5425 }, { "epoch": 6.573849878934625, "grad_norm": 0.004254171624779701, "learning_rate": 4.245213851482342e-05, "loss": 0.0001, "num_input_tokens_seen": 2224912, "step": 5430 }, { "epoch": 6.579903147699758, "grad_norm": 0.17312206327915192, "learning_rate": 4.2433217143711776e-05, "loss": 0.0005, "num_input_tokens_seen": 2226992, "step": 5435 }, { "epoch": 6.585956416464891, "grad_norm": 0.00013170906458981335, "learning_rate": 4.2414276313975157e-05, "loss": 0.0003, "num_input_tokens_seen": 2229104, "step": 5440 }, { "epoch": 6.592009685230024, "grad_norm": 39.104522705078125, "learning_rate": 4.239531604675493e-05, "loss": 0.2714, "num_input_tokens_seen": 2230992, "step": 5445 }, { "epoch": 6.598062953995157, "grad_norm": 0.0007710062200203538, "learning_rate": 4.237633636321419e-05, "loss": 0.0001, "num_input_tokens_seen": 2233072, "step": 5450 }, { "epoch": 6.6041162227602905, "grad_norm": 0.004520764108747244, "learning_rate": 4.2357337284537696e-05, "loss": 0.019, "num_input_tokens_seen": 2235280, "step": 5455 }, { "epoch": 6.610169491525424, "grad_norm": 43.39153289794922, "learning_rate": 4.2338318831931864e-05, "loss": 0.1153, "num_input_tokens_seen": 2237328, "step": 5460 }, { "epoch": 6.616222760290557, "grad_norm": 2.6975409984588623, "learning_rate": 4.231928102662473e-05, "loss": 0.0022, "num_input_tokens_seen": 2239472, "step": 5465 }, { "epoch": 6.622276029055691, "grad_norm": 0.0060306936502456665, "learning_rate": 4.230022388986591e-05, "loss": 0.0003, "num_input_tokens_seen": 2241552, "step": 5470 }, { "epoch": 6.628329297820823, "grad_norm": 0.02299831248819828, "learning_rate": 4.228114744292664e-05, "loss": 0.0578, "num_input_tokens_seen": 2243600, "step": 5475 }, { "epoch": 6.634382566585956, "grad_norm": 0.0036517640110105276, "learning_rate": 4.226205170709968e-05, "loss": 0.001, "num_input_tokens_seen": 2245584, "step": 5480 }, { "epoch": 6.64043583535109, "grad_norm": 0.0029327417723834515, "learning_rate": 4.224293670369932e-05, "loss": 0.0006, "num_input_tokens_seen": 2247600, "step": 5485 }, { "epoch": 6.646489104116223, "grad_norm": 0.0068062362261116505, "learning_rate": 4.2223802454061375e-05, "loss": 0.0003, "num_input_tokens_seen": 2249648, "step": 5490 }, { "epoch": 6.652542372881356, "grad_norm": 0.01573433354496956, "learning_rate": 4.2204648979543124e-05, "loss": 0.0002, "num_input_tokens_seen": 2251600, "step": 5495 }, { "epoch": 6.658595641646489, "grad_norm": 0.021931927651166916, "learning_rate": 4.218547630152331e-05, "loss": 0.0002, "num_input_tokens_seen": 2253712, "step": 5500 }, { "epoch": 6.664648910411622, "grad_norm": 0.005814232397824526, "learning_rate": 4.21662844414021e-05, "loss": 0.0264, "num_input_tokens_seen": 2255856, "step": 5505 }, { "epoch": 6.670702179176756, "grad_norm": 0.002112648682668805, "learning_rate": 4.21470734206011e-05, "loss": 0.0001, "num_input_tokens_seen": 2257872, "step": 5510 }, { "epoch": 6.676755447941889, "grad_norm": 0.2650308310985565, "learning_rate": 4.212784326056329e-05, "loss": 0.0002, "num_input_tokens_seen": 2259824, "step": 5515 }, { "epoch": 6.682808716707022, "grad_norm": 0.005764856468886137, "learning_rate": 4.210859398275299e-05, "loss": 0.0001, "num_input_tokens_seen": 2261776, "step": 5520 }, { "epoch": 6.688861985472155, "grad_norm": 0.005123717710375786, "learning_rate": 4.208932560865588e-05, "loss": 0.0005, "num_input_tokens_seen": 2263856, "step": 5525 }, { "epoch": 6.694915254237288, "grad_norm": 0.0007002182537689805, "learning_rate": 4.2070038159778955e-05, "loss": 0.0001, "num_input_tokens_seen": 2265904, "step": 5530 }, { "epoch": 6.700968523002421, "grad_norm": 0.0033462117426097393, "learning_rate": 4.205073165765051e-05, "loss": 0.0001, "num_input_tokens_seen": 2267984, "step": 5535 }, { "epoch": 6.707021791767555, "grad_norm": 0.0014917402295395732, "learning_rate": 4.2031406123820084e-05, "loss": 0.0001, "num_input_tokens_seen": 2269936, "step": 5540 }, { "epoch": 6.713075060532688, "grad_norm": 0.43545395135879517, "learning_rate": 4.2012061579858465e-05, "loss": 0.0004, "num_input_tokens_seen": 2272016, "step": 5545 }, { "epoch": 6.719128329297821, "grad_norm": 0.0012776181101799011, "learning_rate": 4.1992698047357676e-05, "loss": 0.0001, "num_input_tokens_seen": 2273904, "step": 5550 }, { "epoch": 6.725181598062954, "grad_norm": 0.0010561866220086813, "learning_rate": 4.197331554793092e-05, "loss": 0.0001, "num_input_tokens_seen": 2276016, "step": 5555 }, { "epoch": 6.731234866828087, "grad_norm": 49.616695404052734, "learning_rate": 4.195391410321257e-05, "loss": 0.0569, "num_input_tokens_seen": 2277904, "step": 5560 }, { "epoch": 6.737288135593221, "grad_norm": 0.0008764867670834064, "learning_rate": 4.193449373485815e-05, "loss": 0.0001, "num_input_tokens_seen": 2279952, "step": 5565 }, { "epoch": 6.743341404358354, "grad_norm": 0.0014052917249500751, "learning_rate": 4.1915054464544305e-05, "loss": 0.0002, "num_input_tokens_seen": 2282096, "step": 5570 }, { "epoch": 6.749394673123486, "grad_norm": 0.002466606441885233, "learning_rate": 4.189559631396878e-05, "loss": 0.0001, "num_input_tokens_seen": 2284464, "step": 5575 }, { "epoch": 6.75544794188862, "grad_norm": 0.00033389453892596066, "learning_rate": 4.187611930485039e-05, "loss": 0.0001, "num_input_tokens_seen": 2286480, "step": 5580 }, { "epoch": 6.761501210653753, "grad_norm": 0.0018565910868346691, "learning_rate": 4.1856623458929015e-05, "loss": 0.0004, "num_input_tokens_seen": 2288592, "step": 5585 }, { "epoch": 6.767554479418886, "grad_norm": 0.0008962545543909073, "learning_rate": 4.183710879796552e-05, "loss": 0.0001, "num_input_tokens_seen": 2290640, "step": 5590 }, { "epoch": 6.77360774818402, "grad_norm": 0.2315467745065689, "learning_rate": 4.181757534374182e-05, "loss": 0.0001, "num_input_tokens_seen": 2292624, "step": 5595 }, { "epoch": 6.779661016949152, "grad_norm": 0.008910829201340675, "learning_rate": 4.179802311806078e-05, "loss": 0.0, "num_input_tokens_seen": 2294576, "step": 5600 }, { "epoch": 6.785714285714286, "grad_norm": 0.0037748320028185844, "learning_rate": 4.177845214274622e-05, "loss": 0.0436, "num_input_tokens_seen": 2296720, "step": 5605 }, { "epoch": 6.791767554479419, "grad_norm": 0.011328236199915409, "learning_rate": 4.175886243964289e-05, "loss": 0.0001, "num_input_tokens_seen": 2298832, "step": 5610 }, { "epoch": 6.797820823244552, "grad_norm": 0.0005386667908169329, "learning_rate": 4.173925403061644e-05, "loss": 0.0, "num_input_tokens_seen": 2300688, "step": 5615 }, { "epoch": 6.803874092009686, "grad_norm": 0.005308668129146099, "learning_rate": 4.171962693755341e-05, "loss": 0.0001, "num_input_tokens_seen": 2302704, "step": 5620 }, { "epoch": 6.809927360774818, "grad_norm": 0.001562476740218699, "learning_rate": 4.169998118236118e-05, "loss": 0.0, "num_input_tokens_seen": 2304784, "step": 5625 }, { "epoch": 6.815980629539951, "grad_norm": 0.00040922037442214787, "learning_rate": 4.1680316786967976e-05, "loss": 0.0, "num_input_tokens_seen": 2306800, "step": 5630 }, { "epoch": 6.822033898305085, "grad_norm": 0.000716723152436316, "learning_rate": 4.1660633773322815e-05, "loss": 0.0001, "num_input_tokens_seen": 2308848, "step": 5635 }, { "epoch": 6.828087167070218, "grad_norm": 0.0013146416749805212, "learning_rate": 4.1640932163395506e-05, "loss": 0.0, "num_input_tokens_seen": 2310992, "step": 5640 }, { "epoch": 6.8341404358353515, "grad_norm": 0.003001063596457243, "learning_rate": 4.162121197917661e-05, "loss": 0.0, "num_input_tokens_seen": 2313008, "step": 5645 }, { "epoch": 6.840193704600484, "grad_norm": 0.00036141107557341456, "learning_rate": 4.160147324267742e-05, "loss": 0.0001, "num_input_tokens_seen": 2315056, "step": 5650 }, { "epoch": 6.846246973365617, "grad_norm": 0.006815361324697733, "learning_rate": 4.158171597592994e-05, "loss": 0.0, "num_input_tokens_seen": 2317232, "step": 5655 }, { "epoch": 6.852300242130751, "grad_norm": 0.005204624962061644, "learning_rate": 4.1561940200986846e-05, "loss": 0.0, "num_input_tokens_seen": 2319248, "step": 5660 }, { "epoch": 6.858353510895884, "grad_norm": 0.0011420993832871318, "learning_rate": 4.154214593992149e-05, "loss": 0.0, "num_input_tokens_seen": 2321552, "step": 5665 }, { "epoch": 6.864406779661017, "grad_norm": 0.00039608904626220465, "learning_rate": 4.152233321482785e-05, "loss": 0.1036, "num_input_tokens_seen": 2323568, "step": 5670 }, { "epoch": 6.87046004842615, "grad_norm": 0.024403704330325127, "learning_rate": 4.150250204782051e-05, "loss": 0.0022, "num_input_tokens_seen": 2325552, "step": 5675 }, { "epoch": 6.876513317191283, "grad_norm": 0.0004395530268084258, "learning_rate": 4.148265246103463e-05, "loss": 0.0001, "num_input_tokens_seen": 2327536, "step": 5680 }, { "epoch": 6.8825665859564165, "grad_norm": 0.0021085436455905437, "learning_rate": 4.146278447662597e-05, "loss": 0.0053, "num_input_tokens_seen": 2329616, "step": 5685 }, { "epoch": 6.88861985472155, "grad_norm": 0.0005031941691413522, "learning_rate": 4.1442898116770767e-05, "loss": 0.0001, "num_input_tokens_seen": 2331632, "step": 5690 }, { "epoch": 6.894673123486683, "grad_norm": 67.36676025390625, "learning_rate": 4.142299340366581e-05, "loss": 0.0586, "num_input_tokens_seen": 2333552, "step": 5695 }, { "epoch": 6.900726392251816, "grad_norm": 0.00015601926133967936, "learning_rate": 4.140307035952836e-05, "loss": 0.0, "num_input_tokens_seen": 2335568, "step": 5700 }, { "epoch": 6.906779661016949, "grad_norm": 0.00016049797704908997, "learning_rate": 4.1383129006596137e-05, "loss": 0.0, "num_input_tokens_seen": 2337808, "step": 5705 }, { "epoch": 6.912832929782082, "grad_norm": 0.006333585828542709, "learning_rate": 4.1363169367127294e-05, "loss": 0.0, "num_input_tokens_seen": 2339792, "step": 5710 }, { "epoch": 6.918886198547216, "grad_norm": 25.012189865112305, "learning_rate": 4.134319146340042e-05, "loss": 0.0327, "num_input_tokens_seen": 2341936, "step": 5715 }, { "epoch": 6.924939467312349, "grad_norm": 1.1268092393875122, "learning_rate": 4.1323195317714446e-05, "loss": 0.0006, "num_input_tokens_seen": 2343952, "step": 5720 }, { "epoch": 6.9309927360774815, "grad_norm": 0.0011024216655641794, "learning_rate": 4.130318095238871e-05, "loss": 0.0, "num_input_tokens_seen": 2345968, "step": 5725 }, { "epoch": 6.937046004842615, "grad_norm": 0.0017318957252427936, "learning_rate": 4.128314838976286e-05, "loss": 0.0, "num_input_tokens_seen": 2348016, "step": 5730 }, { "epoch": 6.943099273607748, "grad_norm": 0.0025124181993305683, "learning_rate": 4.126309765219685e-05, "loss": 0.0451, "num_input_tokens_seen": 2350128, "step": 5735 }, { "epoch": 6.9491525423728815, "grad_norm": 0.019062070176005363, "learning_rate": 4.1243028762070936e-05, "loss": 0.0001, "num_input_tokens_seen": 2352208, "step": 5740 }, { "epoch": 6.955205811138015, "grad_norm": 0.0019064603839069605, "learning_rate": 4.1222941741785634e-05, "loss": 0.0, "num_input_tokens_seen": 2354224, "step": 5745 }, { "epoch": 6.961259079903147, "grad_norm": 0.0015580112813040614, "learning_rate": 4.120283661376169e-05, "loss": 0.0, "num_input_tokens_seen": 2356208, "step": 5750 }, { "epoch": 6.967312348668281, "grad_norm": 40.65341567993164, "learning_rate": 4.1182713400440074e-05, "loss": 0.0106, "num_input_tokens_seen": 2358256, "step": 5755 }, { "epoch": 6.973365617433414, "grad_norm": 0.000553587160538882, "learning_rate": 4.116257212428192e-05, "loss": 0.0, "num_input_tokens_seen": 2360464, "step": 5760 }, { "epoch": 6.979418886198547, "grad_norm": 0.0020452761091291904, "learning_rate": 4.1142412807768546e-05, "loss": 0.0128, "num_input_tokens_seen": 2362512, "step": 5765 }, { "epoch": 6.985472154963681, "grad_norm": 0.005388008430600166, "learning_rate": 4.11222354734014e-05, "loss": 0.0, "num_input_tokens_seen": 2364624, "step": 5770 }, { "epoch": 6.991525423728813, "grad_norm": 0.02689669467508793, "learning_rate": 4.1102040143702036e-05, "loss": 0.0004, "num_input_tokens_seen": 2366640, "step": 5775 }, { "epoch": 6.9975786924939465, "grad_norm": 0.0007072792504914105, "learning_rate": 4.108182684121209e-05, "loss": 0.0, "num_input_tokens_seen": 2368592, "step": 5780 }, { "epoch": 7.0, "eval_loss": 0.26290085911750793, "eval_runtime": 4.9692, "eval_samples_per_second": 73.855, "eval_steps_per_second": 18.514, "num_input_tokens_seen": 2369136, "step": 5782 }, { "epoch": 7.00363196125908, "grad_norm": 0.0007495933678001165, "learning_rate": 4.106159558849327e-05, "loss": 0.0, "num_input_tokens_seen": 2370416, "step": 5785 }, { "epoch": 7.009685230024213, "grad_norm": 0.000303404638543725, "learning_rate": 4.1041346408127324e-05, "loss": 0.0, "num_input_tokens_seen": 2372560, "step": 5790 }, { "epoch": 7.0157384987893465, "grad_norm": 0.002223500283434987, "learning_rate": 4.1021079322715984e-05, "loss": 0.0, "num_input_tokens_seen": 2374512, "step": 5795 }, { "epoch": 7.021791767554479, "grad_norm": 0.0006260905065573752, "learning_rate": 4.1000794354880986e-05, "loss": 0.0, "num_input_tokens_seen": 2376688, "step": 5800 }, { "epoch": 7.027845036319612, "grad_norm": 0.0017831063596531749, "learning_rate": 4.098049152726405e-05, "loss": 0.0, "num_input_tokens_seen": 2378608, "step": 5805 }, { "epoch": 7.033898305084746, "grad_norm": 0.00170576898381114, "learning_rate": 4.096017086252678e-05, "loss": 0.0001, "num_input_tokens_seen": 2380560, "step": 5810 }, { "epoch": 7.039951573849879, "grad_norm": 8.025336865102872e-05, "learning_rate": 4.093983238335072e-05, "loss": 0.0, "num_input_tokens_seen": 2382512, "step": 5815 }, { "epoch": 7.046004842615012, "grad_norm": 0.00046767707681283355, "learning_rate": 4.091947611243729e-05, "loss": 0.0, "num_input_tokens_seen": 2384496, "step": 5820 }, { "epoch": 7.052058111380146, "grad_norm": 0.004678878467530012, "learning_rate": 4.089910207250778e-05, "loss": 0.0, "num_input_tokens_seen": 2386544, "step": 5825 }, { "epoch": 7.058111380145278, "grad_norm": 0.0014446449931710958, "learning_rate": 4.087871028630331e-05, "loss": 0.0, "num_input_tokens_seen": 2388688, "step": 5830 }, { "epoch": 7.0641646489104115, "grad_norm": 0.0002800100774038583, "learning_rate": 4.085830077658479e-05, "loss": 0.0, "num_input_tokens_seen": 2390832, "step": 5835 }, { "epoch": 7.070217917675545, "grad_norm": 0.004072246141731739, "learning_rate": 4.083787356613293e-05, "loss": 0.0, "num_input_tokens_seen": 2392848, "step": 5840 }, { "epoch": 7.076271186440678, "grad_norm": 0.0004666007007472217, "learning_rate": 4.0817428677748205e-05, "loss": 0.0, "num_input_tokens_seen": 2394832, "step": 5845 }, { "epoch": 7.0823244552058116, "grad_norm": 0.00015167807578109205, "learning_rate": 4.079696613425079e-05, "loss": 0.0, "num_input_tokens_seen": 2396784, "step": 5850 }, { "epoch": 7.088377723970944, "grad_norm": 0.0022774655371904373, "learning_rate": 4.0776485958480607e-05, "loss": 0.0, "num_input_tokens_seen": 2398672, "step": 5855 }, { "epoch": 7.094430992736077, "grad_norm": 0.0038957325741648674, "learning_rate": 4.075598817329723e-05, "loss": 0.0001, "num_input_tokens_seen": 2400560, "step": 5860 }, { "epoch": 7.100484261501211, "grad_norm": 0.00021162173652555794, "learning_rate": 4.0735472801579885e-05, "loss": 0.0, "num_input_tokens_seen": 2402704, "step": 5865 }, { "epoch": 7.106537530266344, "grad_norm": 0.0005934094660915434, "learning_rate": 4.071493986622745e-05, "loss": 0.0, "num_input_tokens_seen": 2404720, "step": 5870 }, { "epoch": 7.112590799031477, "grad_norm": 0.00019318584236316383, "learning_rate": 4.06943893901584e-05, "loss": 0.0, "num_input_tokens_seen": 2406768, "step": 5875 }, { "epoch": 7.11864406779661, "grad_norm": 0.00043496093712747097, "learning_rate": 4.0673821396310786e-05, "loss": 0.0, "num_input_tokens_seen": 2408976, "step": 5880 }, { "epoch": 7.124697336561743, "grad_norm": 0.0003512675466481596, "learning_rate": 4.06532359076422e-05, "loss": 0.0, "num_input_tokens_seen": 2411216, "step": 5885 }, { "epoch": 7.1307506053268765, "grad_norm": 0.000284861889667809, "learning_rate": 4.063263294712978e-05, "loss": 0.0, "num_input_tokens_seen": 2413232, "step": 5890 }, { "epoch": 7.13680387409201, "grad_norm": 0.00023269825032912195, "learning_rate": 4.061201253777015e-05, "loss": 0.0, "num_input_tokens_seen": 2415248, "step": 5895 }, { "epoch": 7.142857142857143, "grad_norm": 0.001344350166618824, "learning_rate": 4.0591374702579435e-05, "loss": 0.0, "num_input_tokens_seen": 2417200, "step": 5900 }, { "epoch": 7.148910411622276, "grad_norm": 0.00019860056636389345, "learning_rate": 4.0570719464593173e-05, "loss": 0.0, "num_input_tokens_seen": 2419344, "step": 5905 }, { "epoch": 7.154963680387409, "grad_norm": 0.0065568978898227215, "learning_rate": 4.055004684686636e-05, "loss": 0.1167, "num_input_tokens_seen": 2421456, "step": 5910 }, { "epoch": 7.161016949152542, "grad_norm": 0.013063457794487476, "learning_rate": 4.052935687247338e-05, "loss": 0.0001, "num_input_tokens_seen": 2423568, "step": 5915 }, { "epoch": 7.167070217917676, "grad_norm": 0.010746856220066547, "learning_rate": 4.0508649564507975e-05, "loss": 0.0002, "num_input_tokens_seen": 2425584, "step": 5920 }, { "epoch": 7.173123486682809, "grad_norm": 0.0025115464814007282, "learning_rate": 4.048792494608327e-05, "loss": 0.0001, "num_input_tokens_seen": 2427536, "step": 5925 }, { "epoch": 7.1791767554479415, "grad_norm": 0.3207343816757202, "learning_rate": 4.046718304033167e-05, "loss": 0.0007, "num_input_tokens_seen": 2429520, "step": 5930 }, { "epoch": 7.185230024213075, "grad_norm": 0.00020461823442019522, "learning_rate": 4.04464238704049e-05, "loss": 0.0005, "num_input_tokens_seen": 2431728, "step": 5935 }, { "epoch": 7.191283292978208, "grad_norm": 0.010312157683074474, "learning_rate": 4.0425647459473954e-05, "loss": 0.0002, "num_input_tokens_seen": 2433776, "step": 5940 }, { "epoch": 7.197336561743342, "grad_norm": 0.015538225881755352, "learning_rate": 4.040485383072906e-05, "loss": 0.0002, "num_input_tokens_seen": 2435664, "step": 5945 }, { "epoch": 7.203389830508475, "grad_norm": 0.00039714761078357697, "learning_rate": 4.038404300737968e-05, "loss": 0.0001, "num_input_tokens_seen": 2437680, "step": 5950 }, { "epoch": 7.209443099273607, "grad_norm": 0.018917817622423172, "learning_rate": 4.0363215012654457e-05, "loss": 0.0001, "num_input_tokens_seen": 2439792, "step": 5955 }, { "epoch": 7.215496368038741, "grad_norm": 0.022907501086592674, "learning_rate": 4.034236986980119e-05, "loss": 0.0001, "num_input_tokens_seen": 2441872, "step": 5960 }, { "epoch": 7.221549636803874, "grad_norm": 0.0011693151900544763, "learning_rate": 4.032150760208684e-05, "loss": 0.0001, "num_input_tokens_seen": 2444016, "step": 5965 }, { "epoch": 7.227602905569007, "grad_norm": 0.0007009989931248128, "learning_rate": 4.030062823279747e-05, "loss": 0.0001, "num_input_tokens_seen": 2446160, "step": 5970 }, { "epoch": 7.233656174334141, "grad_norm": 0.0008654180564917624, "learning_rate": 4.027973178523824e-05, "loss": 0.0, "num_input_tokens_seen": 2448240, "step": 5975 }, { "epoch": 7.239709443099273, "grad_norm": 0.0016530976863577962, "learning_rate": 4.025881828273336e-05, "loss": 0.0001, "num_input_tokens_seen": 2450384, "step": 5980 }, { "epoch": 7.245762711864407, "grad_norm": 0.0007901392527855933, "learning_rate": 4.023788774862609e-05, "loss": 0.0001, "num_input_tokens_seen": 2452400, "step": 5985 }, { "epoch": 7.25181598062954, "grad_norm": 0.0006840389687567949, "learning_rate": 4.0216940206278686e-05, "loss": 0.0, "num_input_tokens_seen": 2454416, "step": 5990 }, { "epoch": 7.257869249394673, "grad_norm": 0.00015940601588226855, "learning_rate": 4.01959756790724e-05, "loss": 0.0, "num_input_tokens_seen": 2456368, "step": 5995 }, { "epoch": 7.263922518159807, "grad_norm": 0.002903133165091276, "learning_rate": 4.017499419040745e-05, "loss": 0.0, "num_input_tokens_seen": 2458288, "step": 6000 }, { "epoch": 7.269975786924939, "grad_norm": 0.001490480499342084, "learning_rate": 4.0153995763702945e-05, "loss": 0.0001, "num_input_tokens_seen": 2460304, "step": 6005 }, { "epoch": 7.276029055690072, "grad_norm": 0.004345349967479706, "learning_rate": 4.0132980422396956e-05, "loss": 0.0002, "num_input_tokens_seen": 2462384, "step": 6010 }, { "epoch": 7.282082324455206, "grad_norm": 0.0071560973301529884, "learning_rate": 4.011194818994639e-05, "loss": 0.0, "num_input_tokens_seen": 2464592, "step": 6015 }, { "epoch": 7.288135593220339, "grad_norm": 0.022113891318440437, "learning_rate": 4.0090899089827034e-05, "loss": 0.0007, "num_input_tokens_seen": 2466640, "step": 6020 }, { "epoch": 7.2941888619854724, "grad_norm": 0.0013496901374310255, "learning_rate": 4.0069833145533485e-05, "loss": 0.0, "num_input_tokens_seen": 2468752, "step": 6025 }, { "epoch": 7.300242130750606, "grad_norm": 0.0013476748717948794, "learning_rate": 4.004875038057916e-05, "loss": 0.079, "num_input_tokens_seen": 2470832, "step": 6030 }, { "epoch": 7.306295399515738, "grad_norm": 0.00226809224113822, "learning_rate": 4.002765081849623e-05, "loss": 0.0013, "num_input_tokens_seen": 2472912, "step": 6035 }, { "epoch": 7.312348668280872, "grad_norm": 0.0036513470113277435, "learning_rate": 4.000653448283563e-05, "loss": 0.0, "num_input_tokens_seen": 2475120, "step": 6040 }, { "epoch": 7.318401937046005, "grad_norm": 0.00015385322330985218, "learning_rate": 3.998540139716701e-05, "loss": 0.0004, "num_input_tokens_seen": 2477040, "step": 6045 }, { "epoch": 7.324455205811138, "grad_norm": 0.042947228997945786, "learning_rate": 3.996425158507872e-05, "loss": 0.0001, "num_input_tokens_seen": 2479120, "step": 6050 }, { "epoch": 7.330508474576272, "grad_norm": 0.0005220049060881138, "learning_rate": 3.994308507017776e-05, "loss": 0.0, "num_input_tokens_seen": 2481040, "step": 6055 }, { "epoch": 7.336561743341404, "grad_norm": 0.0008834113250486553, "learning_rate": 3.992190187608982e-05, "loss": 0.0, "num_input_tokens_seen": 2482992, "step": 6060 }, { "epoch": 7.342615012106537, "grad_norm": 0.0005985181778669357, "learning_rate": 3.9900702026459156e-05, "loss": 0.0, "num_input_tokens_seen": 2484944, "step": 6065 }, { "epoch": 7.348668280871671, "grad_norm": 0.0005386218545027077, "learning_rate": 3.9879485544948644e-05, "loss": 0.0, "num_input_tokens_seen": 2487056, "step": 6070 }, { "epoch": 7.354721549636804, "grad_norm": 133.8189697265625, "learning_rate": 3.985825245523972e-05, "loss": 0.0581, "num_input_tokens_seen": 2489008, "step": 6075 }, { "epoch": 7.3607748184019375, "grad_norm": 0.000301299529382959, "learning_rate": 3.9837002781032354e-05, "loss": 0.0, "num_input_tokens_seen": 2490992, "step": 6080 }, { "epoch": 7.36682808716707, "grad_norm": 0.00036130441003479064, "learning_rate": 3.981573654604501e-05, "loss": 0.1116, "num_input_tokens_seen": 2493104, "step": 6085 }, { "epoch": 7.372881355932203, "grad_norm": 0.010991279035806656, "learning_rate": 3.979445377401469e-05, "loss": 0.0, "num_input_tokens_seen": 2495216, "step": 6090 }, { "epoch": 7.378934624697337, "grad_norm": 0.0010617804946377873, "learning_rate": 3.9773154488696785e-05, "loss": 0.0, "num_input_tokens_seen": 2497328, "step": 6095 }, { "epoch": 7.38498789346247, "grad_norm": 0.0042671579867601395, "learning_rate": 3.975183871386516e-05, "loss": 0.0001, "num_input_tokens_seen": 2499440, "step": 6100 }, { "epoch": 7.391041162227603, "grad_norm": 0.003802667371928692, "learning_rate": 3.973050647331209e-05, "loss": 0.0001, "num_input_tokens_seen": 2501456, "step": 6105 }, { "epoch": 7.397094430992736, "grad_norm": 0.009245689027011395, "learning_rate": 3.9709157790848207e-05, "loss": 0.0002, "num_input_tokens_seen": 2503504, "step": 6110 }, { "epoch": 7.403147699757869, "grad_norm": 0.0002992156951222569, "learning_rate": 3.9687792690302505e-05, "loss": 0.0002, "num_input_tokens_seen": 2505488, "step": 6115 }, { "epoch": 7.4092009685230025, "grad_norm": 0.002614593831822276, "learning_rate": 3.9666411195522296e-05, "loss": 0.003, "num_input_tokens_seen": 2507536, "step": 6120 }, { "epoch": 7.415254237288136, "grad_norm": 0.0010831720428541303, "learning_rate": 3.964501333037321e-05, "loss": 0.0, "num_input_tokens_seen": 2509648, "step": 6125 }, { "epoch": 7.421307506053269, "grad_norm": 0.4864160418510437, "learning_rate": 3.962359911873913e-05, "loss": 0.0001, "num_input_tokens_seen": 2511696, "step": 6130 }, { "epoch": 7.427360774818402, "grad_norm": 0.0002497600798960775, "learning_rate": 3.960216858452218e-05, "loss": 0.0001, "num_input_tokens_seen": 2513648, "step": 6135 }, { "epoch": 7.433414043583535, "grad_norm": 0.0011273428099229932, "learning_rate": 3.9580721751642736e-05, "loss": 0.0, "num_input_tokens_seen": 2515696, "step": 6140 }, { "epoch": 7.439467312348668, "grad_norm": 0.00044208287727087736, "learning_rate": 3.9559258644039324e-05, "loss": 0.0, "num_input_tokens_seen": 2517776, "step": 6145 }, { "epoch": 7.445520581113802, "grad_norm": 0.03854494169354439, "learning_rate": 3.953777928566867e-05, "loss": 0.0089, "num_input_tokens_seen": 2519920, "step": 6150 }, { "epoch": 7.451573849878935, "grad_norm": 0.0009528912487439811, "learning_rate": 3.951628370050562e-05, "loss": 0.0, "num_input_tokens_seen": 2521968, "step": 6155 }, { "epoch": 7.4576271186440675, "grad_norm": 0.002739922609180212, "learning_rate": 3.9494771912543133e-05, "loss": 0.0828, "num_input_tokens_seen": 2523984, "step": 6160 }, { "epoch": 7.463680387409201, "grad_norm": 6.932223186595365e-05, "learning_rate": 3.947324394579226e-05, "loss": 0.0001, "num_input_tokens_seen": 2526224, "step": 6165 }, { "epoch": 7.469733656174334, "grad_norm": 0.005590303335338831, "learning_rate": 3.945169982428212e-05, "loss": 0.0002, "num_input_tokens_seen": 2528240, "step": 6170 }, { "epoch": 7.4757869249394675, "grad_norm": 0.0028543483931571245, "learning_rate": 3.943013957205982e-05, "loss": 0.0001, "num_input_tokens_seen": 2530224, "step": 6175 }, { "epoch": 7.481840193704601, "grad_norm": 0.40496939420700073, "learning_rate": 3.940856321319054e-05, "loss": 0.1294, "num_input_tokens_seen": 2532368, "step": 6180 }, { "epoch": 7.487893462469733, "grad_norm": 0.012197851203382015, "learning_rate": 3.938697077175738e-05, "loss": 0.0102, "num_input_tokens_seen": 2534320, "step": 6185 }, { "epoch": 7.493946731234867, "grad_norm": 0.009416533634066582, "learning_rate": 3.936536227186141e-05, "loss": 0.1108, "num_input_tokens_seen": 2536400, "step": 6190 }, { "epoch": 7.5, "grad_norm": 0.06311731040477753, "learning_rate": 3.9343737737621636e-05, "loss": 0.0117, "num_input_tokens_seen": 2538416, "step": 6195 }, { "epoch": 7.506053268765133, "grad_norm": 0.009147203527390957, "learning_rate": 3.932209719317494e-05, "loss": 0.1222, "num_input_tokens_seen": 2540272, "step": 6200 }, { "epoch": 7.512106537530267, "grad_norm": 0.11105341464281082, "learning_rate": 3.93004406626761e-05, "loss": 0.0011, "num_input_tokens_seen": 2542352, "step": 6205 }, { "epoch": 7.518159806295399, "grad_norm": 8.560646057128906, "learning_rate": 3.927876817029772e-05, "loss": 0.0487, "num_input_tokens_seen": 2544432, "step": 6210 }, { "epoch": 7.5242130750605325, "grad_norm": 10.606590270996094, "learning_rate": 3.925707974023021e-05, "loss": 0.0793, "num_input_tokens_seen": 2546448, "step": 6215 }, { "epoch": 7.530266343825666, "grad_norm": 0.02260608971118927, "learning_rate": 3.923537539668179e-05, "loss": 0.0338, "num_input_tokens_seen": 2548496, "step": 6220 }, { "epoch": 7.536319612590799, "grad_norm": 0.03656446933746338, "learning_rate": 3.9213655163878436e-05, "loss": 0.0006, "num_input_tokens_seen": 2550544, "step": 6225 }, { "epoch": 7.5423728813559325, "grad_norm": 0.043454360216856, "learning_rate": 3.9191919066063866e-05, "loss": 0.0021, "num_input_tokens_seen": 2552592, "step": 6230 }, { "epoch": 7.548426150121065, "grad_norm": 0.008705888874828815, "learning_rate": 3.917016712749948e-05, "loss": 0.0003, "num_input_tokens_seen": 2554672, "step": 6235 }, { "epoch": 7.554479418886198, "grad_norm": 0.0194411501288414, "learning_rate": 3.914839937246439e-05, "loss": 0.0004, "num_input_tokens_seen": 2556656, "step": 6240 }, { "epoch": 7.560532687651332, "grad_norm": 0.028150435537099838, "learning_rate": 3.9126615825255364e-05, "loss": 0.0005, "num_input_tokens_seen": 2558704, "step": 6245 }, { "epoch": 7.566585956416465, "grad_norm": 0.024516141042113304, "learning_rate": 3.910481651018675e-05, "loss": 0.0002, "num_input_tokens_seen": 2560784, "step": 6250 }, { "epoch": 7.572639225181598, "grad_norm": 0.0002598543360363692, "learning_rate": 3.908300145159055e-05, "loss": 0.0013, "num_input_tokens_seen": 2562864, "step": 6255 }, { "epoch": 7.578692493946731, "grad_norm": 0.011110860854387283, "learning_rate": 3.906117067381632e-05, "loss": 0.0002, "num_input_tokens_seen": 2564880, "step": 6260 }, { "epoch": 7.584745762711864, "grad_norm": 0.004354414995759726, "learning_rate": 3.903932420123114e-05, "loss": 0.0004, "num_input_tokens_seen": 2566864, "step": 6265 }, { "epoch": 7.5907990314769975, "grad_norm": 0.005037966184318066, "learning_rate": 3.901746205821964e-05, "loss": 0.0001, "num_input_tokens_seen": 2568912, "step": 6270 }, { "epoch": 7.596852300242131, "grad_norm": 0.00184596236795187, "learning_rate": 3.899558426918392e-05, "loss": 0.0004, "num_input_tokens_seen": 2570864, "step": 6275 }, { "epoch": 7.602905569007264, "grad_norm": 0.00164370599668473, "learning_rate": 3.8973690858543564e-05, "loss": 0.0, "num_input_tokens_seen": 2572912, "step": 6280 }, { "epoch": 7.608958837772397, "grad_norm": 0.0003264097322244197, "learning_rate": 3.8951781850735576e-05, "loss": 0.0, "num_input_tokens_seen": 2574992, "step": 6285 }, { "epoch": 7.61501210653753, "grad_norm": 0.0024356769863516092, "learning_rate": 3.892985727021436e-05, "loss": 0.0, "num_input_tokens_seen": 2577008, "step": 6290 }, { "epoch": 7.621065375302663, "grad_norm": 0.010540425777435303, "learning_rate": 3.890791714145173e-05, "loss": 0.0005, "num_input_tokens_seen": 2579056, "step": 6295 }, { "epoch": 7.627118644067797, "grad_norm": 0.00021326263959053904, "learning_rate": 3.888596148893683e-05, "loss": 0.0, "num_input_tokens_seen": 2581104, "step": 6300 }, { "epoch": 7.63317191283293, "grad_norm": 7.203565473901108e-05, "learning_rate": 3.886399033717615e-05, "loss": 0.0, "num_input_tokens_seen": 2583184, "step": 6305 }, { "epoch": 7.6392251815980625, "grad_norm": 0.14131547510623932, "learning_rate": 3.884200371069347e-05, "loss": 0.0001, "num_input_tokens_seen": 2585168, "step": 6310 }, { "epoch": 7.645278450363196, "grad_norm": 0.002195041161030531, "learning_rate": 3.882000163402984e-05, "loss": 0.0, "num_input_tokens_seen": 2587216, "step": 6315 }, { "epoch": 7.651331719128329, "grad_norm": 0.03707418963313103, "learning_rate": 3.879798413174356e-05, "loss": 0.0002, "num_input_tokens_seen": 2589264, "step": 6320 }, { "epoch": 7.657384987893463, "grad_norm": 0.0016859326278790832, "learning_rate": 3.877595122841014e-05, "loss": 0.0, "num_input_tokens_seen": 2591280, "step": 6325 }, { "epoch": 7.663438256658596, "grad_norm": 0.0011793742887675762, "learning_rate": 3.87539029486223e-05, "loss": 0.0, "num_input_tokens_seen": 2593296, "step": 6330 }, { "epoch": 7.669491525423728, "grad_norm": 0.0004162766272202134, "learning_rate": 3.8731839316989904e-05, "loss": 0.0, "num_input_tokens_seen": 2595248, "step": 6335 }, { "epoch": 7.675544794188862, "grad_norm": 0.0024376874789595604, "learning_rate": 3.8709760358139954e-05, "loss": 0.0012, "num_input_tokens_seen": 2597392, "step": 6340 }, { "epoch": 7.681598062953995, "grad_norm": 0.00030885564046911895, "learning_rate": 3.8687666096716566e-05, "loss": 0.0, "num_input_tokens_seen": 2599504, "step": 6345 }, { "epoch": 7.687651331719128, "grad_norm": 0.05105634033679962, "learning_rate": 3.866555655738094e-05, "loss": 0.0001, "num_input_tokens_seen": 2601680, "step": 6350 }, { "epoch": 7.693704600484262, "grad_norm": 0.0005574512761086226, "learning_rate": 3.8643431764811325e-05, "loss": 0.0001, "num_input_tokens_seen": 2603632, "step": 6355 }, { "epoch": 7.699757869249394, "grad_norm": 0.0011336562456563115, "learning_rate": 3.8621291743702976e-05, "loss": 0.0004, "num_input_tokens_seen": 2605712, "step": 6360 }, { "epoch": 7.7058111380145276, "grad_norm": 0.0005665639182552695, "learning_rate": 3.859913651876817e-05, "loss": 0.0, "num_input_tokens_seen": 2607792, "step": 6365 }, { "epoch": 7.711864406779661, "grad_norm": 0.0012208983534947038, "learning_rate": 3.8576966114736156e-05, "loss": 0.0, "num_input_tokens_seen": 2609840, "step": 6370 }, { "epoch": 7.717917675544794, "grad_norm": 0.00012163189967395738, "learning_rate": 3.8554780556353104e-05, "loss": 0.0, "num_input_tokens_seen": 2611984, "step": 6375 }, { "epoch": 7.723970944309928, "grad_norm": 3.0446681194007397e-05, "learning_rate": 3.853257986838212e-05, "loss": 0.0, "num_input_tokens_seen": 2614000, "step": 6380 }, { "epoch": 7.73002421307506, "grad_norm": 0.0027235010638833046, "learning_rate": 3.851036407560319e-05, "loss": 0.0, "num_input_tokens_seen": 2616016, "step": 6385 }, { "epoch": 7.736077481840193, "grad_norm": 2.7811454856419004e-05, "learning_rate": 3.848813320281316e-05, "loss": 0.0, "num_input_tokens_seen": 2617968, "step": 6390 }, { "epoch": 7.742130750605327, "grad_norm": 28.257314682006836, "learning_rate": 3.846588727482569e-05, "loss": 0.0752, "num_input_tokens_seen": 2620048, "step": 6395 }, { "epoch": 7.74818401937046, "grad_norm": 0.0006298269145190716, "learning_rate": 3.844362631647129e-05, "loss": 0.0, "num_input_tokens_seen": 2622160, "step": 6400 }, { "epoch": 7.754237288135593, "grad_norm": 1.156023621559143, "learning_rate": 3.84213503525972e-05, "loss": 0.0002, "num_input_tokens_seen": 2624176, "step": 6405 }, { "epoch": 7.760290556900727, "grad_norm": 4.478592018131167e-05, "learning_rate": 3.839905940806742e-05, "loss": 0.0, "num_input_tokens_seen": 2626416, "step": 6410 }, { "epoch": 7.766343825665859, "grad_norm": 0.14681881666183472, "learning_rate": 3.8376753507762695e-05, "loss": 0.0001, "num_input_tokens_seen": 2628464, "step": 6415 }, { "epoch": 7.772397094430993, "grad_norm": 0.0011973108630627394, "learning_rate": 3.835443267658043e-05, "loss": 0.0003, "num_input_tokens_seen": 2630608, "step": 6420 }, { "epoch": 7.778450363196126, "grad_norm": 0.0008692708797752857, "learning_rate": 3.833209693943473e-05, "loss": 0.0, "num_input_tokens_seen": 2632624, "step": 6425 }, { "epoch": 7.784503631961259, "grad_norm": 0.0006774618523195386, "learning_rate": 3.83097463212563e-05, "loss": 0.0, "num_input_tokens_seen": 2634704, "step": 6430 }, { "epoch": 7.790556900726393, "grad_norm": 0.0005341502837836742, "learning_rate": 3.828738084699249e-05, "loss": 0.0, "num_input_tokens_seen": 2636656, "step": 6435 }, { "epoch": 7.796610169491525, "grad_norm": 0.0005444041453301907, "learning_rate": 3.826500054160721e-05, "loss": 0.0, "num_input_tokens_seen": 2638704, "step": 6440 }, { "epoch": 7.802663438256658, "grad_norm": 0.00047881106729619205, "learning_rate": 3.8242605430080924e-05, "loss": 0.0019, "num_input_tokens_seen": 2640816, "step": 6445 }, { "epoch": 7.808716707021792, "grad_norm": 1.8236231803894043, "learning_rate": 3.822019553741064e-05, "loss": 0.0007, "num_input_tokens_seen": 2642832, "step": 6450 }, { "epoch": 7.814769975786925, "grad_norm": 112.80652618408203, "learning_rate": 3.819777088860985e-05, "loss": 0.0732, "num_input_tokens_seen": 2644912, "step": 6455 }, { "epoch": 7.8208232445520585, "grad_norm": 0.00025545264361426234, "learning_rate": 3.817533150870852e-05, "loss": 0.0, "num_input_tokens_seen": 2646992, "step": 6460 }, { "epoch": 7.826876513317191, "grad_norm": 0.0009630437125451863, "learning_rate": 3.8152877422753055e-05, "loss": 0.0, "num_input_tokens_seen": 2649072, "step": 6465 }, { "epoch": 7.832929782082324, "grad_norm": 4.2462248529773206e-05, "learning_rate": 3.813040865580627e-05, "loss": 0.0, "num_input_tokens_seen": 2650992, "step": 6470 }, { "epoch": 7.838983050847458, "grad_norm": 8.108100882964209e-05, "learning_rate": 3.81079252329474e-05, "loss": 0.0, "num_input_tokens_seen": 2653040, "step": 6475 }, { "epoch": 7.845036319612591, "grad_norm": 0.00010462252976140007, "learning_rate": 3.8085427179271996e-05, "loss": 0.0, "num_input_tokens_seen": 2655152, "step": 6480 }, { "epoch": 7.851089588377724, "grad_norm": 2.986617801070679e-05, "learning_rate": 3.806291451989196e-05, "loss": 0.0961, "num_input_tokens_seen": 2657072, "step": 6485 }, { "epoch": 7.857142857142857, "grad_norm": 6.254755973815918, "learning_rate": 3.80403872799355e-05, "loss": 0.0938, "num_input_tokens_seen": 2658992, "step": 6490 }, { "epoch": 7.86319612590799, "grad_norm": 0.005089545156806707, "learning_rate": 3.8017845484547065e-05, "loss": 0.0, "num_input_tokens_seen": 2661168, "step": 6495 }, { "epoch": 7.8692493946731235, "grad_norm": 0.00023714212875347584, "learning_rate": 3.799528915888742e-05, "loss": 0.0001, "num_input_tokens_seen": 2663184, "step": 6500 }, { "epoch": 7.875302663438257, "grad_norm": 0.0017551170894876122, "learning_rate": 3.7972718328133475e-05, "loss": 0.0001, "num_input_tokens_seen": 2665168, "step": 6505 }, { "epoch": 7.88135593220339, "grad_norm": 0.06001663580536842, "learning_rate": 3.7950133017478374e-05, "loss": 0.0006, "num_input_tokens_seen": 2667216, "step": 6510 }, { "epoch": 7.8874092009685235, "grad_norm": 0.0007589400047436357, "learning_rate": 3.79275332521314e-05, "loss": 0.0, "num_input_tokens_seen": 2669424, "step": 6515 }, { "epoch": 7.893462469733656, "grad_norm": 0.006991546135395765, "learning_rate": 3.7904919057318004e-05, "loss": 0.0002, "num_input_tokens_seen": 2671536, "step": 6520 }, { "epoch": 7.899515738498789, "grad_norm": 0.050778795033693314, "learning_rate": 3.78822904582797e-05, "loss": 0.0001, "num_input_tokens_seen": 2673648, "step": 6525 }, { "epoch": 7.905569007263923, "grad_norm": 0.0010742994491010904, "learning_rate": 3.78596474802741e-05, "loss": 0.0, "num_input_tokens_seen": 2675696, "step": 6530 }, { "epoch": 7.911622276029056, "grad_norm": 0.04046130180358887, "learning_rate": 3.783699014857487e-05, "loss": 0.0001, "num_input_tokens_seen": 2677680, "step": 6535 }, { "epoch": 7.917675544794189, "grad_norm": 0.004519982263445854, "learning_rate": 3.781431848847169e-05, "loss": 0.0129, "num_input_tokens_seen": 2679568, "step": 6540 }, { "epoch": 7.923728813559322, "grad_norm": 0.0015804548747837543, "learning_rate": 3.779163252527023e-05, "loss": 0.0, "num_input_tokens_seen": 2681776, "step": 6545 }, { "epoch": 7.929782082324455, "grad_norm": 5.1174709369661286e-05, "learning_rate": 3.7768932284292146e-05, "loss": 0.1396, "num_input_tokens_seen": 2683856, "step": 6550 }, { "epoch": 7.9358353510895885, "grad_norm": 0.02380639873445034, "learning_rate": 3.7746217790874996e-05, "loss": 0.0006, "num_input_tokens_seen": 2685840, "step": 6555 }, { "epoch": 7.941888619854722, "grad_norm": 0.0004704659222625196, "learning_rate": 3.772348907037228e-05, "loss": 0.0009, "num_input_tokens_seen": 2687984, "step": 6560 }, { "epoch": 7.947941888619855, "grad_norm": 0.022342612966895103, "learning_rate": 3.7700746148153356e-05, "loss": 0.0041, "num_input_tokens_seen": 2690192, "step": 6565 }, { "epoch": 7.953995157384988, "grad_norm": 0.0003692159370984882, "learning_rate": 3.767798904960343e-05, "loss": 0.0004, "num_input_tokens_seen": 2692400, "step": 6570 }, { "epoch": 7.960048426150121, "grad_norm": 0.011208659037947655, "learning_rate": 3.765521780012356e-05, "loss": 0.0002, "num_input_tokens_seen": 2694288, "step": 6575 }, { "epoch": 7.966101694915254, "grad_norm": 0.003948112949728966, "learning_rate": 3.763243242513059e-05, "loss": 0.001, "num_input_tokens_seen": 2696528, "step": 6580 }, { "epoch": 7.972154963680388, "grad_norm": 0.0016221614787355065, "learning_rate": 3.760963295005709e-05, "loss": 0.0003, "num_input_tokens_seen": 2698640, "step": 6585 }, { "epoch": 7.978208232445521, "grad_norm": 0.019991328939795494, "learning_rate": 3.7586819400351426e-05, "loss": 0.0002, "num_input_tokens_seen": 2700688, "step": 6590 }, { "epoch": 7.9842615012106535, "grad_norm": 0.019952060654759407, "learning_rate": 3.756399180147763e-05, "loss": 0.0003, "num_input_tokens_seen": 2702768, "step": 6595 }, { "epoch": 7.990314769975787, "grad_norm": 0.0002515793894417584, "learning_rate": 3.754115017891545e-05, "loss": 0.0001, "num_input_tokens_seen": 2704944, "step": 6600 }, { "epoch": 7.99636803874092, "grad_norm": 0.011229642666876316, "learning_rate": 3.7518294558160255e-05, "loss": 0.0002, "num_input_tokens_seen": 2706928, "step": 6605 }, { "epoch": 8.0, "eval_loss": 0.25781524181365967, "eval_runtime": 4.9535, "eval_samples_per_second": 74.089, "eval_steps_per_second": 18.573, "num_input_tokens_seen": 2707920, "step": 6608 }, { "epoch": 8.002421307506053, "grad_norm": 6.633478187723085e-05, "learning_rate": 3.749542496472306e-05, "loss": 0.0001, "num_input_tokens_seen": 2708688, "step": 6610 }, { "epoch": 8.008474576271187, "grad_norm": 0.0017740853363648057, "learning_rate": 3.747254142413047e-05, "loss": 0.0, "num_input_tokens_seen": 2710736, "step": 6615 }, { "epoch": 8.01452784503632, "grad_norm": 0.00032436009496450424, "learning_rate": 3.744964396192465e-05, "loss": 0.0001, "num_input_tokens_seen": 2712688, "step": 6620 }, { "epoch": 8.020581113801454, "grad_norm": 0.002401673933491111, "learning_rate": 3.7426732603663325e-05, "loss": 0.0001, "num_input_tokens_seen": 2714672, "step": 6625 }, { "epoch": 8.026634382566586, "grad_norm": 9.320827666670084e-05, "learning_rate": 3.7403807374919715e-05, "loss": 0.0, "num_input_tokens_seen": 2716816, "step": 6630 }, { "epoch": 8.032687651331718, "grad_norm": 0.00016926413809414953, "learning_rate": 3.738086830128251e-05, "loss": 0.0, "num_input_tokens_seen": 2718832, "step": 6635 }, { "epoch": 8.038740920096853, "grad_norm": 0.008838827721774578, "learning_rate": 3.7357915408355876e-05, "loss": 0.0001, "num_input_tokens_seen": 2720880, "step": 6640 }, { "epoch": 8.044794188861985, "grad_norm": 0.0007607583538629115, "learning_rate": 3.73349487217594e-05, "loss": 0.0001, "num_input_tokens_seen": 2723152, "step": 6645 }, { "epoch": 8.05084745762712, "grad_norm": 0.0007890012348070741, "learning_rate": 3.731196826712805e-05, "loss": 0.0, "num_input_tokens_seen": 2725296, "step": 6650 }, { "epoch": 8.056900726392252, "grad_norm": 0.0019491879502311349, "learning_rate": 3.7288974070112174e-05, "loss": 0.0, "num_input_tokens_seen": 2727312, "step": 6655 }, { "epoch": 8.062953995157384, "grad_norm": 5.102329669171013e-05, "learning_rate": 3.726596615637746e-05, "loss": 0.0, "num_input_tokens_seen": 2729552, "step": 6660 }, { "epoch": 8.069007263922519, "grad_norm": 0.07864350825548172, "learning_rate": 3.7242944551604914e-05, "loss": 0.0002, "num_input_tokens_seen": 2731440, "step": 6665 }, { "epoch": 8.075060532687651, "grad_norm": 0.004083891864866018, "learning_rate": 3.72199092814908e-05, "loss": 0.0, "num_input_tokens_seen": 2733456, "step": 6670 }, { "epoch": 8.081113801452785, "grad_norm": 9.3402064521797e-05, "learning_rate": 3.719686037174664e-05, "loss": 0.0, "num_input_tokens_seen": 2735504, "step": 6675 }, { "epoch": 8.087167070217918, "grad_norm": 0.0017140019917860627, "learning_rate": 3.7173797848099204e-05, "loss": 0.0, "num_input_tokens_seen": 2737648, "step": 6680 }, { "epoch": 8.09322033898305, "grad_norm": 0.0006112480768933892, "learning_rate": 3.715072173629043e-05, "loss": 0.0, "num_input_tokens_seen": 2739696, "step": 6685 }, { "epoch": 8.099273607748184, "grad_norm": 0.002141549251973629, "learning_rate": 3.7127632062077446e-05, "loss": 0.0, "num_input_tokens_seen": 2741776, "step": 6690 }, { "epoch": 8.105326876513317, "grad_norm": 0.0001234413357451558, "learning_rate": 3.7104528851232496e-05, "loss": 0.0489, "num_input_tokens_seen": 2743792, "step": 6695 }, { "epoch": 8.111380145278451, "grad_norm": 0.003932436462491751, "learning_rate": 3.708141212954295e-05, "loss": 0.0, "num_input_tokens_seen": 2745872, "step": 6700 }, { "epoch": 8.117433414043584, "grad_norm": 0.008946097455918789, "learning_rate": 3.705828192281126e-05, "loss": 0.0003, "num_input_tokens_seen": 2747952, "step": 6705 }, { "epoch": 8.123486682808716, "grad_norm": 0.0010494788875803351, "learning_rate": 3.703513825685489e-05, "loss": 0.0001, "num_input_tokens_seen": 2749968, "step": 6710 }, { "epoch": 8.12953995157385, "grad_norm": 35.26046371459961, "learning_rate": 3.7011981157506405e-05, "loss": 0.0163, "num_input_tokens_seen": 2752080, "step": 6715 }, { "epoch": 8.135593220338983, "grad_norm": 0.00022349573555402458, "learning_rate": 3.6988810650613286e-05, "loss": 0.0, "num_input_tokens_seen": 2754128, "step": 6720 }, { "epoch": 8.141646489104117, "grad_norm": 0.00020378545741550624, "learning_rate": 3.6965626762038024e-05, "loss": 0.0001, "num_input_tokens_seen": 2756048, "step": 6725 }, { "epoch": 8.14769975786925, "grad_norm": 8.073065691860393e-05, "learning_rate": 3.694242951765803e-05, "loss": 0.0, "num_input_tokens_seen": 2758064, "step": 6730 }, { "epoch": 8.153753026634382, "grad_norm": 4.472523141885176e-05, "learning_rate": 3.691921894336563e-05, "loss": 0.0, "num_input_tokens_seen": 2760048, "step": 6735 }, { "epoch": 8.159806295399516, "grad_norm": 0.0003433637320995331, "learning_rate": 3.689599506506802e-05, "loss": 0.0, "num_input_tokens_seen": 2762128, "step": 6740 }, { "epoch": 8.165859564164649, "grad_norm": 0.0006836464744992554, "learning_rate": 3.6872757908687255e-05, "loss": 0.0, "num_input_tokens_seen": 2764176, "step": 6745 }, { "epoch": 8.171912832929783, "grad_norm": 6.334993668133393e-05, "learning_rate": 3.684950750016021e-05, "loss": 0.0, "num_input_tokens_seen": 2766224, "step": 6750 }, { "epoch": 8.177966101694915, "grad_norm": 0.00018937444838229567, "learning_rate": 3.6826243865438534e-05, "loss": 0.0008, "num_input_tokens_seen": 2768176, "step": 6755 }, { "epoch": 8.184019370460048, "grad_norm": 0.0004946189583279192, "learning_rate": 3.680296703048867e-05, "loss": 0.0, "num_input_tokens_seen": 2770128, "step": 6760 }, { "epoch": 8.190072639225182, "grad_norm": 0.000338507816195488, "learning_rate": 3.677967702129177e-05, "loss": 0.1218, "num_input_tokens_seen": 2772144, "step": 6765 }, { "epoch": 8.196125907990314, "grad_norm": 0.00011888053268194199, "learning_rate": 3.67563738638437e-05, "loss": 0.0, "num_input_tokens_seen": 2774288, "step": 6770 }, { "epoch": 8.202179176755449, "grad_norm": 0.00010498164192540571, "learning_rate": 3.673305758415499e-05, "loss": 0.0, "num_input_tokens_seen": 2776208, "step": 6775 }, { "epoch": 8.208232445520581, "grad_norm": 6.685663538519293e-05, "learning_rate": 3.6709728208250845e-05, "loss": 0.0, "num_input_tokens_seen": 2778288, "step": 6780 }, { "epoch": 8.214285714285714, "grad_norm": 0.000162960757734254, "learning_rate": 3.6686385762171055e-05, "loss": 0.0514, "num_input_tokens_seen": 2780240, "step": 6785 }, { "epoch": 8.220338983050848, "grad_norm": 0.00024212882271967828, "learning_rate": 3.666303027197003e-05, "loss": 0.0, "num_input_tokens_seen": 2782448, "step": 6790 }, { "epoch": 8.22639225181598, "grad_norm": 7.282512524398044e-05, "learning_rate": 3.663966176371671e-05, "loss": 0.0, "num_input_tokens_seen": 2784496, "step": 6795 }, { "epoch": 8.232445520581114, "grad_norm": 0.0006717280484735966, "learning_rate": 3.661628026349458e-05, "loss": 0.0, "num_input_tokens_seen": 2786576, "step": 6800 }, { "epoch": 8.238498789346247, "grad_norm": 0.0002839941589627415, "learning_rate": 3.659288579740163e-05, "loss": 0.0, "num_input_tokens_seen": 2788688, "step": 6805 }, { "epoch": 8.24455205811138, "grad_norm": 0.009168500080704689, "learning_rate": 3.656947839155032e-05, "loss": 0.0, "num_input_tokens_seen": 2790704, "step": 6810 }, { "epoch": 8.250605326876514, "grad_norm": 0.00018597522284835577, "learning_rate": 3.654605807206754e-05, "loss": 0.0, "num_input_tokens_seen": 2792720, "step": 6815 }, { "epoch": 8.256658595641646, "grad_norm": 8.1178019172512e-05, "learning_rate": 3.652262486509462e-05, "loss": 0.0, "num_input_tokens_seen": 2794672, "step": 6820 }, { "epoch": 8.26271186440678, "grad_norm": 7.061540964059532e-05, "learning_rate": 3.649917879678724e-05, "loss": 0.0214, "num_input_tokens_seen": 2796656, "step": 6825 }, { "epoch": 8.268765133171913, "grad_norm": 0.0006195157766342163, "learning_rate": 3.647571989331548e-05, "loss": 0.0, "num_input_tokens_seen": 2798704, "step": 6830 }, { "epoch": 8.274818401937045, "grad_norm": 0.00014304769865702838, "learning_rate": 3.6452248180863694e-05, "loss": 0.0151, "num_input_tokens_seen": 2800752, "step": 6835 }, { "epoch": 8.28087167070218, "grad_norm": 0.014064022339880466, "learning_rate": 3.642876368563059e-05, "loss": 0.1269, "num_input_tokens_seen": 2802960, "step": 6840 }, { "epoch": 8.286924939467312, "grad_norm": 0.0029092663899064064, "learning_rate": 3.6405266433829075e-05, "loss": 0.0751, "num_input_tokens_seen": 2805136, "step": 6845 }, { "epoch": 8.292978208232446, "grad_norm": 0.05522635951638222, "learning_rate": 3.6381756451686375e-05, "loss": 0.0001, "num_input_tokens_seen": 2807120, "step": 6850 }, { "epoch": 8.299031476997579, "grad_norm": 0.0008122457074932754, "learning_rate": 3.635823376544385e-05, "loss": 0.0001, "num_input_tokens_seen": 2809328, "step": 6855 }, { "epoch": 8.305084745762711, "grad_norm": 0.0006026147166267037, "learning_rate": 3.6334698401357107e-05, "loss": 0.0001, "num_input_tokens_seen": 2811280, "step": 6860 }, { "epoch": 8.311138014527845, "grad_norm": 12.917444229125977, "learning_rate": 3.6311150385695845e-05, "loss": 0.0083, "num_input_tokens_seen": 2813232, "step": 6865 }, { "epoch": 8.317191283292978, "grad_norm": 0.0010906484676524997, "learning_rate": 3.6287589744743925e-05, "loss": 0.0, "num_input_tokens_seen": 2815312, "step": 6870 }, { "epoch": 8.323244552058112, "grad_norm": 0.00027743770624510944, "learning_rate": 3.6264016504799274e-05, "loss": 0.0, "num_input_tokens_seen": 2817424, "step": 6875 }, { "epoch": 8.329297820823244, "grad_norm": 0.0004438970354385674, "learning_rate": 3.624043069217391e-05, "loss": 0.0002, "num_input_tokens_seen": 2819536, "step": 6880 }, { "epoch": 8.335351089588377, "grad_norm": 0.0003189349372405559, "learning_rate": 3.621683233319386e-05, "loss": 0.0, "num_input_tokens_seen": 2821616, "step": 6885 }, { "epoch": 8.341404358353511, "grad_norm": 0.0006260967929847538, "learning_rate": 3.619322145419915e-05, "loss": 0.0, "num_input_tokens_seen": 2823792, "step": 6890 }, { "epoch": 8.347457627118644, "grad_norm": 0.00034123347722925246, "learning_rate": 3.616959808154381e-05, "loss": 0.0715, "num_input_tokens_seen": 2825904, "step": 6895 }, { "epoch": 8.353510895883778, "grad_norm": 0.0003090897807851434, "learning_rate": 3.61459622415958e-05, "loss": 0.0, "num_input_tokens_seen": 2827952, "step": 6900 }, { "epoch": 8.35956416464891, "grad_norm": 0.010767419822514057, "learning_rate": 3.6122313960736983e-05, "loss": 0.0, "num_input_tokens_seen": 2830096, "step": 6905 }, { "epoch": 8.365617433414043, "grad_norm": 0.018134720623493195, "learning_rate": 3.609865326536312e-05, "loss": 0.0, "num_input_tokens_seen": 2832048, "step": 6910 }, { "epoch": 8.371670702179177, "grad_norm": 0.001811730326153338, "learning_rate": 3.607498018188385e-05, "loss": 0.0, "num_input_tokens_seen": 2834128, "step": 6915 }, { "epoch": 8.37772397094431, "grad_norm": 0.0002285801019752398, "learning_rate": 3.605129473672259e-05, "loss": 0.0, "num_input_tokens_seen": 2836016, "step": 6920 }, { "epoch": 8.383777239709444, "grad_norm": 0.0007630673935636878, "learning_rate": 3.602759695631659e-05, "loss": 0.0, "num_input_tokens_seen": 2837968, "step": 6925 }, { "epoch": 8.389830508474576, "grad_norm": 0.0011573403608053923, "learning_rate": 3.6003886867116875e-05, "loss": 0.0001, "num_input_tokens_seen": 2840016, "step": 6930 }, { "epoch": 8.39588377723971, "grad_norm": 0.0020566501189023256, "learning_rate": 3.5980164495588176e-05, "loss": 0.0003, "num_input_tokens_seen": 2842160, "step": 6935 }, { "epoch": 8.401937046004843, "grad_norm": 35.19996643066406, "learning_rate": 3.5956429868208974e-05, "loss": 0.155, "num_input_tokens_seen": 2844304, "step": 6940 }, { "epoch": 8.407990314769975, "grad_norm": 0.002214475767686963, "learning_rate": 3.593268301147139e-05, "loss": 0.0, "num_input_tokens_seen": 2846416, "step": 6945 }, { "epoch": 8.41404358353511, "grad_norm": 0.00048597907880321145, "learning_rate": 3.590892395188122e-05, "loss": 0.0001, "num_input_tokens_seen": 2848304, "step": 6950 }, { "epoch": 8.420096852300242, "grad_norm": 0.00035236135590821505, "learning_rate": 3.5885152715957874e-05, "loss": 0.0002, "num_input_tokens_seen": 2850416, "step": 6955 }, { "epoch": 8.426150121065376, "grad_norm": 0.0021403199061751366, "learning_rate": 3.5861369330234345e-05, "loss": 0.0, "num_input_tokens_seen": 2852464, "step": 6960 }, { "epoch": 8.432203389830509, "grad_norm": 0.0005099152331240475, "learning_rate": 3.583757382125721e-05, "loss": 0.0001, "num_input_tokens_seen": 2854416, "step": 6965 }, { "epoch": 8.438256658595641, "grad_norm": 0.000702406105119735, "learning_rate": 3.5813766215586554e-05, "loss": 0.0, "num_input_tokens_seen": 2856272, "step": 6970 }, { "epoch": 8.444309927360775, "grad_norm": 0.0005587338237091899, "learning_rate": 3.578994653979598e-05, "loss": 0.0, "num_input_tokens_seen": 2858384, "step": 6975 }, { "epoch": 8.450363196125908, "grad_norm": 0.001404477283358574, "learning_rate": 3.576611482047254e-05, "loss": 0.0, "num_input_tokens_seen": 2860432, "step": 6980 }, { "epoch": 8.456416464891042, "grad_norm": 0.017548901960253716, "learning_rate": 3.574227108421676e-05, "loss": 0.0, "num_input_tokens_seen": 2862544, "step": 6985 }, { "epoch": 8.462469733656174, "grad_norm": 0.020258037373423576, "learning_rate": 3.5718415357642567e-05, "loss": 0.0001, "num_input_tokens_seen": 2864528, "step": 6990 }, { "epoch": 8.468523002421307, "grad_norm": 0.002602284774184227, "learning_rate": 3.5694547667377256e-05, "loss": 0.0, "num_input_tokens_seen": 2866640, "step": 6995 }, { "epoch": 8.474576271186441, "grad_norm": 0.021261239424347878, "learning_rate": 3.56706680400615e-05, "loss": 0.0001, "num_input_tokens_seen": 2868816, "step": 7000 }, { "epoch": 8.480629539951574, "grad_norm": 0.0003067507641389966, "learning_rate": 3.5646776502349274e-05, "loss": 0.0231, "num_input_tokens_seen": 2870992, "step": 7005 }, { "epoch": 8.486682808716708, "grad_norm": 0.0021530231460928917, "learning_rate": 3.562287308090786e-05, "loss": 0.0, "num_input_tokens_seen": 2873040, "step": 7010 }, { "epoch": 8.49273607748184, "grad_norm": 0.0017084956634789705, "learning_rate": 3.559895780241781e-05, "loss": 0.0787, "num_input_tokens_seen": 2875120, "step": 7015 }, { "epoch": 8.498789346246973, "grad_norm": 0.011427367106080055, "learning_rate": 3.55750306935729e-05, "loss": 0.0003, "num_input_tokens_seen": 2877168, "step": 7020 }, { "epoch": 8.504842615012107, "grad_norm": 0.0003647661942522973, "learning_rate": 3.5551091781080104e-05, "loss": 0.0001, "num_input_tokens_seen": 2879216, "step": 7025 }, { "epoch": 8.51089588377724, "grad_norm": 6.398409366607666, "learning_rate": 3.552714109165958e-05, "loss": 0.0852, "num_input_tokens_seen": 2881168, "step": 7030 }, { "epoch": 8.516949152542374, "grad_norm": 0.014873812906444073, "learning_rate": 3.550317865204465e-05, "loss": 0.0488, "num_input_tokens_seen": 2883280, "step": 7035 }, { "epoch": 8.523002421307506, "grad_norm": 0.013136486522853374, "learning_rate": 3.547920448898171e-05, "loss": 0.022, "num_input_tokens_seen": 2885360, "step": 7040 }, { "epoch": 8.529055690072639, "grad_norm": 0.002396492287516594, "learning_rate": 3.545521862923028e-05, "loss": 0.0001, "num_input_tokens_seen": 2887568, "step": 7045 }, { "epoch": 8.535108958837773, "grad_norm": 0.008753519505262375, "learning_rate": 3.5431221099562914e-05, "loss": 0.0005, "num_input_tokens_seen": 2889616, "step": 7050 }, { "epoch": 8.541162227602905, "grad_norm": 0.015840912237763405, "learning_rate": 3.540721192676521e-05, "loss": 0.0001, "num_input_tokens_seen": 2891728, "step": 7055 }, { "epoch": 8.54721549636804, "grad_norm": 0.02818777784705162, "learning_rate": 3.538319113763571e-05, "loss": 0.0011, "num_input_tokens_seen": 2893872, "step": 7060 }, { "epoch": 8.553268765133172, "grad_norm": 0.0009582428610883653, "learning_rate": 3.535915875898601e-05, "loss": 0.0001, "num_input_tokens_seen": 2895696, "step": 7065 }, { "epoch": 8.559322033898304, "grad_norm": 0.0009683466050773859, "learning_rate": 3.533511481764057e-05, "loss": 0.0001, "num_input_tokens_seen": 2897680, "step": 7070 }, { "epoch": 8.565375302663439, "grad_norm": 0.0003288947918917984, "learning_rate": 3.531105934043678e-05, "loss": 0.0002, "num_input_tokens_seen": 2899632, "step": 7075 }, { "epoch": 8.571428571428571, "grad_norm": 0.0021188203245401382, "learning_rate": 3.5286992354224904e-05, "loss": 0.0001, "num_input_tokens_seen": 2901712, "step": 7080 }, { "epoch": 8.577481840193705, "grad_norm": 9.077924187295139e-05, "learning_rate": 3.5262913885868066e-05, "loss": 0.0, "num_input_tokens_seen": 2903696, "step": 7085 }, { "epoch": 8.583535108958838, "grad_norm": 0.001047432655468583, "learning_rate": 3.5238823962242176e-05, "loss": 0.0, "num_input_tokens_seen": 2905648, "step": 7090 }, { "epoch": 8.58958837772397, "grad_norm": 0.00039717197068966925, "learning_rate": 3.521472261023596e-05, "loss": 0.0001, "num_input_tokens_seen": 2907536, "step": 7095 }, { "epoch": 8.595641646489105, "grad_norm": 0.0009328247397206724, "learning_rate": 3.519060985675089e-05, "loss": 0.0001, "num_input_tokens_seen": 2909488, "step": 7100 }, { "epoch": 8.601694915254237, "grad_norm": 0.007346242666244507, "learning_rate": 3.5166485728701145e-05, "loss": 0.0, "num_input_tokens_seen": 2911632, "step": 7105 }, { "epoch": 8.607748184019371, "grad_norm": 0.0007871069828979671, "learning_rate": 3.514235025301365e-05, "loss": 0.0, "num_input_tokens_seen": 2913648, "step": 7110 }, { "epoch": 8.613801452784504, "grad_norm": 5.1819737564073876e-05, "learning_rate": 3.511820345662793e-05, "loss": 0.0001, "num_input_tokens_seen": 2915664, "step": 7115 }, { "epoch": 8.619854721549636, "grad_norm": 0.007218606304377317, "learning_rate": 3.5094045366496184e-05, "loss": 0.0, "num_input_tokens_seen": 2917808, "step": 7120 }, { "epoch": 8.62590799031477, "grad_norm": 0.00912031065672636, "learning_rate": 3.506987600958324e-05, "loss": 0.0, "num_input_tokens_seen": 2919728, "step": 7125 }, { "epoch": 8.631961259079903, "grad_norm": 0.00014321667549666017, "learning_rate": 3.504569541286644e-05, "loss": 0.0557, "num_input_tokens_seen": 2921840, "step": 7130 }, { "epoch": 8.638014527845037, "grad_norm": 0.002319933380931616, "learning_rate": 3.5021503603335725e-05, "loss": 0.0835, "num_input_tokens_seen": 2923984, "step": 7135 }, { "epoch": 8.64406779661017, "grad_norm": 0.04582304134964943, "learning_rate": 3.499730060799352e-05, "loss": 0.0002, "num_input_tokens_seen": 2926000, "step": 7140 }, { "epoch": 8.650121065375302, "grad_norm": 0.0010568444849923253, "learning_rate": 3.497308645385476e-05, "loss": 0.0002, "num_input_tokens_seen": 2928144, "step": 7145 }, { "epoch": 8.656174334140436, "grad_norm": 0.02851634845137596, "learning_rate": 3.494886116794683e-05, "loss": 0.082, "num_input_tokens_seen": 2930128, "step": 7150 }, { "epoch": 8.662227602905569, "grad_norm": 0.00010292309161741287, "learning_rate": 3.4924624777309505e-05, "loss": 0.0001, "num_input_tokens_seen": 2932112, "step": 7155 }, { "epoch": 8.668280871670703, "grad_norm": 0.0886460468173027, "learning_rate": 3.490037730899501e-05, "loss": 0.0006, "num_input_tokens_seen": 2934192, "step": 7160 }, { "epoch": 8.674334140435835, "grad_norm": 0.014489387162029743, "learning_rate": 3.4876118790067887e-05, "loss": 0.0003, "num_input_tokens_seen": 2936240, "step": 7165 }, { "epoch": 8.680387409200968, "grad_norm": 0.040073543787002563, "learning_rate": 3.485184924760504e-05, "loss": 0.0003, "num_input_tokens_seen": 2938160, "step": 7170 }, { "epoch": 8.686440677966102, "grad_norm": 0.06954536586999893, "learning_rate": 3.482756870869568e-05, "loss": 0.0002, "num_input_tokens_seen": 2940080, "step": 7175 }, { "epoch": 8.692493946731235, "grad_norm": 0.0019082295475527644, "learning_rate": 3.4803277200441256e-05, "loss": 0.0, "num_input_tokens_seen": 2942192, "step": 7180 }, { "epoch": 8.698547215496369, "grad_norm": 0.0026267068460583687, "learning_rate": 3.477897474995552e-05, "loss": 0.0006, "num_input_tokens_seen": 2944240, "step": 7185 }, { "epoch": 8.704600484261501, "grad_norm": 0.0014541213167831302, "learning_rate": 3.475466138436438e-05, "loss": 0.0, "num_input_tokens_seen": 2946224, "step": 7190 }, { "epoch": 8.710653753026634, "grad_norm": 0.04576864466071129, "learning_rate": 3.473033713080597e-05, "loss": 0.0005, "num_input_tokens_seen": 2948208, "step": 7195 }, { "epoch": 8.716707021791768, "grad_norm": 0.000489784637466073, "learning_rate": 3.4706002016430543e-05, "loss": 0.0, "num_input_tokens_seen": 2950416, "step": 7200 }, { "epoch": 8.7227602905569, "grad_norm": 0.001003463170491159, "learning_rate": 3.4681656068400496e-05, "loss": 0.0402, "num_input_tokens_seen": 2952528, "step": 7205 }, { "epoch": 8.728813559322035, "grad_norm": 0.004167179577052593, "learning_rate": 3.465729931389032e-05, "loss": 0.0, "num_input_tokens_seen": 2954640, "step": 7210 }, { "epoch": 8.734866828087167, "grad_norm": 0.0001774187694536522, "learning_rate": 3.463293178008655e-05, "loss": 0.0, "num_input_tokens_seen": 2956720, "step": 7215 }, { "epoch": 8.7409200968523, "grad_norm": 0.006750156637281179, "learning_rate": 3.460855349418776e-05, "loss": 0.0007, "num_input_tokens_seen": 2958704, "step": 7220 }, { "epoch": 8.746973365617434, "grad_norm": 0.0031560384668409824, "learning_rate": 3.4584164483404544e-05, "loss": 0.0, "num_input_tokens_seen": 2960784, "step": 7225 }, { "epoch": 8.753026634382566, "grad_norm": 0.025723541155457497, "learning_rate": 3.455976477495944e-05, "loss": 0.0, "num_input_tokens_seen": 2962768, "step": 7230 }, { "epoch": 8.7590799031477, "grad_norm": 0.00013877365563530475, "learning_rate": 3.453535439608694e-05, "loss": 0.0, "num_input_tokens_seen": 2964784, "step": 7235 }, { "epoch": 8.765133171912833, "grad_norm": 0.000686564773786813, "learning_rate": 3.4510933374033445e-05, "loss": 0.0, "num_input_tokens_seen": 2966896, "step": 7240 }, { "epoch": 8.771186440677965, "grad_norm": 8.443445403827354e-05, "learning_rate": 3.448650173605723e-05, "loss": 0.0, "num_input_tokens_seen": 2968944, "step": 7245 }, { "epoch": 8.7772397094431, "grad_norm": 0.00215071439743042, "learning_rate": 3.4462059509428435e-05, "loss": 0.0, "num_input_tokens_seen": 2970960, "step": 7250 }, { "epoch": 8.783292978208232, "grad_norm": 7.122565875761211e-05, "learning_rate": 3.443760672142901e-05, "loss": 0.0, "num_input_tokens_seen": 2972976, "step": 7255 }, { "epoch": 8.789346246973366, "grad_norm": 0.00039136107079684734, "learning_rate": 3.441314339935266e-05, "loss": 0.0005, "num_input_tokens_seen": 2975056, "step": 7260 }, { "epoch": 8.795399515738499, "grad_norm": 0.00020482081163208932, "learning_rate": 3.438866957050492e-05, "loss": 0.0, "num_input_tokens_seen": 2976944, "step": 7265 }, { "epoch": 8.801452784503631, "grad_norm": 0.0003187559195794165, "learning_rate": 3.4364185262202984e-05, "loss": 0.0, "num_input_tokens_seen": 2978960, "step": 7270 }, { "epoch": 8.807506053268765, "grad_norm": 0.0007181736873462796, "learning_rate": 3.4339690501775784e-05, "loss": 0.0001, "num_input_tokens_seen": 2980880, "step": 7275 }, { "epoch": 8.813559322033898, "grad_norm": 0.0007199262036010623, "learning_rate": 3.43151853165639e-05, "loss": 0.0, "num_input_tokens_seen": 2982800, "step": 7280 }, { "epoch": 8.819612590799032, "grad_norm": 6.719122029608116e-05, "learning_rate": 3.429066973391955e-05, "loss": 0.0, "num_input_tokens_seen": 2984784, "step": 7285 }, { "epoch": 8.825665859564165, "grad_norm": 0.0011888773879036307, "learning_rate": 3.426614378120657e-05, "loss": 0.0, "num_input_tokens_seen": 2986864, "step": 7290 }, { "epoch": 8.831719128329297, "grad_norm": 0.00018073819228447974, "learning_rate": 3.4241607485800365e-05, "loss": 0.0, "num_input_tokens_seen": 2988848, "step": 7295 }, { "epoch": 8.837772397094431, "grad_norm": 0.0003553138521965593, "learning_rate": 3.4217060875087856e-05, "loss": 0.0, "num_input_tokens_seen": 2990896, "step": 7300 }, { "epoch": 8.843825665859564, "grad_norm": 2.2333085536956787, "learning_rate": 3.419250397646753e-05, "loss": 0.0003, "num_input_tokens_seen": 2993008, "step": 7305 }, { "epoch": 8.849878934624698, "grad_norm": 0.0017578557599335909, "learning_rate": 3.416793681734932e-05, "loss": 0.0525, "num_input_tokens_seen": 2995088, "step": 7310 }, { "epoch": 8.85593220338983, "grad_norm": 0.0024352779146283865, "learning_rate": 3.414335942515461e-05, "loss": 0.0, "num_input_tokens_seen": 2997136, "step": 7315 }, { "epoch": 8.861985472154963, "grad_norm": 0.0040014758706092834, "learning_rate": 3.411877182731623e-05, "loss": 0.0487, "num_input_tokens_seen": 2999248, "step": 7320 }, { "epoch": 8.868038740920097, "grad_norm": 0.03610173240303993, "learning_rate": 3.409417405127839e-05, "loss": 0.0003, "num_input_tokens_seen": 3001264, "step": 7325 }, { "epoch": 8.87409200968523, "grad_norm": 78.4838638305664, "learning_rate": 3.406956612449665e-05, "loss": 0.0675, "num_input_tokens_seen": 3003312, "step": 7330 }, { "epoch": 8.880145278450364, "grad_norm": 0.002449462888762355, "learning_rate": 3.404494807443791e-05, "loss": 0.0001, "num_input_tokens_seen": 3005328, "step": 7335 }, { "epoch": 8.886198547215496, "grad_norm": 0.0029758999589830637, "learning_rate": 3.402031992858037e-05, "loss": 0.0, "num_input_tokens_seen": 3007472, "step": 7340 }, { "epoch": 8.892251815980629, "grad_norm": 0.0013645639410242438, "learning_rate": 3.3995681714413505e-05, "loss": 0.0, "num_input_tokens_seen": 3009616, "step": 7345 }, { "epoch": 8.898305084745763, "grad_norm": 0.00044110306771472096, "learning_rate": 3.397103345943802e-05, "loss": 0.0, "num_input_tokens_seen": 3011696, "step": 7350 }, { "epoch": 8.904358353510895, "grad_norm": 0.00562626589089632, "learning_rate": 3.394637519116581e-05, "loss": 0.0, "num_input_tokens_seen": 3013808, "step": 7355 }, { "epoch": 8.91041162227603, "grad_norm": 0.0015300029190257192, "learning_rate": 3.3921706937119984e-05, "loss": 0.0, "num_input_tokens_seen": 3015888, "step": 7360 }, { "epoch": 8.916464891041162, "grad_norm": 0.009315161034464836, "learning_rate": 3.389702872483477e-05, "loss": 0.0, "num_input_tokens_seen": 3017968, "step": 7365 }, { "epoch": 8.922518159806295, "grad_norm": 0.0032013999298214912, "learning_rate": 3.387234058185553e-05, "loss": 0.0011, "num_input_tokens_seen": 3019952, "step": 7370 }, { "epoch": 8.928571428571429, "grad_norm": 2.9613544029416516e-05, "learning_rate": 3.3847642535738675e-05, "loss": 0.0, "num_input_tokens_seen": 3022160, "step": 7375 }, { "epoch": 8.934624697336561, "grad_norm": 0.00037169179995544255, "learning_rate": 3.382293461405171e-05, "loss": 0.0, "num_input_tokens_seen": 3024272, "step": 7380 }, { "epoch": 8.940677966101696, "grad_norm": 0.006450560409575701, "learning_rate": 3.379821684437314e-05, "loss": 0.0, "num_input_tokens_seen": 3026416, "step": 7385 }, { "epoch": 8.946731234866828, "grad_norm": 0.004157371819019318, "learning_rate": 3.377348925429249e-05, "loss": 0.0, "num_input_tokens_seen": 3028496, "step": 7390 }, { "epoch": 8.95278450363196, "grad_norm": 0.0008347603143192828, "learning_rate": 3.3748751871410193e-05, "loss": 0.0004, "num_input_tokens_seen": 3030704, "step": 7395 }, { "epoch": 8.958837772397095, "grad_norm": 0.0021274928003549576, "learning_rate": 3.372400472333765e-05, "loss": 0.0, "num_input_tokens_seen": 3032720, "step": 7400 }, { "epoch": 8.964891041162227, "grad_norm": 0.00018449118942953646, "learning_rate": 3.369924783769719e-05, "loss": 0.0, "num_input_tokens_seen": 3034736, "step": 7405 }, { "epoch": 8.970944309927361, "grad_norm": 0.0005598808056674898, "learning_rate": 3.3674481242121934e-05, "loss": 0.0, "num_input_tokens_seen": 3036912, "step": 7410 }, { "epoch": 8.976997578692494, "grad_norm": 0.0002398116048425436, "learning_rate": 3.364970496425591e-05, "loss": 0.0001, "num_input_tokens_seen": 3039056, "step": 7415 }, { "epoch": 8.983050847457626, "grad_norm": 0.000758154783397913, "learning_rate": 3.362491903175392e-05, "loss": 0.0068, "num_input_tokens_seen": 3041008, "step": 7420 }, { "epoch": 8.98910411622276, "grad_norm": 6.626932736253366e-05, "learning_rate": 3.360012347228155e-05, "loss": 0.0, "num_input_tokens_seen": 3043056, "step": 7425 }, { "epoch": 8.995157384987893, "grad_norm": 0.025791382417082787, "learning_rate": 3.357531831351514e-05, "loss": 0.0, "num_input_tokens_seen": 3045104, "step": 7430 }, { "epoch": 9.0, "eval_loss": 0.28868547081947327, "eval_runtime": 4.9594, "eval_samples_per_second": 74.002, "eval_steps_per_second": 18.551, "num_input_tokens_seen": 3046392, "step": 7434 }, { "epoch": 9.001210653753027, "grad_norm": 0.00032273810938932, "learning_rate": 3.355050358314172e-05, "loss": 0.1039, "num_input_tokens_seen": 3046808, "step": 7435 }, { "epoch": 9.00726392251816, "grad_norm": 0.00013340302393771708, "learning_rate": 3.352567930885902e-05, "loss": 0.0, "num_input_tokens_seen": 3048920, "step": 7440 }, { "epoch": 9.013317191283292, "grad_norm": 0.023667339235544205, "learning_rate": 3.350084551837545e-05, "loss": 0.0001, "num_input_tokens_seen": 3050936, "step": 7445 }, { "epoch": 9.019370460048426, "grad_norm": 0.0005186949856579304, "learning_rate": 3.347600223940998e-05, "loss": 0.0001, "num_input_tokens_seen": 3052984, "step": 7450 }, { "epoch": 9.025423728813559, "grad_norm": 4.5040505938231945e-05, "learning_rate": 3.345114949969222e-05, "loss": 0.0, "num_input_tokens_seen": 3055032, "step": 7455 }, { "epoch": 9.031476997578693, "grad_norm": 8.572825754527003e-05, "learning_rate": 3.3426287326962334e-05, "loss": 0.0, "num_input_tokens_seen": 3057208, "step": 7460 }, { "epoch": 9.037530266343826, "grad_norm": 0.00048485692241229117, "learning_rate": 3.3401415748970984e-05, "loss": 0.0043, "num_input_tokens_seen": 3059352, "step": 7465 }, { "epoch": 9.043583535108958, "grad_norm": 8.235067070927471e-05, "learning_rate": 3.337653479347937e-05, "loss": 0.0, "num_input_tokens_seen": 3061400, "step": 7470 }, { "epoch": 9.049636803874092, "grad_norm": 0.00015645675011910498, "learning_rate": 3.335164448825913e-05, "loss": 0.0001, "num_input_tokens_seen": 3063352, "step": 7475 }, { "epoch": 9.055690072639225, "grad_norm": 0.02544788457453251, "learning_rate": 3.332674486109235e-05, "loss": 0.0, "num_input_tokens_seen": 3065272, "step": 7480 }, { "epoch": 9.061743341404359, "grad_norm": 5.6374268751824275e-05, "learning_rate": 3.330183593977152e-05, "loss": 0.0, "num_input_tokens_seen": 3067256, "step": 7485 }, { "epoch": 9.067796610169491, "grad_norm": 3.491274037514813e-05, "learning_rate": 3.3276917752099514e-05, "loss": 0.0002, "num_input_tokens_seen": 3069304, "step": 7490 }, { "epoch": 9.073849878934624, "grad_norm": 0.000453268556157127, "learning_rate": 3.325199032588953e-05, "loss": 0.0, "num_input_tokens_seen": 3071128, "step": 7495 }, { "epoch": 9.079903147699758, "grad_norm": 0.0001051233266480267, "learning_rate": 3.322705368896508e-05, "loss": 0.0, "num_input_tokens_seen": 3073208, "step": 7500 }, { "epoch": 9.08595641646489, "grad_norm": 6.343272980302572e-05, "learning_rate": 3.320210786915997e-05, "loss": 0.0, "num_input_tokens_seen": 3075128, "step": 7505 }, { "epoch": 9.092009685230025, "grad_norm": 7.950684812385589e-05, "learning_rate": 3.317715289431825e-05, "loss": 0.0, "num_input_tokens_seen": 3077112, "step": 7510 }, { "epoch": 9.098062953995157, "grad_norm": 0.0002240114117739722, "learning_rate": 3.315218879229419e-05, "loss": 0.0, "num_input_tokens_seen": 3079064, "step": 7515 }, { "epoch": 9.104116222760291, "grad_norm": 0.0013114140601828694, "learning_rate": 3.312721559095224e-05, "loss": 0.0, "num_input_tokens_seen": 3080984, "step": 7520 }, { "epoch": 9.110169491525424, "grad_norm": 0.00010681986168492585, "learning_rate": 3.3102233318167015e-05, "loss": 0.0, "num_input_tokens_seen": 3082968, "step": 7525 }, { "epoch": 9.116222760290556, "grad_norm": 9.089532977668568e-05, "learning_rate": 3.307724200182325e-05, "loss": 0.0, "num_input_tokens_seen": 3085176, "step": 7530 }, { "epoch": 9.12227602905569, "grad_norm": 4.5369153667706996e-05, "learning_rate": 3.305224166981577e-05, "loss": 0.0, "num_input_tokens_seen": 3087192, "step": 7535 }, { "epoch": 9.128329297820823, "grad_norm": 0.00017911555187311023, "learning_rate": 3.30272323500495e-05, "loss": 0.0, "num_input_tokens_seen": 3089112, "step": 7540 }, { "epoch": 9.134382566585957, "grad_norm": 0.0005718666943721473, "learning_rate": 3.3002214070439327e-05, "loss": 0.0, "num_input_tokens_seen": 3091192, "step": 7545 }, { "epoch": 9.14043583535109, "grad_norm": 2.1719601136283018e-05, "learning_rate": 3.2977186858910194e-05, "loss": 0.0, "num_input_tokens_seen": 3093240, "step": 7550 }, { "epoch": 9.146489104116222, "grad_norm": 2.6765366783365607e-05, "learning_rate": 3.2952150743397e-05, "loss": 0.0, "num_input_tokens_seen": 3095192, "step": 7555 }, { "epoch": 9.152542372881356, "grad_norm": 4.876573075307533e-05, "learning_rate": 3.292710575184457e-05, "loss": 0.0, "num_input_tokens_seen": 3097240, "step": 7560 }, { "epoch": 9.158595641646489, "grad_norm": 3.427916090004146e-05, "learning_rate": 3.2902051912207664e-05, "loss": 0.0, "num_input_tokens_seen": 3099256, "step": 7565 }, { "epoch": 9.164648910411623, "grad_norm": 7.394626300083473e-05, "learning_rate": 3.287698925245089e-05, "loss": 0.0, "num_input_tokens_seen": 3101272, "step": 7570 }, { "epoch": 9.170702179176756, "grad_norm": 3.077953078900464e-05, "learning_rate": 3.2851917800548724e-05, "loss": 0.0, "num_input_tokens_seen": 3103288, "step": 7575 }, { "epoch": 9.176755447941888, "grad_norm": 0.00025474984431639314, "learning_rate": 3.282683758448542e-05, "loss": 0.0, "num_input_tokens_seen": 3105400, "step": 7580 }, { "epoch": 9.182808716707022, "grad_norm": 3.320527321193367e-05, "learning_rate": 3.280174863225506e-05, "loss": 0.0, "num_input_tokens_seen": 3107448, "step": 7585 }, { "epoch": 9.188861985472155, "grad_norm": 0.0012508806539699435, "learning_rate": 3.277665097186144e-05, "loss": 0.0001, "num_input_tokens_seen": 3109624, "step": 7590 }, { "epoch": 9.194915254237289, "grad_norm": 2.894778299378231e-05, "learning_rate": 3.27515446313181e-05, "loss": 0.0, "num_input_tokens_seen": 3111864, "step": 7595 }, { "epoch": 9.200968523002421, "grad_norm": 2.581955777714029e-05, "learning_rate": 3.272642963864825e-05, "loss": 0.0, "num_input_tokens_seen": 3113880, "step": 7600 }, { "epoch": 9.207021791767554, "grad_norm": 0.0034752096980810165, "learning_rate": 3.270130602188477e-05, "loss": 0.0003, "num_input_tokens_seen": 3115832, "step": 7605 }, { "epoch": 9.213075060532688, "grad_norm": 6.765152647858486e-05, "learning_rate": 3.267617380907017e-05, "loss": 0.0, "num_input_tokens_seen": 3117880, "step": 7610 }, { "epoch": 9.21912832929782, "grad_norm": 9.524734923616052e-05, "learning_rate": 3.265103302825654e-05, "loss": 0.0, "num_input_tokens_seen": 3119832, "step": 7615 }, { "epoch": 9.225181598062955, "grad_norm": 3.62019709427841e-05, "learning_rate": 3.262588370750554e-05, "loss": 0.0001, "num_input_tokens_seen": 3121816, "step": 7620 }, { "epoch": 9.231234866828087, "grad_norm": 1.9341659935889766e-05, "learning_rate": 3.2600725874888374e-05, "loss": 0.0, "num_input_tokens_seen": 3123896, "step": 7625 }, { "epoch": 9.23728813559322, "grad_norm": 2.3447862986358814e-05, "learning_rate": 3.2575559558485714e-05, "loss": 0.0, "num_input_tokens_seen": 3125880, "step": 7630 }, { "epoch": 9.243341404358354, "grad_norm": 2.236803265986964e-05, "learning_rate": 3.255038478638774e-05, "loss": 0.0, "num_input_tokens_seen": 3127928, "step": 7635 }, { "epoch": 9.249394673123486, "grad_norm": 0.00010414859571028501, "learning_rate": 3.252520158669405e-05, "loss": 0.0, "num_input_tokens_seen": 3130008, "step": 7640 }, { "epoch": 9.25544794188862, "grad_norm": 1.400205565005308e-05, "learning_rate": 3.2500009987513655e-05, "loss": 0.0, "num_input_tokens_seen": 3132344, "step": 7645 }, { "epoch": 9.261501210653753, "grad_norm": 7.644093420822173e-05, "learning_rate": 3.247481001696493e-05, "loss": 0.0, "num_input_tokens_seen": 3134424, "step": 7650 }, { "epoch": 9.267554479418886, "grad_norm": 4.542397800832987e-05, "learning_rate": 3.244960170317561e-05, "loss": 0.0, "num_input_tokens_seen": 3136504, "step": 7655 }, { "epoch": 9.27360774818402, "grad_norm": 3.1843726901570335e-05, "learning_rate": 3.242438507428273e-05, "loss": 0.0, "num_input_tokens_seen": 3138488, "step": 7660 }, { "epoch": 9.279661016949152, "grad_norm": 0.0027228249236941338, "learning_rate": 3.2399160158432604e-05, "loss": 0.0, "num_input_tokens_seen": 3140600, "step": 7665 }, { "epoch": 9.285714285714286, "grad_norm": 2.9152604838600382e-05, "learning_rate": 3.237392698378082e-05, "loss": 0.0, "num_input_tokens_seen": 3142520, "step": 7670 }, { "epoch": 9.291767554479419, "grad_norm": 6.288797885645181e-05, "learning_rate": 3.2348685578492166e-05, "loss": 0.0, "num_input_tokens_seen": 3144600, "step": 7675 }, { "epoch": 9.297820823244551, "grad_norm": 1.766624700394459e-05, "learning_rate": 3.2323435970740603e-05, "loss": 0.0, "num_input_tokens_seen": 3146552, "step": 7680 }, { "epoch": 9.303874092009686, "grad_norm": 2.0331865016487427e-05, "learning_rate": 3.229817818870926e-05, "loss": 0.0, "num_input_tokens_seen": 3148600, "step": 7685 }, { "epoch": 9.309927360774818, "grad_norm": 0.00017340504564344883, "learning_rate": 3.227291226059042e-05, "loss": 0.0, "num_input_tokens_seen": 3150584, "step": 7690 }, { "epoch": 9.315980629539952, "grad_norm": 1.8849435946322046e-05, "learning_rate": 3.22476382145854e-05, "loss": 0.0, "num_input_tokens_seen": 3152536, "step": 7695 }, { "epoch": 9.322033898305085, "grad_norm": 2.3937671357998624e-05, "learning_rate": 3.222235607890463e-05, "loss": 0.0, "num_input_tokens_seen": 3154616, "step": 7700 }, { "epoch": 9.328087167070217, "grad_norm": 2.2054444343666546e-05, "learning_rate": 3.219706588176753e-05, "loss": 0.0, "num_input_tokens_seen": 3156536, "step": 7705 }, { "epoch": 9.334140435835351, "grad_norm": 2.8765514798578806e-05, "learning_rate": 3.217176765140255e-05, "loss": 0.0, "num_input_tokens_seen": 3158680, "step": 7710 }, { "epoch": 9.340193704600484, "grad_norm": 2.0451201635296457e-05, "learning_rate": 3.2146461416047094e-05, "loss": 0.0, "num_input_tokens_seen": 3160632, "step": 7715 }, { "epoch": 9.346246973365618, "grad_norm": 1.987189352803398e-05, "learning_rate": 3.2121147203947483e-05, "loss": 0.0, "num_input_tokens_seen": 3162712, "step": 7720 }, { "epoch": 9.35230024213075, "grad_norm": 0.00024628956452943385, "learning_rate": 3.209582504335898e-05, "loss": 0.0, "num_input_tokens_seen": 3164824, "step": 7725 }, { "epoch": 9.358353510895883, "grad_norm": 0.0003751046897377819, "learning_rate": 3.207049496254569e-05, "loss": 0.0, "num_input_tokens_seen": 3166904, "step": 7730 }, { "epoch": 9.364406779661017, "grad_norm": 2.5274399376939982e-05, "learning_rate": 3.2045156989780566e-05, "loss": 0.0, "num_input_tokens_seen": 3169048, "step": 7735 }, { "epoch": 9.37046004842615, "grad_norm": 1.407626950822305e-05, "learning_rate": 3.201981115334537e-05, "loss": 0.0, "num_input_tokens_seen": 3171128, "step": 7740 }, { "epoch": 9.376513317191284, "grad_norm": 0.00018505325715523213, "learning_rate": 3.199445748153064e-05, "loss": 0.0, "num_input_tokens_seen": 3173112, "step": 7745 }, { "epoch": 9.382566585956416, "grad_norm": 1.7790645870263688e-05, "learning_rate": 3.196909600263567e-05, "loss": 0.0, "num_input_tokens_seen": 3175128, "step": 7750 }, { "epoch": 9.388619854721549, "grad_norm": 2.026636866503395e-05, "learning_rate": 3.194372674496845e-05, "loss": 0.0, "num_input_tokens_seen": 3177112, "step": 7755 }, { "epoch": 9.394673123486683, "grad_norm": 3.219114296371117e-05, "learning_rate": 3.1918349736845667e-05, "loss": 0.0, "num_input_tokens_seen": 3179160, "step": 7760 }, { "epoch": 9.400726392251816, "grad_norm": 2.1380385078373365e-05, "learning_rate": 3.1892965006592646e-05, "loss": 0.0, "num_input_tokens_seen": 3181112, "step": 7765 }, { "epoch": 9.40677966101695, "grad_norm": 1.465060358896153e-05, "learning_rate": 3.186757258254336e-05, "loss": 0.0, "num_input_tokens_seen": 3183224, "step": 7770 }, { "epoch": 9.412832929782082, "grad_norm": 0.00022562374942936003, "learning_rate": 3.184217249304033e-05, "loss": 0.0, "num_input_tokens_seen": 3185272, "step": 7775 }, { "epoch": 9.418886198547215, "grad_norm": 1.486188375565689e-05, "learning_rate": 3.181676476643466e-05, "loss": 0.0, "num_input_tokens_seen": 3187256, "step": 7780 }, { "epoch": 9.424939467312349, "grad_norm": 1.9991537556052208e-05, "learning_rate": 3.179134943108597e-05, "loss": 0.0, "num_input_tokens_seen": 3189240, "step": 7785 }, { "epoch": 9.430992736077481, "grad_norm": 4.8414949560537934e-05, "learning_rate": 3.1765926515362375e-05, "loss": 0.0, "num_input_tokens_seen": 3191288, "step": 7790 }, { "epoch": 9.437046004842616, "grad_norm": 1.7155407476820983e-05, "learning_rate": 3.174049604764045e-05, "loss": 0.0, "num_input_tokens_seen": 3193208, "step": 7795 }, { "epoch": 9.443099273607748, "grad_norm": 5.2553616114892066e-05, "learning_rate": 3.171505805630518e-05, "loss": 0.0, "num_input_tokens_seen": 3195288, "step": 7800 }, { "epoch": 9.44915254237288, "grad_norm": 3.359544643899426e-05, "learning_rate": 3.168961256974999e-05, "loss": 0.0, "num_input_tokens_seen": 3197304, "step": 7805 }, { "epoch": 9.455205811138015, "grad_norm": 1.5992696717148647e-05, "learning_rate": 3.166415961637664e-05, "loss": 0.0, "num_input_tokens_seen": 3199512, "step": 7810 }, { "epoch": 9.461259079903147, "grad_norm": 2.372195740463212e-05, "learning_rate": 3.1638699224595234e-05, "loss": 0.0, "num_input_tokens_seen": 3201560, "step": 7815 }, { "epoch": 9.467312348668282, "grad_norm": 7.621572876814753e-05, "learning_rate": 3.161323142282418e-05, "loss": 0.0, "num_input_tokens_seen": 3203576, "step": 7820 }, { "epoch": 9.473365617433414, "grad_norm": 0.00015587112284265459, "learning_rate": 3.158775623949013e-05, "loss": 0.0, "num_input_tokens_seen": 3205688, "step": 7825 }, { "epoch": 9.479418886198546, "grad_norm": 1.6708399925846606e-05, "learning_rate": 3.1562273703028036e-05, "loss": 0.0, "num_input_tokens_seen": 3207704, "step": 7830 }, { "epoch": 9.48547215496368, "grad_norm": 3.365362135809846e-05, "learning_rate": 3.153678384188099e-05, "loss": 0.0, "num_input_tokens_seen": 3209784, "step": 7835 }, { "epoch": 9.491525423728813, "grad_norm": 1.8518001525080763e-05, "learning_rate": 3.15112866845003e-05, "loss": 0.0, "num_input_tokens_seen": 3212056, "step": 7840 }, { "epoch": 9.497578692493947, "grad_norm": 0.0013937385519966483, "learning_rate": 3.148578225934541e-05, "loss": 0.0, "num_input_tokens_seen": 3214008, "step": 7845 }, { "epoch": 9.50363196125908, "grad_norm": 0.0013519435888156295, "learning_rate": 3.146027059488387e-05, "loss": 0.0, "num_input_tokens_seen": 3216056, "step": 7850 }, { "epoch": 9.509685230024212, "grad_norm": 1.4383177585841622e-05, "learning_rate": 3.143475171959131e-05, "loss": 0.0, "num_input_tokens_seen": 3218072, "step": 7855 }, { "epoch": 9.515738498789347, "grad_norm": 2.3452419554814696e-05, "learning_rate": 3.1409225661951416e-05, "loss": 0.0, "num_input_tokens_seen": 3220088, "step": 7860 }, { "epoch": 9.521791767554479, "grad_norm": 5.123946539242752e-05, "learning_rate": 3.1383692450455896e-05, "loss": 0.0, "num_input_tokens_seen": 3222232, "step": 7865 }, { "epoch": 9.527845036319613, "grad_norm": 1.6736494217184372e-05, "learning_rate": 3.135815211360443e-05, "loss": 0.0, "num_input_tokens_seen": 3223992, "step": 7870 }, { "epoch": 9.533898305084746, "grad_norm": 0.0005167039926163852, "learning_rate": 3.133260467990465e-05, "loss": 0.0, "num_input_tokens_seen": 3226104, "step": 7875 }, { "epoch": 9.539951573849878, "grad_norm": 1.1415429071348626e-05, "learning_rate": 3.130705017787211e-05, "loss": 0.0001, "num_input_tokens_seen": 3228024, "step": 7880 }, { "epoch": 9.546004842615012, "grad_norm": 1.4197374184732325e-05, "learning_rate": 3.128148863603027e-05, "loss": 0.0, "num_input_tokens_seen": 3229912, "step": 7885 }, { "epoch": 9.552058111380145, "grad_norm": 1.3878849131288007e-05, "learning_rate": 3.125592008291044e-05, "loss": 0.0, "num_input_tokens_seen": 3231928, "step": 7890 }, { "epoch": 9.558111380145279, "grad_norm": 7.216184894787148e-05, "learning_rate": 3.1230344547051735e-05, "loss": 0.0, "num_input_tokens_seen": 3233944, "step": 7895 }, { "epoch": 9.564164648910412, "grad_norm": 0.00031896759173832834, "learning_rate": 3.12047620570011e-05, "loss": 0.0, "num_input_tokens_seen": 3235928, "step": 7900 }, { "epoch": 9.570217917675544, "grad_norm": 1.1721442206180654e-05, "learning_rate": 3.1179172641313206e-05, "loss": 0.0, "num_input_tokens_seen": 3238040, "step": 7905 }, { "epoch": 9.576271186440678, "grad_norm": 1.1782076398958452e-05, "learning_rate": 3.115357632855048e-05, "loss": 0.0, "num_input_tokens_seen": 3240120, "step": 7910 }, { "epoch": 9.58232445520581, "grad_norm": 0.000502390437759459, "learning_rate": 3.112797314728305e-05, "loss": 0.0, "num_input_tokens_seen": 3242200, "step": 7915 }, { "epoch": 9.588377723970945, "grad_norm": 2.482449053786695e-05, "learning_rate": 3.1102363126088675e-05, "loss": 0.0, "num_input_tokens_seen": 3244248, "step": 7920 }, { "epoch": 9.594430992736077, "grad_norm": 1.6599840819253586e-05, "learning_rate": 3.1076746293552786e-05, "loss": 0.0, "num_input_tokens_seen": 3246360, "step": 7925 }, { "epoch": 9.600484261501212, "grad_norm": 0.00015346526924986392, "learning_rate": 3.1051122678268414e-05, "loss": 0.0, "num_input_tokens_seen": 3248312, "step": 7930 }, { "epoch": 9.606537530266344, "grad_norm": 0.00036525429459288716, "learning_rate": 3.1025492308836135e-05, "loss": 0.0, "num_input_tokens_seen": 3250424, "step": 7935 }, { "epoch": 9.612590799031477, "grad_norm": 0.0004209598118904978, "learning_rate": 3.099985521386408e-05, "loss": 0.1322, "num_input_tokens_seen": 3252504, "step": 7940 }, { "epoch": 9.61864406779661, "grad_norm": 0.024231789633631706, "learning_rate": 3.0974211421967897e-05, "loss": 0.1313, "num_input_tokens_seen": 3254616, "step": 7945 }, { "epoch": 9.624697336561743, "grad_norm": 0.008134456351399422, "learning_rate": 3.094856096177069e-05, "loss": 0.0009, "num_input_tokens_seen": 3256824, "step": 7950 }, { "epoch": 9.630750605326877, "grad_norm": 0.007181561551988125, "learning_rate": 3.0922903861903014e-05, "loss": 0.0004, "num_input_tokens_seen": 3258936, "step": 7955 }, { "epoch": 9.63680387409201, "grad_norm": 4.8087663650512695, "learning_rate": 3.0897240151002836e-05, "loss": 0.0488, "num_input_tokens_seen": 3260984, "step": 7960 }, { "epoch": 9.642857142857142, "grad_norm": 0.005012081004679203, "learning_rate": 3.0871569857715496e-05, "loss": 0.0099, "num_input_tokens_seen": 3263096, "step": 7965 }, { "epoch": 9.648910411622277, "grad_norm": 0.5535081624984741, "learning_rate": 3.0845893010693706e-05, "loss": 0.0077, "num_input_tokens_seen": 3265048, "step": 7970 }, { "epoch": 9.654963680387409, "grad_norm": 0.0015317696379497647, "learning_rate": 3.0820209638597456e-05, "loss": 0.0001, "num_input_tokens_seen": 3267288, "step": 7975 }, { "epoch": 9.661016949152543, "grad_norm": 0.001742516877129674, "learning_rate": 3.079451977009404e-05, "loss": 0.0003, "num_input_tokens_seen": 3269368, "step": 7980 }, { "epoch": 9.667070217917676, "grad_norm": 0.0008200365118682384, "learning_rate": 3.0768823433858e-05, "loss": 0.0152, "num_input_tokens_seen": 3271448, "step": 7985 }, { "epoch": 9.673123486682808, "grad_norm": 0.002086331369355321, "learning_rate": 3.07431206585711e-05, "loss": 0.0002, "num_input_tokens_seen": 3273656, "step": 7990 }, { "epoch": 9.679176755447942, "grad_norm": 0.0016468135872855783, "learning_rate": 3.0717411472922294e-05, "loss": 0.0007, "num_input_tokens_seen": 3275576, "step": 7995 }, { "epoch": 9.685230024213075, "grad_norm": 0.0006713018519803882, "learning_rate": 3.069169590560767e-05, "loss": 0.0, "num_input_tokens_seen": 3277592, "step": 8000 }, { "epoch": 9.69128329297821, "grad_norm": 0.005557028576731682, "learning_rate": 3.0665973985330456e-05, "loss": 0.0117, "num_input_tokens_seen": 3279800, "step": 8005 }, { "epoch": 9.697336561743342, "grad_norm": 0.0008497940143570304, "learning_rate": 3.064024574080099e-05, "loss": 0.0, "num_input_tokens_seen": 3281848, "step": 8010 }, { "epoch": 9.703389830508474, "grad_norm": 0.0008371492731384933, "learning_rate": 3.061451120073664e-05, "loss": 0.0, "num_input_tokens_seen": 3283832, "step": 8015 }, { "epoch": 9.709443099273608, "grad_norm": 0.0005449040327221155, "learning_rate": 3.05887703938618e-05, "loss": 0.0, "num_input_tokens_seen": 3286072, "step": 8020 }, { "epoch": 9.71549636803874, "grad_norm": 8.043519483180717e-05, "learning_rate": 3.056302334890786e-05, "loss": 0.0, "num_input_tokens_seen": 3287928, "step": 8025 }, { "epoch": 9.721549636803875, "grad_norm": 0.02071470394730568, "learning_rate": 3.053727009461321e-05, "loss": 0.0001, "num_input_tokens_seen": 3289912, "step": 8030 }, { "epoch": 9.727602905569007, "grad_norm": 0.0002592724922578782, "learning_rate": 3.051151065972312e-05, "loss": 0.0, "num_input_tokens_seen": 3291832, "step": 8035 }, { "epoch": 9.73365617433414, "grad_norm": 0.0003126137889921665, "learning_rate": 3.0485745072989786e-05, "loss": 0.0, "num_input_tokens_seen": 3293976, "step": 8040 }, { "epoch": 9.739709443099274, "grad_norm": 0.00032684224424883723, "learning_rate": 3.0459973363172267e-05, "loss": 0.0397, "num_input_tokens_seen": 3295928, "step": 8045 }, { "epoch": 9.745762711864407, "grad_norm": 0.00165075424592942, "learning_rate": 3.0434195559036453e-05, "loss": 0.0, "num_input_tokens_seen": 3297976, "step": 8050 }, { "epoch": 9.75181598062954, "grad_norm": 0.007779262028634548, "learning_rate": 3.0408411689355042e-05, "loss": 0.0, "num_input_tokens_seen": 3299928, "step": 8055 }, { "epoch": 9.757869249394673, "grad_norm": 0.0004615871876012534, "learning_rate": 3.0382621782907498e-05, "loss": 0.0, "num_input_tokens_seen": 3301976, "step": 8060 }, { "epoch": 9.763922518159806, "grad_norm": 0.0009391659987159073, "learning_rate": 3.0356825868480017e-05, "loss": 0.0525, "num_input_tokens_seen": 3304120, "step": 8065 }, { "epoch": 9.76997578692494, "grad_norm": 0.00018064089817926288, "learning_rate": 3.0331023974865514e-05, "loss": 0.0, "num_input_tokens_seen": 3306104, "step": 8070 }, { "epoch": 9.776029055690072, "grad_norm": 0.007917432114481926, "learning_rate": 3.0305216130863568e-05, "loss": 0.0, "num_input_tokens_seen": 3308088, "step": 8075 }, { "epoch": 9.782082324455207, "grad_norm": 0.00028762713191099465, "learning_rate": 3.0279402365280403e-05, "loss": 0.0, "num_input_tokens_seen": 3310040, "step": 8080 }, { "epoch": 9.788135593220339, "grad_norm": 0.0006541379261761904, "learning_rate": 3.025358270692885e-05, "loss": 0.0012, "num_input_tokens_seen": 3312184, "step": 8085 }, { "epoch": 9.794188861985472, "grad_norm": 0.0010701349237933755, "learning_rate": 3.0227757184628325e-05, "loss": 0.0, "num_input_tokens_seen": 3314168, "step": 8090 }, { "epoch": 9.800242130750606, "grad_norm": 0.00012713308387901634, "learning_rate": 3.0201925827204785e-05, "loss": 0.0003, "num_input_tokens_seen": 3316184, "step": 8095 }, { "epoch": 9.806295399515738, "grad_norm": 8.155262912623584e-05, "learning_rate": 3.0176088663490704e-05, "loss": 0.0, "num_input_tokens_seen": 3318168, "step": 8100 }, { "epoch": 9.812348668280872, "grad_norm": 6.286461575655267e-05, "learning_rate": 3.0150245722325016e-05, "loss": 0.0, "num_input_tokens_seen": 3320280, "step": 8105 }, { "epoch": 9.818401937046005, "grad_norm": 0.001239868113771081, "learning_rate": 3.012439703255313e-05, "loss": 0.0, "num_input_tokens_seen": 3322328, "step": 8110 }, { "epoch": 9.824455205811137, "grad_norm": 0.0002320308267371729, "learning_rate": 3.009854262302686e-05, "loss": 0.0, "num_input_tokens_seen": 3324408, "step": 8115 }, { "epoch": 9.830508474576272, "grad_norm": 7.742171146674082e-05, "learning_rate": 3.0072682522604405e-05, "loss": 0.0, "num_input_tokens_seen": 3326584, "step": 8120 }, { "epoch": 9.836561743341404, "grad_norm": 0.0011774206068366766, "learning_rate": 3.004681676015032e-05, "loss": 0.0, "num_input_tokens_seen": 3328568, "step": 8125 }, { "epoch": 9.842615012106538, "grad_norm": 0.0004978084471076727, "learning_rate": 3.0020945364535468e-05, "loss": 0.0001, "num_input_tokens_seen": 3330488, "step": 8130 }, { "epoch": 9.84866828087167, "grad_norm": 0.07202856987714767, "learning_rate": 2.9995068364637024e-05, "loss": 0.0001, "num_input_tokens_seen": 3332472, "step": 8135 }, { "epoch": 9.854721549636803, "grad_norm": 0.012568481266498566, "learning_rate": 2.9969185789338382e-05, "loss": 0.0, "num_input_tokens_seen": 3334392, "step": 8140 }, { "epoch": 9.860774818401937, "grad_norm": 0.0005212469841353595, "learning_rate": 2.9943297667529197e-05, "loss": 0.0, "num_input_tokens_seen": 3336408, "step": 8145 }, { "epoch": 9.86682808716707, "grad_norm": 0.00013586344721261412, "learning_rate": 2.9917404028105285e-05, "loss": 0.0, "num_input_tokens_seen": 3338424, "step": 8150 }, { "epoch": 9.872881355932204, "grad_norm": 0.00010274600208504125, "learning_rate": 2.989150489996864e-05, "loss": 0.0, "num_input_tokens_seen": 3340440, "step": 8155 }, { "epoch": 9.878934624697337, "grad_norm": 0.0002893543569371104, "learning_rate": 2.9865600312027382e-05, "loss": 0.0, "num_input_tokens_seen": 3342520, "step": 8160 }, { "epoch": 9.884987893462469, "grad_norm": 0.0007382083567790687, "learning_rate": 2.9839690293195705e-05, "loss": 0.0202, "num_input_tokens_seen": 3344536, "step": 8165 }, { "epoch": 9.891041162227603, "grad_norm": 5.629880979540758e-05, "learning_rate": 2.981377487239388e-05, "loss": 0.0, "num_input_tokens_seen": 3346584, "step": 8170 }, { "epoch": 9.897094430992736, "grad_norm": 0.00017873429169412702, "learning_rate": 2.9787854078548215e-05, "loss": 0.0, "num_input_tokens_seen": 3348632, "step": 8175 }, { "epoch": 9.90314769975787, "grad_norm": 0.041393131017684937, "learning_rate": 2.9761927940591017e-05, "loss": 0.0, "num_input_tokens_seen": 3350744, "step": 8180 }, { "epoch": 9.909200968523002, "grad_norm": 3.304830897832289e-05, "learning_rate": 2.973599648746051e-05, "loss": 0.0, "num_input_tokens_seen": 3352856, "step": 8185 }, { "epoch": 9.915254237288135, "grad_norm": 0.0001203859064844437, "learning_rate": 2.9710059748100925e-05, "loss": 0.0, "num_input_tokens_seen": 3354840, "step": 8190 }, { "epoch": 9.92130750605327, "grad_norm": 5.8073092077393085e-05, "learning_rate": 2.9684117751462337e-05, "loss": 0.0, "num_input_tokens_seen": 3357048, "step": 8195 }, { "epoch": 9.927360774818402, "grad_norm": 0.0012996257282793522, "learning_rate": 2.9658170526500723e-05, "loss": 0.0, "num_input_tokens_seen": 3359160, "step": 8200 }, { "epoch": 9.933414043583536, "grad_norm": 0.00015111951506696641, "learning_rate": 2.9632218102177862e-05, "loss": 0.0, "num_input_tokens_seen": 3361304, "step": 8205 }, { "epoch": 9.939467312348668, "grad_norm": 0.001165767665952444, "learning_rate": 2.960626050746137e-05, "loss": 0.0, "num_input_tokens_seen": 3363416, "step": 8210 }, { "epoch": 9.9455205811138, "grad_norm": 0.00020653658430092037, "learning_rate": 2.9580297771324618e-05, "loss": 0.0767, "num_input_tokens_seen": 3365400, "step": 8215 }, { "epoch": 9.951573849878935, "grad_norm": 0.0025889805983752012, "learning_rate": 2.9554329922746714e-05, "loss": 0.0027, "num_input_tokens_seen": 3367512, "step": 8220 }, { "epoch": 9.957627118644067, "grad_norm": 0.006459138356149197, "learning_rate": 2.952835699071248e-05, "loss": 0.0, "num_input_tokens_seen": 3369656, "step": 8225 }, { "epoch": 9.963680387409202, "grad_norm": 0.0019356763223186135, "learning_rate": 2.9502379004212407e-05, "loss": 0.0001, "num_input_tokens_seen": 3371704, "step": 8230 }, { "epoch": 9.969733656174334, "grad_norm": 0.0006762559060007334, "learning_rate": 2.9476395992242622e-05, "loss": 0.0, "num_input_tokens_seen": 3373752, "step": 8235 }, { "epoch": 9.975786924939467, "grad_norm": 0.0073732840828597546, "learning_rate": 2.9450407983804883e-05, "loss": 0.001, "num_input_tokens_seen": 3375832, "step": 8240 }, { "epoch": 9.9818401937046, "grad_norm": 0.15406082570552826, "learning_rate": 2.9424415007906497e-05, "loss": 0.0001, "num_input_tokens_seen": 3377880, "step": 8245 }, { "epoch": 9.987893462469733, "grad_norm": 0.00018870746134780347, "learning_rate": 2.939841709356033e-05, "loss": 0.0, "num_input_tokens_seen": 3379800, "step": 8250 }, { "epoch": 9.993946731234868, "grad_norm": 3.623337507247925, "learning_rate": 2.937241426978477e-05, "loss": 0.0007, "num_input_tokens_seen": 3381912, "step": 8255 }, { "epoch": 10.0, "grad_norm": 0.00010719951387727633, "learning_rate": 2.934640656560367e-05, "loss": 0.0, "num_input_tokens_seen": 3383592, "step": 8260 }, { "epoch": 10.0, "eval_loss": 0.31217461824417114, "eval_runtime": 4.9479, "eval_samples_per_second": 74.172, "eval_steps_per_second": 18.594, "num_input_tokens_seen": 3383592, "step": 8260 }, { "epoch": 10.006053268765132, "grad_norm": 64.49374389648438, "learning_rate": 2.9320394010046314e-05, "loss": 0.0557, "num_input_tokens_seen": 3385640, "step": 8265 }, { "epoch": 10.012106537530267, "grad_norm": 2.578636303951498e-05, "learning_rate": 2.9294376632147447e-05, "loss": 0.0, "num_input_tokens_seen": 3387720, "step": 8270 }, { "epoch": 10.0181598062954, "grad_norm": 0.0011242808541283011, "learning_rate": 2.926835446094716e-05, "loss": 0.0, "num_input_tokens_seen": 3389608, "step": 8275 }, { "epoch": 10.024213075060533, "grad_norm": 8.538281690562144e-05, "learning_rate": 2.924232752549091e-05, "loss": 0.0001, "num_input_tokens_seen": 3391560, "step": 8280 }, { "epoch": 10.030266343825666, "grad_norm": 6.0571113863261417e-05, "learning_rate": 2.9216295854829455e-05, "loss": 0.0001, "num_input_tokens_seen": 3393704, "step": 8285 }, { "epoch": 10.036319612590798, "grad_norm": 0.00013585608394350857, "learning_rate": 2.919025947801886e-05, "loss": 0.0, "num_input_tokens_seen": 3395816, "step": 8290 }, { "epoch": 10.042372881355933, "grad_norm": 5.016162685933523e-05, "learning_rate": 2.9164218424120432e-05, "loss": 0.0, "num_input_tokens_seen": 3397864, "step": 8295 }, { "epoch": 10.048426150121065, "grad_norm": 55.56088638305664, "learning_rate": 2.9138172722200696e-05, "loss": 0.0127, "num_input_tokens_seen": 3399912, "step": 8300 }, { "epoch": 10.0544794188862, "grad_norm": 0.0005393848405219615, "learning_rate": 2.9112122401331375e-05, "loss": 0.0, "num_input_tokens_seen": 3401896, "step": 8305 }, { "epoch": 10.060532687651332, "grad_norm": 0.0003322197590023279, "learning_rate": 2.908606749058933e-05, "loss": 0.1346, "num_input_tokens_seen": 3404008, "step": 8310 }, { "epoch": 10.066585956416464, "grad_norm": 0.00016666868759784847, "learning_rate": 2.9060008019056566e-05, "loss": 0.0, "num_input_tokens_seen": 3406120, "step": 8315 }, { "epoch": 10.072639225181598, "grad_norm": 0.0002759807975962758, "learning_rate": 2.903394401582017e-05, "loss": 0.0, "num_input_tokens_seen": 3408328, "step": 8320 }, { "epoch": 10.07869249394673, "grad_norm": 0.24228928983211517, "learning_rate": 2.9007875509972275e-05, "loss": 0.0001, "num_input_tokens_seen": 3410376, "step": 8325 }, { "epoch": 10.084745762711865, "grad_norm": 0.005394752137362957, "learning_rate": 2.8981802530610057e-05, "loss": 0.0, "num_input_tokens_seen": 3412392, "step": 8330 }, { "epoch": 10.090799031476998, "grad_norm": 0.00315763708204031, "learning_rate": 2.8955725106835686e-05, "loss": 0.0, "num_input_tokens_seen": 3414472, "step": 8335 }, { "epoch": 10.09685230024213, "grad_norm": 0.0085788294672966, "learning_rate": 2.8929643267756286e-05, "loss": 0.0001, "num_input_tokens_seen": 3416328, "step": 8340 }, { "epoch": 10.102905569007264, "grad_norm": 0.00066661078017205, "learning_rate": 2.8903557042483887e-05, "loss": 0.0001, "num_input_tokens_seen": 3418248, "step": 8345 }, { "epoch": 10.108958837772397, "grad_norm": 0.0004536838096100837, "learning_rate": 2.887746646013546e-05, "loss": 0.0, "num_input_tokens_seen": 3420392, "step": 8350 }, { "epoch": 10.115012106537531, "grad_norm": 0.0002469899773132056, "learning_rate": 2.8851371549832813e-05, "loss": 0.0001, "num_input_tokens_seen": 3422376, "step": 8355 }, { "epoch": 10.121065375302663, "grad_norm": 0.000454829481896013, "learning_rate": 2.8825272340702574e-05, "loss": 0.0, "num_input_tokens_seen": 3424392, "step": 8360 }, { "epoch": 10.127118644067796, "grad_norm": 0.0008198734722100198, "learning_rate": 2.8799168861876203e-05, "loss": 0.0, "num_input_tokens_seen": 3426472, "step": 8365 }, { "epoch": 10.13317191283293, "grad_norm": 0.0016623978735879064, "learning_rate": 2.8773061142489882e-05, "loss": 0.0014, "num_input_tokens_seen": 3428584, "step": 8370 }, { "epoch": 10.139225181598063, "grad_norm": 0.0005793138989247382, "learning_rate": 2.8746949211684577e-05, "loss": 0.0, "num_input_tokens_seen": 3430632, "step": 8375 }, { "epoch": 10.145278450363197, "grad_norm": 0.001019827090203762, "learning_rate": 2.872083309860591e-05, "loss": 0.0001, "num_input_tokens_seen": 3432712, "step": 8380 }, { "epoch": 10.15133171912833, "grad_norm": 0.00021755440684501082, "learning_rate": 2.8694712832404198e-05, "loss": 0.0, "num_input_tokens_seen": 3434792, "step": 8385 }, { "epoch": 10.157384987893462, "grad_norm": 0.0002812929742503911, "learning_rate": 2.8668588442234373e-05, "loss": 0.0, "num_input_tokens_seen": 3436840, "step": 8390 }, { "epoch": 10.163438256658596, "grad_norm": 0.0017326445085927844, "learning_rate": 2.8642459957255996e-05, "loss": 0.084, "num_input_tokens_seen": 3438888, "step": 8395 }, { "epoch": 10.169491525423728, "grad_norm": 0.00021969800582155585, "learning_rate": 2.8616327406633175e-05, "loss": 0.0, "num_input_tokens_seen": 3440936, "step": 8400 }, { "epoch": 10.175544794188863, "grad_norm": 0.00020917141227982938, "learning_rate": 2.8590190819534567e-05, "loss": 0.0, "num_input_tokens_seen": 3443016, "step": 8405 }, { "epoch": 10.181598062953995, "grad_norm": 0.0001378294255118817, "learning_rate": 2.8564050225133337e-05, "loss": 0.0, "num_input_tokens_seen": 3445128, "step": 8410 }, { "epoch": 10.187651331719128, "grad_norm": 0.0003894153342116624, "learning_rate": 2.8537905652607122e-05, "loss": 0.0004, "num_input_tokens_seen": 3446984, "step": 8415 }, { "epoch": 10.193704600484262, "grad_norm": 2.3743084966554306e-05, "learning_rate": 2.8511757131137982e-05, "loss": 0.0, "num_input_tokens_seen": 3449032, "step": 8420 }, { "epoch": 10.199757869249394, "grad_norm": 0.0012310536112636328, "learning_rate": 2.848560468991241e-05, "loss": 0.0, "num_input_tokens_seen": 3450984, "step": 8425 }, { "epoch": 10.205811138014528, "grad_norm": 0.016525069251656532, "learning_rate": 2.8459448358121253e-05, "loss": 0.0001, "num_input_tokens_seen": 3452968, "step": 8430 }, { "epoch": 10.211864406779661, "grad_norm": 0.003942083101719618, "learning_rate": 2.843328816495972e-05, "loss": 0.0, "num_input_tokens_seen": 3455016, "step": 8435 }, { "epoch": 10.217917675544793, "grad_norm": 0.00010805801866808906, "learning_rate": 2.8407124139627323e-05, "loss": 0.0327, "num_input_tokens_seen": 3457000, "step": 8440 }, { "epoch": 10.223970944309928, "grad_norm": 0.00030730347498320043, "learning_rate": 2.838095631132784e-05, "loss": 0.0, "num_input_tokens_seen": 3459080, "step": 8445 }, { "epoch": 10.23002421307506, "grad_norm": 0.005312125198543072, "learning_rate": 2.83547847092693e-05, "loss": 0.0, "num_input_tokens_seen": 3461192, "step": 8450 }, { "epoch": 10.236077481840194, "grad_norm": 0.0004981564707122743, "learning_rate": 2.8328609362663956e-05, "loss": 0.0, "num_input_tokens_seen": 3463144, "step": 8455 }, { "epoch": 10.242130750605327, "grad_norm": 0.0004926573019474745, "learning_rate": 2.8302430300728227e-05, "loss": 0.0004, "num_input_tokens_seen": 3465288, "step": 8460 }, { "epoch": 10.24818401937046, "grad_norm": 0.000135244510602206, "learning_rate": 2.827624755268269e-05, "loss": 0.0, "num_input_tokens_seen": 3467272, "step": 8465 }, { "epoch": 10.254237288135593, "grad_norm": 2.8375037800287828e-05, "learning_rate": 2.8250061147752015e-05, "loss": 0.0, "num_input_tokens_seen": 3469480, "step": 8470 }, { "epoch": 10.260290556900726, "grad_norm": 8.951556810643524e-05, "learning_rate": 2.8223871115164985e-05, "loss": 0.0, "num_input_tokens_seen": 3471560, "step": 8475 }, { "epoch": 10.26634382566586, "grad_norm": 0.00013692761422134936, "learning_rate": 2.8197677484154407e-05, "loss": 0.0, "num_input_tokens_seen": 3473512, "step": 8480 }, { "epoch": 10.272397094430993, "grad_norm": 0.0010790692176669836, "learning_rate": 2.8171480283957118e-05, "loss": 0.0, "num_input_tokens_seen": 3475464, "step": 8485 }, { "epoch": 10.278450363196125, "grad_norm": 6.249060243135318e-05, "learning_rate": 2.8145279543813918e-05, "loss": 0.0004, "num_input_tokens_seen": 3477448, "step": 8490 }, { "epoch": 10.28450363196126, "grad_norm": 9.309209417551756e-05, "learning_rate": 2.8119075292969606e-05, "loss": 0.0, "num_input_tokens_seen": 3479528, "step": 8495 }, { "epoch": 10.290556900726392, "grad_norm": 5.988131046295166, "learning_rate": 2.8092867560672836e-05, "loss": 0.0402, "num_input_tokens_seen": 3481448, "step": 8500 }, { "epoch": 10.296610169491526, "grad_norm": 0.00015514239203184843, "learning_rate": 2.80666563761762e-05, "loss": 0.0, "num_input_tokens_seen": 3483464, "step": 8505 }, { "epoch": 10.302663438256658, "grad_norm": 0.0002144550089724362, "learning_rate": 2.8040441768736104e-05, "loss": 0.0, "num_input_tokens_seen": 3485544, "step": 8510 }, { "epoch": 10.30871670702179, "grad_norm": 0.00817590020596981, "learning_rate": 2.8014223767612807e-05, "loss": 0.0, "num_input_tokens_seen": 3487496, "step": 8515 }, { "epoch": 10.314769975786925, "grad_norm": 0.0003842135483864695, "learning_rate": 2.7988002402070347e-05, "loss": 0.0, "num_input_tokens_seen": 3489736, "step": 8520 }, { "epoch": 10.320823244552058, "grad_norm": 2.0272884285077453e-05, "learning_rate": 2.7961777701376502e-05, "loss": 0.0, "num_input_tokens_seen": 3491816, "step": 8525 }, { "epoch": 10.326876513317192, "grad_norm": 5.594160393229686e-05, "learning_rate": 2.7935549694802788e-05, "loss": 0.0001, "num_input_tokens_seen": 3493768, "step": 8530 }, { "epoch": 10.332929782082324, "grad_norm": 4.803948104381561e-05, "learning_rate": 2.790931841162441e-05, "loss": 0.0, "num_input_tokens_seen": 3495848, "step": 8535 }, { "epoch": 10.338983050847457, "grad_norm": 5.3308998758438975e-05, "learning_rate": 2.7883083881120214e-05, "loss": 0.0, "num_input_tokens_seen": 3498024, "step": 8540 }, { "epoch": 10.345036319612591, "grad_norm": 0.0006762113771401346, "learning_rate": 2.78568461325727e-05, "loss": 0.1294, "num_input_tokens_seen": 3500104, "step": 8545 }, { "epoch": 10.351089588377723, "grad_norm": 4.036940663354471e-05, "learning_rate": 2.7830605195267933e-05, "loss": 0.0, "num_input_tokens_seen": 3502120, "step": 8550 }, { "epoch": 10.357142857142858, "grad_norm": 0.044009264558553696, "learning_rate": 2.780436109849555e-05, "loss": 0.0003, "num_input_tokens_seen": 3504136, "step": 8555 }, { "epoch": 10.36319612590799, "grad_norm": 0.0024590673856437206, "learning_rate": 2.7778113871548716e-05, "loss": 0.0001, "num_input_tokens_seen": 3506152, "step": 8560 }, { "epoch": 10.369249394673124, "grad_norm": 28.14794158935547, "learning_rate": 2.7751863543724076e-05, "loss": 0.008, "num_input_tokens_seen": 3508360, "step": 8565 }, { "epoch": 10.375302663438257, "grad_norm": 0.023538419976830482, "learning_rate": 2.772561014432176e-05, "loss": 0.084, "num_input_tokens_seen": 3510408, "step": 8570 }, { "epoch": 10.38135593220339, "grad_norm": 0.001658266643062234, "learning_rate": 2.7699353702645305e-05, "loss": 0.0, "num_input_tokens_seen": 3512456, "step": 8575 }, { "epoch": 10.387409200968523, "grad_norm": 0.020313898101449013, "learning_rate": 2.7673094248001646e-05, "loss": 0.0001, "num_input_tokens_seen": 3514440, "step": 8580 }, { "epoch": 10.393462469733656, "grad_norm": 0.0006491821841336787, "learning_rate": 2.764683180970109e-05, "loss": 0.0002, "num_input_tokens_seen": 3516488, "step": 8585 }, { "epoch": 10.39951573849879, "grad_norm": 9.719777153804898e-05, "learning_rate": 2.762056641705727e-05, "loss": 0.0, "num_input_tokens_seen": 3518504, "step": 8590 }, { "epoch": 10.405569007263923, "grad_norm": 3.21035367960576e-05, "learning_rate": 2.759429809938712e-05, "loss": 0.0, "num_input_tokens_seen": 3520520, "step": 8595 }, { "epoch": 10.411622276029055, "grad_norm": 6.795322406105697e-05, "learning_rate": 2.756802688601084e-05, "loss": 0.0, "num_input_tokens_seen": 3522440, "step": 8600 }, { "epoch": 10.41767554479419, "grad_norm": 0.0010736285476014018, "learning_rate": 2.7541752806251848e-05, "loss": 0.0014, "num_input_tokens_seen": 3524392, "step": 8605 }, { "epoch": 10.423728813559322, "grad_norm": 0.06693597882986069, "learning_rate": 2.7515475889436775e-05, "loss": 0.0001, "num_input_tokens_seen": 3526408, "step": 8610 }, { "epoch": 10.429782082324456, "grad_norm": 7.387235164642334, "learning_rate": 2.748919616489542e-05, "loss": 0.0022, "num_input_tokens_seen": 3528488, "step": 8615 }, { "epoch": 10.435835351089588, "grad_norm": 1.8952745449496433e-05, "learning_rate": 2.7462913661960703e-05, "loss": 0.0, "num_input_tokens_seen": 3530664, "step": 8620 }, { "epoch": 10.441888619854721, "grad_norm": 0.00026919005904346704, "learning_rate": 2.7436628409968664e-05, "loss": 0.0, "num_input_tokens_seen": 3532648, "step": 8625 }, { "epoch": 10.447941888619855, "grad_norm": 0.0020718795713037252, "learning_rate": 2.7410340438258388e-05, "loss": 0.0, "num_input_tokens_seen": 3534600, "step": 8630 }, { "epoch": 10.453995157384988, "grad_norm": 0.00017520984692964703, "learning_rate": 2.7384049776172015e-05, "loss": 0.0, "num_input_tokens_seen": 3536776, "step": 8635 }, { "epoch": 10.460048426150122, "grad_norm": 4.8071800847537816e-05, "learning_rate": 2.7357756453054688e-05, "loss": 0.0, "num_input_tokens_seen": 3538664, "step": 8640 }, { "epoch": 10.466101694915254, "grad_norm": 0.00022258618264459074, "learning_rate": 2.73314604982545e-05, "loss": 0.0, "num_input_tokens_seen": 3540648, "step": 8645 }, { "epoch": 10.472154963680387, "grad_norm": 7.30229148757644e-05, "learning_rate": 2.730516194112251e-05, "loss": 0.0, "num_input_tokens_seen": 3542760, "step": 8650 }, { "epoch": 10.478208232445521, "grad_norm": 0.00023320975014939904, "learning_rate": 2.7278860811012652e-05, "loss": 0.0, "num_input_tokens_seen": 3544808, "step": 8655 }, { "epoch": 10.484261501210653, "grad_norm": 0.0018569659441709518, "learning_rate": 2.725255713728176e-05, "loss": 0.0009, "num_input_tokens_seen": 3546760, "step": 8660 }, { "epoch": 10.490314769975788, "grad_norm": 7.380015449598432e-05, "learning_rate": 2.722625094928948e-05, "loss": 0.0, "num_input_tokens_seen": 3548936, "step": 8665 }, { "epoch": 10.49636803874092, "grad_norm": 0.002682202495634556, "learning_rate": 2.7199942276398284e-05, "loss": 0.0, "num_input_tokens_seen": 3550824, "step": 8670 }, { "epoch": 10.502421307506053, "grad_norm": 0.0001416607847204432, "learning_rate": 2.7173631147973412e-05, "loss": 0.0, "num_input_tokens_seen": 3552904, "step": 8675 }, { "epoch": 10.508474576271187, "grad_norm": 5.915201109019108e-05, "learning_rate": 2.714731759338285e-05, "loss": 0.0, "num_input_tokens_seen": 3554952, "step": 8680 }, { "epoch": 10.51452784503632, "grad_norm": 4.405226354720071e-05, "learning_rate": 2.712100164199728e-05, "loss": 0.0001, "num_input_tokens_seen": 3557000, "step": 8685 }, { "epoch": 10.520581113801454, "grad_norm": 0.0001012082866509445, "learning_rate": 2.7094683323190063e-05, "loss": 0.0318, "num_input_tokens_seen": 3559080, "step": 8690 }, { "epoch": 10.526634382566586, "grad_norm": 3.620338611654006e-05, "learning_rate": 2.7068362666337217e-05, "loss": 0.0, "num_input_tokens_seen": 3561192, "step": 8695 }, { "epoch": 10.532687651331718, "grad_norm": 7.832014671294019e-05, "learning_rate": 2.7042039700817347e-05, "loss": 0.0, "num_input_tokens_seen": 3563080, "step": 8700 }, { "epoch": 10.538740920096853, "grad_norm": 25.469947814941406, "learning_rate": 2.7015714456011647e-05, "loss": 0.0766, "num_input_tokens_seen": 3565160, "step": 8705 }, { "epoch": 10.544794188861985, "grad_norm": 0.0017125722952187061, "learning_rate": 2.698938696130386e-05, "loss": 0.0001, "num_input_tokens_seen": 3567240, "step": 8710 }, { "epoch": 10.55084745762712, "grad_norm": 6.311330071184784e-05, "learning_rate": 2.6963057246080227e-05, "loss": 0.0, "num_input_tokens_seen": 3569544, "step": 8715 }, { "epoch": 10.556900726392252, "grad_norm": 7.708146586082876e-05, "learning_rate": 2.693672533972948e-05, "loss": 0.0, "num_input_tokens_seen": 3571560, "step": 8720 }, { "epoch": 10.562953995157384, "grad_norm": 0.0002731763815972954, "learning_rate": 2.6910391271642793e-05, "loss": 0.0001, "num_input_tokens_seen": 3573640, "step": 8725 }, { "epoch": 10.569007263922519, "grad_norm": 0.4055250883102417, "learning_rate": 2.6884055071213742e-05, "loss": 0.0008, "num_input_tokens_seen": 3575688, "step": 8730 }, { "epoch": 10.575060532687651, "grad_norm": 1.7044001817703247, "learning_rate": 2.685771676783831e-05, "loss": 0.0005, "num_input_tokens_seen": 3577800, "step": 8735 }, { "epoch": 10.581113801452785, "grad_norm": 0.00017048532026819885, "learning_rate": 2.6831376390914785e-05, "loss": 0.0045, "num_input_tokens_seen": 3579944, "step": 8740 }, { "epoch": 10.587167070217918, "grad_norm": 0.00010408971866127104, "learning_rate": 2.680503396984382e-05, "loss": 0.0, "num_input_tokens_seen": 3581896, "step": 8745 }, { "epoch": 10.59322033898305, "grad_norm": 9.095996938413009e-05, "learning_rate": 2.6778689534028295e-05, "loss": 0.0, "num_input_tokens_seen": 3583880, "step": 8750 }, { "epoch": 10.599273607748184, "grad_norm": 0.011030416935682297, "learning_rate": 2.6752343112873397e-05, "loss": 0.0, "num_input_tokens_seen": 3586056, "step": 8755 }, { "epoch": 10.605326876513317, "grad_norm": 8.145711763063446e-05, "learning_rate": 2.672599473578648e-05, "loss": 0.0, "num_input_tokens_seen": 3588040, "step": 8760 }, { "epoch": 10.611380145278451, "grad_norm": 0.002551580546423793, "learning_rate": 2.6699644432177112e-05, "loss": 0.0, "num_input_tokens_seen": 3590056, "step": 8765 }, { "epoch": 10.617433414043584, "grad_norm": 0.0011730380356311798, "learning_rate": 2.6673292231456993e-05, "loss": 0.0, "num_input_tokens_seen": 3592168, "step": 8770 }, { "epoch": 10.623486682808716, "grad_norm": 9.810436313273385e-05, "learning_rate": 2.6646938163039942e-05, "loss": 0.0, "num_input_tokens_seen": 3594280, "step": 8775 }, { "epoch": 10.62953995157385, "grad_norm": 0.0005169520154595375, "learning_rate": 2.6620582256341885e-05, "loss": 0.0, "num_input_tokens_seen": 3596424, "step": 8780 }, { "epoch": 10.635593220338983, "grad_norm": 0.0016520090866833925, "learning_rate": 2.659422454078077e-05, "loss": 0.0, "num_input_tokens_seen": 3598600, "step": 8785 }, { "epoch": 10.641646489104117, "grad_norm": 0.00023395127209369093, "learning_rate": 2.6567865045776586e-05, "loss": 0.0345, "num_input_tokens_seen": 3600616, "step": 8790 }, { "epoch": 10.64769975786925, "grad_norm": 93.25843811035156, "learning_rate": 2.654150380075129e-05, "loss": 0.0258, "num_input_tokens_seen": 3602696, "step": 8795 }, { "epoch": 10.653753026634382, "grad_norm": 7.120957161532715e-05, "learning_rate": 2.651514083512881e-05, "loss": 0.0, "num_input_tokens_seen": 3604808, "step": 8800 }, { "epoch": 10.659806295399516, "grad_norm": 0.0032605985179543495, "learning_rate": 2.6488776178334978e-05, "loss": 0.0001, "num_input_tokens_seen": 3606920, "step": 8805 }, { "epoch": 10.665859564164649, "grad_norm": 0.0005470658652484417, "learning_rate": 2.646240985979753e-05, "loss": 0.0, "num_input_tokens_seen": 3608776, "step": 8810 }, { "epoch": 10.671912832929783, "grad_norm": 0.00433291494846344, "learning_rate": 2.6436041908946046e-05, "loss": 0.0214, "num_input_tokens_seen": 3610824, "step": 8815 }, { "epoch": 10.677966101694915, "grad_norm": 9.321128891315311e-05, "learning_rate": 2.6409672355211936e-05, "loss": 0.0, "num_input_tokens_seen": 3612968, "step": 8820 }, { "epoch": 10.684019370460048, "grad_norm": 0.08674860000610352, "learning_rate": 2.638330122802838e-05, "loss": 0.0002, "num_input_tokens_seen": 3615144, "step": 8825 }, { "epoch": 10.690072639225182, "grad_norm": 0.0013023039791733027, "learning_rate": 2.635692855683033e-05, "loss": 0.0, "num_input_tokens_seen": 3617160, "step": 8830 }, { "epoch": 10.696125907990314, "grad_norm": 0.000728488143067807, "learning_rate": 2.6330554371054467e-05, "loss": 0.0003, "num_input_tokens_seen": 3619368, "step": 8835 }, { "epoch": 10.702179176755449, "grad_norm": 0.01648154854774475, "learning_rate": 2.630417870013916e-05, "loss": 0.0, "num_input_tokens_seen": 3621384, "step": 8840 }, { "epoch": 10.708232445520581, "grad_norm": 0.00012495359987951815, "learning_rate": 2.627780157352442e-05, "loss": 0.0896, "num_input_tokens_seen": 3623496, "step": 8845 }, { "epoch": 10.714285714285714, "grad_norm": 0.0003292164474260062, "learning_rate": 2.6251423020651893e-05, "loss": 0.0, "num_input_tokens_seen": 3625512, "step": 8850 }, { "epoch": 10.720338983050848, "grad_norm": 0.0012340544490143657, "learning_rate": 2.622504307096482e-05, "loss": 0.0001, "num_input_tokens_seen": 3627752, "step": 8855 }, { "epoch": 10.72639225181598, "grad_norm": 0.009047150611877441, "learning_rate": 2.6198661753908004e-05, "loss": 0.0, "num_input_tokens_seen": 3629896, "step": 8860 }, { "epoch": 10.732445520581114, "grad_norm": 0.0006859235581941903, "learning_rate": 2.6172279098927772e-05, "loss": 0.0, "num_input_tokens_seen": 3631848, "step": 8865 }, { "epoch": 10.738498789346247, "grad_norm": 0.0008055937360040843, "learning_rate": 2.614589513547192e-05, "loss": 0.0, "num_input_tokens_seen": 3633896, "step": 8870 }, { "epoch": 10.74455205811138, "grad_norm": 0.06315652281045914, "learning_rate": 2.6119509892989747e-05, "loss": 0.0001, "num_input_tokens_seen": 3635912, "step": 8875 }, { "epoch": 10.750605326876514, "grad_norm": 0.00898653082549572, "learning_rate": 2.6093123400931945e-05, "loss": 0.0015, "num_input_tokens_seen": 3637864, "step": 8880 }, { "epoch": 10.756658595641646, "grad_norm": 0.0008300360641442239, "learning_rate": 2.6066735688750626e-05, "loss": 0.0, "num_input_tokens_seen": 3639912, "step": 8885 }, { "epoch": 10.76271186440678, "grad_norm": 0.0001667337492108345, "learning_rate": 2.604034678589924e-05, "loss": 0.0, "num_input_tokens_seen": 3642120, "step": 8890 }, { "epoch": 10.768765133171913, "grad_norm": 2.9101747713866644e-05, "learning_rate": 2.6013956721832582e-05, "loss": 0.0, "num_input_tokens_seen": 3644136, "step": 8895 }, { "epoch": 10.774818401937045, "grad_norm": 0.00042919389670714736, "learning_rate": 2.5987565526006748e-05, "loss": 0.0, "num_input_tokens_seen": 3646216, "step": 8900 }, { "epoch": 10.78087167070218, "grad_norm": 1.8186599845648743e-05, "learning_rate": 2.596117322787907e-05, "loss": 0.0, "num_input_tokens_seen": 3648264, "step": 8905 }, { "epoch": 10.786924939467312, "grad_norm": 3.180598497390747, "learning_rate": 2.593477985690815e-05, "loss": 0.0081, "num_input_tokens_seen": 3650408, "step": 8910 }, { "epoch": 10.792978208232446, "grad_norm": 2.5794754037633538e-05, "learning_rate": 2.590838544255374e-05, "loss": 0.0, "num_input_tokens_seen": 3652488, "step": 8915 }, { "epoch": 10.799031476997579, "grad_norm": 0.4029621183872223, "learning_rate": 2.5881990014276808e-05, "loss": 0.0001, "num_input_tokens_seen": 3654600, "step": 8920 }, { "epoch": 10.805084745762711, "grad_norm": 0.00012829207116737962, "learning_rate": 2.5855593601539412e-05, "loss": 0.0, "num_input_tokens_seen": 3656744, "step": 8925 }, { "epoch": 10.811138014527845, "grad_norm": 4.421450648806058e-05, "learning_rate": 2.5829196233804738e-05, "loss": 0.0, "num_input_tokens_seen": 3658760, "step": 8930 }, { "epoch": 10.817191283292978, "grad_norm": 6.178219337016344e-05, "learning_rate": 2.5802797940537004e-05, "loss": 0.0, "num_input_tokens_seen": 3660712, "step": 8935 }, { "epoch": 10.823244552058112, "grad_norm": 6.172007124405354e-05, "learning_rate": 2.5776398751201507e-05, "loss": 0.0, "num_input_tokens_seen": 3662760, "step": 8940 }, { "epoch": 10.829297820823244, "grad_norm": 3.427137926337309e-05, "learning_rate": 2.574999869526451e-05, "loss": 0.0, "num_input_tokens_seen": 3664776, "step": 8945 }, { "epoch": 10.835351089588377, "grad_norm": 6.4447260228917e-05, "learning_rate": 2.5723597802193256e-05, "loss": 0.0, "num_input_tokens_seen": 3666824, "step": 8950 }, { "epoch": 10.841404358353511, "grad_norm": 2.2775440811528824e-05, "learning_rate": 2.56971961014559e-05, "loss": 0.0, "num_input_tokens_seen": 3668936, "step": 8955 }, { "epoch": 10.847457627118644, "grad_norm": 2.9469310902641155e-05, "learning_rate": 2.5670793622521544e-05, "loss": 0.0, "num_input_tokens_seen": 3671048, "step": 8960 }, { "epoch": 10.853510895883778, "grad_norm": 3.450909935054369e-05, "learning_rate": 2.564439039486012e-05, "loss": 0.0, "num_input_tokens_seen": 3673160, "step": 8965 }, { "epoch": 10.85956416464891, "grad_norm": 4.153323607170023e-05, "learning_rate": 2.5617986447942406e-05, "loss": 0.0, "num_input_tokens_seen": 3675176, "step": 8970 }, { "epoch": 10.865617433414045, "grad_norm": 0.00020979178952984512, "learning_rate": 2.5591581811239983e-05, "loss": 0.0, "num_input_tokens_seen": 3677192, "step": 8975 }, { "epoch": 10.871670702179177, "grad_norm": 3.1750998459756374e-05, "learning_rate": 2.5565176514225213e-05, "loss": 0.0, "num_input_tokens_seen": 3679464, "step": 8980 }, { "epoch": 10.87772397094431, "grad_norm": 0.0003036274865735322, "learning_rate": 2.553877058637118e-05, "loss": 0.0, "num_input_tokens_seen": 3681448, "step": 8985 }, { "epoch": 10.883777239709444, "grad_norm": 8.318477921420708e-05, "learning_rate": 2.5512364057151676e-05, "loss": 0.0, "num_input_tokens_seen": 3683464, "step": 8990 }, { "epoch": 10.889830508474576, "grad_norm": 3.187478432664648e-05, "learning_rate": 2.5485956956041167e-05, "loss": 0.0, "num_input_tokens_seen": 3685576, "step": 8995 }, { "epoch": 10.89588377723971, "grad_norm": 4.8251255066134036e-05, "learning_rate": 2.5459549312514764e-05, "loss": 0.0, "num_input_tokens_seen": 3687592, "step": 9000 }, { "epoch": 10.901937046004843, "grad_norm": 9.007652261061594e-05, "learning_rate": 2.5433141156048163e-05, "loss": 0.0, "num_input_tokens_seen": 3689608, "step": 9005 }, { "epoch": 10.907990314769975, "grad_norm": 0.28160756826400757, "learning_rate": 2.5406732516117655e-05, "loss": 0.0001, "num_input_tokens_seen": 3691656, "step": 9010 }, { "epoch": 10.91404358353511, "grad_norm": 7.015546725597233e-05, "learning_rate": 2.5380323422200053e-05, "loss": 0.0, "num_input_tokens_seen": 3693640, "step": 9015 }, { "epoch": 10.920096852300242, "grad_norm": 4.654492295230739e-05, "learning_rate": 2.5353913903772696e-05, "loss": 0.0, "num_input_tokens_seen": 3695688, "step": 9020 }, { "epoch": 10.926150121065376, "grad_norm": 4.172200715402141e-05, "learning_rate": 2.5327503990313377e-05, "loss": 0.0, "num_input_tokens_seen": 3697608, "step": 9025 }, { "epoch": 10.932203389830509, "grad_norm": 3.3909091143868864e-05, "learning_rate": 2.5301093711300344e-05, "loss": 0.0075, "num_input_tokens_seen": 3699688, "step": 9030 }, { "epoch": 10.938256658595641, "grad_norm": 3.844270031549968e-05, "learning_rate": 2.5274683096212237e-05, "loss": 0.0, "num_input_tokens_seen": 3701768, "step": 9035 }, { "epoch": 10.944309927360775, "grad_norm": 0.06774739921092987, "learning_rate": 2.5248272174528093e-05, "loss": 0.0001, "num_input_tokens_seen": 3703624, "step": 9040 }, { "epoch": 10.950363196125908, "grad_norm": 3.125051443930715e-05, "learning_rate": 2.5221860975727275e-05, "loss": 0.0, "num_input_tokens_seen": 3705704, "step": 9045 }, { "epoch": 10.956416464891042, "grad_norm": 2.50362263614079e-05, "learning_rate": 2.5195449529289472e-05, "loss": 0.0, "num_input_tokens_seen": 3707656, "step": 9050 }, { "epoch": 10.962469733656174, "grad_norm": 3.07604786939919e-05, "learning_rate": 2.516903786469461e-05, "loss": 0.0001, "num_input_tokens_seen": 3709704, "step": 9055 }, { "epoch": 10.968523002421307, "grad_norm": 2.2126607291284017e-05, "learning_rate": 2.514262601142291e-05, "loss": 0.0, "num_input_tokens_seen": 3711560, "step": 9060 }, { "epoch": 10.974576271186441, "grad_norm": 2.2552911104867235e-05, "learning_rate": 2.5116213998954768e-05, "loss": 0.0938, "num_input_tokens_seen": 3713704, "step": 9065 }, { "epoch": 10.980629539951574, "grad_norm": 2.5582512535038404e-05, "learning_rate": 2.5089801856770778e-05, "loss": 0.0, "num_input_tokens_seen": 3715880, "step": 9070 }, { "epoch": 10.986682808716708, "grad_norm": 1.8466918845660985e-05, "learning_rate": 2.5063389614351656e-05, "loss": 0.0, "num_input_tokens_seen": 3717960, "step": 9075 }, { "epoch": 10.99273607748184, "grad_norm": 2.3098917154129595e-05, "learning_rate": 2.5036977301178266e-05, "loss": 0.0, "num_input_tokens_seen": 3720040, "step": 9080 }, { "epoch": 10.998789346246973, "grad_norm": 2.4976721761049703e-05, "learning_rate": 2.5010564946731512e-05, "loss": 0.0, "num_input_tokens_seen": 3721992, "step": 9085 }, { "epoch": 11.0, "eval_loss": 0.33854931592941284, "eval_runtime": 4.955, "eval_samples_per_second": 74.066, "eval_steps_per_second": 18.567, "num_input_tokens_seen": 3722064, "step": 9086 }, { "epoch": 11.004842615012107, "grad_norm": 4.7891695430735126e-05, "learning_rate": 2.4984152580492366e-05, "loss": 0.0, "num_input_tokens_seen": 3723760, "step": 9090 }, { "epoch": 11.01089588377724, "grad_norm": 0.0001609539322089404, "learning_rate": 2.49577402319418e-05, "loss": 0.0, "num_input_tokens_seen": 3725776, "step": 9095 }, { "epoch": 11.016949152542374, "grad_norm": 2.478426722518634e-05, "learning_rate": 2.4931327930560798e-05, "loss": 0.0, "num_input_tokens_seen": 3727824, "step": 9100 }, { "epoch": 11.023002421307506, "grad_norm": 1.5165376680670306e-05, "learning_rate": 2.4904915705830238e-05, "loss": 0.0, "num_input_tokens_seen": 3729904, "step": 9105 }, { "epoch": 11.029055690072639, "grad_norm": 3.664302494144067e-05, "learning_rate": 2.487850358723097e-05, "loss": 0.0, "num_input_tokens_seen": 3732048, "step": 9110 }, { "epoch": 11.035108958837773, "grad_norm": 0.00023666920606046915, "learning_rate": 2.4852091604243663e-05, "loss": 0.0, "num_input_tokens_seen": 3734128, "step": 9115 }, { "epoch": 11.041162227602905, "grad_norm": 3.353881766088307e-05, "learning_rate": 2.482567978634891e-05, "loss": 0.0, "num_input_tokens_seen": 3736240, "step": 9120 }, { "epoch": 11.04721549636804, "grad_norm": 7.671180355828255e-05, "learning_rate": 2.479926816302705e-05, "loss": 0.0, "num_input_tokens_seen": 3738256, "step": 9125 }, { "epoch": 11.053268765133172, "grad_norm": 0.0001040056231431663, "learning_rate": 2.4772856763758252e-05, "loss": 0.0, "num_input_tokens_seen": 3740304, "step": 9130 }, { "epoch": 11.059322033898304, "grad_norm": 3.77040696144104, "learning_rate": 2.47464456180224e-05, "loss": 0.0562, "num_input_tokens_seen": 3742320, "step": 9135 }, { "epoch": 11.065375302663439, "grad_norm": 0.00031497920281253755, "learning_rate": 2.472003475529913e-05, "loss": 0.0, "num_input_tokens_seen": 3744432, "step": 9140 }, { "epoch": 11.071428571428571, "grad_norm": 0.3521515130996704, "learning_rate": 2.4693624205067723e-05, "loss": 0.0001, "num_input_tokens_seen": 3746544, "step": 9145 }, { "epoch": 11.077481840193705, "grad_norm": 1.8888926206273027e-05, "learning_rate": 2.466721399680716e-05, "loss": 0.0026, "num_input_tokens_seen": 3748528, "step": 9150 }, { "epoch": 11.083535108958838, "grad_norm": 0.00014508521417155862, "learning_rate": 2.464080415999598e-05, "loss": 0.0, "num_input_tokens_seen": 3750576, "step": 9155 }, { "epoch": 11.08958837772397, "grad_norm": 0.005011416971683502, "learning_rate": 2.4614394724112366e-05, "loss": 0.0, "num_input_tokens_seen": 3752560, "step": 9160 }, { "epoch": 11.095641646489105, "grad_norm": 8.095463999779895e-05, "learning_rate": 2.458798571863401e-05, "loss": 0.0, "num_input_tokens_seen": 3754672, "step": 9165 }, { "epoch": 11.101694915254237, "grad_norm": 0.008825930766761303, "learning_rate": 2.4561577173038164e-05, "loss": 0.0, "num_input_tokens_seen": 3756656, "step": 9170 }, { "epoch": 11.107748184019371, "grad_norm": 4.882080611423589e-05, "learning_rate": 2.4535169116801517e-05, "loss": 0.0, "num_input_tokens_seen": 3758544, "step": 9175 }, { "epoch": 11.113801452784504, "grad_norm": 5.973516817903146e-05, "learning_rate": 2.4508761579400248e-05, "loss": 0.0, "num_input_tokens_seen": 3760720, "step": 9180 }, { "epoch": 11.119854721549636, "grad_norm": 1.5565867215627804e-05, "learning_rate": 2.4482354590309962e-05, "loss": 0.0, "num_input_tokens_seen": 3762704, "step": 9185 }, { "epoch": 11.12590799031477, "grad_norm": 0.0012663902016356587, "learning_rate": 2.4455948179005613e-05, "loss": 0.0, "num_input_tokens_seen": 3764848, "step": 9190 }, { "epoch": 11.131961259079903, "grad_norm": 5.6862732890294865e-05, "learning_rate": 2.4429542374961552e-05, "loss": 0.0, "num_input_tokens_seen": 3766768, "step": 9195 }, { "epoch": 11.138014527845037, "grad_norm": 4.022721986984834e-05, "learning_rate": 2.440313720765142e-05, "loss": 0.0, "num_input_tokens_seen": 3768720, "step": 9200 }, { "epoch": 11.14406779661017, "grad_norm": 2.8991589715587907e-05, "learning_rate": 2.4376732706548183e-05, "loss": 0.0, "num_input_tokens_seen": 3770672, "step": 9205 }, { "epoch": 11.150121065375302, "grad_norm": 0.03153045102953911, "learning_rate": 2.435032890112402e-05, "loss": 0.0003, "num_input_tokens_seen": 3772848, "step": 9210 }, { "epoch": 11.156174334140436, "grad_norm": 1.648285979172215e-05, "learning_rate": 2.432392582085037e-05, "loss": 0.0, "num_input_tokens_seen": 3775088, "step": 9215 }, { "epoch": 11.162227602905569, "grad_norm": 0.0003068851656280458, "learning_rate": 2.4297523495197845e-05, "loss": 0.0, "num_input_tokens_seen": 3777104, "step": 9220 }, { "epoch": 11.168280871670703, "grad_norm": 6.31972579867579e-05, "learning_rate": 2.427112195363622e-05, "loss": 0.0, "num_input_tokens_seen": 3779216, "step": 9225 }, { "epoch": 11.174334140435835, "grad_norm": 1.4779735465708654e-05, "learning_rate": 2.4244721225634384e-05, "loss": 0.0, "num_input_tokens_seen": 3781232, "step": 9230 }, { "epoch": 11.180387409200968, "grad_norm": 2.5826120690908283e-05, "learning_rate": 2.421832134066034e-05, "loss": 0.0, "num_input_tokens_seen": 3783344, "step": 9235 }, { "epoch": 11.186440677966102, "grad_norm": 3.876216942444444e-05, "learning_rate": 2.4191922328181118e-05, "loss": 0.0, "num_input_tokens_seen": 3785296, "step": 9240 }, { "epoch": 11.192493946731235, "grad_norm": 3.877134804497473e-05, "learning_rate": 2.4165524217662822e-05, "loss": 0.0, "num_input_tokens_seen": 3787280, "step": 9245 }, { "epoch": 11.198547215496369, "grad_norm": 0.00018550120876170695, "learning_rate": 2.4139127038570504e-05, "loss": 0.0, "num_input_tokens_seen": 3789296, "step": 9250 }, { "epoch": 11.204600484261501, "grad_norm": 1.905765566334594e-05, "learning_rate": 2.4112730820368176e-05, "loss": 0.0, "num_input_tokens_seen": 3791280, "step": 9255 }, { "epoch": 11.210653753026634, "grad_norm": 0.0002530152560211718, "learning_rate": 2.4086335592518817e-05, "loss": 0.0, "num_input_tokens_seen": 3793232, "step": 9260 }, { "epoch": 11.216707021791768, "grad_norm": 0.0013109195278957486, "learning_rate": 2.4059941384484266e-05, "loss": 0.0, "num_input_tokens_seen": 3795184, "step": 9265 }, { "epoch": 11.2227602905569, "grad_norm": 1.4877853573125321e-05, "learning_rate": 2.4033548225725252e-05, "loss": 0.0, "num_input_tokens_seen": 3797232, "step": 9270 }, { "epoch": 11.228813559322035, "grad_norm": 1.807385342544876e-05, "learning_rate": 2.4007156145701294e-05, "loss": 0.0, "num_input_tokens_seen": 3799248, "step": 9275 }, { "epoch": 11.234866828087167, "grad_norm": 2.017890255956445e-05, "learning_rate": 2.3980765173870745e-05, "loss": 0.0, "num_input_tokens_seen": 3801328, "step": 9280 }, { "epoch": 11.2409200968523, "grad_norm": 1.800991412892472e-05, "learning_rate": 2.3954375339690692e-05, "loss": 0.0, "num_input_tokens_seen": 3803280, "step": 9285 }, { "epoch": 11.246973365617434, "grad_norm": 0.021041389554739, "learning_rate": 2.3927986672616985e-05, "loss": 0.0, "num_input_tokens_seen": 3805296, "step": 9290 }, { "epoch": 11.253026634382566, "grad_norm": 2.842288085957989e-05, "learning_rate": 2.3901599202104127e-05, "loss": 0.0, "num_input_tokens_seen": 3807312, "step": 9295 }, { "epoch": 11.2590799031477, "grad_norm": 2.0464431145228446e-05, "learning_rate": 2.387521295760533e-05, "loss": 0.0, "num_input_tokens_seen": 3809360, "step": 9300 }, { "epoch": 11.265133171912833, "grad_norm": 1.742266431392636e-05, "learning_rate": 2.3848827968572405e-05, "loss": 0.0, "num_input_tokens_seen": 3811408, "step": 9305 }, { "epoch": 11.271186440677965, "grad_norm": 2.2232004994293675e-05, "learning_rate": 2.3822444264455793e-05, "loss": 0.0, "num_input_tokens_seen": 3813520, "step": 9310 }, { "epoch": 11.2772397094431, "grad_norm": 1.9144979887641966e-05, "learning_rate": 2.3796061874704454e-05, "loss": 0.0, "num_input_tokens_seen": 3815504, "step": 9315 }, { "epoch": 11.283292978208232, "grad_norm": 1.6232337657129392e-05, "learning_rate": 2.3769680828765932e-05, "loss": 0.0, "num_input_tokens_seen": 3817712, "step": 9320 }, { "epoch": 11.289346246973366, "grad_norm": 1.7060925529222004e-05, "learning_rate": 2.3743301156086244e-05, "loss": 0.0, "num_input_tokens_seen": 3819760, "step": 9325 }, { "epoch": 11.295399515738499, "grad_norm": 1.663145303609781e-05, "learning_rate": 2.3716922886109864e-05, "loss": 0.0, "num_input_tokens_seen": 3821808, "step": 9330 }, { "epoch": 11.301452784503631, "grad_norm": 1.586178404977545e-05, "learning_rate": 2.3690546048279728e-05, "loss": 0.0, "num_input_tokens_seen": 3824016, "step": 9335 }, { "epoch": 11.307506053268765, "grad_norm": 1.4502108570013661e-05, "learning_rate": 2.3664170672037154e-05, "loss": 0.0, "num_input_tokens_seen": 3826064, "step": 9340 }, { "epoch": 11.313559322033898, "grad_norm": 2.1317147911759093e-05, "learning_rate": 2.363779678682185e-05, "loss": 0.0, "num_input_tokens_seen": 3828304, "step": 9345 }, { "epoch": 11.319612590799032, "grad_norm": 0.00042562055750750005, "learning_rate": 2.3611424422071816e-05, "loss": 0.0, "num_input_tokens_seen": 3830320, "step": 9350 }, { "epoch": 11.325665859564165, "grad_norm": 2.869475974875968e-05, "learning_rate": 2.358505360722341e-05, "loss": 0.0, "num_input_tokens_seen": 3832368, "step": 9355 }, { "epoch": 11.331719128329297, "grad_norm": 2.375262920395471e-05, "learning_rate": 2.3558684371711215e-05, "loss": 0.0, "num_input_tokens_seen": 3834448, "step": 9360 }, { "epoch": 11.337772397094431, "grad_norm": 2.2795828044763766e-05, "learning_rate": 2.3532316744968088e-05, "loss": 0.0, "num_input_tokens_seen": 3836528, "step": 9365 }, { "epoch": 11.343825665859564, "grad_norm": 5.193407560000196e-05, "learning_rate": 2.3505950756425052e-05, "loss": 0.0003, "num_input_tokens_seen": 3838576, "step": 9370 }, { "epoch": 11.349878934624698, "grad_norm": 0.002772793173789978, "learning_rate": 2.3479586435511337e-05, "loss": 0.0, "num_input_tokens_seen": 3840560, "step": 9375 }, { "epoch": 11.35593220338983, "grad_norm": 2.0030665837111883e-05, "learning_rate": 2.3453223811654282e-05, "loss": 0.0, "num_input_tokens_seen": 3842512, "step": 9380 }, { "epoch": 11.361985472154963, "grad_norm": 6.169628613861278e-05, "learning_rate": 2.3426862914279364e-05, "loss": 0.0, "num_input_tokens_seen": 3844592, "step": 9385 }, { "epoch": 11.368038740920097, "grad_norm": 2.757170113909524e-05, "learning_rate": 2.340050377281009e-05, "loss": 0.0, "num_input_tokens_seen": 3846512, "step": 9390 }, { "epoch": 11.37409200968523, "grad_norm": 6.615056190639734e-05, "learning_rate": 2.3374146416668063e-05, "loss": 0.0, "num_input_tokens_seen": 3848560, "step": 9395 }, { "epoch": 11.380145278450364, "grad_norm": 1.3037676581006963e-05, "learning_rate": 2.3347790875272822e-05, "loss": 0.0, "num_input_tokens_seen": 3850640, "step": 9400 }, { "epoch": 11.386198547215496, "grad_norm": 3.275153721915558e-05, "learning_rate": 2.3321437178041966e-05, "loss": 0.0, "num_input_tokens_seen": 3852656, "step": 9405 }, { "epoch": 11.392251815980629, "grad_norm": 1.523134960734751e-05, "learning_rate": 2.3295085354390964e-05, "loss": 0.0, "num_input_tokens_seen": 3854768, "step": 9410 }, { "epoch": 11.398305084745763, "grad_norm": 7.246857421705499e-05, "learning_rate": 2.326873543373321e-05, "loss": 0.0, "num_input_tokens_seen": 3856816, "step": 9415 }, { "epoch": 11.404358353510895, "grad_norm": 1.8530079614720307e-05, "learning_rate": 2.324238744548e-05, "loss": 0.0, "num_input_tokens_seen": 3859056, "step": 9420 }, { "epoch": 11.41041162227603, "grad_norm": 1.3032731658313423e-05, "learning_rate": 2.3216041419040455e-05, "loss": 0.0, "num_input_tokens_seen": 3861072, "step": 9425 }, { "epoch": 11.416464891041162, "grad_norm": 1.4223420294001698e-05, "learning_rate": 2.3189697383821512e-05, "loss": 0.0, "num_input_tokens_seen": 3863280, "step": 9430 }, { "epoch": 11.422518159806295, "grad_norm": 1.6850093743414618e-05, "learning_rate": 2.3163355369227865e-05, "loss": 0.0, "num_input_tokens_seen": 3865328, "step": 9435 }, { "epoch": 11.428571428571429, "grad_norm": 1.4311099221231416e-05, "learning_rate": 2.3137015404661988e-05, "loss": 0.0, "num_input_tokens_seen": 3867440, "step": 9440 }, { "epoch": 11.434624697336561, "grad_norm": 2.278971442137845e-05, "learning_rate": 2.3110677519524032e-05, "loss": 0.0, "num_input_tokens_seen": 3869424, "step": 9445 }, { "epoch": 11.440677966101696, "grad_norm": 1.8291824744665064e-05, "learning_rate": 2.308434174321186e-05, "loss": 0.0, "num_input_tokens_seen": 3871536, "step": 9450 }, { "epoch": 11.446731234866828, "grad_norm": 1.7364061932312325e-05, "learning_rate": 2.3058008105120946e-05, "loss": 0.0, "num_input_tokens_seen": 3873616, "step": 9455 }, { "epoch": 11.45278450363196, "grad_norm": 5.2127556800842285, "learning_rate": 2.3031676634644406e-05, "loss": 0.0385, "num_input_tokens_seen": 3875504, "step": 9460 }, { "epoch": 11.458837772397095, "grad_norm": 2.3118016542866826e-05, "learning_rate": 2.300534736117292e-05, "loss": 0.0, "num_input_tokens_seen": 3877648, "step": 9465 }, { "epoch": 11.464891041162227, "grad_norm": 0.005908558610826731, "learning_rate": 2.2979020314094738e-05, "loss": 0.0001, "num_input_tokens_seen": 3879536, "step": 9470 }, { "epoch": 11.470944309927361, "grad_norm": 0.412965327501297, "learning_rate": 2.2952695522795583e-05, "loss": 0.0003, "num_input_tokens_seen": 3881552, "step": 9475 }, { "epoch": 11.476997578692494, "grad_norm": 2.393739669059869e-05, "learning_rate": 2.2926373016658703e-05, "loss": 0.0, "num_input_tokens_seen": 3883568, "step": 9480 }, { "epoch": 11.483050847457626, "grad_norm": 1.2144771972089075e-05, "learning_rate": 2.2900052825064782e-05, "loss": 0.0109, "num_input_tokens_seen": 3885584, "step": 9485 }, { "epoch": 11.48910411622276, "grad_norm": 0.00034260383108630776, "learning_rate": 2.2873734977391898e-05, "loss": 0.0, "num_input_tokens_seen": 3887664, "step": 9490 }, { "epoch": 11.495157384987893, "grad_norm": 6.389649934135377e-05, "learning_rate": 2.2847419503015543e-05, "loss": 0.0, "num_input_tokens_seen": 3889648, "step": 9495 }, { "epoch": 11.501210653753027, "grad_norm": 0.0001433730503777042, "learning_rate": 2.2821106431308544e-05, "loss": 0.0, "num_input_tokens_seen": 3891536, "step": 9500 }, { "epoch": 11.50726392251816, "grad_norm": 2.8006252250634134e-05, "learning_rate": 2.2794795791641065e-05, "loss": 0.0217, "num_input_tokens_seen": 3893680, "step": 9505 }, { "epoch": 11.513317191283292, "grad_norm": 8.234289998654276e-05, "learning_rate": 2.276848761338052e-05, "loss": 0.0, "num_input_tokens_seen": 3895760, "step": 9510 }, { "epoch": 11.519370460048426, "grad_norm": 0.00012510614760685712, "learning_rate": 2.2742181925891608e-05, "loss": 0.0, "num_input_tokens_seen": 3897648, "step": 9515 }, { "epoch": 11.525423728813559, "grad_norm": 8.431117021245882e-05, "learning_rate": 2.2715878758536236e-05, "loss": 0.0, "num_input_tokens_seen": 3899664, "step": 9520 }, { "epoch": 11.531476997578693, "grad_norm": 0.0001579879899509251, "learning_rate": 2.26895781406735e-05, "loss": 0.0, "num_input_tokens_seen": 3901616, "step": 9525 }, { "epoch": 11.537530266343826, "grad_norm": 2.390149893471971e-05, "learning_rate": 2.2663280101659643e-05, "loss": 0.0, "num_input_tokens_seen": 3903792, "step": 9530 }, { "epoch": 11.543583535108958, "grad_norm": 0.004874860867857933, "learning_rate": 2.2636984670848044e-05, "loss": 0.0, "num_input_tokens_seen": 3905968, "step": 9535 }, { "epoch": 11.549636803874092, "grad_norm": 1.8770364476949908e-05, "learning_rate": 2.2610691877589145e-05, "loss": 0.0, "num_input_tokens_seen": 3908080, "step": 9540 }, { "epoch": 11.555690072639225, "grad_norm": 8.174572030839045e-06, "learning_rate": 2.258440175123048e-05, "loss": 0.0, "num_input_tokens_seen": 3910160, "step": 9545 }, { "epoch": 11.561743341404359, "grad_norm": 1.2014251296932343e-05, "learning_rate": 2.255811432111658e-05, "loss": 0.0, "num_input_tokens_seen": 3912144, "step": 9550 }, { "epoch": 11.567796610169491, "grad_norm": 2.0298488379921764e-05, "learning_rate": 2.2531829616588977e-05, "loss": 0.0, "num_input_tokens_seen": 3914256, "step": 9555 }, { "epoch": 11.573849878934624, "grad_norm": 1.3922021025791764e-05, "learning_rate": 2.2505547666986145e-05, "loss": 0.0, "num_input_tokens_seen": 3916336, "step": 9560 }, { "epoch": 11.579903147699758, "grad_norm": 1.693330159469042e-05, "learning_rate": 2.2479268501643512e-05, "loss": 0.0, "num_input_tokens_seen": 3918384, "step": 9565 }, { "epoch": 11.58595641646489, "grad_norm": 4.9706268310546875, "learning_rate": 2.245299214989338e-05, "loss": 0.0385, "num_input_tokens_seen": 3920464, "step": 9570 }, { "epoch": 11.592009685230025, "grad_norm": 2.9893732062191702e-05, "learning_rate": 2.24267186410649e-05, "loss": 0.0, "num_input_tokens_seen": 3922544, "step": 9575 }, { "epoch": 11.598062953995157, "grad_norm": 2.8954729714314453e-05, "learning_rate": 2.240044800448407e-05, "loss": 0.0, "num_input_tokens_seen": 3924528, "step": 9580 }, { "epoch": 11.60411622276029, "grad_norm": 82.52793884277344, "learning_rate": 2.2374180269473675e-05, "loss": 0.0128, "num_input_tokens_seen": 3926576, "step": 9585 }, { "epoch": 11.610169491525424, "grad_norm": 0.03718245401978493, "learning_rate": 2.2347915465353268e-05, "loss": 0.0, "num_input_tokens_seen": 3928592, "step": 9590 }, { "epoch": 11.616222760290556, "grad_norm": 1.6921965652727522e-05, "learning_rate": 2.2321653621439103e-05, "loss": 0.0, "num_input_tokens_seen": 3930640, "step": 9595 }, { "epoch": 11.62227602905569, "grad_norm": 1.808140950743109e-05, "learning_rate": 2.2295394767044167e-05, "loss": 0.0609, "num_input_tokens_seen": 3932688, "step": 9600 }, { "epoch": 11.628329297820823, "grad_norm": 2.4833294446580112e-05, "learning_rate": 2.2269138931478084e-05, "loss": 0.0, "num_input_tokens_seen": 3934736, "step": 9605 }, { "epoch": 11.634382566585955, "grad_norm": 2.2461585103883408e-05, "learning_rate": 2.2242886144047133e-05, "loss": 0.0, "num_input_tokens_seen": 3936816, "step": 9610 }, { "epoch": 11.64043583535109, "grad_norm": 5.669087840942666e-05, "learning_rate": 2.221663643405415e-05, "loss": 0.0003, "num_input_tokens_seen": 3938960, "step": 9615 }, { "epoch": 11.646489104116222, "grad_norm": 1.594521563674789e-05, "learning_rate": 2.2190389830798585e-05, "loss": 0.0, "num_input_tokens_seen": 3940976, "step": 9620 }, { "epoch": 11.652542372881356, "grad_norm": 0.0005740828928537667, "learning_rate": 2.2164146363576383e-05, "loss": 0.0, "num_input_tokens_seen": 3942832, "step": 9625 }, { "epoch": 11.658595641646489, "grad_norm": 5.836095806444064e-05, "learning_rate": 2.2137906061680018e-05, "loss": 0.0, "num_input_tokens_seen": 3944816, "step": 9630 }, { "epoch": 11.664648910411623, "grad_norm": 2.1335010387701914e-05, "learning_rate": 2.211166895439839e-05, "loss": 0.0, "num_input_tokens_seen": 3946800, "step": 9635 }, { "epoch": 11.670702179176756, "grad_norm": 0.00010105661203851923, "learning_rate": 2.208543507101688e-05, "loss": 0.0588, "num_input_tokens_seen": 3948912, "step": 9640 }, { "epoch": 11.676755447941888, "grad_norm": 0.018837088719010353, "learning_rate": 2.2059204440817245e-05, "loss": 0.0, "num_input_tokens_seen": 3951056, "step": 9645 }, { "epoch": 11.682808716707022, "grad_norm": 0.012493186630308628, "learning_rate": 2.2032977093077602e-05, "loss": 0.0001, "num_input_tokens_seen": 3953040, "step": 9650 }, { "epoch": 11.688861985472155, "grad_norm": 3.66298045264557e-05, "learning_rate": 2.2006753057072435e-05, "loss": 0.0, "num_input_tokens_seen": 3955152, "step": 9655 }, { "epoch": 11.694915254237289, "grad_norm": 2.6551523208618164, "learning_rate": 2.19805323620725e-05, "loss": 0.0022, "num_input_tokens_seen": 3957200, "step": 9660 }, { "epoch": 11.700968523002421, "grad_norm": 0.03903164342045784, "learning_rate": 2.195431503734485e-05, "loss": 0.0001, "num_input_tokens_seen": 3959440, "step": 9665 }, { "epoch": 11.707021791767554, "grad_norm": 0.13719822466373444, "learning_rate": 2.1928101112152746e-05, "loss": 0.0002, "num_input_tokens_seen": 3961616, "step": 9670 }, { "epoch": 11.713075060532688, "grad_norm": 0.061190977692604065, "learning_rate": 2.1901890615755694e-05, "loss": 0.0003, "num_input_tokens_seen": 3963728, "step": 9675 }, { "epoch": 11.71912832929782, "grad_norm": 2.098979166476056e-05, "learning_rate": 2.1875683577409327e-05, "loss": 0.0, "num_input_tokens_seen": 3965776, "step": 9680 }, { "epoch": 11.725181598062955, "grad_norm": 4.67002010345459, "learning_rate": 2.1849480026365462e-05, "loss": 0.01, "num_input_tokens_seen": 3967824, "step": 9685 }, { "epoch": 11.731234866828087, "grad_norm": 0.0001350611273664981, "learning_rate": 2.182327999187199e-05, "loss": 0.0001, "num_input_tokens_seen": 3969776, "step": 9690 }, { "epoch": 11.73728813559322, "grad_norm": 0.001544412225484848, "learning_rate": 2.179708350317291e-05, "loss": 0.0001, "num_input_tokens_seen": 3971632, "step": 9695 }, { "epoch": 11.743341404358354, "grad_norm": 0.0007562396349385381, "learning_rate": 2.177089058950822e-05, "loss": 0.0, "num_input_tokens_seen": 3973776, "step": 9700 }, { "epoch": 11.749394673123486, "grad_norm": 0.00011784287926275283, "learning_rate": 2.1744701280113963e-05, "loss": 0.0, "num_input_tokens_seen": 3975792, "step": 9705 }, { "epoch": 11.75544794188862, "grad_norm": 1.4307860510598402e-05, "learning_rate": 2.1718515604222144e-05, "loss": 0.0, "num_input_tokens_seen": 3977776, "step": 9710 }, { "epoch": 11.761501210653753, "grad_norm": 1.95810898730997e-05, "learning_rate": 2.169233359106073e-05, "loss": 0.0, "num_input_tokens_seen": 3979888, "step": 9715 }, { "epoch": 11.767554479418886, "grad_norm": 0.0002076903183478862, "learning_rate": 2.1666155269853567e-05, "loss": 0.0, "num_input_tokens_seen": 3982000, "step": 9720 }, { "epoch": 11.77360774818402, "grad_norm": 4.314665784477256e-05, "learning_rate": 2.1639980669820402e-05, "loss": 0.0049, "num_input_tokens_seen": 3984176, "step": 9725 }, { "epoch": 11.779661016949152, "grad_norm": 1.3659635442309082e-05, "learning_rate": 2.1613809820176837e-05, "loss": 0.0, "num_input_tokens_seen": 3986320, "step": 9730 }, { "epoch": 11.785714285714286, "grad_norm": 0.0012939319713041186, "learning_rate": 2.1587642750134256e-05, "loss": 0.0, "num_input_tokens_seen": 3988432, "step": 9735 }, { "epoch": 11.791767554479419, "grad_norm": 3.85846033168491e-05, "learning_rate": 2.1561479488899868e-05, "loss": 0.0, "num_input_tokens_seen": 3990288, "step": 9740 }, { "epoch": 11.797820823244551, "grad_norm": 0.0007375862332992256, "learning_rate": 2.153532006567658e-05, "loss": 0.0, "num_input_tokens_seen": 3992400, "step": 9745 }, { "epoch": 11.803874092009686, "grad_norm": 2.6863459424930625e-05, "learning_rate": 2.150916450966307e-05, "loss": 0.0, "num_input_tokens_seen": 3994416, "step": 9750 }, { "epoch": 11.809927360774818, "grad_norm": 0.003914542030543089, "learning_rate": 2.1483012850053653e-05, "loss": 0.0, "num_input_tokens_seen": 3996624, "step": 9755 }, { "epoch": 11.815980629539952, "grad_norm": 0.0023728152737021446, "learning_rate": 2.1456865116038322e-05, "loss": 0.0, "num_input_tokens_seen": 3998736, "step": 9760 }, { "epoch": 11.822033898305085, "grad_norm": 7.886349339969456e-05, "learning_rate": 2.1430721336802667e-05, "loss": 0.0, "num_input_tokens_seen": 4000752, "step": 9765 }, { "epoch": 11.828087167070217, "grad_norm": 1.526314372313209e-05, "learning_rate": 2.1404581541527902e-05, "loss": 0.0, "num_input_tokens_seen": 4002832, "step": 9770 }, { "epoch": 11.834140435835351, "grad_norm": 2.666442196641583e-05, "learning_rate": 2.1378445759390738e-05, "loss": 0.0, "num_input_tokens_seen": 4004752, "step": 9775 }, { "epoch": 11.840193704600484, "grad_norm": 3.236914926674217e-05, "learning_rate": 2.135231401956346e-05, "loss": 0.0003, "num_input_tokens_seen": 4006960, "step": 9780 }, { "epoch": 11.846246973365618, "grad_norm": 5.189806688576937e-05, "learning_rate": 2.1326186351213807e-05, "loss": 0.0, "num_input_tokens_seen": 4008976, "step": 9785 }, { "epoch": 11.85230024213075, "grad_norm": 2.891885378630832e-05, "learning_rate": 2.1300062783504994e-05, "loss": 0.0, "num_input_tokens_seen": 4011120, "step": 9790 }, { "epoch": 11.858353510895883, "grad_norm": 1.3061786376056261e-05, "learning_rate": 2.1273943345595637e-05, "loss": 0.0, "num_input_tokens_seen": 4013136, "step": 9795 }, { "epoch": 11.864406779661017, "grad_norm": 2.199769915023353e-05, "learning_rate": 2.1247828066639768e-05, "loss": 0.0, "num_input_tokens_seen": 4015088, "step": 9800 }, { "epoch": 11.87046004842615, "grad_norm": 0.0001563088590046391, "learning_rate": 2.1221716975786764e-05, "loss": 0.0, "num_input_tokens_seen": 4017072, "step": 9805 }, { "epoch": 11.876513317191284, "grad_norm": 0.0002168263599742204, "learning_rate": 2.119561010218131e-05, "loss": 0.0, "num_input_tokens_seen": 4019152, "step": 9810 }, { "epoch": 11.882566585956416, "grad_norm": 0.0002631130919326097, "learning_rate": 2.1169507474963422e-05, "loss": 0.0, "num_input_tokens_seen": 4020976, "step": 9815 }, { "epoch": 11.888619854721549, "grad_norm": 9.973086889658589e-06, "learning_rate": 2.1143409123268342e-05, "loss": 0.0, "num_input_tokens_seen": 4022928, "step": 9820 }, { "epoch": 11.894673123486683, "grad_norm": 1.6398964362451807e-05, "learning_rate": 2.1117315076226558e-05, "loss": 0.0, "num_input_tokens_seen": 4025104, "step": 9825 }, { "epoch": 11.900726392251816, "grad_norm": 1.0180465324083343e-05, "learning_rate": 2.109122536296374e-05, "loss": 0.0, "num_input_tokens_seen": 4027152, "step": 9830 }, { "epoch": 11.90677966101695, "grad_norm": 3.749215466086753e-05, "learning_rate": 2.1065140012600752e-05, "loss": 0.0, "num_input_tokens_seen": 4029264, "step": 9835 }, { "epoch": 11.912832929782082, "grad_norm": 2.4953416868811473e-05, "learning_rate": 2.103905905425354e-05, "loss": 0.0, "num_input_tokens_seen": 4031344, "step": 9840 }, { "epoch": 11.918886198547215, "grad_norm": 1.3687426871911157e-05, "learning_rate": 2.1012982517033188e-05, "loss": 0.0, "num_input_tokens_seen": 4033424, "step": 9845 }, { "epoch": 11.924939467312349, "grad_norm": 1.4664564332633745e-05, "learning_rate": 2.0986910430045818e-05, "loss": 0.0, "num_input_tokens_seen": 4035504, "step": 9850 }, { "epoch": 11.930992736077481, "grad_norm": 3.127561285509728e-05, "learning_rate": 2.096084282239262e-05, "loss": 0.0, "num_input_tokens_seen": 4037488, "step": 9855 }, { "epoch": 11.937046004842616, "grad_norm": 7.41943294997327e-05, "learning_rate": 2.0934779723169735e-05, "loss": 0.0, "num_input_tokens_seen": 4039536, "step": 9860 }, { "epoch": 11.943099273607748, "grad_norm": 9.87857929430902e-05, "learning_rate": 2.0908721161468308e-05, "loss": 0.0, "num_input_tokens_seen": 4041584, "step": 9865 }, { "epoch": 11.94915254237288, "grad_norm": 5.5216311011463404e-05, "learning_rate": 2.088266716637441e-05, "loss": 0.0, "num_input_tokens_seen": 4043600, "step": 9870 }, { "epoch": 11.955205811138015, "grad_norm": 8.445164894510526e-06, "learning_rate": 2.0856617766969027e-05, "loss": 0.0, "num_input_tokens_seen": 4045680, "step": 9875 }, { "epoch": 11.961259079903147, "grad_norm": 9.861905709840357e-06, "learning_rate": 2.083057299232798e-05, "loss": 0.0, "num_input_tokens_seen": 4047728, "step": 9880 }, { "epoch": 11.967312348668282, "grad_norm": 1.9064213120145723e-05, "learning_rate": 2.080453287152196e-05, "loss": 0.0, "num_input_tokens_seen": 4050000, "step": 9885 }, { "epoch": 11.973365617433414, "grad_norm": 1.0160306374018546e-05, "learning_rate": 2.0778497433616463e-05, "loss": 0.0, "num_input_tokens_seen": 4051920, "step": 9890 }, { "epoch": 11.979418886198546, "grad_norm": 0.00015105846978258342, "learning_rate": 2.075246670767173e-05, "loss": 0.0, "num_input_tokens_seen": 4054096, "step": 9895 }, { "epoch": 11.98547215496368, "grad_norm": 1.8995233403984457e-05, "learning_rate": 2.072644072274278e-05, "loss": 0.0, "num_input_tokens_seen": 4056240, "step": 9900 }, { "epoch": 11.991525423728813, "grad_norm": 4.325750342104584e-05, "learning_rate": 2.0700419507879303e-05, "loss": 0.0, "num_input_tokens_seen": 4058096, "step": 9905 }, { "epoch": 11.997578692493947, "grad_norm": 1.9126464394503273e-05, "learning_rate": 2.067440309212571e-05, "loss": 0.0, "num_input_tokens_seen": 4060048, "step": 9910 }, { "epoch": 12.0, "eval_loss": 0.362910658121109, "eval_runtime": 4.9677, "eval_samples_per_second": 73.878, "eval_steps_per_second": 18.52, "num_input_tokens_seen": 4060512, "step": 9912 }, { "epoch": 12.00363196125908, "grad_norm": 1.0686765563150402e-05, "learning_rate": 2.0648391504521e-05, "loss": 0.0, "num_input_tokens_seen": 4061728, "step": 9915 }, { "epoch": 12.009685230024212, "grad_norm": 2.1454645320773125e-05, "learning_rate": 2.0622384774098834e-05, "loss": 0.0, "num_input_tokens_seen": 4063648, "step": 9920 }, { "epoch": 12.015738498789347, "grad_norm": 3.2837640901561826e-05, "learning_rate": 2.0596382929887412e-05, "loss": 0.0, "num_input_tokens_seen": 4065632, "step": 9925 }, { "epoch": 12.021791767554479, "grad_norm": 1.608816637599375e-05, "learning_rate": 2.057038600090952e-05, "loss": 0.0562, "num_input_tokens_seen": 4067712, "step": 9930 }, { "epoch": 12.027845036319613, "grad_norm": 9.299180419475306e-06, "learning_rate": 2.0544394016182405e-05, "loss": 0.0, "num_input_tokens_seen": 4069632, "step": 9935 }, { "epoch": 12.033898305084746, "grad_norm": 0.0019151880405843258, "learning_rate": 2.051840700471785e-05, "loss": 0.0, "num_input_tokens_seen": 4071680, "step": 9940 }, { "epoch": 12.039951573849878, "grad_norm": 1.8713088138611056e-05, "learning_rate": 2.0492424995522044e-05, "loss": 0.0, "num_input_tokens_seen": 4073632, "step": 9945 }, { "epoch": 12.046004842615012, "grad_norm": 2.6233508833684027e-05, "learning_rate": 2.0466448017595635e-05, "loss": 0.0, "num_input_tokens_seen": 4075712, "step": 9950 }, { "epoch": 12.052058111380145, "grad_norm": 0.00224795239046216, "learning_rate": 2.0440476099933604e-05, "loss": 0.0, "num_input_tokens_seen": 4077760, "step": 9955 }, { "epoch": 12.058111380145279, "grad_norm": 0.0005712060374207795, "learning_rate": 2.0414509271525333e-05, "loss": 0.0, "num_input_tokens_seen": 4079936, "step": 9960 }, { "epoch": 12.064164648910412, "grad_norm": 1.550230990687851e-05, "learning_rate": 2.038854756135449e-05, "loss": 0.0, "num_input_tokens_seen": 4081952, "step": 9965 }, { "epoch": 12.070217917675544, "grad_norm": 0.00042263444629497826, "learning_rate": 2.0362590998399032e-05, "loss": 0.0, "num_input_tokens_seen": 4084064, "step": 9970 }, { "epoch": 12.076271186440678, "grad_norm": 1.9614899429143406e-05, "learning_rate": 2.033663961163121e-05, "loss": 0.0001, "num_input_tokens_seen": 4086208, "step": 9975 }, { "epoch": 12.08232445520581, "grad_norm": 1.4654462574981153e-05, "learning_rate": 2.0310693430017434e-05, "loss": 0.0, "num_input_tokens_seen": 4088256, "step": 9980 }, { "epoch": 12.088377723970945, "grad_norm": 4.908870323561132e-05, "learning_rate": 2.0284752482518357e-05, "loss": 0.0141, "num_input_tokens_seen": 4090368, "step": 9985 }, { "epoch": 12.094430992736077, "grad_norm": 3.3759893995011225e-05, "learning_rate": 2.0258816798088765e-05, "loss": 0.0, "num_input_tokens_seen": 4092480, "step": 9990 }, { "epoch": 12.10048426150121, "grad_norm": 0.0027922699227929115, "learning_rate": 2.0232886405677583e-05, "loss": 0.0, "num_input_tokens_seen": 4094592, "step": 9995 }, { "epoch": 12.106537530266344, "grad_norm": 8.319485641550273e-05, "learning_rate": 2.0206961334227808e-05, "loss": 0.0, "num_input_tokens_seen": 4096768, "step": 10000 }, { "epoch": 12.112590799031477, "grad_norm": 0.11691424250602722, "learning_rate": 2.0181041612676523e-05, "loss": 0.0008, "num_input_tokens_seen": 4098784, "step": 10005 }, { "epoch": 12.11864406779661, "grad_norm": 5.779175626230426e-05, "learning_rate": 2.0155127269954815e-05, "loss": 0.0, "num_input_tokens_seen": 4100736, "step": 10010 }, { "epoch": 12.124697336561743, "grad_norm": 0.00010700618440750986, "learning_rate": 2.0129218334987798e-05, "loss": 0.0, "num_input_tokens_seen": 4102848, "step": 10015 }, { "epoch": 12.130750605326876, "grad_norm": 0.0010081524960696697, "learning_rate": 2.0103314836694513e-05, "loss": 0.0, "num_input_tokens_seen": 4104800, "step": 10020 }, { "epoch": 12.13680387409201, "grad_norm": 0.00016175293421838433, "learning_rate": 2.0077416803987965e-05, "loss": 0.0, "num_input_tokens_seen": 4106848, "step": 10025 }, { "epoch": 12.142857142857142, "grad_norm": 0.00014327243843581527, "learning_rate": 2.0051524265775034e-05, "loss": 0.0, "num_input_tokens_seen": 4109120, "step": 10030 }, { "epoch": 12.148910411622277, "grad_norm": 6.528737867483869e-05, "learning_rate": 2.0025637250956494e-05, "loss": 0.0, "num_input_tokens_seen": 4111264, "step": 10035 }, { "epoch": 12.154963680387409, "grad_norm": 0.00021048351482022554, "learning_rate": 1.9999755788426922e-05, "loss": 0.0, "num_input_tokens_seen": 4113344, "step": 10040 }, { "epoch": 12.161016949152541, "grad_norm": 0.00010846980876522139, "learning_rate": 1.9973879907074716e-05, "loss": 0.0, "num_input_tokens_seen": 4115488, "step": 10045 }, { "epoch": 12.167070217917676, "grad_norm": 9.053030953509733e-05, "learning_rate": 1.9948009635782053e-05, "loss": 0.0, "num_input_tokens_seen": 4117504, "step": 10050 }, { "epoch": 12.173123486682808, "grad_norm": 0.00022978673223406076, "learning_rate": 1.9922145003424822e-05, "loss": 0.0, "num_input_tokens_seen": 4119584, "step": 10055 }, { "epoch": 12.179176755447942, "grad_norm": 6.051102536730468e-05, "learning_rate": 1.9896286038872645e-05, "loss": 0.0, "num_input_tokens_seen": 4121696, "step": 10060 }, { "epoch": 12.185230024213075, "grad_norm": 0.0007039514021016657, "learning_rate": 1.9870432770988795e-05, "loss": 0.0, "num_input_tokens_seen": 4123776, "step": 10065 }, { "epoch": 12.19128329297821, "grad_norm": 4.81213464809116e-05, "learning_rate": 1.9844585228630214e-05, "loss": 0.0, "num_input_tokens_seen": 4125888, "step": 10070 }, { "epoch": 12.197336561743342, "grad_norm": 0.07528390735387802, "learning_rate": 1.9818743440647415e-05, "loss": 0.0, "num_input_tokens_seen": 4127872, "step": 10075 }, { "epoch": 12.203389830508474, "grad_norm": 0.0004561438981909305, "learning_rate": 1.979290743588453e-05, "loss": 0.0, "num_input_tokens_seen": 4129952, "step": 10080 }, { "epoch": 12.209443099273608, "grad_norm": 0.0037748899776488543, "learning_rate": 1.9767077243179198e-05, "loss": 0.0, "num_input_tokens_seen": 4132032, "step": 10085 }, { "epoch": 12.21549636803874, "grad_norm": 5.4098753025755286e-05, "learning_rate": 1.9741252891362612e-05, "loss": 0.0, "num_input_tokens_seen": 4134112, "step": 10090 }, { "epoch": 12.221549636803875, "grad_norm": 3.160798587487079e-05, "learning_rate": 1.9715434409259392e-05, "loss": 0.0, "num_input_tokens_seen": 4136096, "step": 10095 }, { "epoch": 12.227602905569007, "grad_norm": 0.00031315060914494097, "learning_rate": 1.968962182568766e-05, "loss": 0.0, "num_input_tokens_seen": 4138144, "step": 10100 }, { "epoch": 12.23365617433414, "grad_norm": 4.2419556848471984e-05, "learning_rate": 1.9663815169458913e-05, "loss": 0.0, "num_input_tokens_seen": 4140160, "step": 10105 }, { "epoch": 12.239709443099274, "grad_norm": 0.0013262138236314058, "learning_rate": 1.9638014469378062e-05, "loss": 0.0, "num_input_tokens_seen": 4142144, "step": 10110 }, { "epoch": 12.245762711864407, "grad_norm": 0.0019090176792815328, "learning_rate": 1.9612219754243344e-05, "loss": 0.0, "num_input_tokens_seen": 4144256, "step": 10115 }, { "epoch": 12.25181598062954, "grad_norm": 0.0005147655610926449, "learning_rate": 1.958643105284635e-05, "loss": 0.0, "num_input_tokens_seen": 4146368, "step": 10120 }, { "epoch": 12.257869249394673, "grad_norm": 0.00010768322681542486, "learning_rate": 1.956064839397192e-05, "loss": 0.0, "num_input_tokens_seen": 4148384, "step": 10125 }, { "epoch": 12.263922518159806, "grad_norm": 3.442894740146585e-05, "learning_rate": 1.9534871806398163e-05, "loss": 0.0, "num_input_tokens_seen": 4150432, "step": 10130 }, { "epoch": 12.26997578692494, "grad_norm": 4.995283597963862e-05, "learning_rate": 1.9509101318896434e-05, "loss": 0.0, "num_input_tokens_seen": 4152480, "step": 10135 }, { "epoch": 12.276029055690072, "grad_norm": 5.166860137251206e-05, "learning_rate": 1.9483336960231242e-05, "loss": 0.0, "num_input_tokens_seen": 4154496, "step": 10140 }, { "epoch": 12.282082324455207, "grad_norm": 4.373360570752993e-05, "learning_rate": 1.9457578759160287e-05, "loss": 0.0, "num_input_tokens_seen": 4156576, "step": 10145 }, { "epoch": 12.288135593220339, "grad_norm": 2.6753663405543193e-05, "learning_rate": 1.9431826744434367e-05, "loss": 0.0, "num_input_tokens_seen": 4158528, "step": 10150 }, { "epoch": 12.294188861985472, "grad_norm": 7.479215128114447e-05, "learning_rate": 1.9406080944797415e-05, "loss": 0.0, "num_input_tokens_seen": 4160544, "step": 10155 }, { "epoch": 12.300242130750606, "grad_norm": 2.8919692340423353e-05, "learning_rate": 1.9380341388986376e-05, "loss": 0.0, "num_input_tokens_seen": 4162656, "step": 10160 }, { "epoch": 12.306295399515738, "grad_norm": 4.7063585952855647e-05, "learning_rate": 1.935460810573127e-05, "loss": 0.0, "num_input_tokens_seen": 4164768, "step": 10165 }, { "epoch": 12.312348668280872, "grad_norm": 4.2971692892024294e-05, "learning_rate": 1.9328881123755083e-05, "loss": 0.0, "num_input_tokens_seen": 4166816, "step": 10170 }, { "epoch": 12.318401937046005, "grad_norm": 0.00018292498134542257, "learning_rate": 1.93031604717738e-05, "loss": 0.0, "num_input_tokens_seen": 4168768, "step": 10175 }, { "epoch": 12.324455205811137, "grad_norm": 0.0006129827816039324, "learning_rate": 1.9277446178496305e-05, "loss": 0.0, "num_input_tokens_seen": 4170880, "step": 10180 }, { "epoch": 12.330508474576272, "grad_norm": 6.036024205968715e-05, "learning_rate": 1.9251738272624416e-05, "loss": 0.0, "num_input_tokens_seen": 4172864, "step": 10185 }, { "epoch": 12.336561743341404, "grad_norm": 2.120635508617852e-05, "learning_rate": 1.92260367828528e-05, "loss": 0.0, "num_input_tokens_seen": 4174976, "step": 10190 }, { "epoch": 12.342615012106538, "grad_norm": 3.27652451233007e-05, "learning_rate": 1.9200341737868983e-05, "loss": 0.0, "num_input_tokens_seen": 4176992, "step": 10195 }, { "epoch": 12.34866828087167, "grad_norm": 0.0002923154388554394, "learning_rate": 1.9174653166353272e-05, "loss": 0.0, "num_input_tokens_seen": 4179104, "step": 10200 }, { "epoch": 12.354721549636803, "grad_norm": 3.362069764989428e-05, "learning_rate": 1.9148971096978757e-05, "loss": 0.0, "num_input_tokens_seen": 4181280, "step": 10205 }, { "epoch": 12.360774818401937, "grad_norm": 0.00012481598241720349, "learning_rate": 1.9123295558411292e-05, "loss": 0.0, "num_input_tokens_seen": 4183328, "step": 10210 }, { "epoch": 12.36682808716707, "grad_norm": 1.2416145182214677e-05, "learning_rate": 1.9097626579309407e-05, "loss": 0.0, "num_input_tokens_seen": 4185344, "step": 10215 }, { "epoch": 12.372881355932204, "grad_norm": 1.5140096365939826e-05, "learning_rate": 1.9071964188324343e-05, "loss": 0.0, "num_input_tokens_seen": 4187424, "step": 10220 }, { "epoch": 12.378934624697337, "grad_norm": 0.000146130783832632, "learning_rate": 1.904630841409995e-05, "loss": 0.0, "num_input_tokens_seen": 4189440, "step": 10225 }, { "epoch": 12.384987893462469, "grad_norm": 2.6325215003453195e-05, "learning_rate": 1.9020659285272745e-05, "loss": 0.0, "num_input_tokens_seen": 4191520, "step": 10230 }, { "epoch": 12.391041162227603, "grad_norm": 3.687796197482385e-05, "learning_rate": 1.899501683047177e-05, "loss": 0.0, "num_input_tokens_seen": 4193344, "step": 10235 }, { "epoch": 12.397094430992736, "grad_norm": 4.873079524259083e-05, "learning_rate": 1.8969381078318672e-05, "loss": 0.0, "num_input_tokens_seen": 4195392, "step": 10240 }, { "epoch": 12.40314769975787, "grad_norm": 2.536422834964469e-05, "learning_rate": 1.8943752057427555e-05, "loss": 0.0, "num_input_tokens_seen": 4197376, "step": 10245 }, { "epoch": 12.409200968523002, "grad_norm": 1.891673127829563e-05, "learning_rate": 1.891812979640507e-05, "loss": 0.0, "num_input_tokens_seen": 4199488, "step": 10250 }, { "epoch": 12.415254237288135, "grad_norm": 0.0001729373907437548, "learning_rate": 1.8892514323850285e-05, "loss": 0.0, "num_input_tokens_seen": 4201536, "step": 10255 }, { "epoch": 12.42130750605327, "grad_norm": 0.00011098497634520754, "learning_rate": 1.886690566835472e-05, "loss": 0.0, "num_input_tokens_seen": 4203616, "step": 10260 }, { "epoch": 12.427360774818402, "grad_norm": 4.286110197426751e-05, "learning_rate": 1.8841303858502245e-05, "loss": 0.0, "num_input_tokens_seen": 4205696, "step": 10265 }, { "epoch": 12.433414043583536, "grad_norm": 1.54096596816089e-05, "learning_rate": 1.8815708922869124e-05, "loss": 0.0, "num_input_tokens_seen": 4207488, "step": 10270 }, { "epoch": 12.439467312348668, "grad_norm": 3.432882294873707e-05, "learning_rate": 1.8790120890023945e-05, "loss": 0.0, "num_input_tokens_seen": 4209536, "step": 10275 }, { "epoch": 12.4455205811138, "grad_norm": 0.00010934738384094089, "learning_rate": 1.876453978852756e-05, "loss": 0.0, "num_input_tokens_seen": 4211424, "step": 10280 }, { "epoch": 12.451573849878935, "grad_norm": 4.999416705686599e-05, "learning_rate": 1.873896564693313e-05, "loss": 0.0, "num_input_tokens_seen": 4213504, "step": 10285 }, { "epoch": 12.457627118644067, "grad_norm": 3.663169627543539e-05, "learning_rate": 1.871339849378601e-05, "loss": 0.0, "num_input_tokens_seen": 4215808, "step": 10290 }, { "epoch": 12.463680387409202, "grad_norm": 1.2581605915329419e-05, "learning_rate": 1.8687838357623783e-05, "loss": 0.0, "num_input_tokens_seen": 4217952, "step": 10295 }, { "epoch": 12.469733656174334, "grad_norm": 2.4377401132369414e-05, "learning_rate": 1.866228526697617e-05, "loss": 0.0, "num_input_tokens_seen": 4220096, "step": 10300 }, { "epoch": 12.475786924939467, "grad_norm": 1.2599651199707296e-05, "learning_rate": 1.863673925036506e-05, "loss": 0.0, "num_input_tokens_seen": 4222176, "step": 10305 }, { "epoch": 12.4818401937046, "grad_norm": 2.746887184912339e-05, "learning_rate": 1.861120033630442e-05, "loss": 0.0, "num_input_tokens_seen": 4224256, "step": 10310 }, { "epoch": 12.487893462469733, "grad_norm": 3.747353912331164e-05, "learning_rate": 1.8585668553300308e-05, "loss": 0.0, "num_input_tokens_seen": 4226240, "step": 10315 }, { "epoch": 12.493946731234868, "grad_norm": 2.5060791813302785e-05, "learning_rate": 1.8560143929850804e-05, "loss": 0.0, "num_input_tokens_seen": 4228128, "step": 10320 }, { "epoch": 12.5, "grad_norm": 1.3093461348034907e-05, "learning_rate": 1.853462649444601e-05, "loss": 0.0, "num_input_tokens_seen": 4230176, "step": 10325 }, { "epoch": 12.506053268765132, "grad_norm": 4.9833255616249517e-05, "learning_rate": 1.8509116275568003e-05, "loss": 0.0, "num_input_tokens_seen": 4232320, "step": 10330 }, { "epoch": 12.512106537530267, "grad_norm": 1.588548911968246e-05, "learning_rate": 1.848361330169081e-05, "loss": 0.0, "num_input_tokens_seen": 4234432, "step": 10335 }, { "epoch": 12.5181598062954, "grad_norm": 4.306585469748825e-05, "learning_rate": 1.845811760128035e-05, "loss": 0.0, "num_input_tokens_seen": 4236672, "step": 10340 }, { "epoch": 12.524213075060533, "grad_norm": 0.01463165320456028, "learning_rate": 1.8432629202794456e-05, "loss": 0.0, "num_input_tokens_seen": 4238816, "step": 10345 }, { "epoch": 12.530266343825666, "grad_norm": 2.3828917619539425e-05, "learning_rate": 1.8407148134682772e-05, "loss": 0.0, "num_input_tokens_seen": 4240800, "step": 10350 }, { "epoch": 12.536319612590798, "grad_norm": 1.8164122593589127e-05, "learning_rate": 1.8381674425386806e-05, "loss": 0.0, "num_input_tokens_seen": 4242880, "step": 10355 }, { "epoch": 12.542372881355933, "grad_norm": 3.299604213680141e-05, "learning_rate": 1.8356208103339813e-05, "loss": 0.0, "num_input_tokens_seen": 4244832, "step": 10360 }, { "epoch": 12.548426150121065, "grad_norm": 0.00028421200113371015, "learning_rate": 1.8330749196966807e-05, "loss": 0.0, "num_input_tokens_seen": 4246880, "step": 10365 }, { "epoch": 12.5544794188862, "grad_norm": 2.0855435650446452e-05, "learning_rate": 1.8305297734684548e-05, "loss": 0.0, "num_input_tokens_seen": 4249024, "step": 10370 }, { "epoch": 12.560532687651332, "grad_norm": 1.8246821127831936e-05, "learning_rate": 1.8279853744901464e-05, "loss": 0.0, "num_input_tokens_seen": 4251072, "step": 10375 }, { "epoch": 12.566585956416464, "grad_norm": 2.3361408238997683e-05, "learning_rate": 1.8254417256017676e-05, "loss": 0.0, "num_input_tokens_seen": 4253120, "step": 10380 }, { "epoch": 12.572639225181598, "grad_norm": 0.00045257125748321414, "learning_rate": 1.8228988296424877e-05, "loss": 0.0, "num_input_tokens_seen": 4255072, "step": 10385 }, { "epoch": 12.57869249394673, "grad_norm": 4.498262933338992e-05, "learning_rate": 1.8203566894506406e-05, "loss": 0.0, "num_input_tokens_seen": 4256992, "step": 10390 }, { "epoch": 12.584745762711865, "grad_norm": 1.61935604410246e-05, "learning_rate": 1.8178153078637136e-05, "loss": 0.0001, "num_input_tokens_seen": 4259200, "step": 10395 }, { "epoch": 12.590799031476998, "grad_norm": 0.0005618418799713254, "learning_rate": 1.81527468771835e-05, "loss": 0.0, "num_input_tokens_seen": 4261120, "step": 10400 }, { "epoch": 12.59685230024213, "grad_norm": 1.718502062431071e-05, "learning_rate": 1.8127348318503405e-05, "loss": 0.0, "num_input_tokens_seen": 4263168, "step": 10405 }, { "epoch": 12.602905569007264, "grad_norm": 2.8149997888249345e-05, "learning_rate": 1.810195743094624e-05, "loss": 0.0, "num_input_tokens_seen": 4265152, "step": 10410 }, { "epoch": 12.608958837772397, "grad_norm": 2.077953286061529e-05, "learning_rate": 1.8076574242852828e-05, "loss": 0.0, "num_input_tokens_seen": 4267200, "step": 10415 }, { "epoch": 12.615012106537531, "grad_norm": 0.00033457373501732945, "learning_rate": 1.8051198782555416e-05, "loss": 0.0439, "num_input_tokens_seen": 4269152, "step": 10420 }, { "epoch": 12.621065375302663, "grad_norm": 9.433024388272315e-06, "learning_rate": 1.8025831078377582e-05, "loss": 0.0, "num_input_tokens_seen": 4271200, "step": 10425 }, { "epoch": 12.627118644067796, "grad_norm": 1.1820740837720223e-05, "learning_rate": 1.8000471158634298e-05, "loss": 0.0, "num_input_tokens_seen": 4273184, "step": 10430 }, { "epoch": 12.63317191283293, "grad_norm": 1.0398781341791619e-05, "learning_rate": 1.7975119051631817e-05, "loss": 0.0, "num_input_tokens_seen": 4275232, "step": 10435 }, { "epoch": 12.639225181598063, "grad_norm": 4.67398131149821e-05, "learning_rate": 1.794977478566767e-05, "loss": 0.0, "num_input_tokens_seen": 4277216, "step": 10440 }, { "epoch": 12.645278450363197, "grad_norm": 1.4703760825796053e-05, "learning_rate": 1.792443838903065e-05, "loss": 0.0, "num_input_tokens_seen": 4279168, "step": 10445 }, { "epoch": 12.65133171912833, "grad_norm": 0.0023041425738483667, "learning_rate": 1.7899109890000758e-05, "loss": 0.0, "num_input_tokens_seen": 4281312, "step": 10450 }, { "epoch": 12.657384987893462, "grad_norm": 1.971116542816162, "learning_rate": 1.7873789316849196e-05, "loss": 0.0079, "num_input_tokens_seen": 4283232, "step": 10455 }, { "epoch": 12.663438256658596, "grad_norm": 7.158876542234793e-06, "learning_rate": 1.7848476697838283e-05, "loss": 0.0, "num_input_tokens_seen": 4285248, "step": 10460 }, { "epoch": 12.669491525423728, "grad_norm": 0.0008716965676285326, "learning_rate": 1.78231720612215e-05, "loss": 0.0, "num_input_tokens_seen": 4287360, "step": 10465 }, { "epoch": 12.675544794188863, "grad_norm": 9.71276585914893e-06, "learning_rate": 1.7797875435243387e-05, "loss": 0.0, "num_input_tokens_seen": 4289312, "step": 10470 }, { "epoch": 12.681598062953995, "grad_norm": 5.483183940668823e-06, "learning_rate": 1.777258684813957e-05, "loss": 0.0003, "num_input_tokens_seen": 4291360, "step": 10475 }, { "epoch": 12.687651331719128, "grad_norm": 5.6895969464676455e-06, "learning_rate": 1.774730632813668e-05, "loss": 0.0, "num_input_tokens_seen": 4293536, "step": 10480 }, { "epoch": 12.693704600484262, "grad_norm": 6.882925845275167e-06, "learning_rate": 1.772203390345235e-05, "loss": 0.0, "num_input_tokens_seen": 4295488, "step": 10485 }, { "epoch": 12.699757869249394, "grad_norm": 8.680508472025394e-06, "learning_rate": 1.7696769602295183e-05, "loss": 0.2462, "num_input_tokens_seen": 4297600, "step": 10490 }, { "epoch": 12.705811138014528, "grad_norm": 1.4937509298324585, "learning_rate": 1.7671513452864723e-05, "loss": 0.0001, "num_input_tokens_seen": 4299648, "step": 10495 }, { "epoch": 12.711864406779661, "grad_norm": 4.549182631308213e-05, "learning_rate": 1.7646265483351377e-05, "loss": 0.0, "num_input_tokens_seen": 4301536, "step": 10500 }, { "epoch": 12.717917675544793, "grad_norm": 8.852637984091416e-05, "learning_rate": 1.762102572193647e-05, "loss": 0.0, "num_input_tokens_seen": 4303456, "step": 10505 }, { "epoch": 12.723970944309928, "grad_norm": 0.00024285539984703064, "learning_rate": 1.759579419679212e-05, "loss": 0.0, "num_input_tokens_seen": 4305600, "step": 10510 }, { "epoch": 12.73002421307506, "grad_norm": 0.00017585925525054336, "learning_rate": 1.7570570936081306e-05, "loss": 0.0, "num_input_tokens_seen": 4307584, "step": 10515 }, { "epoch": 12.736077481840194, "grad_norm": 0.00012365978909656405, "learning_rate": 1.7545355967957736e-05, "loss": 0.0, "num_input_tokens_seen": 4309696, "step": 10520 }, { "epoch": 12.742130750605327, "grad_norm": 0.00042490861960686743, "learning_rate": 1.7520149320565862e-05, "loss": 0.0, "num_input_tokens_seen": 4311808, "step": 10525 }, { "epoch": 12.74818401937046, "grad_norm": 0.0003286628343630582, "learning_rate": 1.7494951022040885e-05, "loss": 0.0, "num_input_tokens_seen": 4313888, "step": 10530 }, { "epoch": 12.754237288135593, "grad_norm": 0.00017284319619648159, "learning_rate": 1.746976110050865e-05, "loss": 0.0, "num_input_tokens_seen": 4315904, "step": 10535 }, { "epoch": 12.760290556900726, "grad_norm": 0.000162775773787871, "learning_rate": 1.7444579584085695e-05, "loss": 0.0, "num_input_tokens_seen": 4317952, "step": 10540 }, { "epoch": 12.76634382566586, "grad_norm": 0.00011847238056361675, "learning_rate": 1.7419406500879114e-05, "loss": 0.0, "num_input_tokens_seen": 4319904, "step": 10545 }, { "epoch": 12.772397094430993, "grad_norm": 0.00024359978851862252, "learning_rate": 1.7394241878986648e-05, "loss": 0.0, "num_input_tokens_seen": 4321888, "step": 10550 }, { "epoch": 12.778450363196125, "grad_norm": 0.00022359340800903738, "learning_rate": 1.7369085746496552e-05, "loss": 0.0, "num_input_tokens_seen": 4323936, "step": 10555 }, { "epoch": 12.78450363196126, "grad_norm": 0.0003651288861874491, "learning_rate": 1.734393813148764e-05, "loss": 0.0, "num_input_tokens_seen": 4325920, "step": 10560 }, { "epoch": 12.790556900726392, "grad_norm": 0.0001189343020087108, "learning_rate": 1.7318799062029187e-05, "loss": 0.0, "num_input_tokens_seen": 4327872, "step": 10565 }, { "epoch": 12.796610169491526, "grad_norm": 0.00017395509348716587, "learning_rate": 1.7293668566180948e-05, "loss": 0.0, "num_input_tokens_seen": 4329888, "step": 10570 }, { "epoch": 12.802663438256658, "grad_norm": 0.00014673854457214475, "learning_rate": 1.7268546671993102e-05, "loss": 0.0, "num_input_tokens_seen": 4332096, "step": 10575 }, { "epoch": 12.80871670702179, "grad_norm": 0.0004943571402691305, "learning_rate": 1.7243433407506238e-05, "loss": 0.0, "num_input_tokens_seen": 4334240, "step": 10580 }, { "epoch": 12.814769975786925, "grad_norm": 0.0005349934799596667, "learning_rate": 1.7218328800751288e-05, "loss": 0.0, "num_input_tokens_seen": 4336384, "step": 10585 }, { "epoch": 12.820823244552058, "grad_norm": 0.00018246442778035998, "learning_rate": 1.7193232879749548e-05, "loss": 0.0, "num_input_tokens_seen": 4338400, "step": 10590 }, { "epoch": 12.826876513317192, "grad_norm": 4.965154221281409e-05, "learning_rate": 1.716814567251261e-05, "loss": 0.0, "num_input_tokens_seen": 4340288, "step": 10595 }, { "epoch": 12.832929782082324, "grad_norm": 9.854380914475769e-05, "learning_rate": 1.7143067207042318e-05, "loss": 0.0, "num_input_tokens_seen": 4342432, "step": 10600 }, { "epoch": 12.838983050847457, "grad_norm": 0.00016766360204201192, "learning_rate": 1.7117997511330798e-05, "loss": 0.0, "num_input_tokens_seen": 4344544, "step": 10605 }, { "epoch": 12.845036319612591, "grad_norm": 0.00010691685747588053, "learning_rate": 1.7092936613360354e-05, "loss": 0.0, "num_input_tokens_seen": 4346624, "step": 10610 }, { "epoch": 12.851089588377723, "grad_norm": 8.100535342236981e-05, "learning_rate": 1.7067884541103498e-05, "loss": 0.0, "num_input_tokens_seen": 4348608, "step": 10615 }, { "epoch": 12.857142857142858, "grad_norm": 0.00011146420729346573, "learning_rate": 1.7042841322522853e-05, "loss": 0.0, "num_input_tokens_seen": 4350528, "step": 10620 }, { "epoch": 12.86319612590799, "grad_norm": 5.500898259924725e-05, "learning_rate": 1.7017806985571204e-05, "loss": 0.0, "num_input_tokens_seen": 4352576, "step": 10625 }, { "epoch": 12.869249394673123, "grad_norm": 0.005818668287247419, "learning_rate": 1.6992781558191387e-05, "loss": 0.0, "num_input_tokens_seen": 4354656, "step": 10630 }, { "epoch": 12.875302663438257, "grad_norm": 0.0001795487041817978, "learning_rate": 1.6967765068316326e-05, "loss": 0.0, "num_input_tokens_seen": 4356640, "step": 10635 }, { "epoch": 12.88135593220339, "grad_norm": 0.00046849995851516724, "learning_rate": 1.6942757543868924e-05, "loss": 0.0, "num_input_tokens_seen": 4358720, "step": 10640 }, { "epoch": 12.887409200968523, "grad_norm": 0.0001987216528505087, "learning_rate": 1.691775901276213e-05, "loss": 0.0, "num_input_tokens_seen": 4360736, "step": 10645 }, { "epoch": 12.893462469733656, "grad_norm": 0.00025703271967358887, "learning_rate": 1.6892769502898803e-05, "loss": 0.0, "num_input_tokens_seen": 4362784, "step": 10650 }, { "epoch": 12.899515738498788, "grad_norm": 0.02991967834532261, "learning_rate": 1.6867789042171776e-05, "loss": 0.0, "num_input_tokens_seen": 4364704, "step": 10655 }, { "epoch": 12.905569007263923, "grad_norm": 8.107156463665888e-05, "learning_rate": 1.684281765846376e-05, "loss": 0.0, "num_input_tokens_seen": 4366688, "step": 10660 }, { "epoch": 12.911622276029055, "grad_norm": 0.0045127179473638535, "learning_rate": 1.681785537964734e-05, "loss": 0.0, "num_input_tokens_seen": 4368768, "step": 10665 }, { "epoch": 12.91767554479419, "grad_norm": 0.000920180173125118, "learning_rate": 1.6792902233584933e-05, "loss": 0.0, "num_input_tokens_seen": 4370720, "step": 10670 }, { "epoch": 12.923728813559322, "grad_norm": 5.944539589108899e-05, "learning_rate": 1.6767958248128755e-05, "loss": 0.0, "num_input_tokens_seen": 4372832, "step": 10675 }, { "epoch": 12.929782082324456, "grad_norm": 0.00021957248100079596, "learning_rate": 1.6743023451120832e-05, "loss": 0.0, "num_input_tokens_seen": 4374944, "step": 10680 }, { "epoch": 12.935835351089588, "grad_norm": 5.579952266998589e-05, "learning_rate": 1.6718097870392883e-05, "loss": 0.0, "num_input_tokens_seen": 4376960, "step": 10685 }, { "epoch": 12.941888619854721, "grad_norm": 0.1347380131483078, "learning_rate": 1.6693181533766383e-05, "loss": 0.0, "num_input_tokens_seen": 4379040, "step": 10690 }, { "epoch": 12.947941888619855, "grad_norm": 9.001831494970247e-05, "learning_rate": 1.6668274469052463e-05, "loss": 0.0, "num_input_tokens_seen": 4381216, "step": 10695 }, { "epoch": 12.953995157384988, "grad_norm": 0.00016572400636505336, "learning_rate": 1.6643376704051934e-05, "loss": 0.0, "num_input_tokens_seen": 4383328, "step": 10700 }, { "epoch": 12.960048426150122, "grad_norm": 9.079480514628813e-05, "learning_rate": 1.661848826655518e-05, "loss": 0.0, "num_input_tokens_seen": 4385408, "step": 10705 }, { "epoch": 12.966101694915254, "grad_norm": 6.535009742947295e-05, "learning_rate": 1.659360918434222e-05, "loss": 0.0, "num_input_tokens_seen": 4387392, "step": 10710 }, { "epoch": 12.972154963680387, "grad_norm": 0.00010034612205345184, "learning_rate": 1.65687394851826e-05, "loss": 0.0, "num_input_tokens_seen": 4389408, "step": 10715 }, { "epoch": 12.978208232445521, "grad_norm": 3.270518936915323e-05, "learning_rate": 1.6543879196835422e-05, "loss": 0.0, "num_input_tokens_seen": 4391360, "step": 10720 }, { "epoch": 12.984261501210653, "grad_norm": 0.00013586226850748062, "learning_rate": 1.651902834704924e-05, "loss": 0.0, "num_input_tokens_seen": 4393184, "step": 10725 }, { "epoch": 12.990314769975788, "grad_norm": 9.100128954742104e-05, "learning_rate": 1.6494186963562126e-05, "loss": 0.0, "num_input_tokens_seen": 4395264, "step": 10730 }, { "epoch": 12.99636803874092, "grad_norm": 0.00018722475215326995, "learning_rate": 1.6469355074101538e-05, "loss": 0.0, "num_input_tokens_seen": 4397376, "step": 10735 }, { "epoch": 13.0, "eval_loss": 0.3363990783691406, "eval_runtime": 5.209, "eval_samples_per_second": 70.455, "eval_steps_per_second": 17.662, "num_input_tokens_seen": 4398224, "step": 10738 }, { "epoch": 13.002421307506053, "grad_norm": 0.00012279150541871786, "learning_rate": 1.644453270638438e-05, "loss": 0.0, "num_input_tokens_seen": 4399024, "step": 10740 }, { "epoch": 13.008474576271187, "grad_norm": 0.00011693319538608193, "learning_rate": 1.641971988811688e-05, "loss": 0.0, "num_input_tokens_seen": 4400944, "step": 10745 }, { "epoch": 13.01452784503632, "grad_norm": 4.8570447688689455e-05, "learning_rate": 1.6394916646994653e-05, "loss": 0.0, "num_input_tokens_seen": 4402992, "step": 10750 }, { "epoch": 13.020581113801454, "grad_norm": 4.0440754673909396e-05, "learning_rate": 1.6370123010702604e-05, "loss": 0.0, "num_input_tokens_seen": 4405104, "step": 10755 }, { "epoch": 13.026634382566586, "grad_norm": 2.94198816845892e-05, "learning_rate": 1.63453390069149e-05, "loss": 0.0, "num_input_tokens_seen": 4407248, "step": 10760 }, { "epoch": 13.032687651331718, "grad_norm": 5.0390983233228326e-05, "learning_rate": 1.6320564663294996e-05, "loss": 0.0, "num_input_tokens_seen": 4409264, "step": 10765 }, { "epoch": 13.038740920096853, "grad_norm": 0.00018538377480581403, "learning_rate": 1.629580000749552e-05, "loss": 0.0, "num_input_tokens_seen": 4411408, "step": 10770 }, { "epoch": 13.044794188861985, "grad_norm": 1.858057476056274e-05, "learning_rate": 1.627104506715834e-05, "loss": 0.0, "num_input_tokens_seen": 4413488, "step": 10775 }, { "epoch": 13.05084745762712, "grad_norm": 8.607571362517774e-05, "learning_rate": 1.6246299869914415e-05, "loss": 0.0, "num_input_tokens_seen": 4415536, "step": 10780 }, { "epoch": 13.056900726392252, "grad_norm": 3.529422974679619e-05, "learning_rate": 1.622156444338389e-05, "loss": 0.0, "num_input_tokens_seen": 4417744, "step": 10785 }, { "epoch": 13.062953995157384, "grad_norm": 4.3708645534934476e-05, "learning_rate": 1.6196838815175958e-05, "loss": 0.0, "num_input_tokens_seen": 4419920, "step": 10790 }, { "epoch": 13.069007263922519, "grad_norm": 8.57859558891505e-05, "learning_rate": 1.61721230128889e-05, "loss": 0.0, "num_input_tokens_seen": 4421808, "step": 10795 }, { "epoch": 13.075060532687651, "grad_norm": 6.916273559909314e-05, "learning_rate": 1.6147417064110027e-05, "loss": 0.0, "num_input_tokens_seen": 4423856, "step": 10800 }, { "epoch": 13.081113801452785, "grad_norm": 5.4501455451827496e-05, "learning_rate": 1.6122720996415656e-05, "loss": 0.0, "num_input_tokens_seen": 4425936, "step": 10805 }, { "epoch": 13.087167070217918, "grad_norm": 8.020684617804363e-05, "learning_rate": 1.609803483737105e-05, "loss": 0.0, "num_input_tokens_seen": 4427952, "step": 10810 }, { "epoch": 13.09322033898305, "grad_norm": 2.9297993023646995e-05, "learning_rate": 1.6073358614530443e-05, "loss": 0.0, "num_input_tokens_seen": 4430032, "step": 10815 }, { "epoch": 13.099273607748184, "grad_norm": 2.459686584188603e-05, "learning_rate": 1.6048692355436955e-05, "loss": 0.0, "num_input_tokens_seen": 4431984, "step": 10820 }, { "epoch": 13.105326876513317, "grad_norm": 7.721747533651069e-05, "learning_rate": 1.6024036087622614e-05, "loss": 0.0, "num_input_tokens_seen": 4433968, "step": 10825 }, { "epoch": 13.111380145278451, "grad_norm": 6.336269871098921e-05, "learning_rate": 1.5999389838608254e-05, "loss": 0.0, "num_input_tokens_seen": 4435952, "step": 10830 }, { "epoch": 13.117433414043584, "grad_norm": 4.9149479309562594e-05, "learning_rate": 1.597475363590355e-05, "loss": 0.0, "num_input_tokens_seen": 4438096, "step": 10835 }, { "epoch": 13.123486682808716, "grad_norm": 2.149077772628516e-05, "learning_rate": 1.5950127507006985e-05, "loss": 0.0, "num_input_tokens_seen": 4440080, "step": 10840 }, { "epoch": 13.12953995157385, "grad_norm": 0.000321285828249529, "learning_rate": 1.5925511479405746e-05, "loss": 0.0, "num_input_tokens_seen": 4442256, "step": 10845 }, { "epoch": 13.135593220338983, "grad_norm": 5.47071649634745e-05, "learning_rate": 1.59009055805758e-05, "loss": 0.0, "num_input_tokens_seen": 4444240, "step": 10850 }, { "epoch": 13.141646489104117, "grad_norm": 0.00042202245094813406, "learning_rate": 1.5876309837981764e-05, "loss": 0.0, "num_input_tokens_seen": 4446320, "step": 10855 }, { "epoch": 13.14769975786925, "grad_norm": 5.7755751186050475e-05, "learning_rate": 1.5851724279076948e-05, "loss": 0.0, "num_input_tokens_seen": 4448272, "step": 10860 }, { "epoch": 13.153753026634382, "grad_norm": 2.857872095773928e-05, "learning_rate": 1.5827148931303277e-05, "loss": 0.0, "num_input_tokens_seen": 4450224, "step": 10865 }, { "epoch": 13.159806295399516, "grad_norm": 4.639020698959939e-05, "learning_rate": 1.5802583822091293e-05, "loss": 0.0, "num_input_tokens_seen": 4452240, "step": 10870 }, { "epoch": 13.165859564164649, "grad_norm": 8.398451609537005e-05, "learning_rate": 1.57780289788601e-05, "loss": 0.0, "num_input_tokens_seen": 4454352, "step": 10875 }, { "epoch": 13.171912832929783, "grad_norm": 5.421654714155011e-05, "learning_rate": 1.575348442901735e-05, "loss": 0.0, "num_input_tokens_seen": 4456496, "step": 10880 }, { "epoch": 13.177966101694915, "grad_norm": 5.7779969210969284e-05, "learning_rate": 1.5728950199959194e-05, "loss": 0.0, "num_input_tokens_seen": 4458544, "step": 10885 }, { "epoch": 13.184019370460048, "grad_norm": 0.00168497150298208, "learning_rate": 1.570442631907028e-05, "loss": 0.0, "num_input_tokens_seen": 4460656, "step": 10890 }, { "epoch": 13.190072639225182, "grad_norm": 4.9465968913864344e-05, "learning_rate": 1.567991281372369e-05, "loss": 0.0, "num_input_tokens_seen": 4462832, "step": 10895 }, { "epoch": 13.196125907990314, "grad_norm": 3.730338721652515e-05, "learning_rate": 1.5655409711280942e-05, "loss": 0.0, "num_input_tokens_seen": 4464848, "step": 10900 }, { "epoch": 13.202179176755449, "grad_norm": 5.7318422477692366e-05, "learning_rate": 1.563091703909192e-05, "loss": 0.0, "num_input_tokens_seen": 4466896, "step": 10905 }, { "epoch": 13.208232445520581, "grad_norm": 0.00017905437562149018, "learning_rate": 1.5606434824494884e-05, "loss": 0.0, "num_input_tokens_seen": 4468880, "step": 10910 }, { "epoch": 13.214285714285714, "grad_norm": 2.4371867766603827e-05, "learning_rate": 1.558196309481642e-05, "loss": 0.0, "num_input_tokens_seen": 4470992, "step": 10915 }, { "epoch": 13.220338983050848, "grad_norm": 0.6247068047523499, "learning_rate": 1.5557501877371398e-05, "loss": 0.001, "num_input_tokens_seen": 4472976, "step": 10920 }, { "epoch": 13.22639225181598, "grad_norm": 3.057029243791476e-05, "learning_rate": 1.5533051199462972e-05, "loss": 0.0, "num_input_tokens_seen": 4475024, "step": 10925 }, { "epoch": 13.232445520581114, "grad_norm": 3.7869216612307355e-05, "learning_rate": 1.550861108838251e-05, "loss": 0.0, "num_input_tokens_seen": 4477040, "step": 10930 }, { "epoch": 13.238498789346247, "grad_norm": 9.412808867637068e-05, "learning_rate": 1.5484181571409613e-05, "loss": 0.0, "num_input_tokens_seen": 4479120, "step": 10935 }, { "epoch": 13.24455205811138, "grad_norm": 7.033185829641297e-05, "learning_rate": 1.5459762675812027e-05, "loss": 0.0, "num_input_tokens_seen": 4481104, "step": 10940 }, { "epoch": 13.250605326876514, "grad_norm": 1.876329224614892e-05, "learning_rate": 1.5435354428845682e-05, "loss": 0.0, "num_input_tokens_seen": 4483152, "step": 10945 }, { "epoch": 13.256658595641646, "grad_norm": 6.336582009680569e-05, "learning_rate": 1.541095685775457e-05, "loss": 0.0, "num_input_tokens_seen": 4485264, "step": 10950 }, { "epoch": 13.26271186440678, "grad_norm": 3.06977417494636e-05, "learning_rate": 1.538656998977082e-05, "loss": 0.0, "num_input_tokens_seen": 4487376, "step": 10955 }, { "epoch": 13.268765133171913, "grad_norm": 1.4871243365632836e-05, "learning_rate": 1.536219385211457e-05, "loss": 0.0, "num_input_tokens_seen": 4489328, "step": 10960 }, { "epoch": 13.274818401937045, "grad_norm": 6.444716564146802e-05, "learning_rate": 1.5337828471994023e-05, "loss": 0.0, "num_input_tokens_seen": 4491472, "step": 10965 }, { "epoch": 13.28087167070218, "grad_norm": 8.251299004768953e-05, "learning_rate": 1.531347387660533e-05, "loss": 0.0, "num_input_tokens_seen": 4493520, "step": 10970 }, { "epoch": 13.286924939467312, "grad_norm": 5.100293128634803e-05, "learning_rate": 1.5289130093132632e-05, "loss": 0.0, "num_input_tokens_seen": 4495664, "step": 10975 }, { "epoch": 13.292978208232446, "grad_norm": 6.430667417589575e-05, "learning_rate": 1.5264797148748003e-05, "loss": 0.0, "num_input_tokens_seen": 4497712, "step": 10980 }, { "epoch": 13.299031476997579, "grad_norm": 0.00015150877879932523, "learning_rate": 1.5240475070611415e-05, "loss": 0.0, "num_input_tokens_seen": 4499728, "step": 10985 }, { "epoch": 13.305084745762711, "grad_norm": 3.391830614418723e-05, "learning_rate": 1.5216163885870701e-05, "loss": 0.0, "num_input_tokens_seen": 4501744, "step": 10990 }, { "epoch": 13.311138014527845, "grad_norm": 1.7775073501979932e-05, "learning_rate": 1.5191863621661539e-05, "loss": 0.0002, "num_input_tokens_seen": 4503888, "step": 10995 }, { "epoch": 13.317191283292978, "grad_norm": 4.6474349801428616e-05, "learning_rate": 1.5167574305107435e-05, "loss": 0.0, "num_input_tokens_seen": 4505904, "step": 11000 }, { "epoch": 13.323244552058112, "grad_norm": 3.270995875936933e-05, "learning_rate": 1.5143295963319643e-05, "loss": 0.0, "num_input_tokens_seen": 4507824, "step": 11005 }, { "epoch": 13.329297820823244, "grad_norm": 4.628031456377357e-05, "learning_rate": 1.5119028623397201e-05, "loss": 0.0, "num_input_tokens_seen": 4509904, "step": 11010 }, { "epoch": 13.335351089588377, "grad_norm": 4.057234400534071e-05, "learning_rate": 1.5094772312426842e-05, "loss": 0.0, "num_input_tokens_seen": 4511888, "step": 11015 }, { "epoch": 13.341404358353511, "grad_norm": 8.185389742720872e-05, "learning_rate": 1.5070527057483013e-05, "loss": 0.0, "num_input_tokens_seen": 4513968, "step": 11020 }, { "epoch": 13.347457627118644, "grad_norm": 4.164400161243975e-05, "learning_rate": 1.5046292885627783e-05, "loss": 0.0, "num_input_tokens_seen": 4516080, "step": 11025 }, { "epoch": 13.353510895883778, "grad_norm": 5.704198338207789e-05, "learning_rate": 1.5022069823910894e-05, "loss": 0.0, "num_input_tokens_seen": 4518160, "step": 11030 }, { "epoch": 13.35956416464891, "grad_norm": 4.9805399612523615e-05, "learning_rate": 1.4997857899369653e-05, "loss": 0.0, "num_input_tokens_seen": 4520208, "step": 11035 }, { "epoch": 13.365617433414043, "grad_norm": 3.516272045089863e-05, "learning_rate": 1.4973657139028963e-05, "loss": 0.0502, "num_input_tokens_seen": 4522352, "step": 11040 }, { "epoch": 13.371670702179177, "grad_norm": 2.2276439267443493e-05, "learning_rate": 1.4949467569901232e-05, "loss": 0.0, "num_input_tokens_seen": 4524432, "step": 11045 }, { "epoch": 13.37772397094431, "grad_norm": 7.60092880227603e-05, "learning_rate": 1.4925289218986414e-05, "loss": 0.0, "num_input_tokens_seen": 4526384, "step": 11050 }, { "epoch": 13.383777239709444, "grad_norm": 0.00014220402226783335, "learning_rate": 1.4901122113271909e-05, "loss": 0.0, "num_input_tokens_seen": 4528464, "step": 11055 }, { "epoch": 13.389830508474576, "grad_norm": 2.3242066163220443e-05, "learning_rate": 1.48769662797326e-05, "loss": 0.0, "num_input_tokens_seen": 4530544, "step": 11060 }, { "epoch": 13.39588377723971, "grad_norm": 4.0318529499927536e-05, "learning_rate": 1.485282174533076e-05, "loss": 0.0, "num_input_tokens_seen": 4532752, "step": 11065 }, { "epoch": 13.401937046004843, "grad_norm": 3.734970232471824e-05, "learning_rate": 1.4828688537016044e-05, "loss": 0.0, "num_input_tokens_seen": 4534768, "step": 11070 }, { "epoch": 13.407990314769975, "grad_norm": 7.058586925268173e-05, "learning_rate": 1.4804566681725495e-05, "loss": 0.0, "num_input_tokens_seen": 4536816, "step": 11075 }, { "epoch": 13.41404358353511, "grad_norm": 4.649688708013855e-05, "learning_rate": 1.4780456206383459e-05, "loss": 0.0, "num_input_tokens_seen": 4538768, "step": 11080 }, { "epoch": 13.420096852300242, "grad_norm": 3.333803033456206e-05, "learning_rate": 1.4756357137901604e-05, "loss": 0.0, "num_input_tokens_seen": 4540752, "step": 11085 }, { "epoch": 13.426150121065376, "grad_norm": 8.324473310494795e-05, "learning_rate": 1.4732269503178836e-05, "loss": 0.0, "num_input_tokens_seen": 4542896, "step": 11090 }, { "epoch": 13.432203389830509, "grad_norm": 3.904035474988632e-05, "learning_rate": 1.4708193329101321e-05, "loss": 0.0, "num_input_tokens_seen": 4544944, "step": 11095 }, { "epoch": 13.438256658595641, "grad_norm": 3.383325383765623e-05, "learning_rate": 1.4684128642542425e-05, "loss": 0.0, "num_input_tokens_seen": 4547056, "step": 11100 }, { "epoch": 13.444309927360775, "grad_norm": 6.662757368758321e-05, "learning_rate": 1.46600754703627e-05, "loss": 0.0, "num_input_tokens_seen": 4549040, "step": 11105 }, { "epoch": 13.450363196125908, "grad_norm": 0.00010645513248164207, "learning_rate": 1.4636033839409824e-05, "loss": 0.0, "num_input_tokens_seen": 4550960, "step": 11110 }, { "epoch": 13.456416464891042, "grad_norm": 4.449651169124991e-05, "learning_rate": 1.4612003776518618e-05, "loss": 0.0, "num_input_tokens_seen": 4553008, "step": 11115 }, { "epoch": 13.462469733656174, "grad_norm": 4.3232292227912694e-05, "learning_rate": 1.4587985308510979e-05, "loss": 0.0, "num_input_tokens_seen": 4555088, "step": 11120 }, { "epoch": 13.468523002421307, "grad_norm": 8.055554644670337e-05, "learning_rate": 1.4563978462195865e-05, "loss": 0.0, "num_input_tokens_seen": 4557072, "step": 11125 }, { "epoch": 13.474576271186441, "grad_norm": 4.744488251162693e-05, "learning_rate": 1.4539983264369245e-05, "loss": 0.0, "num_input_tokens_seen": 4559120, "step": 11130 }, { "epoch": 13.480629539951574, "grad_norm": 0.00027560387388803065, "learning_rate": 1.4515999741814137e-05, "loss": 0.0, "num_input_tokens_seen": 4561200, "step": 11135 }, { "epoch": 13.486682808716708, "grad_norm": 3.051044222956989e-05, "learning_rate": 1.4492027921300449e-05, "loss": 0.0, "num_input_tokens_seen": 4563152, "step": 11140 }, { "epoch": 13.49273607748184, "grad_norm": 0.0016843974590301514, "learning_rate": 1.4468067829585108e-05, "loss": 0.0, "num_input_tokens_seen": 4565264, "step": 11145 }, { "epoch": 13.498789346246973, "grad_norm": 0.0002074848598567769, "learning_rate": 1.4444119493411901e-05, "loss": 0.0, "num_input_tokens_seen": 4567152, "step": 11150 }, { "epoch": 13.504842615012107, "grad_norm": 1.5090345186763443e-05, "learning_rate": 1.4420182939511481e-05, "loss": 0.0, "num_input_tokens_seen": 4569200, "step": 11155 }, { "epoch": 13.51089588377724, "grad_norm": 5.592458910541609e-05, "learning_rate": 1.4396258194601403e-05, "loss": 0.0, "num_input_tokens_seen": 4571280, "step": 11160 }, { "epoch": 13.516949152542374, "grad_norm": 3.3521097066113725e-05, "learning_rate": 1.4372345285386003e-05, "loss": 0.0, "num_input_tokens_seen": 4573200, "step": 11165 }, { "epoch": 13.523002421307506, "grad_norm": 4.8877958761295304e-05, "learning_rate": 1.4348444238556408e-05, "loss": 0.0, "num_input_tokens_seen": 4575344, "step": 11170 }, { "epoch": 13.529055690072639, "grad_norm": 4.0982948121381924e-05, "learning_rate": 1.4324555080790523e-05, "loss": 0.0, "num_input_tokens_seen": 4577392, "step": 11175 }, { "epoch": 13.535108958837773, "grad_norm": 7.635049405507743e-05, "learning_rate": 1.430067783875296e-05, "loss": 0.0, "num_input_tokens_seen": 4579376, "step": 11180 }, { "epoch": 13.541162227602905, "grad_norm": 0.00016697877435944974, "learning_rate": 1.4276812539095039e-05, "loss": 0.0, "num_input_tokens_seen": 4581424, "step": 11185 }, { "epoch": 13.54721549636804, "grad_norm": 3.35801814799197e-05, "learning_rate": 1.425295920845478e-05, "loss": 0.0, "num_input_tokens_seen": 4583632, "step": 11190 }, { "epoch": 13.553268765133172, "grad_norm": 1.607202648301609e-05, "learning_rate": 1.4229117873456776e-05, "loss": 0.0, "num_input_tokens_seen": 4585616, "step": 11195 }, { "epoch": 13.559322033898304, "grad_norm": 3.9179321902338415e-05, "learning_rate": 1.420528856071231e-05, "loss": 0.0, "num_input_tokens_seen": 4587600, "step": 11200 }, { "epoch": 13.565375302663439, "grad_norm": 3.907312930095941e-05, "learning_rate": 1.4181471296819171e-05, "loss": 0.0081, "num_input_tokens_seen": 4589680, "step": 11205 }, { "epoch": 13.571428571428571, "grad_norm": 1.388102555210935e-05, "learning_rate": 1.4157666108361767e-05, "loss": 0.0, "num_input_tokens_seen": 4591728, "step": 11210 }, { "epoch": 13.577481840193705, "grad_norm": 8.97541394806467e-06, "learning_rate": 1.4133873021910975e-05, "loss": 0.0, "num_input_tokens_seen": 4593808, "step": 11215 }, { "epoch": 13.583535108958838, "grad_norm": 1.969411641766783e-05, "learning_rate": 1.41100920640242e-05, "loss": 0.0, "num_input_tokens_seen": 4596016, "step": 11220 }, { "epoch": 13.58958837772397, "grad_norm": 2.2135420294944197e-05, "learning_rate": 1.4086323261245282e-05, "loss": 0.0, "num_input_tokens_seen": 4598128, "step": 11225 }, { "epoch": 13.595641646489105, "grad_norm": 1.3184141607780475e-05, "learning_rate": 1.4062566640104513e-05, "loss": 0.0, "num_input_tokens_seen": 4600048, "step": 11230 }, { "epoch": 13.601694915254237, "grad_norm": 2.462893826304935e-05, "learning_rate": 1.4038822227118582e-05, "loss": 0.0, "num_input_tokens_seen": 4602032, "step": 11235 }, { "epoch": 13.607748184019371, "grad_norm": 1.1598602213780396e-05, "learning_rate": 1.4015090048790541e-05, "loss": 0.0, "num_input_tokens_seen": 4604080, "step": 11240 }, { "epoch": 13.613801452784504, "grad_norm": 1.3518622836272698e-05, "learning_rate": 1.3991370131609805e-05, "loss": 0.0, "num_input_tokens_seen": 4606096, "step": 11245 }, { "epoch": 13.619854721549636, "grad_norm": 6.368691174429841e-06, "learning_rate": 1.396766250205208e-05, "loss": 0.0, "num_input_tokens_seen": 4608048, "step": 11250 }, { "epoch": 13.62590799031477, "grad_norm": 1.0074950296257157e-05, "learning_rate": 1.3943967186579404e-05, "loss": 0.0, "num_input_tokens_seen": 4610288, "step": 11255 }, { "epoch": 13.631961259079903, "grad_norm": 1.4005288903717883e-05, "learning_rate": 1.3920284211639995e-05, "loss": 0.0, "num_input_tokens_seen": 4612400, "step": 11260 }, { "epoch": 13.638014527845037, "grad_norm": 8.496597729390487e-06, "learning_rate": 1.3896613603668364e-05, "loss": 0.0, "num_input_tokens_seen": 4614384, "step": 11265 }, { "epoch": 13.64406779661017, "grad_norm": 8.572819751861971e-06, "learning_rate": 1.387295538908519e-05, "loss": 0.0, "num_input_tokens_seen": 4616432, "step": 11270 }, { "epoch": 13.650121065375302, "grad_norm": 9.38794528337894e-06, "learning_rate": 1.3849309594297319e-05, "loss": 0.0, "num_input_tokens_seen": 4618512, "step": 11275 }, { "epoch": 13.656174334140436, "grad_norm": 3.466973430477083e-05, "learning_rate": 1.3825676245697735e-05, "loss": 0.0, "num_input_tokens_seen": 4620528, "step": 11280 }, { "epoch": 13.662227602905569, "grad_norm": 9.249056347471196e-06, "learning_rate": 1.3802055369665534e-05, "loss": 0.0, "num_input_tokens_seen": 4622512, "step": 11285 }, { "epoch": 13.668280871670703, "grad_norm": 2.099675293720793e-05, "learning_rate": 1.3778446992565877e-05, "loss": 0.0, "num_input_tokens_seen": 4624496, "step": 11290 }, { "epoch": 13.674334140435835, "grad_norm": 1.799994242901448e-05, "learning_rate": 1.3754851140750013e-05, "loss": 0.0, "num_input_tokens_seen": 4626512, "step": 11295 }, { "epoch": 13.680387409200968, "grad_norm": 15.490422248840332, "learning_rate": 1.3731267840555151e-05, "loss": 0.1217, "num_input_tokens_seen": 4628528, "step": 11300 }, { "epoch": 13.686440677966102, "grad_norm": 4.196485679131001e-05, "learning_rate": 1.3707697118304539e-05, "loss": 0.0, "num_input_tokens_seen": 4630480, "step": 11305 }, { "epoch": 13.692493946731235, "grad_norm": 5.0598635425558314e-05, "learning_rate": 1.3684139000307375e-05, "loss": 0.0, "num_input_tokens_seen": 4632464, "step": 11310 }, { "epoch": 13.698547215496369, "grad_norm": 6.409043999155983e-05, "learning_rate": 1.3660593512858754e-05, "loss": 0.0, "num_input_tokens_seen": 4634608, "step": 11315 }, { "epoch": 13.704600484261501, "grad_norm": 0.0001324586191913113, "learning_rate": 1.3637060682239725e-05, "loss": 0.0, "num_input_tokens_seen": 4636848, "step": 11320 }, { "epoch": 13.710653753026634, "grad_norm": 0.00014844947145320475, "learning_rate": 1.3613540534717179e-05, "loss": 0.0, "num_input_tokens_seen": 4639056, "step": 11325 }, { "epoch": 13.716707021791768, "grad_norm": 0.00019463387434370816, "learning_rate": 1.3590033096543859e-05, "loss": 0.0, "num_input_tokens_seen": 4641264, "step": 11330 }, { "epoch": 13.7227602905569, "grad_norm": 0.0001682594302110374, "learning_rate": 1.356653839395831e-05, "loss": 0.0, "num_input_tokens_seen": 4643472, "step": 11335 }, { "epoch": 13.728813559322035, "grad_norm": 0.021079745143651962, "learning_rate": 1.3543056453184882e-05, "loss": 0.0, "num_input_tokens_seen": 4645616, "step": 11340 }, { "epoch": 13.734866828087167, "grad_norm": 0.00010112704330822453, "learning_rate": 1.351958730043367e-05, "loss": 0.0, "num_input_tokens_seen": 4647696, "step": 11345 }, { "epoch": 13.7409200968523, "grad_norm": 0.00019347066699992865, "learning_rate": 1.3496130961900488e-05, "loss": 0.0, "num_input_tokens_seen": 4650000, "step": 11350 }, { "epoch": 13.746973365617434, "grad_norm": 0.0001052662919391878, "learning_rate": 1.347268746376685e-05, "loss": 0.0, "num_input_tokens_seen": 4652144, "step": 11355 }, { "epoch": 13.753026634382566, "grad_norm": 0.0003226147382520139, "learning_rate": 1.3449256832199969e-05, "loss": 0.0, "num_input_tokens_seen": 4654192, "step": 11360 }, { "epoch": 13.7590799031477, "grad_norm": 0.0003797482349909842, "learning_rate": 1.3425839093352636e-05, "loss": 0.0, "num_input_tokens_seen": 4656336, "step": 11365 }, { "epoch": 13.765133171912833, "grad_norm": 9.590137051418424e-05, "learning_rate": 1.3402434273363308e-05, "loss": 0.0, "num_input_tokens_seen": 4658416, "step": 11370 }, { "epoch": 13.771186440677965, "grad_norm": 8.390981383854523e-05, "learning_rate": 1.337904239835599e-05, "loss": 0.0, "num_input_tokens_seen": 4660464, "step": 11375 }, { "epoch": 13.7772397094431, "grad_norm": 4.826115764444694e-05, "learning_rate": 1.3355663494440249e-05, "loss": 0.0, "num_input_tokens_seen": 4662480, "step": 11380 }, { "epoch": 13.783292978208232, "grad_norm": 0.0021386395674198866, "learning_rate": 1.3332297587711174e-05, "loss": 0.0, "num_input_tokens_seen": 4664400, "step": 11385 }, { "epoch": 13.789346246973366, "grad_norm": 0.0016877278685569763, "learning_rate": 1.3308944704249343e-05, "loss": 0.0, "num_input_tokens_seen": 4666512, "step": 11390 }, { "epoch": 13.795399515738499, "grad_norm": 0.0002691335685085505, "learning_rate": 1.3285604870120802e-05, "loss": 0.0, "num_input_tokens_seen": 4668496, "step": 11395 }, { "epoch": 13.801452784503631, "grad_norm": 6.459418364102021e-05, "learning_rate": 1.3262278111377028e-05, "loss": 0.0, "num_input_tokens_seen": 4670544, "step": 11400 }, { "epoch": 13.807506053268765, "grad_norm": 4.205571167403832e-05, "learning_rate": 1.3238964454054903e-05, "loss": 0.0, "num_input_tokens_seen": 4672496, "step": 11405 }, { "epoch": 13.813559322033898, "grad_norm": 4.6438661229331046e-05, "learning_rate": 1.321566392417668e-05, "loss": 0.0, "num_input_tokens_seen": 4674512, "step": 11410 }, { "epoch": 13.819612590799032, "grad_norm": 6.079869490349665e-05, "learning_rate": 1.3192376547749997e-05, "loss": 0.0, "num_input_tokens_seen": 4676656, "step": 11415 }, { "epoch": 13.825665859564165, "grad_norm": 0.00017566492897458375, "learning_rate": 1.3169102350767737e-05, "loss": 0.0, "num_input_tokens_seen": 4678672, "step": 11420 }, { "epoch": 13.831719128329297, "grad_norm": 0.0013358438154682517, "learning_rate": 1.314584135920815e-05, "loss": 0.0, "num_input_tokens_seen": 4680720, "step": 11425 }, { "epoch": 13.837772397094431, "grad_norm": 0.0003298347000963986, "learning_rate": 1.31225935990347e-05, "loss": 0.0, "num_input_tokens_seen": 4682736, "step": 11430 }, { "epoch": 13.843825665859564, "grad_norm": 6.346468580886722e-05, "learning_rate": 1.309935909619609e-05, "loss": 0.0, "num_input_tokens_seen": 4684784, "step": 11435 }, { "epoch": 13.849878934624698, "grad_norm": 9.766114089870825e-05, "learning_rate": 1.3076137876626232e-05, "loss": 0.0, "num_input_tokens_seen": 4686768, "step": 11440 }, { "epoch": 13.85593220338983, "grad_norm": 0.00011970099149039015, "learning_rate": 1.3052929966244214e-05, "loss": 0.0, "num_input_tokens_seen": 4688784, "step": 11445 }, { "epoch": 13.861985472154963, "grad_norm": 6.323494017124176e-05, "learning_rate": 1.3029735390954251e-05, "loss": 0.0, "num_input_tokens_seen": 4690864, "step": 11450 }, { "epoch": 13.868038740920097, "grad_norm": 8.256610453827307e-05, "learning_rate": 1.3006554176645713e-05, "loss": 0.0, "num_input_tokens_seen": 4692880, "step": 11455 }, { "epoch": 13.87409200968523, "grad_norm": 5.169608266442083e-05, "learning_rate": 1.2983386349193e-05, "loss": 0.0, "num_input_tokens_seen": 4694800, "step": 11460 }, { "epoch": 13.880145278450364, "grad_norm": 4.9122983909910545e-05, "learning_rate": 1.2960231934455605e-05, "loss": 0.0, "num_input_tokens_seen": 4697008, "step": 11465 }, { "epoch": 13.886198547215496, "grad_norm": 0.00019492124556563795, "learning_rate": 1.293709095827807e-05, "loss": 0.0, "num_input_tokens_seen": 4699024, "step": 11470 }, { "epoch": 13.892251815980629, "grad_norm": 5.7898305385606363e-05, "learning_rate": 1.2913963446489875e-05, "loss": 0.0, "num_input_tokens_seen": 4701072, "step": 11475 }, { "epoch": 13.898305084745763, "grad_norm": 0.0002651372633408755, "learning_rate": 1.2890849424905546e-05, "loss": 0.0, "num_input_tokens_seen": 4703152, "step": 11480 }, { "epoch": 13.904358353510895, "grad_norm": 0.0002349119895370677, "learning_rate": 1.2867748919324474e-05, "loss": 0.0407, "num_input_tokens_seen": 4705328, "step": 11485 }, { "epoch": 13.91041162227603, "grad_norm": 0.0001445374364266172, "learning_rate": 1.2844661955531036e-05, "loss": 0.0, "num_input_tokens_seen": 4707312, "step": 11490 }, { "epoch": 13.916464891041162, "grad_norm": 0.00011241265019634739, "learning_rate": 1.282158855929445e-05, "loss": 0.0, "num_input_tokens_seen": 4709424, "step": 11495 }, { "epoch": 13.922518159806295, "grad_norm": 0.0004859553591813892, "learning_rate": 1.27985287563688e-05, "loss": 0.0002, "num_input_tokens_seen": 4711408, "step": 11500 }, { "epoch": 13.928571428571429, "grad_norm": 42.41106414794922, "learning_rate": 1.2775482572492997e-05, "loss": 0.0014, "num_input_tokens_seen": 4713488, "step": 11505 }, { "epoch": 13.934624697336561, "grad_norm": 0.0004832239937968552, "learning_rate": 1.2752450033390756e-05, "loss": 0.0321, "num_input_tokens_seen": 4715440, "step": 11510 }, { "epoch": 13.940677966101696, "grad_norm": 0.0007737406995147467, "learning_rate": 1.272943116477055e-05, "loss": 0.0, "num_input_tokens_seen": 4717584, "step": 11515 }, { "epoch": 13.946731234866828, "grad_norm": 0.015233997255563736, "learning_rate": 1.270642599232563e-05, "loss": 0.0, "num_input_tokens_seen": 4719664, "step": 11520 }, { "epoch": 13.95278450363196, "grad_norm": 0.0007845546351745725, "learning_rate": 1.268343454173389e-05, "loss": 0.0, "num_input_tokens_seen": 4721744, "step": 11525 }, { "epoch": 13.958837772397095, "grad_norm": 0.0011630355147644877, "learning_rate": 1.266045683865798e-05, "loss": 0.0, "num_input_tokens_seen": 4723824, "step": 11530 }, { "epoch": 13.964891041162227, "grad_norm": 0.0010557740461081266, "learning_rate": 1.2637492908745163e-05, "loss": 0.0, "num_input_tokens_seen": 4725872, "step": 11535 }, { "epoch": 13.970944309927361, "grad_norm": 0.00308576924726367, "learning_rate": 1.2614542777627348e-05, "loss": 0.0, "num_input_tokens_seen": 4727888, "step": 11540 }, { "epoch": 13.976997578692494, "grad_norm": 0.0046475850977003574, "learning_rate": 1.2591606470921024e-05, "loss": 0.0, "num_input_tokens_seen": 4729808, "step": 11545 }, { "epoch": 13.983050847457626, "grad_norm": 0.00045791128650307655, "learning_rate": 1.2568684014227267e-05, "loss": 0.0, "num_input_tokens_seen": 4731696, "step": 11550 }, { "epoch": 13.98910411622276, "grad_norm": 0.00047566572902724147, "learning_rate": 1.254577543313168e-05, "loss": 0.0, "num_input_tokens_seen": 4733808, "step": 11555 }, { "epoch": 13.995157384987893, "grad_norm": 0.004428526386618614, "learning_rate": 1.252288075320439e-05, "loss": 0.0, "num_input_tokens_seen": 4735856, "step": 11560 }, { "epoch": 14.0, "eval_loss": 0.28447118401527405, "eval_runtime": 4.9581, "eval_samples_per_second": 74.021, "eval_steps_per_second": 18.556, "num_input_tokens_seen": 4737184, "step": 11564 }, { "epoch": 14.001210653753027, "grad_norm": 0.00014890398597344756, "learning_rate": 1.2500000000000006e-05, "loss": 0.0, "num_input_tokens_seen": 4737568, "step": 11565 }, { "epoch": 14.00726392251816, "grad_norm": 0.0002713954891078174, "learning_rate": 1.2477133199057572e-05, "loss": 0.0, "num_input_tokens_seen": 4739552, "step": 11570 }, { "epoch": 14.013317191283292, "grad_norm": 0.0049999612383544445, "learning_rate": 1.2454280375900609e-05, "loss": 0.0, "num_input_tokens_seen": 4741536, "step": 11575 }, { "epoch": 14.019370460048426, "grad_norm": 7.00877426424995e-05, "learning_rate": 1.2431441556036963e-05, "loss": 0.0, "num_input_tokens_seen": 4743424, "step": 11580 }, { "epoch": 14.025423728813559, "grad_norm": 0.00024999980814754963, "learning_rate": 1.2408616764958916e-05, "loss": 0.0, "num_input_tokens_seen": 4745408, "step": 11585 }, { "epoch": 14.031476997578693, "grad_norm": 0.0009397369576618075, "learning_rate": 1.2385806028143063e-05, "loss": 0.0, "num_input_tokens_seen": 4747392, "step": 11590 }, { "epoch": 14.037530266343826, "grad_norm": 0.0003313389897812158, "learning_rate": 1.2363009371050307e-05, "loss": 0.0, "num_input_tokens_seen": 4749344, "step": 11595 }, { "epoch": 14.043583535108958, "grad_norm": 0.0002767942496575415, "learning_rate": 1.2340226819125845e-05, "loss": 0.0003, "num_input_tokens_seen": 4751360, "step": 11600 }, { "epoch": 14.049636803874092, "grad_norm": 0.00019489662372507155, "learning_rate": 1.2317458397799129e-05, "loss": 0.0, "num_input_tokens_seen": 4753280, "step": 11605 }, { "epoch": 14.055690072639225, "grad_norm": 0.00011917779193026945, "learning_rate": 1.2294704132483826e-05, "loss": 0.0, "num_input_tokens_seen": 4755360, "step": 11610 }, { "epoch": 14.061743341404359, "grad_norm": 0.00044029977289028466, "learning_rate": 1.2271964048577841e-05, "loss": 0.0, "num_input_tokens_seen": 4757312, "step": 11615 }, { "epoch": 14.067796610169491, "grad_norm": 0.0002238813613075763, "learning_rate": 1.2249238171463197e-05, "loss": 0.0, "num_input_tokens_seen": 4759296, "step": 11620 }, { "epoch": 14.073849878934624, "grad_norm": 8.193244138965383e-05, "learning_rate": 1.2226526526506093e-05, "loss": 0.0, "num_input_tokens_seen": 4761376, "step": 11625 }, { "epoch": 14.079903147699758, "grad_norm": 0.00048150410293601453, "learning_rate": 1.2203829139056836e-05, "loss": 0.0, "num_input_tokens_seen": 4763456, "step": 11630 }, { "epoch": 14.08595641646489, "grad_norm": 0.00026618613628670573, "learning_rate": 1.218114603444981e-05, "loss": 0.0, "num_input_tokens_seen": 4765664, "step": 11635 }, { "epoch": 14.092009685230025, "grad_norm": 9.644337114877999e-05, "learning_rate": 1.2158477238003484e-05, "loss": 0.0, "num_input_tokens_seen": 4767712, "step": 11640 }, { "epoch": 14.098062953995157, "grad_norm": 0.0009699287475086749, "learning_rate": 1.2135822775020309e-05, "loss": 0.0, "num_input_tokens_seen": 4769792, "step": 11645 }, { "epoch": 14.104116222760291, "grad_norm": 6.225141260074452e-05, "learning_rate": 1.2113182670786787e-05, "loss": 0.0, "num_input_tokens_seen": 4771872, "step": 11650 }, { "epoch": 14.110169491525424, "grad_norm": 5.256848453427665e-05, "learning_rate": 1.2090556950573367e-05, "loss": 0.0, "num_input_tokens_seen": 4773856, "step": 11655 }, { "epoch": 14.116222760290556, "grad_norm": 4.446886305231601e-05, "learning_rate": 1.2067945639634445e-05, "loss": 0.0, "num_input_tokens_seen": 4775904, "step": 11660 }, { "epoch": 14.12227602905569, "grad_norm": 0.00010329565702704713, "learning_rate": 1.2045348763208333e-05, "loss": 0.0, "num_input_tokens_seen": 4777952, "step": 11665 }, { "epoch": 14.128329297820823, "grad_norm": 8.39427812024951e-05, "learning_rate": 1.202276634651724e-05, "loss": 0.0, "num_input_tokens_seen": 4780064, "step": 11670 }, { "epoch": 14.134382566585957, "grad_norm": 0.0001273017842322588, "learning_rate": 1.2000198414767211e-05, "loss": 0.0, "num_input_tokens_seen": 4782080, "step": 11675 }, { "epoch": 14.14043583535109, "grad_norm": 7.789132359903306e-05, "learning_rate": 1.197764499314818e-05, "loss": 0.0, "num_input_tokens_seen": 4784128, "step": 11680 }, { "epoch": 14.146489104116222, "grad_norm": 6.959127495065331e-05, "learning_rate": 1.1955106106833802e-05, "loss": 0.0, "num_input_tokens_seen": 4786144, "step": 11685 }, { "epoch": 14.152542372881356, "grad_norm": 5.1543338486226276e-05, "learning_rate": 1.1932581780981584e-05, "loss": 0.0, "num_input_tokens_seen": 4788256, "step": 11690 }, { "epoch": 14.158595641646489, "grad_norm": 5.450949174701236e-05, "learning_rate": 1.1910072040732738e-05, "loss": 0.0, "num_input_tokens_seen": 4790272, "step": 11695 }, { "epoch": 14.164648910411623, "grad_norm": 3.350571932969615e-05, "learning_rate": 1.188757691121221e-05, "loss": 0.0, "num_input_tokens_seen": 4792256, "step": 11700 }, { "epoch": 14.170702179176756, "grad_norm": 5.205437264521606e-05, "learning_rate": 1.1865096417528635e-05, "loss": 0.0, "num_input_tokens_seen": 4794272, "step": 11705 }, { "epoch": 14.176755447941888, "grad_norm": 7.011940761003643e-05, "learning_rate": 1.184263058477431e-05, "loss": 0.0, "num_input_tokens_seen": 4796352, "step": 11710 }, { "epoch": 14.182808716707022, "grad_norm": 0.00010119129001395777, "learning_rate": 1.1820179438025172e-05, "loss": 0.0, "num_input_tokens_seen": 4798272, "step": 11715 }, { "epoch": 14.188861985472155, "grad_norm": 0.00010952782031381503, "learning_rate": 1.179774300234076e-05, "loss": 0.0, "num_input_tokens_seen": 4800544, "step": 11720 }, { "epoch": 14.194915254237289, "grad_norm": 0.00010680587001843378, "learning_rate": 1.17753213027642e-05, "loss": 0.0, "num_input_tokens_seen": 4802688, "step": 11725 }, { "epoch": 14.200968523002421, "grad_norm": 2.2121261281426996e-05, "learning_rate": 1.1752914364322156e-05, "loss": 0.0, "num_input_tokens_seen": 4804704, "step": 11730 }, { "epoch": 14.207021791767554, "grad_norm": 0.0001276799157494679, "learning_rate": 1.1730522212024853e-05, "loss": 0.0, "num_input_tokens_seen": 4806816, "step": 11735 }, { "epoch": 14.213075060532688, "grad_norm": 3.6066805478185415e-05, "learning_rate": 1.1708144870865945e-05, "loss": 0.0, "num_input_tokens_seen": 4808800, "step": 11740 }, { "epoch": 14.21912832929782, "grad_norm": 0.0007090646540746093, "learning_rate": 1.1685782365822629e-05, "loss": 0.0, "num_input_tokens_seen": 4810944, "step": 11745 }, { "epoch": 14.225181598062955, "grad_norm": 0.0003964231873396784, "learning_rate": 1.166343472185549e-05, "loss": 0.0, "num_input_tokens_seen": 4812928, "step": 11750 }, { "epoch": 14.231234866828087, "grad_norm": 5.877452713320963e-05, "learning_rate": 1.164110196390855e-05, "loss": 0.0, "num_input_tokens_seen": 4814944, "step": 11755 }, { "epoch": 14.23728813559322, "grad_norm": 4.348082802607678e-05, "learning_rate": 1.1618784116909204e-05, "loss": 0.0, "num_input_tokens_seen": 4816960, "step": 11760 }, { "epoch": 14.243341404358354, "grad_norm": 4.755189365823753e-05, "learning_rate": 1.1596481205768206e-05, "loss": 0.0, "num_input_tokens_seen": 4819040, "step": 11765 }, { "epoch": 14.249394673123486, "grad_norm": 0.0005508217145688832, "learning_rate": 1.1574193255379645e-05, "loss": 0.0, "num_input_tokens_seen": 4821152, "step": 11770 }, { "epoch": 14.25544794188862, "grad_norm": 1.9426073777140118e-05, "learning_rate": 1.1551920290620904e-05, "loss": 0.0, "num_input_tokens_seen": 4823104, "step": 11775 }, { "epoch": 14.261501210653753, "grad_norm": 0.00010215555812465027, "learning_rate": 1.1529662336352645e-05, "loss": 0.0, "num_input_tokens_seen": 4825088, "step": 11780 }, { "epoch": 14.267554479418886, "grad_norm": 7.703419396420941e-05, "learning_rate": 1.150741941741877e-05, "loss": 0.0, "num_input_tokens_seen": 4827040, "step": 11785 }, { "epoch": 14.27360774818402, "grad_norm": 3.432413359405473e-05, "learning_rate": 1.1485191558646403e-05, "loss": 0.0, "num_input_tokens_seen": 4829088, "step": 11790 }, { "epoch": 14.279661016949152, "grad_norm": 3.0417153539019637e-05, "learning_rate": 1.1462978784845843e-05, "loss": 0.0, "num_input_tokens_seen": 4831072, "step": 11795 }, { "epoch": 14.285714285714286, "grad_norm": 1.95279953913996e-05, "learning_rate": 1.14407811208106e-05, "loss": 0.0099, "num_input_tokens_seen": 4833088, "step": 11800 }, { "epoch": 14.291767554479419, "grad_norm": 2.860104177671019e-05, "learning_rate": 1.1418598591317242e-05, "loss": 0.0, "num_input_tokens_seen": 4835200, "step": 11805 }, { "epoch": 14.297820823244551, "grad_norm": 2.137728370144032e-05, "learning_rate": 1.1396431221125516e-05, "loss": 0.0, "num_input_tokens_seen": 4837184, "step": 11810 }, { "epoch": 14.303874092009686, "grad_norm": 1.0368973562435713e-05, "learning_rate": 1.137427903497821e-05, "loss": 0.0, "num_input_tokens_seen": 4839200, "step": 11815 }, { "epoch": 14.309927360774818, "grad_norm": 1.0969745744660031e-05, "learning_rate": 1.135214205760117e-05, "loss": 0.0, "num_input_tokens_seen": 4841248, "step": 11820 }, { "epoch": 14.315980629539952, "grad_norm": 1.2095387319277506e-05, "learning_rate": 1.1330020313703268e-05, "loss": 0.0, "num_input_tokens_seen": 4843264, "step": 11825 }, { "epoch": 14.322033898305085, "grad_norm": 6.655584911641199e-06, "learning_rate": 1.1307913827976374e-05, "loss": 0.0081, "num_input_tokens_seen": 4845280, "step": 11830 }, { "epoch": 14.328087167070217, "grad_norm": 1.6355616025975905e-05, "learning_rate": 1.128582262509532e-05, "loss": 0.0, "num_input_tokens_seen": 4847424, "step": 11835 }, { "epoch": 14.334140435835351, "grad_norm": 1.893915941764135e-05, "learning_rate": 1.1263746729717911e-05, "loss": 0.0, "num_input_tokens_seen": 4849376, "step": 11840 }, { "epoch": 14.340193704600484, "grad_norm": 1.2455249816412106e-05, "learning_rate": 1.1241686166484805e-05, "loss": 0.0, "num_input_tokens_seen": 4851360, "step": 11845 }, { "epoch": 14.346246973365618, "grad_norm": 1.1454673767730128e-05, "learning_rate": 1.1219640960019608e-05, "loss": 0.0, "num_input_tokens_seen": 4853344, "step": 11850 }, { "epoch": 14.35230024213075, "grad_norm": 1.3988619684823789e-05, "learning_rate": 1.1197611134928765e-05, "loss": 0.0, "num_input_tokens_seen": 4855264, "step": 11855 }, { "epoch": 14.358353510895883, "grad_norm": 7.798376464052126e-05, "learning_rate": 1.1175596715801515e-05, "loss": 0.0, "num_input_tokens_seen": 4857248, "step": 11860 }, { "epoch": 14.364406779661017, "grad_norm": 2.0776216842932627e-05, "learning_rate": 1.115359772720996e-05, "loss": 0.0, "num_input_tokens_seen": 4859360, "step": 11865 }, { "epoch": 14.37046004842615, "grad_norm": 1.1831957635877188e-05, "learning_rate": 1.1131614193708948e-05, "loss": 0.0, "num_input_tokens_seen": 4861536, "step": 11870 }, { "epoch": 14.376513317191284, "grad_norm": 8.412060196860693e-06, "learning_rate": 1.1109646139836075e-05, "loss": 0.0, "num_input_tokens_seen": 4863520, "step": 11875 }, { "epoch": 14.382566585956416, "grad_norm": 1.782698200258892e-05, "learning_rate": 1.1087693590111667e-05, "loss": 0.0, "num_input_tokens_seen": 4865504, "step": 11880 }, { "epoch": 14.388619854721549, "grad_norm": 9.932583452609833e-06, "learning_rate": 1.1065756569038743e-05, "loss": 0.0, "num_input_tokens_seen": 4867584, "step": 11885 }, { "epoch": 14.394673123486683, "grad_norm": 1.1869452464452479e-05, "learning_rate": 1.1043835101102979e-05, "loss": 0.0, "num_input_tokens_seen": 4869600, "step": 11890 }, { "epoch": 14.400726392251816, "grad_norm": 7.709569217695389e-06, "learning_rate": 1.1021929210772724e-05, "loss": 0.0, "num_input_tokens_seen": 4871776, "step": 11895 }, { "epoch": 14.40677966101695, "grad_norm": 1.5092405192262959e-05, "learning_rate": 1.1000038922498884e-05, "loss": 0.0, "num_input_tokens_seen": 4873824, "step": 11900 }, { "epoch": 14.412832929782082, "grad_norm": 1.2879281712230295e-05, "learning_rate": 1.0978164260715016e-05, "loss": 0.0, "num_input_tokens_seen": 4876096, "step": 11905 }, { "epoch": 14.418886198547215, "grad_norm": 7.974384971021209e-06, "learning_rate": 1.0956305249837163e-05, "loss": 0.0, "num_input_tokens_seen": 4878048, "step": 11910 }, { "epoch": 14.424939467312349, "grad_norm": 1.5388310202979483e-05, "learning_rate": 1.0934461914263967e-05, "loss": 0.0, "num_input_tokens_seen": 4880096, "step": 11915 }, { "epoch": 14.430992736077481, "grad_norm": 2.3034510377328843e-05, "learning_rate": 1.091263427837653e-05, "loss": 0.0, "num_input_tokens_seen": 4882144, "step": 11920 }, { "epoch": 14.437046004842616, "grad_norm": 1.2723337022180203e-05, "learning_rate": 1.0890822366538447e-05, "loss": 0.0, "num_input_tokens_seen": 4884224, "step": 11925 }, { "epoch": 14.443099273607748, "grad_norm": 1.7344975276500918e-05, "learning_rate": 1.0869026203095758e-05, "loss": 0.0, "num_input_tokens_seen": 4886624, "step": 11930 }, { "epoch": 14.44915254237288, "grad_norm": 7.550234386144439e-06, "learning_rate": 1.0847245812376924e-05, "loss": 0.0, "num_input_tokens_seen": 4888672, "step": 11935 }, { "epoch": 14.455205811138015, "grad_norm": 8.015880666789599e-06, "learning_rate": 1.0825481218692807e-05, "loss": 0.0, "num_input_tokens_seen": 4890656, "step": 11940 }, { "epoch": 14.461259079903147, "grad_norm": 1.7686506907921284e-05, "learning_rate": 1.0803732446336626e-05, "loss": 0.0, "num_input_tokens_seen": 4892800, "step": 11945 }, { "epoch": 14.467312348668282, "grad_norm": 1.0591853424557485e-05, "learning_rate": 1.0781999519583949e-05, "loss": 0.0, "num_input_tokens_seen": 4894880, "step": 11950 }, { "epoch": 14.473365617433414, "grad_norm": 7.995764462975785e-06, "learning_rate": 1.0760282462692647e-05, "loss": 0.0, "num_input_tokens_seen": 4897120, "step": 11955 }, { "epoch": 14.479418886198546, "grad_norm": 7.156783340178663e-06, "learning_rate": 1.0738581299902912e-05, "loss": 0.0, "num_input_tokens_seen": 4899008, "step": 11960 }, { "epoch": 14.48547215496368, "grad_norm": 1.8213147995993495e-05, "learning_rate": 1.071689605543713e-05, "loss": 0.0, "num_input_tokens_seen": 4901184, "step": 11965 }, { "epoch": 14.491525423728813, "grad_norm": 2.8993912565056235e-05, "learning_rate": 1.069522675349999e-05, "loss": 0.0, "num_input_tokens_seen": 4903232, "step": 11970 }, { "epoch": 14.497578692493947, "grad_norm": 1.0172986549150664e-05, "learning_rate": 1.067357341827834e-05, "loss": 0.0, "num_input_tokens_seen": 4905152, "step": 11975 }, { "epoch": 14.50363196125908, "grad_norm": 7.989679943420924e-06, "learning_rate": 1.0651936073941223e-05, "loss": 0.0, "num_input_tokens_seen": 4907360, "step": 11980 }, { "epoch": 14.509685230024212, "grad_norm": 1.0588154509605374e-05, "learning_rate": 1.063031474463983e-05, "loss": 0.0, "num_input_tokens_seen": 4909600, "step": 11985 }, { "epoch": 14.515738498789347, "grad_norm": 1.1232543329242617e-05, "learning_rate": 1.0608709454507478e-05, "loss": 0.0, "num_input_tokens_seen": 4911712, "step": 11990 }, { "epoch": 14.521791767554479, "grad_norm": 1.1412678759370465e-05, "learning_rate": 1.0587120227659571e-05, "loss": 0.0, "num_input_tokens_seen": 4913632, "step": 11995 }, { "epoch": 14.527845036319613, "grad_norm": 0.00025339346029795706, "learning_rate": 1.056554708819362e-05, "loss": 0.0, "num_input_tokens_seen": 4915648, "step": 12000 }, { "epoch": 14.533898305084746, "grad_norm": 1.019220599118853e-05, "learning_rate": 1.0543990060189116e-05, "loss": 0.0269, "num_input_tokens_seen": 4917728, "step": 12005 }, { "epoch": 14.539951573849878, "grad_norm": 1.8296415873919614e-05, "learning_rate": 1.0522449167707631e-05, "loss": 0.0, "num_input_tokens_seen": 4919680, "step": 12010 }, { "epoch": 14.546004842615012, "grad_norm": 1.2781054465449415e-05, "learning_rate": 1.0500924434792697e-05, "loss": 0.0, "num_input_tokens_seen": 4921728, "step": 12015 }, { "epoch": 14.552058111380145, "grad_norm": 6.428051619877806e-06, "learning_rate": 1.0479415885469787e-05, "loss": 0.0, "num_input_tokens_seen": 4923744, "step": 12020 }, { "epoch": 14.558111380145279, "grad_norm": 7.335190275625791e-06, "learning_rate": 1.0457923543746368e-05, "loss": 0.0379, "num_input_tokens_seen": 4925856, "step": 12025 }, { "epoch": 14.564164648910412, "grad_norm": 1.2202448488096707e-05, "learning_rate": 1.0436447433611743e-05, "loss": 0.0, "num_input_tokens_seen": 4927808, "step": 12030 }, { "epoch": 14.570217917675544, "grad_norm": 1.2638628504646476e-05, "learning_rate": 1.0414987579037171e-05, "loss": 0.0, "num_input_tokens_seen": 4929856, "step": 12035 }, { "epoch": 14.576271186440678, "grad_norm": 3.320376345072873e-05, "learning_rate": 1.0393544003975722e-05, "loss": 0.0, "num_input_tokens_seen": 4931776, "step": 12040 }, { "epoch": 14.58232445520581, "grad_norm": 1.4673746591142844e-05, "learning_rate": 1.0372116732362305e-05, "loss": 0.0, "num_input_tokens_seen": 4933824, "step": 12045 }, { "epoch": 14.588377723970945, "grad_norm": 6.90505248712725e-06, "learning_rate": 1.0350705788113635e-05, "loss": 0.0, "num_input_tokens_seen": 4935840, "step": 12050 }, { "epoch": 14.594430992736077, "grad_norm": 9.841667633736506e-06, "learning_rate": 1.0329311195128195e-05, "loss": 0.0, "num_input_tokens_seen": 4937888, "step": 12055 }, { "epoch": 14.600484261501212, "grad_norm": 1.5838566469028592e-05, "learning_rate": 1.0307932977286217e-05, "loss": 0.0, "num_input_tokens_seen": 4939936, "step": 12060 }, { "epoch": 14.606537530266344, "grad_norm": 1.4648855540144723e-05, "learning_rate": 1.0286571158449688e-05, "loss": 0.0, "num_input_tokens_seen": 4942080, "step": 12065 }, { "epoch": 14.612590799031477, "grad_norm": 2.3159074771683663e-05, "learning_rate": 1.0265225762462225e-05, "loss": 0.0, "num_input_tokens_seen": 4944160, "step": 12070 }, { "epoch": 14.61864406779661, "grad_norm": 1.867055834736675e-05, "learning_rate": 1.024389681314918e-05, "loss": 0.0, "num_input_tokens_seen": 4946272, "step": 12075 }, { "epoch": 14.624697336561743, "grad_norm": 9.279727237299085e-06, "learning_rate": 1.0222584334317515e-05, "loss": 0.0, "num_input_tokens_seen": 4948288, "step": 12080 }, { "epoch": 14.630750605326877, "grad_norm": 2.6701323804445565e-05, "learning_rate": 1.0201288349755808e-05, "loss": 0.0, "num_input_tokens_seen": 4950368, "step": 12085 }, { "epoch": 14.63680387409201, "grad_norm": 9.284267434850335e-06, "learning_rate": 1.0180008883234232e-05, "loss": 0.0, "num_input_tokens_seen": 4952384, "step": 12090 }, { "epoch": 14.642857142857142, "grad_norm": 1.1937876479350962e-05, "learning_rate": 1.0158745958504526e-05, "loss": 0.0, "num_input_tokens_seen": 4954432, "step": 12095 }, { "epoch": 14.648910411622277, "grad_norm": 1.130078817368485e-05, "learning_rate": 1.013749959929996e-05, "loss": 0.0, "num_input_tokens_seen": 4956384, "step": 12100 }, { "epoch": 14.654963680387409, "grad_norm": 1.6467593013658188e-05, "learning_rate": 1.0116269829335316e-05, "loss": 0.0, "num_input_tokens_seen": 4958304, "step": 12105 }, { "epoch": 14.661016949152543, "grad_norm": 9.038081770995632e-05, "learning_rate": 1.0095056672306854e-05, "loss": 0.0, "num_input_tokens_seen": 4960352, "step": 12110 }, { "epoch": 14.667070217917676, "grad_norm": 8.919065294321626e-06, "learning_rate": 1.0073860151892292e-05, "loss": 0.0, "num_input_tokens_seen": 4962336, "step": 12115 }, { "epoch": 14.673123486682808, "grad_norm": 1.1350082786520943e-05, "learning_rate": 1.005268029175081e-05, "loss": 0.0, "num_input_tokens_seen": 4964352, "step": 12120 }, { "epoch": 14.679176755447942, "grad_norm": 2.2354384782374837e-05, "learning_rate": 1.0031517115522926e-05, "loss": 0.0, "num_input_tokens_seen": 4966560, "step": 12125 }, { "epoch": 14.685230024213075, "grad_norm": 1.0888235010497738e-05, "learning_rate": 1.0010370646830603e-05, "loss": 0.0, "num_input_tokens_seen": 4968640, "step": 12130 }, { "epoch": 14.69128329297821, "grad_norm": 3.012302659044508e-05, "learning_rate": 9.989240909277115e-06, "loss": 0.0, "num_input_tokens_seen": 4970656, "step": 12135 }, { "epoch": 14.697336561743342, "grad_norm": 1.0408724847366102e-05, "learning_rate": 9.968127926447074e-06, "loss": 0.0, "num_input_tokens_seen": 4972544, "step": 12140 }, { "epoch": 14.703389830508474, "grad_norm": 3.620460847741924e-05, "learning_rate": 9.947031721906392e-06, "loss": 0.0, "num_input_tokens_seen": 4974592, "step": 12145 }, { "epoch": 14.709443099273608, "grad_norm": 7.205284418887459e-06, "learning_rate": 9.925952319202247e-06, "loss": 0.0, "num_input_tokens_seen": 4976512, "step": 12150 }, { "epoch": 14.71549636803874, "grad_norm": 1.5660401913919486e-05, "learning_rate": 9.90488974186306e-06, "loss": 0.0, "num_input_tokens_seen": 4978496, "step": 12155 }, { "epoch": 14.721549636803875, "grad_norm": 0.00013802200555801392, "learning_rate": 9.883844013398508e-06, "loss": 0.0, "num_input_tokens_seen": 4980512, "step": 12160 }, { "epoch": 14.727602905569007, "grad_norm": 1.7857231796369888e-05, "learning_rate": 9.862815157299391e-06, "loss": 0.0, "num_input_tokens_seen": 4982592, "step": 12165 }, { "epoch": 14.73365617433414, "grad_norm": 2.138238778570667e-05, "learning_rate": 9.841803197037756e-06, "loss": 0.0, "num_input_tokens_seen": 4984768, "step": 12170 }, { "epoch": 14.739709443099274, "grad_norm": 0.00017354545707348734, "learning_rate": 9.820808156066726e-06, "loss": 0.0, "num_input_tokens_seen": 4986784, "step": 12175 }, { "epoch": 14.745762711864407, "grad_norm": 3.2174979423871264e-05, "learning_rate": 9.799830057820567e-06, "loss": 0.0, "num_input_tokens_seen": 4988736, "step": 12180 }, { "epoch": 14.75181598062954, "grad_norm": 1.2030813195451628e-05, "learning_rate": 9.778868925714657e-06, "loss": 0.0, "num_input_tokens_seen": 4990880, "step": 12185 }, { "epoch": 14.757869249394673, "grad_norm": 1.3090888387523592e-05, "learning_rate": 9.757924783145381e-06, "loss": 0.0, "num_input_tokens_seen": 4992896, "step": 12190 }, { "epoch": 14.763922518159806, "grad_norm": 8.530388186045457e-06, "learning_rate": 9.736997653490215e-06, "loss": 0.0, "num_input_tokens_seen": 4994944, "step": 12195 }, { "epoch": 14.76997578692494, "grad_norm": 1.5525502021773718e-05, "learning_rate": 9.716087560107617e-06, "loss": 0.001, "num_input_tokens_seen": 4996928, "step": 12200 }, { "epoch": 14.776029055690072, "grad_norm": 1.5343421182478778e-05, "learning_rate": 9.69519452633703e-06, "loss": 0.0, "num_input_tokens_seen": 4998944, "step": 12205 }, { "epoch": 14.782082324455207, "grad_norm": 1.3042565115028992e-05, "learning_rate": 9.674318575498868e-06, "loss": 0.0, "num_input_tokens_seen": 5001024, "step": 12210 }, { "epoch": 14.788135593220339, "grad_norm": 1.4527621715387795e-05, "learning_rate": 9.653459730894462e-06, "loss": 0.0, "num_input_tokens_seen": 5003040, "step": 12215 }, { "epoch": 14.794188861985472, "grad_norm": 1.3048624168732204e-05, "learning_rate": 9.632618015806052e-06, "loss": 0.0, "num_input_tokens_seen": 5005152, "step": 12220 }, { "epoch": 14.800242130750606, "grad_norm": 1.1899536730197724e-05, "learning_rate": 9.611793453496792e-06, "loss": 0.0, "num_input_tokens_seen": 5007168, "step": 12225 }, { "epoch": 14.806295399515738, "grad_norm": 8.475319191347808e-06, "learning_rate": 9.590986067210623e-06, "loss": 0.0, "num_input_tokens_seen": 5009152, "step": 12230 }, { "epoch": 14.812348668280872, "grad_norm": 1.198192057927372e-05, "learning_rate": 9.570195880172381e-06, "loss": 0.0, "num_input_tokens_seen": 5011200, "step": 12235 }, { "epoch": 14.818401937046005, "grad_norm": 8.599819921073504e-06, "learning_rate": 9.549422915587666e-06, "loss": 0.0, "num_input_tokens_seen": 5013472, "step": 12240 }, { "epoch": 14.824455205811137, "grad_norm": 8.38319283502642e-06, "learning_rate": 9.528667196642868e-06, "loss": 0.0, "num_input_tokens_seen": 5015360, "step": 12245 }, { "epoch": 14.830508474576272, "grad_norm": 1.649592559260782e-05, "learning_rate": 9.507928746505126e-06, "loss": 0.0, "num_input_tokens_seen": 5017344, "step": 12250 }, { "epoch": 14.836561743341404, "grad_norm": 1.1803747838712297e-05, "learning_rate": 9.487207588322302e-06, "loss": 0.0, "num_input_tokens_seen": 5019264, "step": 12255 }, { "epoch": 14.842615012106538, "grad_norm": 2.8614258553716354e-05, "learning_rate": 9.46650374522296e-06, "loss": 0.0089, "num_input_tokens_seen": 5021312, "step": 12260 }, { "epoch": 14.84866828087167, "grad_norm": 6.4376899899798445e-06, "learning_rate": 9.445817240316332e-06, "loss": 0.0, "num_input_tokens_seen": 5023360, "step": 12265 }, { "epoch": 14.854721549636803, "grad_norm": 7.718420420133043e-06, "learning_rate": 9.425148096692301e-06, "loss": 0.0, "num_input_tokens_seen": 5025472, "step": 12270 }, { "epoch": 14.860774818401937, "grad_norm": 9.180852430290543e-06, "learning_rate": 9.404496337421365e-06, "loss": 0.0, "num_input_tokens_seen": 5027744, "step": 12275 }, { "epoch": 14.86682808716707, "grad_norm": 0.00018219415505882353, "learning_rate": 9.38386198555465e-06, "loss": 0.0, "num_input_tokens_seen": 5029728, "step": 12280 }, { "epoch": 14.872881355932204, "grad_norm": 4.9224592657992616e-05, "learning_rate": 9.363245064123791e-06, "loss": 0.0, "num_input_tokens_seen": 5031840, "step": 12285 }, { "epoch": 14.878934624697337, "grad_norm": 2.4178139938157983e-05, "learning_rate": 9.34264559614103e-06, "loss": 0.0, "num_input_tokens_seen": 5033952, "step": 12290 }, { "epoch": 14.884987893462469, "grad_norm": 6.84483238728717e-05, "learning_rate": 9.322063604599093e-06, "loss": 0.0, "num_input_tokens_seen": 5036000, "step": 12295 }, { "epoch": 14.891041162227603, "grad_norm": 1.627679739613086e-05, "learning_rate": 9.301499112471204e-06, "loss": 0.0, "num_input_tokens_seen": 5038208, "step": 12300 }, { "epoch": 14.897094430992736, "grad_norm": 1.575334681547247e-05, "learning_rate": 9.28095214271106e-06, "loss": 0.0, "num_input_tokens_seen": 5040384, "step": 12305 }, { "epoch": 14.90314769975787, "grad_norm": 8.244348464359064e-06, "learning_rate": 9.260422718252798e-06, "loss": 0.0, "num_input_tokens_seen": 5042400, "step": 12310 }, { "epoch": 14.909200968523002, "grad_norm": 1.0988011126755737e-05, "learning_rate": 9.23991086201097e-06, "loss": 0.0, "num_input_tokens_seen": 5044608, "step": 12315 }, { "epoch": 14.915254237288135, "grad_norm": 0.00030694884480908513, "learning_rate": 9.219416596880517e-06, "loss": 0.0, "num_input_tokens_seen": 5046720, "step": 12320 }, { "epoch": 14.92130750605327, "grad_norm": 0.00024208470131270587, "learning_rate": 9.19893994573674e-06, "loss": 0.0, "num_input_tokens_seen": 5048672, "step": 12325 }, { "epoch": 14.927360774818402, "grad_norm": 1.5841620552237146e-05, "learning_rate": 9.178480931435315e-06, "loss": 0.0, "num_input_tokens_seen": 5050784, "step": 12330 }, { "epoch": 14.933414043583536, "grad_norm": 8.239527232944965e-06, "learning_rate": 9.158039576812177e-06, "loss": 0.0, "num_input_tokens_seen": 5052896, "step": 12335 }, { "epoch": 14.939467312348668, "grad_norm": 3.2268457289319485e-05, "learning_rate": 9.137615904683575e-06, "loss": 0.0, "num_input_tokens_seen": 5054944, "step": 12340 }, { "epoch": 14.9455205811138, "grad_norm": 1.3447290257317945e-05, "learning_rate": 9.117209937846053e-06, "loss": 0.0, "num_input_tokens_seen": 5057184, "step": 12345 }, { "epoch": 14.951573849878935, "grad_norm": 4.8036374209914356e-05, "learning_rate": 9.096821699076322e-06, "loss": 0.0, "num_input_tokens_seen": 5059296, "step": 12350 }, { "epoch": 14.957627118644067, "grad_norm": 5.58854517294094e-05, "learning_rate": 9.07645121113138e-06, "loss": 0.0, "num_input_tokens_seen": 5061184, "step": 12355 }, { "epoch": 14.963680387409202, "grad_norm": 1.1417174391681328e-05, "learning_rate": 9.056098496748358e-06, "loss": 0.0, "num_input_tokens_seen": 5063232, "step": 12360 }, { "epoch": 14.969733656174334, "grad_norm": 9.269971997127868e-06, "learning_rate": 9.035763578644579e-06, "loss": 0.0, "num_input_tokens_seen": 5065280, "step": 12365 }, { "epoch": 14.975786924939467, "grad_norm": 1.028814585879445e-05, "learning_rate": 9.015446479517487e-06, "loss": 0.0, "num_input_tokens_seen": 5067328, "step": 12370 }, { "epoch": 14.9818401937046, "grad_norm": 8.317610081576277e-06, "learning_rate": 8.995147222044639e-06, "loss": 0.0001, "num_input_tokens_seen": 5069408, "step": 12375 }, { "epoch": 14.987893462469733, "grad_norm": 1.0695494893298019e-05, "learning_rate": 8.974865828883674e-06, "loss": 0.0, "num_input_tokens_seen": 5071392, "step": 12380 }, { "epoch": 14.993946731234868, "grad_norm": 1.7298882085015066e-05, "learning_rate": 8.95460232267232e-06, "loss": 0.0, "num_input_tokens_seen": 5073376, "step": 12385 }, { "epoch": 15.0, "grad_norm": 6.670272796327481e-06, "learning_rate": 8.934356726028287e-06, "loss": 0.0, "num_input_tokens_seen": 5075136, "step": 12390 }, { "epoch": 15.0, "eval_loss": 0.4197850525379181, "eval_runtime": 4.9604, "eval_samples_per_second": 73.987, "eval_steps_per_second": 18.547, "num_input_tokens_seen": 5075136, "step": 12390 }, { "epoch": 15.006053268765132, "grad_norm": 0.0002114462695317343, "learning_rate": 8.914129061549345e-06, "loss": 0.0, "num_input_tokens_seen": 5077344, "step": 12395 }, { "epoch": 15.012106537530267, "grad_norm": 9.055672308022622e-06, "learning_rate": 8.893919351813224e-06, "loss": 0.0, "num_input_tokens_seen": 5079360, "step": 12400 }, { "epoch": 15.0181598062954, "grad_norm": 1.2761925063387025e-05, "learning_rate": 8.873727619377611e-06, "loss": 0.0, "num_input_tokens_seen": 5081504, "step": 12405 }, { "epoch": 15.024213075060533, "grad_norm": 7.685984019190073e-05, "learning_rate": 8.853553886780139e-06, "loss": 0.0, "num_input_tokens_seen": 5083616, "step": 12410 }, { "epoch": 15.030266343825666, "grad_norm": 6.0020315686415415e-06, "learning_rate": 8.833398176538343e-06, "loss": 0.0, "num_input_tokens_seen": 5085632, "step": 12415 }, { "epoch": 15.036319612590798, "grad_norm": 1.1747141797968652e-05, "learning_rate": 8.81326051114964e-06, "loss": 0.0, "num_input_tokens_seen": 5087616, "step": 12420 }, { "epoch": 15.042372881355933, "grad_norm": 1.4380860193341505e-05, "learning_rate": 8.793140913091314e-06, "loss": 0.0, "num_input_tokens_seen": 5089472, "step": 12425 }, { "epoch": 15.048426150121065, "grad_norm": 8.08732511359267e-06, "learning_rate": 8.773039404820472e-06, "loss": 0.0, "num_input_tokens_seen": 5091648, "step": 12430 }, { "epoch": 15.0544794188862, "grad_norm": 6.226911864359863e-06, "learning_rate": 8.752956008774027e-06, "loss": 0.0, "num_input_tokens_seen": 5093728, "step": 12435 }, { "epoch": 15.060532687651332, "grad_norm": 2.342499647056684e-05, "learning_rate": 8.732890747368711e-06, "loss": 0.0, "num_input_tokens_seen": 5095712, "step": 12440 }, { "epoch": 15.066585956416464, "grad_norm": 1.4624642972194124e-05, "learning_rate": 8.71284364300095e-06, "loss": 0.0, "num_input_tokens_seen": 5097856, "step": 12445 }, { "epoch": 15.072639225181598, "grad_norm": 1.2014496860501822e-05, "learning_rate": 8.69281471804698e-06, "loss": 0.0, "num_input_tokens_seen": 5099744, "step": 12450 }, { "epoch": 15.07869249394673, "grad_norm": 7.963571079017129e-06, "learning_rate": 8.672803994862662e-06, "loss": 0.0, "num_input_tokens_seen": 5101856, "step": 12455 }, { "epoch": 15.084745762711865, "grad_norm": 6.673443749605212e-06, "learning_rate": 8.65281149578362e-06, "loss": 0.0, "num_input_tokens_seen": 5103872, "step": 12460 }, { "epoch": 15.090799031476998, "grad_norm": 9.678989044914488e-06, "learning_rate": 8.632837243125082e-06, "loss": 0.0, "num_input_tokens_seen": 5105888, "step": 12465 }, { "epoch": 15.09685230024213, "grad_norm": 6.638567356276326e-06, "learning_rate": 8.612881259181938e-06, "loss": 0.0, "num_input_tokens_seen": 5107808, "step": 12470 }, { "epoch": 15.102905569007264, "grad_norm": 9.224901077686809e-06, "learning_rate": 8.59294356622867e-06, "loss": 0.0, "num_input_tokens_seen": 5109888, "step": 12475 }, { "epoch": 15.108958837772397, "grad_norm": 6.592224963242188e-05, "learning_rate": 8.573024186519357e-06, "loss": 0.0, "num_input_tokens_seen": 5112000, "step": 12480 }, { "epoch": 15.115012106537531, "grad_norm": 1.2516696187958587e-05, "learning_rate": 8.553123142287617e-06, "loss": 0.0, "num_input_tokens_seen": 5113984, "step": 12485 }, { "epoch": 15.121065375302663, "grad_norm": 2.169524123019073e-05, "learning_rate": 8.533240455746647e-06, "loss": 0.0, "num_input_tokens_seen": 5116000, "step": 12490 }, { "epoch": 15.127118644067796, "grad_norm": 4.4077754864702e-05, "learning_rate": 8.513376149089095e-06, "loss": 0.0, "num_input_tokens_seen": 5118144, "step": 12495 }, { "epoch": 15.13317191283293, "grad_norm": 2.1117530195624568e-05, "learning_rate": 8.493530244487122e-06, "loss": 0.0, "num_input_tokens_seen": 5120288, "step": 12500 }, { "epoch": 15.139225181598063, "grad_norm": 0.0001524465624243021, "learning_rate": 8.473702764092376e-06, "loss": 0.0, "num_input_tokens_seen": 5122304, "step": 12505 }, { "epoch": 15.145278450363197, "grad_norm": 1.1715364962583408e-05, "learning_rate": 8.453893730035877e-06, "loss": 0.0, "num_input_tokens_seen": 5124448, "step": 12510 }, { "epoch": 15.15133171912833, "grad_norm": 9.781627340998966e-06, "learning_rate": 8.434103164428117e-06, "loss": 0.0, "num_input_tokens_seen": 5126784, "step": 12515 }, { "epoch": 15.157384987893462, "grad_norm": 7.166084105847403e-05, "learning_rate": 8.414331089358943e-06, "loss": 0.0, "num_input_tokens_seen": 5128704, "step": 12520 }, { "epoch": 15.163438256658596, "grad_norm": 7.5618427217705175e-06, "learning_rate": 8.394577526897564e-06, "loss": 0.0, "num_input_tokens_seen": 5130880, "step": 12525 }, { "epoch": 15.169491525423728, "grad_norm": 9.990042599383742e-06, "learning_rate": 8.374842499092534e-06, "loss": 0.0, "num_input_tokens_seen": 5133024, "step": 12530 }, { "epoch": 15.175544794188863, "grad_norm": 1.9326222172821872e-05, "learning_rate": 8.355126027971713e-06, "loss": 0.0, "num_input_tokens_seen": 5135040, "step": 12535 }, { "epoch": 15.181598062953995, "grad_norm": 7.214182005554903e-06, "learning_rate": 8.335428135542244e-06, "loss": 0.0, "num_input_tokens_seen": 5137152, "step": 12540 }, { "epoch": 15.187651331719128, "grad_norm": 7.525648470618762e-06, "learning_rate": 8.315748843790563e-06, "loss": 0.0, "num_input_tokens_seen": 5139168, "step": 12545 }, { "epoch": 15.193704600484262, "grad_norm": 7.5248472057865e-06, "learning_rate": 8.29608817468229e-06, "loss": 0.0, "num_input_tokens_seen": 5141216, "step": 12550 }, { "epoch": 15.199757869249394, "grad_norm": 1.374479143123608e-05, "learning_rate": 8.276446150162313e-06, "loss": 0.0, "num_input_tokens_seen": 5143200, "step": 12555 }, { "epoch": 15.205811138014528, "grad_norm": 1.3974835383123718e-05, "learning_rate": 8.256822792154672e-06, "loss": 0.0, "num_input_tokens_seen": 5145216, "step": 12560 }, { "epoch": 15.211864406779661, "grad_norm": 9.275755473936442e-06, "learning_rate": 8.237218122562595e-06, "loss": 0.0, "num_input_tokens_seen": 5147232, "step": 12565 }, { "epoch": 15.217917675544793, "grad_norm": 1.2352385056146886e-05, "learning_rate": 8.217632163268435e-06, "loss": 0.0, "num_input_tokens_seen": 5149280, "step": 12570 }, { "epoch": 15.223970944309928, "grad_norm": 9.558570127410349e-06, "learning_rate": 8.198064936133668e-06, "loss": 0.0, "num_input_tokens_seen": 5151392, "step": 12575 }, { "epoch": 15.23002421307506, "grad_norm": 1.1450936654000543e-05, "learning_rate": 8.178516462998861e-06, "loss": 0.0, "num_input_tokens_seen": 5153344, "step": 12580 }, { "epoch": 15.236077481840194, "grad_norm": 2.5277195163653232e-05, "learning_rate": 8.158986765683647e-06, "loss": 0.0, "num_input_tokens_seen": 5155424, "step": 12585 }, { "epoch": 15.242130750605327, "grad_norm": 1.2380896805552766e-05, "learning_rate": 8.139475865986697e-06, "loss": 0.0, "num_input_tokens_seen": 5157504, "step": 12590 }, { "epoch": 15.24818401937046, "grad_norm": 7.669922524655703e-06, "learning_rate": 8.119983785685717e-06, "loss": 0.0, "num_input_tokens_seen": 5159520, "step": 12595 }, { "epoch": 15.254237288135593, "grad_norm": 9.497331120655872e-06, "learning_rate": 8.100510546537388e-06, "loss": 0.0, "num_input_tokens_seen": 5161536, "step": 12600 }, { "epoch": 15.260290556900726, "grad_norm": 1.1453164916019887e-05, "learning_rate": 8.081056170277362e-06, "loss": 0.0, "num_input_tokens_seen": 5163520, "step": 12605 }, { "epoch": 15.26634382566586, "grad_norm": 8.04883075034013e-06, "learning_rate": 8.061620678620265e-06, "loss": 0.0, "num_input_tokens_seen": 5165536, "step": 12610 }, { "epoch": 15.272397094430993, "grad_norm": 2.7185775252291933e-05, "learning_rate": 8.042204093259598e-06, "loss": 0.0, "num_input_tokens_seen": 5167360, "step": 12615 }, { "epoch": 15.278450363196125, "grad_norm": 8.070112926361617e-06, "learning_rate": 8.0228064358678e-06, "loss": 0.0, "num_input_tokens_seen": 5169344, "step": 12620 }, { "epoch": 15.28450363196126, "grad_norm": 6.2562371567764785e-06, "learning_rate": 8.003427728096163e-06, "loss": 0.0, "num_input_tokens_seen": 5171424, "step": 12625 }, { "epoch": 15.290556900726392, "grad_norm": 1.1295711374259554e-05, "learning_rate": 7.98406799157483e-06, "loss": 0.0, "num_input_tokens_seen": 5173440, "step": 12630 }, { "epoch": 15.296610169491526, "grad_norm": 0.00029473844915628433, "learning_rate": 7.96472724791277e-06, "loss": 0.0, "num_input_tokens_seen": 5175360, "step": 12635 }, { "epoch": 15.302663438256658, "grad_norm": 2.662416773091536e-05, "learning_rate": 7.945405518697758e-06, "loss": 0.0, "num_input_tokens_seen": 5177376, "step": 12640 }, { "epoch": 15.30871670702179, "grad_norm": 2.673429116839543e-05, "learning_rate": 7.926102825496324e-06, "loss": 0.0, "num_input_tokens_seen": 5179360, "step": 12645 }, { "epoch": 15.314769975786925, "grad_norm": 0.0002746554382611066, "learning_rate": 7.906819189853793e-06, "loss": 0.0, "num_input_tokens_seen": 5181408, "step": 12650 }, { "epoch": 15.320823244552058, "grad_norm": 8.21388493932318e-06, "learning_rate": 7.88755463329417e-06, "loss": 0.0, "num_input_tokens_seen": 5183360, "step": 12655 }, { "epoch": 15.326876513317192, "grad_norm": 6.162277259136317e-06, "learning_rate": 7.868309177320188e-06, "loss": 0.0, "num_input_tokens_seen": 5185600, "step": 12660 }, { "epoch": 15.332929782082324, "grad_norm": 8.687564331921749e-06, "learning_rate": 7.849082843413277e-06, "loss": 0.0, "num_input_tokens_seen": 5187776, "step": 12665 }, { "epoch": 15.338983050847457, "grad_norm": 2.4092081730486825e-05, "learning_rate": 7.829875653033477e-06, "loss": 0.0, "num_input_tokens_seen": 5189824, "step": 12670 }, { "epoch": 15.345036319612591, "grad_norm": 3.791119161178358e-05, "learning_rate": 7.810687627619509e-06, "loss": 0.0, "num_input_tokens_seen": 5191968, "step": 12675 }, { "epoch": 15.351089588377723, "grad_norm": 7.78353296482237e-06, "learning_rate": 7.791518788588678e-06, "loss": 0.0, "num_input_tokens_seen": 5193888, "step": 12680 }, { "epoch": 15.357142857142858, "grad_norm": 0.00016570801381021738, "learning_rate": 7.772369157336874e-06, "loss": 0.0, "num_input_tokens_seen": 5195904, "step": 12685 }, { "epoch": 15.36319612590799, "grad_norm": 1.3002613741264213e-05, "learning_rate": 7.753238755238548e-06, "loss": 0.0, "num_input_tokens_seen": 5198016, "step": 12690 }, { "epoch": 15.369249394673124, "grad_norm": 0.0003610371786635369, "learning_rate": 7.734127603646697e-06, "loss": 0.0, "num_input_tokens_seen": 5200224, "step": 12695 }, { "epoch": 15.375302663438257, "grad_norm": 1.4729061149409972e-05, "learning_rate": 7.71503572389281e-06, "loss": 0.0, "num_input_tokens_seen": 5202304, "step": 12700 }, { "epoch": 15.38135593220339, "grad_norm": 0.0007073761662468314, "learning_rate": 7.69596313728691e-06, "loss": 0.0, "num_input_tokens_seen": 5204384, "step": 12705 }, { "epoch": 15.387409200968523, "grad_norm": 1.3574450349551626e-05, "learning_rate": 7.676909865117418e-06, "loss": 0.0, "num_input_tokens_seen": 5206528, "step": 12710 }, { "epoch": 15.393462469733656, "grad_norm": 9.627931831346359e-06, "learning_rate": 7.657875928651262e-06, "loss": 0.0, "num_input_tokens_seen": 5208640, "step": 12715 }, { "epoch": 15.39951573849879, "grad_norm": 1.3539064639189746e-05, "learning_rate": 7.638861349133744e-06, "loss": 0.0, "num_input_tokens_seen": 5210656, "step": 12720 }, { "epoch": 15.405569007263923, "grad_norm": 7.806549547240138e-06, "learning_rate": 7.619866147788585e-06, "loss": 0.0, "num_input_tokens_seen": 5212736, "step": 12725 }, { "epoch": 15.411622276029055, "grad_norm": 1.1836137673526537e-05, "learning_rate": 7.600890345817868e-06, "loss": 0.0, "num_input_tokens_seen": 5214784, "step": 12730 }, { "epoch": 15.41767554479419, "grad_norm": 7.541374998254469e-06, "learning_rate": 7.5819339644019995e-06, "loss": 0.0, "num_input_tokens_seen": 5216896, "step": 12735 }, { "epoch": 15.423728813559322, "grad_norm": 1.141117809311254e-05, "learning_rate": 7.5629970246997556e-06, "loss": 0.0, "num_input_tokens_seen": 5219136, "step": 12740 }, { "epoch": 15.429782082324456, "grad_norm": 8.432549293502234e-06, "learning_rate": 7.5440795478481815e-06, "loss": 0.0, "num_input_tokens_seen": 5221184, "step": 12745 }, { "epoch": 15.435835351089588, "grad_norm": 6.482813205366256e-06, "learning_rate": 7.525181554962604e-06, "loss": 0.0, "num_input_tokens_seen": 5223328, "step": 12750 }, { "epoch": 15.441888619854721, "grad_norm": 6.35360356682213e-06, "learning_rate": 7.5063030671366025e-06, "loss": 0.0, "num_input_tokens_seen": 5225568, "step": 12755 }, { "epoch": 15.447941888619855, "grad_norm": 8.475969480059575e-06, "learning_rate": 7.487444105441982e-06, "loss": 0.0, "num_input_tokens_seen": 5227552, "step": 12760 }, { "epoch": 15.453995157384988, "grad_norm": 1.0734044735727366e-05, "learning_rate": 7.4686046909287545e-06, "loss": 0.0, "num_input_tokens_seen": 5229472, "step": 12765 }, { "epoch": 15.460048426150122, "grad_norm": 3.6358764191390947e-05, "learning_rate": 7.449784844625138e-06, "loss": 0.0, "num_input_tokens_seen": 5231616, "step": 12770 }, { "epoch": 15.466101694915254, "grad_norm": 1.9338198399054818e-05, "learning_rate": 7.4309845875374515e-06, "loss": 0.0, "num_input_tokens_seen": 5233632, "step": 12775 }, { "epoch": 15.472154963680387, "grad_norm": 8.283516763185617e-06, "learning_rate": 7.4122039406502116e-06, "loss": 0.0, "num_input_tokens_seen": 5235616, "step": 12780 }, { "epoch": 15.478208232445521, "grad_norm": 6.63877472106833e-06, "learning_rate": 7.393442924926006e-06, "loss": 0.0, "num_input_tokens_seen": 5237952, "step": 12785 }, { "epoch": 15.484261501210653, "grad_norm": 6.3577358559996355e-06, "learning_rate": 7.374701561305527e-06, "loss": 0.0, "num_input_tokens_seen": 5240064, "step": 12790 }, { "epoch": 15.490314769975788, "grad_norm": 9.391490493726451e-06, "learning_rate": 7.3559798707075304e-06, "loss": 0.0, "num_input_tokens_seen": 5242048, "step": 12795 }, { "epoch": 15.49636803874092, "grad_norm": 1.0401681720395572e-05, "learning_rate": 7.337277874028806e-06, "loss": 0.0, "num_input_tokens_seen": 5244128, "step": 12800 }, { "epoch": 15.502421307506053, "grad_norm": 9.690646948001813e-06, "learning_rate": 7.318595592144167e-06, "loss": 0.0, "num_input_tokens_seen": 5246112, "step": 12805 }, { "epoch": 15.508474576271187, "grad_norm": 8.027927833609283e-06, "learning_rate": 7.299933045906421e-06, "loss": 0.0, "num_input_tokens_seen": 5248128, "step": 12810 }, { "epoch": 15.51452784503632, "grad_norm": 7.309138254640857e-06, "learning_rate": 7.281290256146347e-06, "loss": 0.0, "num_input_tokens_seen": 5250112, "step": 12815 }, { "epoch": 15.520581113801454, "grad_norm": 9.503344699623995e-06, "learning_rate": 7.262667243672666e-06, "loss": 0.0, "num_input_tokens_seen": 5252320, "step": 12820 }, { "epoch": 15.526634382566586, "grad_norm": 9.23323295864975e-06, "learning_rate": 7.244064029272049e-06, "loss": 0.0, "num_input_tokens_seen": 5254400, "step": 12825 }, { "epoch": 15.532687651331718, "grad_norm": 6.521044269902632e-06, "learning_rate": 7.2254806337090225e-06, "loss": 0.0, "num_input_tokens_seen": 5256352, "step": 12830 }, { "epoch": 15.538740920096853, "grad_norm": 2.7912283258046955e-05, "learning_rate": 7.2069170777260415e-06, "loss": 0.0, "num_input_tokens_seen": 5258464, "step": 12835 }, { "epoch": 15.544794188861985, "grad_norm": 1.8541906683822162e-05, "learning_rate": 7.188373382043384e-06, "loss": 0.0, "num_input_tokens_seen": 5260416, "step": 12840 }, { "epoch": 15.55084745762712, "grad_norm": 9.35377011046512e-06, "learning_rate": 7.169849567359171e-06, "loss": 0.0, "num_input_tokens_seen": 5262496, "step": 12845 }, { "epoch": 15.556900726392252, "grad_norm": 9.147295713773929e-06, "learning_rate": 7.151345654349331e-06, "loss": 0.0, "num_input_tokens_seen": 5264576, "step": 12850 }, { "epoch": 15.562953995157384, "grad_norm": 9.879800927592441e-06, "learning_rate": 7.132861663667581e-06, "loss": 0.0, "num_input_tokens_seen": 5266752, "step": 12855 }, { "epoch": 15.569007263922519, "grad_norm": 2.8230377211002633e-05, "learning_rate": 7.1143976159453925e-06, "loss": 0.0, "num_input_tokens_seen": 5268672, "step": 12860 }, { "epoch": 15.575060532687651, "grad_norm": 3.455306068644859e-05, "learning_rate": 7.095953531792002e-06, "loss": 0.0, "num_input_tokens_seen": 5270656, "step": 12865 }, { "epoch": 15.581113801452785, "grad_norm": 2.154414505639579e-05, "learning_rate": 7.077529431794319e-06, "loss": 0.0, "num_input_tokens_seen": 5272672, "step": 12870 }, { "epoch": 15.587167070217918, "grad_norm": 1.3208268683229107e-05, "learning_rate": 7.059125336517003e-06, "loss": 0.0, "num_input_tokens_seen": 5274752, "step": 12875 }, { "epoch": 15.59322033898305, "grad_norm": 1.6370127923437394e-05, "learning_rate": 7.040741266502321e-06, "loss": 0.0, "num_input_tokens_seen": 5276832, "step": 12880 }, { "epoch": 15.599273607748184, "grad_norm": 5.74683372178697e-06, "learning_rate": 7.02237724227025e-06, "loss": 0.0, "num_input_tokens_seen": 5278944, "step": 12885 }, { "epoch": 15.605326876513317, "grad_norm": 1.016039004753111e-05, "learning_rate": 7.004033284318359e-06, "loss": 0.0, "num_input_tokens_seen": 5281024, "step": 12890 }, { "epoch": 15.611380145278451, "grad_norm": 7.946347977849655e-06, "learning_rate": 6.985709413121805e-06, "loss": 0.0, "num_input_tokens_seen": 5283168, "step": 12895 }, { "epoch": 15.617433414043584, "grad_norm": 9.31619342736667e-06, "learning_rate": 6.967405649133365e-06, "loss": 0.0, "num_input_tokens_seen": 5285248, "step": 12900 }, { "epoch": 15.623486682808716, "grad_norm": 1.1748663382604718e-05, "learning_rate": 6.949122012783349e-06, "loss": 0.0, "num_input_tokens_seen": 5287264, "step": 12905 }, { "epoch": 15.62953995157385, "grad_norm": 1.1222904504393227e-05, "learning_rate": 6.930858524479597e-06, "loss": 0.0, "num_input_tokens_seen": 5289344, "step": 12910 }, { "epoch": 15.635593220338983, "grad_norm": 5.7175498113792855e-06, "learning_rate": 6.912615204607467e-06, "loss": 0.0, "num_input_tokens_seen": 5291328, "step": 12915 }, { "epoch": 15.641646489104117, "grad_norm": 5.840245194121962e-06, "learning_rate": 6.894392073529812e-06, "loss": 0.0, "num_input_tokens_seen": 5293440, "step": 12920 }, { "epoch": 15.64769975786925, "grad_norm": 2.9027040000073612e-05, "learning_rate": 6.876189151586932e-06, "loss": 0.0, "num_input_tokens_seen": 5295392, "step": 12925 }, { "epoch": 15.653753026634382, "grad_norm": 6.738963747920934e-06, "learning_rate": 6.8580064590966025e-06, "loss": 0.0, "num_input_tokens_seen": 5297472, "step": 12930 }, { "epoch": 15.659806295399516, "grad_norm": 8.291229278256651e-06, "learning_rate": 6.839844016353971e-06, "loss": 0.0, "num_input_tokens_seen": 5299520, "step": 12935 }, { "epoch": 15.665859564164649, "grad_norm": 5.540269739867654e-06, "learning_rate": 6.821701843631634e-06, "loss": 0.0, "num_input_tokens_seen": 5301568, "step": 12940 }, { "epoch": 15.671912832929783, "grad_norm": 1.539275217510294e-05, "learning_rate": 6.8035799611795295e-06, "loss": 0.0, "num_input_tokens_seen": 5303616, "step": 12945 }, { "epoch": 15.677966101694915, "grad_norm": 9.585964107827749e-06, "learning_rate": 6.785478389224956e-06, "loss": 0.0, "num_input_tokens_seen": 5305632, "step": 12950 }, { "epoch": 15.684019370460048, "grad_norm": 3.654522151919082e-05, "learning_rate": 6.7673971479725485e-06, "loss": 0.0, "num_input_tokens_seen": 5307776, "step": 12955 }, { "epoch": 15.690072639225182, "grad_norm": 5.796573987026932e-06, "learning_rate": 6.7493362576042416e-06, "loss": 0.0, "num_input_tokens_seen": 5309856, "step": 12960 }, { "epoch": 15.696125907990314, "grad_norm": 1.504241481598001e-05, "learning_rate": 6.7312957382792556e-06, "loss": 0.0, "num_input_tokens_seen": 5311776, "step": 12965 }, { "epoch": 15.702179176755449, "grad_norm": 1.0075180398416705e-05, "learning_rate": 6.713275610134076e-06, "loss": 0.0, "num_input_tokens_seen": 5313856, "step": 12970 }, { "epoch": 15.708232445520581, "grad_norm": 1.1450566489656921e-05, "learning_rate": 6.695275893282427e-06, "loss": 0.0, "num_input_tokens_seen": 5315840, "step": 12975 }, { "epoch": 15.714285714285714, "grad_norm": 8.288837307190988e-06, "learning_rate": 6.677296607815242e-06, "loss": 0.0, "num_input_tokens_seen": 5317888, "step": 12980 }, { "epoch": 15.720338983050848, "grad_norm": 9.281151506002061e-06, "learning_rate": 6.659337773800678e-06, "loss": 0.0, "num_input_tokens_seen": 5319872, "step": 12985 }, { "epoch": 15.72639225181598, "grad_norm": 9.424791642231867e-06, "learning_rate": 6.641399411284016e-06, "loss": 0.0, "num_input_tokens_seen": 5321952, "step": 12990 }, { "epoch": 15.732445520581114, "grad_norm": 1.1279516911599785e-05, "learning_rate": 6.623481540287746e-06, "loss": 0.0, "num_input_tokens_seen": 5323936, "step": 12995 }, { "epoch": 15.738498789346247, "grad_norm": 9.948847946361639e-06, "learning_rate": 6.605584180811419e-06, "loss": 0.0, "num_input_tokens_seen": 5326048, "step": 13000 }, { "epoch": 15.74455205811138, "grad_norm": 5.661354953190312e-05, "learning_rate": 6.587707352831754e-06, "loss": 0.0, "num_input_tokens_seen": 5328032, "step": 13005 }, { "epoch": 15.750605326876514, "grad_norm": 9.142162525677122e-06, "learning_rate": 6.5698510763025165e-06, "loss": 0.0, "num_input_tokens_seen": 5330016, "step": 13010 }, { "epoch": 15.756658595641646, "grad_norm": 6.765746093151392e-06, "learning_rate": 6.55201537115454e-06, "loss": 0.0, "num_input_tokens_seen": 5332032, "step": 13015 }, { "epoch": 15.76271186440678, "grad_norm": 2.245779432996642e-05, "learning_rate": 6.534200257295706e-06, "loss": 0.0, "num_input_tokens_seen": 5334080, "step": 13020 }, { "epoch": 15.768765133171913, "grad_norm": 1.01217574410839e-05, "learning_rate": 6.516405754610899e-06, "loss": 0.0, "num_input_tokens_seen": 5336128, "step": 13025 }, { "epoch": 15.774818401937045, "grad_norm": 6.1265209296834655e-06, "learning_rate": 6.4986318829619975e-06, "loss": 0.0, "num_input_tokens_seen": 5338144, "step": 13030 }, { "epoch": 15.78087167070218, "grad_norm": 7.174251550168265e-06, "learning_rate": 6.480878662187884e-06, "loss": 0.0, "num_input_tokens_seen": 5340192, "step": 13035 }, { "epoch": 15.786924939467312, "grad_norm": 5.8357550187793095e-06, "learning_rate": 6.463146112104332e-06, "loss": 0.0025, "num_input_tokens_seen": 5342144, "step": 13040 }, { "epoch": 15.792978208232446, "grad_norm": 9.090523235499859e-06, "learning_rate": 6.445434252504101e-06, "loss": 0.0, "num_input_tokens_seen": 5344160, "step": 13045 }, { "epoch": 15.799031476997579, "grad_norm": 7.2475213528377935e-06, "learning_rate": 6.42774310315683e-06, "loss": 0.0, "num_input_tokens_seen": 5346240, "step": 13050 }, { "epoch": 15.805084745762711, "grad_norm": 7.4687109190563206e-06, "learning_rate": 6.410072683809021e-06, "loss": 0.0, "num_input_tokens_seen": 5348320, "step": 13055 }, { "epoch": 15.811138014527845, "grad_norm": 1.3044601473666262e-05, "learning_rate": 6.392423014184082e-06, "loss": 0.0, "num_input_tokens_seen": 5350304, "step": 13060 }, { "epoch": 15.817191283292978, "grad_norm": 8.444573722954374e-06, "learning_rate": 6.374794113982233e-06, "loss": 0.0, "num_input_tokens_seen": 5352384, "step": 13065 }, { "epoch": 15.823244552058112, "grad_norm": 1.0327997188142035e-05, "learning_rate": 6.357186002880513e-06, "loss": 0.0, "num_input_tokens_seen": 5354464, "step": 13070 }, { "epoch": 15.829297820823244, "grad_norm": 9.58102918957593e-06, "learning_rate": 6.339598700532762e-06, "loss": 0.0, "num_input_tokens_seen": 5356352, "step": 13075 }, { "epoch": 15.835351089588377, "grad_norm": 6.526124252559384e-06, "learning_rate": 6.322032226569591e-06, "loss": 0.0, "num_input_tokens_seen": 5358368, "step": 13080 }, { "epoch": 15.841404358353511, "grad_norm": 6.502098131022649e-06, "learning_rate": 6.3044866005983585e-06, "loss": 0.0, "num_input_tokens_seen": 5360416, "step": 13085 }, { "epoch": 15.847457627118644, "grad_norm": 1.5479168723686598e-05, "learning_rate": 6.286961842203179e-06, "loss": 0.0, "num_input_tokens_seen": 5362464, "step": 13090 }, { "epoch": 15.853510895883778, "grad_norm": 2.3206290279631503e-05, "learning_rate": 6.2694579709448226e-06, "loss": 0.0, "num_input_tokens_seen": 5364320, "step": 13095 }, { "epoch": 15.85956416464891, "grad_norm": 4.806689958058996e-06, "learning_rate": 6.251975006360802e-06, "loss": 0.0, "num_input_tokens_seen": 5366400, "step": 13100 }, { "epoch": 15.865617433414045, "grad_norm": 5.7845027185976505e-06, "learning_rate": 6.234512967965261e-06, "loss": 0.0, "num_input_tokens_seen": 5368352, "step": 13105 }, { "epoch": 15.871670702179177, "grad_norm": 6.8567428570531774e-06, "learning_rate": 6.2170718752489945e-06, "loss": 0.0, "num_input_tokens_seen": 5370368, "step": 13110 }, { "epoch": 15.87772397094431, "grad_norm": 1.0618714441079646e-05, "learning_rate": 6.199651747679419e-06, "loss": 0.0, "num_input_tokens_seen": 5372384, "step": 13115 }, { "epoch": 15.883777239709444, "grad_norm": 6.896547802170971e-06, "learning_rate": 6.182252604700548e-06, "loss": 0.0, "num_input_tokens_seen": 5374304, "step": 13120 }, { "epoch": 15.889830508474576, "grad_norm": 7.5253865361446515e-06, "learning_rate": 6.164874465732972e-06, "loss": 0.0, "num_input_tokens_seen": 5376416, "step": 13125 }, { "epoch": 15.89588377723971, "grad_norm": 5.729562872147653e-06, "learning_rate": 6.147517350173843e-06, "loss": 0.0, "num_input_tokens_seen": 5378688, "step": 13130 }, { "epoch": 15.901937046004843, "grad_norm": 5.331426564225694e-06, "learning_rate": 6.130181277396837e-06, "loss": 0.0, "num_input_tokens_seen": 5380704, "step": 13135 }, { "epoch": 15.907990314769975, "grad_norm": 5.226641405897681e-06, "learning_rate": 6.112866266752154e-06, "loss": 0.0, "num_input_tokens_seen": 5382720, "step": 13140 }, { "epoch": 15.91404358353511, "grad_norm": 7.851177542761434e-06, "learning_rate": 6.095572337566474e-06, "loss": 0.0, "num_input_tokens_seen": 5384672, "step": 13145 }, { "epoch": 15.920096852300242, "grad_norm": 8.42815097712446e-06, "learning_rate": 6.078299509142948e-06, "loss": 0.0, "num_input_tokens_seen": 5386784, "step": 13150 }, { "epoch": 15.926150121065376, "grad_norm": 1.0261698662361596e-05, "learning_rate": 6.061047800761202e-06, "loss": 0.0, "num_input_tokens_seen": 5388736, "step": 13155 }, { "epoch": 15.932203389830509, "grad_norm": 6.545801170432242e-06, "learning_rate": 6.043817231677229e-06, "loss": 0.0, "num_input_tokens_seen": 5390784, "step": 13160 }, { "epoch": 15.938256658595641, "grad_norm": 1.0542059499130119e-05, "learning_rate": 6.026607821123487e-06, "loss": 0.0, "num_input_tokens_seen": 5392864, "step": 13165 }, { "epoch": 15.944309927360775, "grad_norm": 5.800350663776044e-06, "learning_rate": 6.0094195883087875e-06, "loss": 0.0, "num_input_tokens_seen": 5394880, "step": 13170 }, { "epoch": 15.950363196125908, "grad_norm": 6.046329417586094e-06, "learning_rate": 5.992252552418304e-06, "loss": 0.0, "num_input_tokens_seen": 5396992, "step": 13175 }, { "epoch": 15.956416464891042, "grad_norm": 6.3688507907500025e-06, "learning_rate": 5.9751067326135585e-06, "loss": 0.0, "num_input_tokens_seen": 5399008, "step": 13180 }, { "epoch": 15.962469733656174, "grad_norm": 9.709984624350909e-06, "learning_rate": 5.957982148032388e-06, "loss": 0.0, "num_input_tokens_seen": 5401088, "step": 13185 }, { "epoch": 15.968523002421307, "grad_norm": 2.652407238201704e-05, "learning_rate": 5.940878817788917e-06, "loss": 0.0, "num_input_tokens_seen": 5403040, "step": 13190 }, { "epoch": 15.974576271186441, "grad_norm": 1.6512709407834336e-05, "learning_rate": 5.923796760973582e-06, "loss": 0.0, "num_input_tokens_seen": 5405216, "step": 13195 }, { "epoch": 15.980629539951574, "grad_norm": 7.177910447353497e-06, "learning_rate": 5.906735996653031e-06, "loss": 0.0, "num_input_tokens_seen": 5407264, "step": 13200 }, { "epoch": 15.986682808716708, "grad_norm": 7.329317668336444e-06, "learning_rate": 5.889696543870157e-06, "loss": 0.0, "num_input_tokens_seen": 5409152, "step": 13205 }, { "epoch": 15.99273607748184, "grad_norm": 0.0011195375118404627, "learning_rate": 5.872678421644101e-06, "loss": 0.0, "num_input_tokens_seen": 5411168, "step": 13210 }, { "epoch": 15.998789346246973, "grad_norm": 4.780383733304916e-06, "learning_rate": 5.855681648970132e-06, "loss": 0.0, "num_input_tokens_seen": 5413280, "step": 13215 }, { "epoch": 16.0, "eval_loss": 0.4235284924507141, "eval_runtime": 4.9682, "eval_samples_per_second": 73.869, "eval_steps_per_second": 18.518, "num_input_tokens_seen": 5413384, "step": 13216 }, { "epoch": 16.004842615012105, "grad_norm": 3.37673191097565e-05, "learning_rate": 5.8387062448197524e-06, "loss": 0.0, "num_input_tokens_seen": 5415080, "step": 13220 }, { "epoch": 16.01089588377724, "grad_norm": 7.766646376694553e-06, "learning_rate": 5.82175222814057e-06, "loss": 0.0, "num_input_tokens_seen": 5417192, "step": 13225 }, { "epoch": 16.016949152542374, "grad_norm": 1.0339836080675013e-05, "learning_rate": 5.804819617856344e-06, "loss": 0.0, "num_input_tokens_seen": 5419304, "step": 13230 }, { "epoch": 16.023002421307506, "grad_norm": 5.031938144384185e-06, "learning_rate": 5.787908432866923e-06, "loss": 0.0, "num_input_tokens_seen": 5421480, "step": 13235 }, { "epoch": 16.02905569007264, "grad_norm": 8.579366294725332e-06, "learning_rate": 5.771018692048258e-06, "loss": 0.0, "num_input_tokens_seen": 5423624, "step": 13240 }, { "epoch": 16.03510895883777, "grad_norm": 0.0018248963169753551, "learning_rate": 5.75415041425234e-06, "loss": 0.0, "num_input_tokens_seen": 5425832, "step": 13245 }, { "epoch": 16.041162227602907, "grad_norm": 1.1424161129980348e-05, "learning_rate": 5.73730361830725e-06, "loss": 0.0, "num_input_tokens_seen": 5428008, "step": 13250 }, { "epoch": 16.04721549636804, "grad_norm": 5.462104127218481e-06, "learning_rate": 5.720478323017025e-06, "loss": 0.0, "num_input_tokens_seen": 5430056, "step": 13255 }, { "epoch": 16.053268765133172, "grad_norm": 8.784055353316944e-06, "learning_rate": 5.703674547161764e-06, "loss": 0.0, "num_input_tokens_seen": 5432136, "step": 13260 }, { "epoch": 16.059322033898304, "grad_norm": 1.0200512406299822e-05, "learning_rate": 5.686892309497513e-06, "loss": 0.0, "num_input_tokens_seen": 5434280, "step": 13265 }, { "epoch": 16.065375302663437, "grad_norm": 7.056105459923856e-06, "learning_rate": 5.6701316287562896e-06, "loss": 0.0, "num_input_tokens_seen": 5436296, "step": 13270 }, { "epoch": 16.071428571428573, "grad_norm": 5.733665602747351e-06, "learning_rate": 5.653392523646042e-06, "loss": 0.0, "num_input_tokens_seen": 5438248, "step": 13275 }, { "epoch": 16.077481840193705, "grad_norm": 5.7780644056038e-06, "learning_rate": 5.636675012850642e-06, "loss": 0.0, "num_input_tokens_seen": 5440264, "step": 13280 }, { "epoch": 16.083535108958838, "grad_norm": 7.978341272973921e-06, "learning_rate": 5.619979115029858e-06, "loss": 0.0, "num_input_tokens_seen": 5442280, "step": 13285 }, { "epoch": 16.08958837772397, "grad_norm": 7.131504844437586e-06, "learning_rate": 5.6033048488193324e-06, "loss": 0.0, "num_input_tokens_seen": 5444296, "step": 13290 }, { "epoch": 16.095641646489103, "grad_norm": 8.238333066401538e-06, "learning_rate": 5.586652232830564e-06, "loss": 0.0, "num_input_tokens_seen": 5446344, "step": 13295 }, { "epoch": 16.10169491525424, "grad_norm": 6.293022579484386e-06, "learning_rate": 5.5700212856508864e-06, "loss": 0.0, "num_input_tokens_seen": 5448424, "step": 13300 }, { "epoch": 16.10774818401937, "grad_norm": 1.0344106158299837e-05, "learning_rate": 5.553412025843443e-06, "loss": 0.0, "num_input_tokens_seen": 5450376, "step": 13305 }, { "epoch": 16.113801452784504, "grad_norm": 8.719829565961845e-06, "learning_rate": 5.536824471947169e-06, "loss": 0.0, "num_input_tokens_seen": 5452296, "step": 13310 }, { "epoch": 16.119854721549636, "grad_norm": 7.155702405725606e-06, "learning_rate": 5.520258642476797e-06, "loss": 0.0, "num_input_tokens_seen": 5454408, "step": 13315 }, { "epoch": 16.12590799031477, "grad_norm": 1.6467241948703304e-05, "learning_rate": 5.503714555922762e-06, "loss": 0.0, "num_input_tokens_seen": 5456328, "step": 13320 }, { "epoch": 16.131961259079905, "grad_norm": 5.468363724503433e-06, "learning_rate": 5.487192230751278e-06, "loss": 0.0, "num_input_tokens_seen": 5458344, "step": 13325 }, { "epoch": 16.138014527845037, "grad_norm": 6.25990560365608e-06, "learning_rate": 5.470691685404244e-06, "loss": 0.0, "num_input_tokens_seen": 5460424, "step": 13330 }, { "epoch": 16.14406779661017, "grad_norm": 7.108035333658336e-06, "learning_rate": 5.454212938299255e-06, "loss": 0.0, "num_input_tokens_seen": 5462504, "step": 13335 }, { "epoch": 16.150121065375302, "grad_norm": 1.4638817447121255e-05, "learning_rate": 5.437756007829576e-06, "loss": 0.0, "num_input_tokens_seen": 5464616, "step": 13340 }, { "epoch": 16.156174334140434, "grad_norm": 1.009204970614519e-05, "learning_rate": 5.421320912364117e-06, "loss": 0.0, "num_input_tokens_seen": 5466568, "step": 13345 }, { "epoch": 16.16222760290557, "grad_norm": 2.341290564800147e-05, "learning_rate": 5.404907670247411e-06, "loss": 0.0, "num_input_tokens_seen": 5468552, "step": 13350 }, { "epoch": 16.168280871670703, "grad_norm": 3.633502637967467e-05, "learning_rate": 5.388516299799629e-06, "loss": 0.0, "num_input_tokens_seen": 5470632, "step": 13355 }, { "epoch": 16.174334140435835, "grad_norm": 5.8311925386078656e-05, "learning_rate": 5.372146819316484e-06, "loss": 0.0, "num_input_tokens_seen": 5472680, "step": 13360 }, { "epoch": 16.180387409200968, "grad_norm": 5.3855133046454284e-06, "learning_rate": 5.355799247069282e-06, "loss": 0.0, "num_input_tokens_seen": 5474728, "step": 13365 }, { "epoch": 16.1864406779661, "grad_norm": 6.759319603588665e-06, "learning_rate": 5.339473601304889e-06, "loss": 0.0, "num_input_tokens_seen": 5476712, "step": 13370 }, { "epoch": 16.192493946731236, "grad_norm": 7.718527740507852e-06, "learning_rate": 5.323169900245653e-06, "loss": 0.0, "num_input_tokens_seen": 5478632, "step": 13375 }, { "epoch": 16.19854721549637, "grad_norm": 9.544852218823507e-06, "learning_rate": 5.306888162089476e-06, "loss": 0.0, "num_input_tokens_seen": 5480744, "step": 13380 }, { "epoch": 16.2046004842615, "grad_norm": 5.453157427837141e-06, "learning_rate": 5.290628405009717e-06, "loss": 0.0, "num_input_tokens_seen": 5482728, "step": 13385 }, { "epoch": 16.210653753026634, "grad_norm": 2.174989458580967e-05, "learning_rate": 5.2743906471552056e-06, "loss": 0.0, "num_input_tokens_seen": 5484776, "step": 13390 }, { "epoch": 16.216707021791766, "grad_norm": 6.926201422174927e-06, "learning_rate": 5.258174906650223e-06, "loss": 0.0, "num_input_tokens_seen": 5486728, "step": 13395 }, { "epoch": 16.222760290556902, "grad_norm": 4.83142321172636e-06, "learning_rate": 5.2419812015944645e-06, "loss": 0.0, "num_input_tokens_seen": 5488872, "step": 13400 }, { "epoch": 16.228813559322035, "grad_norm": 6.201669748406857e-06, "learning_rate": 5.2258095500630296e-06, "loss": 0.0, "num_input_tokens_seen": 5491016, "step": 13405 }, { "epoch": 16.234866828087167, "grad_norm": 8.019224878808018e-06, "learning_rate": 5.2096599701064285e-06, "loss": 0.0, "num_input_tokens_seen": 5493000, "step": 13410 }, { "epoch": 16.2409200968523, "grad_norm": 6.025636139384005e-06, "learning_rate": 5.19353247975049e-06, "loss": 0.0, "num_input_tokens_seen": 5494984, "step": 13415 }, { "epoch": 16.246973365617432, "grad_norm": 4.966358119418146e-06, "learning_rate": 5.1774270969964375e-06, "loss": 0.0, "num_input_tokens_seen": 5496840, "step": 13420 }, { "epoch": 16.253026634382568, "grad_norm": 5.854138635186246e-06, "learning_rate": 5.161343839820762e-06, "loss": 0.0, "num_input_tokens_seen": 5498760, "step": 13425 }, { "epoch": 16.2590799031477, "grad_norm": 1.5840752894291654e-05, "learning_rate": 5.145282726175315e-06, "loss": 0.0, "num_input_tokens_seen": 5500936, "step": 13430 }, { "epoch": 16.265133171912833, "grad_norm": 2.1881915017729625e-05, "learning_rate": 5.129243773987194e-06, "loss": 0.0, "num_input_tokens_seen": 5502888, "step": 13435 }, { "epoch": 16.271186440677965, "grad_norm": 1.1100717529188842e-05, "learning_rate": 5.113227001158774e-06, "loss": 0.0, "num_input_tokens_seen": 5504936, "step": 13440 }, { "epoch": 16.277239709443098, "grad_norm": 1.062103001459036e-05, "learning_rate": 5.097232425567674e-06, "loss": 0.0, "num_input_tokens_seen": 5507016, "step": 13445 }, { "epoch": 16.283292978208234, "grad_norm": 1.947067903529387e-05, "learning_rate": 5.081260065066731e-06, "loss": 0.0, "num_input_tokens_seen": 5508968, "step": 13450 }, { "epoch": 16.289346246973366, "grad_norm": 9.35500611376483e-06, "learning_rate": 5.065309937483992e-06, "loss": 0.0, "num_input_tokens_seen": 5510984, "step": 13455 }, { "epoch": 16.2953995157385, "grad_norm": 1.0075079444504809e-05, "learning_rate": 5.0493820606226844e-06, "loss": 0.0, "num_input_tokens_seen": 5512968, "step": 13460 }, { "epoch": 16.30145278450363, "grad_norm": 8.100805644062348e-06, "learning_rate": 5.033476452261202e-06, "loss": 0.0, "num_input_tokens_seen": 5515048, "step": 13465 }, { "epoch": 16.307506053268764, "grad_norm": 7.433553037117235e-06, "learning_rate": 5.017593130153075e-06, "loss": 0.0, "num_input_tokens_seen": 5517032, "step": 13470 }, { "epoch": 16.3135593220339, "grad_norm": 7.668460966669954e-06, "learning_rate": 5.001732112026983e-06, "loss": 0.0, "num_input_tokens_seen": 5519208, "step": 13475 }, { "epoch": 16.319612590799032, "grad_norm": 7.575612016808009e-06, "learning_rate": 4.985893415586671e-06, "loss": 0.0, "num_input_tokens_seen": 5521256, "step": 13480 }, { "epoch": 16.325665859564165, "grad_norm": 5.0775079216691665e-06, "learning_rate": 4.970077058511003e-06, "loss": 0.0, "num_input_tokens_seen": 5523304, "step": 13485 }, { "epoch": 16.331719128329297, "grad_norm": 2.058415520878043e-05, "learning_rate": 4.954283058453896e-06, "loss": 0.0, "num_input_tokens_seen": 5525448, "step": 13490 }, { "epoch": 16.33777239709443, "grad_norm": 6.30821932645631e-06, "learning_rate": 4.938511433044307e-06, "loss": 0.0, "num_input_tokens_seen": 5527656, "step": 13495 }, { "epoch": 16.343825665859566, "grad_norm": 1.1574021577835083, "learning_rate": 4.922762199886227e-06, "loss": 0.0001, "num_input_tokens_seen": 5529640, "step": 13500 }, { "epoch": 16.349878934624698, "grad_norm": 6.0729278629878536e-05, "learning_rate": 4.907035376558647e-06, "loss": 0.0, "num_input_tokens_seen": 5531720, "step": 13505 }, { "epoch": 16.35593220338983, "grad_norm": 1.841576704464387e-05, "learning_rate": 4.891330980615547e-06, "loss": 0.0, "num_input_tokens_seen": 5533864, "step": 13510 }, { "epoch": 16.361985472154963, "grad_norm": 5.160369710210944e-06, "learning_rate": 4.875649029585888e-06, "loss": 0.0, "num_input_tokens_seen": 5535944, "step": 13515 }, { "epoch": 16.368038740920095, "grad_norm": 4.436426388565451e-05, "learning_rate": 4.859989540973547e-06, "loss": 0.0, "num_input_tokens_seen": 5537992, "step": 13520 }, { "epoch": 16.37409200968523, "grad_norm": 6.054846835468197e-06, "learning_rate": 4.844352532257351e-06, "loss": 0.0, "num_input_tokens_seen": 5540104, "step": 13525 }, { "epoch": 16.380145278450364, "grad_norm": 5.969058292976115e-06, "learning_rate": 4.828738020891047e-06, "loss": 0.0, "num_input_tokens_seen": 5542120, "step": 13530 }, { "epoch": 16.386198547215496, "grad_norm": 5.711440280720126e-06, "learning_rate": 4.8131460243032274e-06, "loss": 0.0, "num_input_tokens_seen": 5544168, "step": 13535 }, { "epoch": 16.39225181598063, "grad_norm": 1.3357346688280813e-05, "learning_rate": 4.797576559897407e-06, "loss": 0.0, "num_input_tokens_seen": 5546344, "step": 13540 }, { "epoch": 16.39830508474576, "grad_norm": 5.368493475543801e-06, "learning_rate": 4.782029645051916e-06, "loss": 0.0, "num_input_tokens_seen": 5548488, "step": 13545 }, { "epoch": 16.404358353510897, "grad_norm": 6.821068382123485e-06, "learning_rate": 4.766505297119922e-06, "loss": 0.0, "num_input_tokens_seen": 5550408, "step": 13550 }, { "epoch": 16.41041162227603, "grad_norm": 5.478823823068524e-06, "learning_rate": 4.751003533429413e-06, "loss": 0.0, "num_input_tokens_seen": 5552456, "step": 13555 }, { "epoch": 16.416464891041162, "grad_norm": 7.251175702549517e-06, "learning_rate": 4.7355243712831566e-06, "loss": 0.0, "num_input_tokens_seen": 5554504, "step": 13560 }, { "epoch": 16.422518159806295, "grad_norm": 1.2261729352758266e-05, "learning_rate": 4.720067827958702e-06, "loss": 0.0, "num_input_tokens_seen": 5556456, "step": 13565 }, { "epoch": 16.428571428571427, "grad_norm": 4.997537416784326e-06, "learning_rate": 4.704633920708346e-06, "loss": 0.0, "num_input_tokens_seen": 5558536, "step": 13570 }, { "epoch": 16.434624697336563, "grad_norm": 8.952002644946333e-06, "learning_rate": 4.689222666759119e-06, "loss": 0.0, "num_input_tokens_seen": 5560584, "step": 13575 }, { "epoch": 16.440677966101696, "grad_norm": 2.8312169888522476e-05, "learning_rate": 4.6738340833127865e-06, "loss": 0.0, "num_input_tokens_seen": 5562696, "step": 13580 }, { "epoch": 16.446731234866828, "grad_norm": 5.3988587751518935e-06, "learning_rate": 4.658468187545764e-06, "loss": 0.0, "num_input_tokens_seen": 5564680, "step": 13585 }, { "epoch": 16.45278450363196, "grad_norm": 5.472230895975372e-06, "learning_rate": 4.643124996609196e-06, "loss": 0.0, "num_input_tokens_seen": 5566696, "step": 13590 }, { "epoch": 16.458837772397093, "grad_norm": 5.20343837706605e-06, "learning_rate": 4.627804527628857e-06, "loss": 0.0, "num_input_tokens_seen": 5568680, "step": 13595 }, { "epoch": 16.46489104116223, "grad_norm": 1.2006261385977268e-05, "learning_rate": 4.612506797705143e-06, "loss": 0.0, "num_input_tokens_seen": 5570728, "step": 13600 }, { "epoch": 16.47094430992736, "grad_norm": 7.174311122071231e-06, "learning_rate": 4.597231823913111e-06, "loss": 0.0, "num_input_tokens_seen": 5572872, "step": 13605 }, { "epoch": 16.476997578692494, "grad_norm": 5.165100901649566e-06, "learning_rate": 4.581979623302388e-06, "loss": 0.0, "num_input_tokens_seen": 5574952, "step": 13610 }, { "epoch": 16.483050847457626, "grad_norm": 1.8415608792565763e-05, "learning_rate": 4.566750212897186e-06, "loss": 0.0, "num_input_tokens_seen": 5577064, "step": 13615 }, { "epoch": 16.48910411622276, "grad_norm": 2.625380329845939e-05, "learning_rate": 4.55154360969629e-06, "loss": 0.0, "num_input_tokens_seen": 5578920, "step": 13620 }, { "epoch": 16.495157384987895, "grad_norm": 3.686717536766082e-05, "learning_rate": 4.536359830673015e-06, "loss": 0.0, "num_input_tokens_seen": 5580904, "step": 13625 }, { "epoch": 16.501210653753027, "grad_norm": 5.768627488578204e-06, "learning_rate": 4.521198892775203e-06, "loss": 0.0, "num_input_tokens_seen": 5582888, "step": 13630 }, { "epoch": 16.50726392251816, "grad_norm": 9.33439423533855e-06, "learning_rate": 4.506060812925222e-06, "loss": 0.0, "num_input_tokens_seen": 5585032, "step": 13635 }, { "epoch": 16.513317191283292, "grad_norm": 5.904744739382295e-06, "learning_rate": 4.490945608019884e-06, "loss": 0.0, "num_input_tokens_seen": 5587016, "step": 13640 }, { "epoch": 16.519370460048425, "grad_norm": 1.4080187611398287e-05, "learning_rate": 4.47585329493051e-06, "loss": 0.0, "num_input_tokens_seen": 5589064, "step": 13645 }, { "epoch": 16.52542372881356, "grad_norm": 9.382079042552505e-06, "learning_rate": 4.460783890502848e-06, "loss": 0.0, "num_input_tokens_seen": 5591144, "step": 13650 }, { "epoch": 16.531476997578693, "grad_norm": 1.2274272194190416e-05, "learning_rate": 4.44573741155708e-06, "loss": 0.0, "num_input_tokens_seen": 5593288, "step": 13655 }, { "epoch": 16.537530266343826, "grad_norm": 5.935361514275428e-06, "learning_rate": 4.430713874887799e-06, "loss": 0.0, "num_input_tokens_seen": 5595304, "step": 13660 }, { "epoch": 16.543583535108958, "grad_norm": 9.16456338018179e-05, "learning_rate": 4.415713297263987e-06, "loss": 0.0, "num_input_tokens_seen": 5597544, "step": 13665 }, { "epoch": 16.54963680387409, "grad_norm": 6.5258991526206955e-06, "learning_rate": 4.400735695429003e-06, "loss": 0.0, "num_input_tokens_seen": 5599528, "step": 13670 }, { "epoch": 16.555690072639226, "grad_norm": 2.071857306873426e-05, "learning_rate": 4.3857810861005765e-06, "loss": 0.0, "num_input_tokens_seen": 5601544, "step": 13675 }, { "epoch": 16.56174334140436, "grad_norm": 5.912371761951363e-06, "learning_rate": 4.370849485970741e-06, "loss": 0.0, "num_input_tokens_seen": 5603592, "step": 13680 }, { "epoch": 16.56779661016949, "grad_norm": 6.7651449171535205e-06, "learning_rate": 4.3559409117058605e-06, "loss": 0.0, "num_input_tokens_seen": 5605640, "step": 13685 }, { "epoch": 16.573849878934624, "grad_norm": 4.919254934065975e-06, "learning_rate": 4.3410553799466235e-06, "loss": 0.0, "num_input_tokens_seen": 5607816, "step": 13690 }, { "epoch": 16.579903147699756, "grad_norm": 7.257205288624391e-05, "learning_rate": 4.326192907307949e-06, "loss": 0.0, "num_input_tokens_seen": 5609832, "step": 13695 }, { "epoch": 16.585956416464892, "grad_norm": 4.8374413381679915e-06, "learning_rate": 4.3113535103790736e-06, "loss": 0.0, "num_input_tokens_seen": 5611944, "step": 13700 }, { "epoch": 16.592009685230025, "grad_norm": 1.0124210348294582e-05, "learning_rate": 4.296537205723425e-06, "loss": 0.0, "num_input_tokens_seen": 5613928, "step": 13705 }, { "epoch": 16.598062953995157, "grad_norm": 6.575006864295574e-06, "learning_rate": 4.281744009878699e-06, "loss": 0.0, "num_input_tokens_seen": 5616104, "step": 13710 }, { "epoch": 16.60411622276029, "grad_norm": 6.3488482737739105e-06, "learning_rate": 4.266973939356769e-06, "loss": 0.0, "num_input_tokens_seen": 5618024, "step": 13715 }, { "epoch": 16.610169491525422, "grad_norm": 5.522345418285113e-06, "learning_rate": 4.252227010643711e-06, "loss": 0.0, "num_input_tokens_seen": 5620072, "step": 13720 }, { "epoch": 16.616222760290558, "grad_norm": 4.600882675731555e-06, "learning_rate": 4.237503240199764e-06, "loss": 0.0, "num_input_tokens_seen": 5622152, "step": 13725 }, { "epoch": 16.62227602905569, "grad_norm": 8.861116839398164e-06, "learning_rate": 4.2228026444593226e-06, "loss": 0.0, "num_input_tokens_seen": 5624264, "step": 13730 }, { "epoch": 16.628329297820823, "grad_norm": 0.0004186647420283407, "learning_rate": 4.208125239830901e-06, "loss": 0.0, "num_input_tokens_seen": 5626440, "step": 13735 }, { "epoch": 16.634382566585955, "grad_norm": 6.12945223110728e-06, "learning_rate": 4.1934710426971585e-06, "loss": 0.0, "num_input_tokens_seen": 5628584, "step": 13740 }, { "epoch": 16.640435835351088, "grad_norm": 1.3995774679642636e-05, "learning_rate": 4.178840069414811e-06, "loss": 0.0, "num_input_tokens_seen": 5630664, "step": 13745 }, { "epoch": 16.646489104116224, "grad_norm": 8.654594239487778e-06, "learning_rate": 4.164232336314686e-06, "loss": 0.0, "num_input_tokens_seen": 5632712, "step": 13750 }, { "epoch": 16.652542372881356, "grad_norm": 6.233088697626954e-06, "learning_rate": 4.149647859701655e-06, "loss": 0.0, "num_input_tokens_seen": 5634856, "step": 13755 }, { "epoch": 16.65859564164649, "grad_norm": 5.068308837508084e-06, "learning_rate": 4.135086655854622e-06, "loss": 0.0, "num_input_tokens_seen": 5636840, "step": 13760 }, { "epoch": 16.66464891041162, "grad_norm": 4.397660813992843e-05, "learning_rate": 4.1205487410265395e-06, "loss": 0.0, "num_input_tokens_seen": 5638952, "step": 13765 }, { "epoch": 16.670702179176754, "grad_norm": 7.532852032454684e-05, "learning_rate": 4.106034131444342e-06, "loss": 0.0, "num_input_tokens_seen": 5640936, "step": 13770 }, { "epoch": 16.67675544794189, "grad_norm": 5.0134508455812465e-06, "learning_rate": 4.091542843308968e-06, "loss": 0.0, "num_input_tokens_seen": 5642856, "step": 13775 }, { "epoch": 16.682808716707022, "grad_norm": 5.089025307825068e-06, "learning_rate": 4.07707489279531e-06, "loss": 0.0, "num_input_tokens_seen": 5645000, "step": 13780 }, { "epoch": 16.688861985472155, "grad_norm": 4.752615495817736e-06, "learning_rate": 4.062630296052223e-06, "loss": 0.0, "num_input_tokens_seen": 5647048, "step": 13785 }, { "epoch": 16.694915254237287, "grad_norm": 5.199748557060957e-06, "learning_rate": 4.048209069202483e-06, "loss": 0.0, "num_input_tokens_seen": 5649256, "step": 13790 }, { "epoch": 16.70096852300242, "grad_norm": 5.3297671911423095e-06, "learning_rate": 4.033811228342807e-06, "loss": 0.0, "num_input_tokens_seen": 5651304, "step": 13795 }, { "epoch": 16.707021791767556, "grad_norm": 3.536291842465289e-05, "learning_rate": 4.0194367895437676e-06, "loss": 0.0, "num_input_tokens_seen": 5653512, "step": 13800 }, { "epoch": 16.713075060532688, "grad_norm": 2.3053267796058208e-05, "learning_rate": 4.005085768849856e-06, "loss": 0.0, "num_input_tokens_seen": 5655624, "step": 13805 }, { "epoch": 16.71912832929782, "grad_norm": 6.199427389219636e-06, "learning_rate": 3.990758182279406e-06, "loss": 0.0, "num_input_tokens_seen": 5657896, "step": 13810 }, { "epoch": 16.725181598062953, "grad_norm": 7.450661996699637e-06, "learning_rate": 3.976454045824593e-06, "loss": 0.0, "num_input_tokens_seen": 5660008, "step": 13815 }, { "epoch": 16.731234866828085, "grad_norm": 4.910083134745946e-06, "learning_rate": 3.962173375451425e-06, "loss": 0.0, "num_input_tokens_seen": 5661992, "step": 13820 }, { "epoch": 16.73728813559322, "grad_norm": 1.0168583685299382e-05, "learning_rate": 3.94791618709971e-06, "loss": 0.0, "num_input_tokens_seen": 5663944, "step": 13825 }, { "epoch": 16.743341404358354, "grad_norm": 1.2086927199561615e-05, "learning_rate": 3.9336824966830475e-06, "loss": 0.0, "num_input_tokens_seen": 5666024, "step": 13830 }, { "epoch": 16.749394673123486, "grad_norm": 6.289364137046505e-06, "learning_rate": 3.919472320088829e-06, "loss": 0.0, "num_input_tokens_seen": 5668040, "step": 13835 }, { "epoch": 16.75544794188862, "grad_norm": 5.209441496845102e-06, "learning_rate": 3.905285673178164e-06, "loss": 0.0, "num_input_tokens_seen": 5670184, "step": 13840 }, { "epoch": 16.76150121065375, "grad_norm": 4.817769422515994e-06, "learning_rate": 3.891122571785924e-06, "loss": 0.0, "num_input_tokens_seen": 5672200, "step": 13845 }, { "epoch": 16.767554479418887, "grad_norm": 6.988956556597259e-06, "learning_rate": 3.876983031720693e-06, "loss": 0.0, "num_input_tokens_seen": 5674280, "step": 13850 }, { "epoch": 16.77360774818402, "grad_norm": 1.2974471246707253e-05, "learning_rate": 3.8628670687647525e-06, "loss": 0.0, "num_input_tokens_seen": 5676360, "step": 13855 }, { "epoch": 16.779661016949152, "grad_norm": 6.7436762947181705e-06, "learning_rate": 3.848774698674087e-06, "loss": 0.0, "num_input_tokens_seen": 5678248, "step": 13860 }, { "epoch": 16.785714285714285, "grad_norm": 1.2654096281039529e-05, "learning_rate": 3.8347059371783095e-06, "loss": 0.0, "num_input_tokens_seen": 5680488, "step": 13865 }, { "epoch": 16.79176755447942, "grad_norm": 6.09731341683073e-06, "learning_rate": 3.820660799980721e-06, "loss": 0.0, "num_input_tokens_seen": 5682440, "step": 13870 }, { "epoch": 16.797820823244553, "grad_norm": 5.101149326947052e-06, "learning_rate": 3.8066393027582276e-06, "loss": 0.0, "num_input_tokens_seen": 5684552, "step": 13875 }, { "epoch": 16.803874092009686, "grad_norm": 4.777199137606658e-06, "learning_rate": 3.7926414611613613e-06, "loss": 0.0, "num_input_tokens_seen": 5686504, "step": 13880 }, { "epoch": 16.809927360774818, "grad_norm": 7.494704732380342e-06, "learning_rate": 3.7786672908142444e-06, "loss": 0.0, "num_input_tokens_seen": 5688488, "step": 13885 }, { "epoch": 16.81598062953995, "grad_norm": 8.016674655664247e-06, "learning_rate": 3.7647168073145806e-06, "loss": 0.0, "num_input_tokens_seen": 5690792, "step": 13890 }, { "epoch": 16.822033898305087, "grad_norm": 1.1048604392271955e-05, "learning_rate": 3.750790026233625e-06, "loss": 0.0, "num_input_tokens_seen": 5692744, "step": 13895 }, { "epoch": 16.82808716707022, "grad_norm": 8.049228199524805e-06, "learning_rate": 3.7368869631162095e-06, "loss": 0.0, "num_input_tokens_seen": 5694792, "step": 13900 }, { "epoch": 16.83414043583535, "grad_norm": 9.77108720690012e-06, "learning_rate": 3.723007633480638e-06, "loss": 0.0, "num_input_tokens_seen": 5696904, "step": 13905 }, { "epoch": 16.840193704600484, "grad_norm": 9.997363122238312e-06, "learning_rate": 3.709152052818776e-06, "loss": 0.0, "num_input_tokens_seen": 5698920, "step": 13910 }, { "epoch": 16.846246973365616, "grad_norm": 3.0075168979237787e-05, "learning_rate": 3.695320236595956e-06, "loss": 0.0, "num_input_tokens_seen": 5700904, "step": 13915 }, { "epoch": 16.852300242130752, "grad_norm": 6.76576519254013e-06, "learning_rate": 3.681512200250972e-06, "loss": 0.0, "num_input_tokens_seen": 5702984, "step": 13920 }, { "epoch": 16.858353510895885, "grad_norm": 8.913545570976567e-06, "learning_rate": 3.66772795919611e-06, "loss": 0.0, "num_input_tokens_seen": 5704936, "step": 13925 }, { "epoch": 16.864406779661017, "grad_norm": 1.7445448975195177e-05, "learning_rate": 3.653967528817068e-06, "loss": 0.0, "num_input_tokens_seen": 5707016, "step": 13930 }, { "epoch": 16.87046004842615, "grad_norm": 4.2603269321261905e-06, "learning_rate": 3.640230924472979e-06, "loss": 0.0, "num_input_tokens_seen": 5709064, "step": 13935 }, { "epoch": 16.876513317191282, "grad_norm": 8.08206186775351e-06, "learning_rate": 3.6265181614963783e-06, "loss": 0.0, "num_input_tokens_seen": 5711016, "step": 13940 }, { "epoch": 16.88256658595642, "grad_norm": 6.404125088010915e-06, "learning_rate": 3.6128292551931924e-06, "loss": 0.0, "num_input_tokens_seen": 5712968, "step": 13945 }, { "epoch": 16.88861985472155, "grad_norm": 5.236939159658505e-06, "learning_rate": 3.599164220842707e-06, "loss": 0.0, "num_input_tokens_seen": 5714984, "step": 13950 }, { "epoch": 16.894673123486683, "grad_norm": 4.780110884894384e-06, "learning_rate": 3.585523073697597e-06, "loss": 0.0, "num_input_tokens_seen": 5717064, "step": 13955 }, { "epoch": 16.900726392251816, "grad_norm": 4.901911324850516e-06, "learning_rate": 3.5719058289838262e-06, "loss": 0.0, "num_input_tokens_seen": 5719272, "step": 13960 }, { "epoch": 16.906779661016948, "grad_norm": 1.842774508986622e-05, "learning_rate": 3.558312501900718e-06, "loss": 0.0, "num_input_tokens_seen": 5721320, "step": 13965 }, { "epoch": 16.912832929782084, "grad_norm": 4.89374651806429e-05, "learning_rate": 3.544743107620882e-06, "loss": 0.0, "num_input_tokens_seen": 5723336, "step": 13970 }, { "epoch": 16.918886198547217, "grad_norm": 1.8757255020318553e-05, "learning_rate": 3.531197661290217e-06, "loss": 0.0, "num_input_tokens_seen": 5725192, "step": 13975 }, { "epoch": 16.92493946731235, "grad_norm": 2.6497718863538466e-05, "learning_rate": 3.5176761780278928e-06, "loss": 0.0, "num_input_tokens_seen": 5727304, "step": 13980 }, { "epoch": 16.93099273607748, "grad_norm": 5.933227839705069e-06, "learning_rate": 3.504178672926331e-06, "loss": 0.0, "num_input_tokens_seen": 5729352, "step": 13985 }, { "epoch": 16.937046004842614, "grad_norm": 6.118673354649218e-06, "learning_rate": 3.4907051610511886e-06, "loss": 0.0, "num_input_tokens_seen": 5731400, "step": 13990 }, { "epoch": 16.94309927360775, "grad_norm": 9.365097866975702e-06, "learning_rate": 3.4772556574413475e-06, "loss": 0.0, "num_input_tokens_seen": 5733448, "step": 13995 }, { "epoch": 16.949152542372882, "grad_norm": 1.0836102774192113e-05, "learning_rate": 3.4638301771088817e-06, "loss": 0.0, "num_input_tokens_seen": 5735432, "step": 14000 }, { "epoch": 16.955205811138015, "grad_norm": 1.3326977750693914e-05, "learning_rate": 3.4504287350390603e-06, "loss": 0.0, "num_input_tokens_seen": 5737544, "step": 14005 }, { "epoch": 16.961259079903147, "grad_norm": 6.387172561517218e-06, "learning_rate": 3.4370513461903177e-06, "loss": 0.0, "num_input_tokens_seen": 5739624, "step": 14010 }, { "epoch": 16.96731234866828, "grad_norm": 5.546640295506222e-06, "learning_rate": 3.4236980254942347e-06, "loss": 0.0, "num_input_tokens_seen": 5741608, "step": 14015 }, { "epoch": 16.973365617433416, "grad_norm": 7.120559985196451e-06, "learning_rate": 3.4103687878555473e-06, "loss": 0.0, "num_input_tokens_seen": 5743592, "step": 14020 }, { "epoch": 16.979418886198548, "grad_norm": 1.4494214156002272e-05, "learning_rate": 3.3970636481520766e-06, "loss": 0.0, "num_input_tokens_seen": 5745384, "step": 14025 }, { "epoch": 16.98547215496368, "grad_norm": 4.20045580540318e-06, "learning_rate": 3.383782621234785e-06, "loss": 0.0, "num_input_tokens_seen": 5747368, "step": 14030 }, { "epoch": 16.991525423728813, "grad_norm": 5.718556167266797e-06, "learning_rate": 3.370525721927695e-06, "loss": 0.0, "num_input_tokens_seen": 5749192, "step": 14035 }, { "epoch": 16.997578692493946, "grad_norm": 1.1010358321073e-05, "learning_rate": 3.3572929650279028e-06, "loss": 0.0, "num_input_tokens_seen": 5751112, "step": 14040 }, { "epoch": 17.0, "eval_loss": 0.46613237261772156, "eval_runtime": 4.9547, "eval_samples_per_second": 74.071, "eval_steps_per_second": 18.568, "num_input_tokens_seen": 5751600, "step": 14042 }, { "epoch": 17.00363196125908, "grad_norm": 3.352160638314672e-05, "learning_rate": 3.344084365305561e-06, "loss": 0.0, "num_input_tokens_seen": 5752848, "step": 14045 }, { "epoch": 17.009685230024214, "grad_norm": 5.492397121997783e-06, "learning_rate": 3.330899937503859e-06, "loss": 0.0, "num_input_tokens_seen": 5755056, "step": 14050 }, { "epoch": 17.015738498789347, "grad_norm": 2.2616208298131824e-05, "learning_rate": 3.317739696338995e-06, "loss": 0.0, "num_input_tokens_seen": 5756976, "step": 14055 }, { "epoch": 17.02179176755448, "grad_norm": 7.3660908128658775e-06, "learning_rate": 3.304603656500202e-06, "loss": 0.0, "num_input_tokens_seen": 5758928, "step": 14060 }, { "epoch": 17.02784503631961, "grad_norm": 5.7747415667108726e-06, "learning_rate": 3.2914918326496486e-06, "loss": 0.0, "num_input_tokens_seen": 5761008, "step": 14065 }, { "epoch": 17.033898305084747, "grad_norm": 2.4840312107698992e-05, "learning_rate": 3.278404239422522e-06, "loss": 0.0, "num_input_tokens_seen": 5763152, "step": 14070 }, { "epoch": 17.03995157384988, "grad_norm": 6.00325665800483e-06, "learning_rate": 3.265340891426946e-06, "loss": 0.0, "num_input_tokens_seen": 5765200, "step": 14075 }, { "epoch": 17.046004842615012, "grad_norm": 6.001612746331375e-06, "learning_rate": 3.2523018032439625e-06, "loss": 0.0, "num_input_tokens_seen": 5767248, "step": 14080 }, { "epoch": 17.052058111380145, "grad_norm": 3.912618194590323e-05, "learning_rate": 3.239286989427573e-06, "loss": 0.0, "num_input_tokens_seen": 5769264, "step": 14085 }, { "epoch": 17.058111380145277, "grad_norm": 9.537845471641049e-06, "learning_rate": 3.2262964645046553e-06, "loss": 0.0, "num_input_tokens_seen": 5771408, "step": 14090 }, { "epoch": 17.064164648910413, "grad_norm": 4.9774594117479865e-06, "learning_rate": 3.213330242974988e-06, "loss": 0.0, "num_input_tokens_seen": 5773488, "step": 14095 }, { "epoch": 17.070217917675546, "grad_norm": 5.739432708651293e-06, "learning_rate": 3.2003883393112194e-06, "loss": 0.0, "num_input_tokens_seen": 5775600, "step": 14100 }, { "epoch": 17.076271186440678, "grad_norm": 1.589114617672749e-05, "learning_rate": 3.187470767958858e-06, "loss": 0.0, "num_input_tokens_seen": 5777712, "step": 14105 }, { "epoch": 17.08232445520581, "grad_norm": 7.572838967462303e-06, "learning_rate": 3.1745775433362433e-06, "loss": 0.0, "num_input_tokens_seen": 5779600, "step": 14110 }, { "epoch": 17.088377723970943, "grad_norm": 0.00042676524026319385, "learning_rate": 3.161708679834566e-06, "loss": 0.0, "num_input_tokens_seen": 5781648, "step": 14115 }, { "epoch": 17.09443099273608, "grad_norm": 3.667983037303202e-05, "learning_rate": 3.1488641918177826e-06, "loss": 0.0, "num_input_tokens_seen": 5783664, "step": 14120 }, { "epoch": 17.10048426150121, "grad_norm": 7.842563718440942e-06, "learning_rate": 3.1360440936226894e-06, "loss": 0.0, "num_input_tokens_seen": 5785712, "step": 14125 }, { "epoch": 17.106537530266344, "grad_norm": 5.22989876117208e-06, "learning_rate": 3.1232483995588117e-06, "loss": 0.0, "num_input_tokens_seen": 5787856, "step": 14130 }, { "epoch": 17.112590799031477, "grad_norm": 6.219146143848775e-06, "learning_rate": 3.1104771239084787e-06, "loss": 0.0, "num_input_tokens_seen": 5789968, "step": 14135 }, { "epoch": 17.11864406779661, "grad_norm": 1.0063869922305457e-05, "learning_rate": 3.097730280926736e-06, "loss": 0.0, "num_input_tokens_seen": 5792112, "step": 14140 }, { "epoch": 17.124697336561745, "grad_norm": 1.302744112763321e-05, "learning_rate": 3.0850078848413704e-06, "loss": 0.0, "num_input_tokens_seen": 5794160, "step": 14145 }, { "epoch": 17.130750605326877, "grad_norm": 4.994799837731989e-06, "learning_rate": 3.072309949852878e-06, "loss": 0.0, "num_input_tokens_seen": 5796240, "step": 14150 }, { "epoch": 17.13680387409201, "grad_norm": 4.891924618277699e-06, "learning_rate": 3.0596364901344486e-06, "loss": 0.0, "num_input_tokens_seen": 5798320, "step": 14155 }, { "epoch": 17.142857142857142, "grad_norm": 2.4674658561707474e-05, "learning_rate": 3.046987519831962e-06, "loss": 0.0, "num_input_tokens_seen": 5800368, "step": 14160 }, { "epoch": 17.148910411622275, "grad_norm": 1.2645058632188011e-05, "learning_rate": 3.0343630530639517e-06, "loss": 0.0, "num_input_tokens_seen": 5802576, "step": 14165 }, { "epoch": 17.15496368038741, "grad_norm": 1.1866854947584216e-05, "learning_rate": 3.0217631039216126e-06, "loss": 0.0, "num_input_tokens_seen": 5804624, "step": 14170 }, { "epoch": 17.161016949152543, "grad_norm": 5.8674208958109375e-06, "learning_rate": 3.009187686468762e-06, "loss": 0.0, "num_input_tokens_seen": 5806672, "step": 14175 }, { "epoch": 17.167070217917676, "grad_norm": 6.438746822823305e-06, "learning_rate": 2.9966368147418515e-06, "loss": 0.0, "num_input_tokens_seen": 5808560, "step": 14180 }, { "epoch": 17.173123486682808, "grad_norm": 1.1317239113850519e-05, "learning_rate": 2.9841105027499123e-06, "loss": 0.0, "num_input_tokens_seen": 5810576, "step": 14185 }, { "epoch": 17.17917675544794, "grad_norm": 4.272588557796553e-05, "learning_rate": 2.971608764474584e-06, "loss": 0.0, "num_input_tokens_seen": 5812656, "step": 14190 }, { "epoch": 17.185230024213077, "grad_norm": 5.832577244291315e-06, "learning_rate": 2.9591316138700697e-06, "loss": 0.0, "num_input_tokens_seen": 5814736, "step": 14195 }, { "epoch": 17.19128329297821, "grad_norm": 5.152377525519114e-06, "learning_rate": 2.9466790648631227e-06, "loss": 0.0, "num_input_tokens_seen": 5816880, "step": 14200 }, { "epoch": 17.19733656174334, "grad_norm": 0.0007226219167932868, "learning_rate": 2.9342511313530415e-06, "loss": 0.0, "num_input_tokens_seen": 5818896, "step": 14205 }, { "epoch": 17.203389830508474, "grad_norm": 7.593265308969421e-06, "learning_rate": 2.921847827211649e-06, "loss": 0.0, "num_input_tokens_seen": 5820944, "step": 14210 }, { "epoch": 17.209443099273606, "grad_norm": 4.502644060266903e-06, "learning_rate": 2.9094691662832766e-06, "loss": 0.0, "num_input_tokens_seen": 5823056, "step": 14215 }, { "epoch": 17.215496368038743, "grad_norm": 5.5113559938035905e-06, "learning_rate": 2.8971151623847587e-06, "loss": 0.0, "num_input_tokens_seen": 5824944, "step": 14220 }, { "epoch": 17.221549636803875, "grad_norm": 5.471664280776167e-06, "learning_rate": 2.884785829305381e-06, "loss": 0.0, "num_input_tokens_seen": 5827024, "step": 14225 }, { "epoch": 17.227602905569007, "grad_norm": 7.518170150433434e-06, "learning_rate": 2.8724811808069267e-06, "loss": 0.0, "num_input_tokens_seen": 5828976, "step": 14230 }, { "epoch": 17.23365617433414, "grad_norm": 2.1798410671181045e-05, "learning_rate": 2.8602012306236104e-06, "loss": 0.0, "num_input_tokens_seen": 5831024, "step": 14235 }, { "epoch": 17.239709443099272, "grad_norm": 5.367363428376848e-06, "learning_rate": 2.8479459924620644e-06, "loss": 0.0, "num_input_tokens_seen": 5833104, "step": 14240 }, { "epoch": 17.24576271186441, "grad_norm": 6.498288712464273e-05, "learning_rate": 2.8357154800013663e-06, "loss": 0.0, "num_input_tokens_seen": 5835280, "step": 14245 }, { "epoch": 17.25181598062954, "grad_norm": 2.0462457541725598e-05, "learning_rate": 2.8235097068929684e-06, "loss": 0.0, "num_input_tokens_seen": 5837424, "step": 14250 }, { "epoch": 17.257869249394673, "grad_norm": 5.1706319936783984e-06, "learning_rate": 2.8113286867607336e-06, "loss": 0.0, "num_input_tokens_seen": 5839440, "step": 14255 }, { "epoch": 17.263922518159806, "grad_norm": 6.318095984170213e-06, "learning_rate": 2.7991724332008807e-06, "loss": 0.0, "num_input_tokens_seen": 5841520, "step": 14260 }, { "epoch": 17.269975786924938, "grad_norm": 4.13737325288821e-06, "learning_rate": 2.7870409597819865e-06, "loss": 0.0, "num_input_tokens_seen": 5843664, "step": 14265 }, { "epoch": 17.276029055690074, "grad_norm": 1.456629343010718e-05, "learning_rate": 2.7749342800449746e-06, "loss": 0.0, "num_input_tokens_seen": 5845744, "step": 14270 }, { "epoch": 17.282082324455207, "grad_norm": 0.0003695138148032129, "learning_rate": 2.762852407503086e-06, "loss": 0.0, "num_input_tokens_seen": 5847760, "step": 14275 }, { "epoch": 17.28813559322034, "grad_norm": 5.872701422049431e-06, "learning_rate": 2.750795355641872e-06, "loss": 0.0, "num_input_tokens_seen": 5849616, "step": 14280 }, { "epoch": 17.29418886198547, "grad_norm": 6.075230885471683e-06, "learning_rate": 2.7387631379192035e-06, "loss": 0.0, "num_input_tokens_seen": 5851696, "step": 14285 }, { "epoch": 17.300242130750604, "grad_norm": 3.294998896308243e-05, "learning_rate": 2.726755767765188e-06, "loss": 0.0, "num_input_tokens_seen": 5853648, "step": 14290 }, { "epoch": 17.30629539951574, "grad_norm": 5.047607828601031e-06, "learning_rate": 2.714773258582243e-06, "loss": 0.0, "num_input_tokens_seen": 5855696, "step": 14295 }, { "epoch": 17.312348668280872, "grad_norm": 4.404371793498285e-06, "learning_rate": 2.7028156237450085e-06, "loss": 0.0, "num_input_tokens_seen": 5857744, "step": 14300 }, { "epoch": 17.318401937046005, "grad_norm": 5.114361556479707e-06, "learning_rate": 2.6908828766003703e-06, "loss": 0.0, "num_input_tokens_seen": 5859920, "step": 14305 }, { "epoch": 17.324455205811137, "grad_norm": 4.140638338867575e-06, "learning_rate": 2.6789750304674317e-06, "loss": 0.0, "num_input_tokens_seen": 5862000, "step": 14310 }, { "epoch": 17.33050847457627, "grad_norm": 2.3171247448772192e-05, "learning_rate": 2.667092098637505e-06, "loss": 0.0, "num_input_tokens_seen": 5863920, "step": 14315 }, { "epoch": 17.336561743341406, "grad_norm": 5.1723823162319604e-06, "learning_rate": 2.6552340943740957e-06, "loss": 0.0, "num_input_tokens_seen": 5865936, "step": 14320 }, { "epoch": 17.34261501210654, "grad_norm": 7.107015335350297e-06, "learning_rate": 2.643401030912876e-06, "loss": 0.0, "num_input_tokens_seen": 5867952, "step": 14325 }, { "epoch": 17.34866828087167, "grad_norm": 7.255977379827527e-06, "learning_rate": 2.6315929214616926e-06, "loss": 0.0, "num_input_tokens_seen": 5869968, "step": 14330 }, { "epoch": 17.354721549636803, "grad_norm": 5.473132205224829e-06, "learning_rate": 2.619809779200522e-06, "loss": 0.0, "num_input_tokens_seen": 5872048, "step": 14335 }, { "epoch": 17.360774818401936, "grad_norm": 1.8516289856052026e-05, "learning_rate": 2.608051617281501e-06, "loss": 0.0, "num_input_tokens_seen": 5874096, "step": 14340 }, { "epoch": 17.36682808716707, "grad_norm": 7.841345905035269e-06, "learning_rate": 2.596318448828844e-06, "loss": 0.0, "num_input_tokens_seen": 5876112, "step": 14345 }, { "epoch": 17.372881355932204, "grad_norm": 5.9589142438198905e-06, "learning_rate": 2.5846102869389073e-06, "loss": 0.0, "num_input_tokens_seen": 5878128, "step": 14350 }, { "epoch": 17.378934624697337, "grad_norm": 1.1241218999202829e-05, "learning_rate": 2.572927144680112e-06, "loss": 0.0, "num_input_tokens_seen": 5880304, "step": 14355 }, { "epoch": 17.38498789346247, "grad_norm": 5.4165261644811835e-06, "learning_rate": 2.5612690350929614e-06, "loss": 0.0, "num_input_tokens_seen": 5882384, "step": 14360 }, { "epoch": 17.3910411622276, "grad_norm": 5.8015252761833835e-06, "learning_rate": 2.5496359711900118e-06, "loss": 0.0, "num_input_tokens_seen": 5884496, "step": 14365 }, { "epoch": 17.397094430992738, "grad_norm": 1.2961439097125549e-05, "learning_rate": 2.538027965955872e-06, "loss": 0.0, "num_input_tokens_seen": 5886640, "step": 14370 }, { "epoch": 17.40314769975787, "grad_norm": 0.00036930551868863404, "learning_rate": 2.526445032347166e-06, "loss": 0.0, "num_input_tokens_seen": 5888496, "step": 14375 }, { "epoch": 17.409200968523002, "grad_norm": 5.839758159709163e-06, "learning_rate": 2.514887183292561e-06, "loss": 0.0, "num_input_tokens_seen": 5890544, "step": 14380 }, { "epoch": 17.415254237288135, "grad_norm": 5.764529305452015e-06, "learning_rate": 2.503354431692689e-06, "loss": 0.0, "num_input_tokens_seen": 5892496, "step": 14385 }, { "epoch": 17.421307506053267, "grad_norm": 6.773770110157784e-06, "learning_rate": 2.49184679042021e-06, "loss": 0.0, "num_input_tokens_seen": 5894512, "step": 14390 }, { "epoch": 17.427360774818403, "grad_norm": 7.90959984442452e-06, "learning_rate": 2.480364272319713e-06, "loss": 0.0, "num_input_tokens_seen": 5896560, "step": 14395 }, { "epoch": 17.433414043583536, "grad_norm": 6.338773346215021e-06, "learning_rate": 2.46890689020777e-06, "loss": 0.0, "num_input_tokens_seen": 5898640, "step": 14400 }, { "epoch": 17.43946731234867, "grad_norm": 8.328603871632367e-05, "learning_rate": 2.4574746568729074e-06, "loss": 0.0, "num_input_tokens_seen": 5900656, "step": 14405 }, { "epoch": 17.4455205811138, "grad_norm": 6.060063242330216e-06, "learning_rate": 2.4460675850755454e-06, "loss": 0.0, "num_input_tokens_seen": 5902768, "step": 14410 }, { "epoch": 17.451573849878933, "grad_norm": 5.240221526037203e-06, "learning_rate": 2.434685687548058e-06, "loss": 0.0, "num_input_tokens_seen": 5904816, "step": 14415 }, { "epoch": 17.45762711864407, "grad_norm": 5.287778094498208e-06, "learning_rate": 2.423328976994693e-06, "loss": 0.0, "num_input_tokens_seen": 5906832, "step": 14420 }, { "epoch": 17.4636803874092, "grad_norm": 5.864750619366532e-06, "learning_rate": 2.411997466091598e-06, "loss": 0.0, "num_input_tokens_seen": 5908816, "step": 14425 }, { "epoch": 17.469733656174334, "grad_norm": 4.997003088647034e-06, "learning_rate": 2.4006911674867877e-06, "loss": 0.0, "num_input_tokens_seen": 5910832, "step": 14430 }, { "epoch": 17.475786924939467, "grad_norm": 4.517314664553851e-06, "learning_rate": 2.3894100938001374e-06, "loss": 0.0, "num_input_tokens_seen": 5912784, "step": 14435 }, { "epoch": 17.4818401937046, "grad_norm": 7.264010037033586e-06, "learning_rate": 2.3781542576233632e-06, "loss": 0.0, "num_input_tokens_seen": 5914832, "step": 14440 }, { "epoch": 17.487893462469735, "grad_norm": 0.006111954804509878, "learning_rate": 2.36692367152003e-06, "loss": 0.0, "num_input_tokens_seen": 5916912, "step": 14445 }, { "epoch": 17.493946731234868, "grad_norm": 5.479711489897454e-06, "learning_rate": 2.355718348025482e-06, "loss": 0.0, "num_input_tokens_seen": 5918896, "step": 14450 }, { "epoch": 17.5, "grad_norm": 5.759713076258777e-06, "learning_rate": 2.344538299646906e-06, "loss": 0.0, "num_input_tokens_seen": 5920912, "step": 14455 }, { "epoch": 17.506053268765132, "grad_norm": 3.964863208238967e-05, "learning_rate": 2.3333835388632513e-06, "loss": 0.0, "num_input_tokens_seen": 5923056, "step": 14460 }, { "epoch": 17.512106537530265, "grad_norm": 1.833251371863298e-05, "learning_rate": 2.3222540781252493e-06, "loss": 0.0, "num_input_tokens_seen": 5925168, "step": 14465 }, { "epoch": 17.5181598062954, "grad_norm": 4.268659267836483e-06, "learning_rate": 2.311149929855397e-06, "loss": 0.0, "num_input_tokens_seen": 5927376, "step": 14470 }, { "epoch": 17.524213075060533, "grad_norm": 5.040976702730404e-06, "learning_rate": 2.3000711064479265e-06, "loss": 0.0, "num_input_tokens_seen": 5929392, "step": 14475 }, { "epoch": 17.530266343825666, "grad_norm": 0.00031603677780367434, "learning_rate": 2.289017620268813e-06, "loss": 0.0, "num_input_tokens_seen": 5931344, "step": 14480 }, { "epoch": 17.5363196125908, "grad_norm": 2.475289693393279e-05, "learning_rate": 2.2779894836557485e-06, "loss": 0.0, "num_input_tokens_seen": 5933520, "step": 14485 }, { "epoch": 17.54237288135593, "grad_norm": 4.218406502332073e-06, "learning_rate": 2.266986708918126e-06, "loss": 0.0, "num_input_tokens_seen": 5935696, "step": 14490 }, { "epoch": 17.548426150121067, "grad_norm": 4.737042218039278e-06, "learning_rate": 2.2560093083370314e-06, "loss": 0.0, "num_input_tokens_seen": 5937648, "step": 14495 }, { "epoch": 17.5544794188862, "grad_norm": 6.4104101511475164e-06, "learning_rate": 2.2450572941652428e-06, "loss": 0.0, "num_input_tokens_seen": 5939632, "step": 14500 }, { "epoch": 17.56053268765133, "grad_norm": 0.0016662953421473503, "learning_rate": 2.2341306786271695e-06, "loss": 0.0, "num_input_tokens_seen": 5941648, "step": 14505 }, { "epoch": 17.566585956416464, "grad_norm": 1.1226276910747401e-05, "learning_rate": 2.2232294739189086e-06, "loss": 0.0, "num_input_tokens_seen": 5943664, "step": 14510 }, { "epoch": 17.572639225181597, "grad_norm": 1.650213380344212e-05, "learning_rate": 2.212353692208172e-06, "loss": 0.0, "num_input_tokens_seen": 5945680, "step": 14515 }, { "epoch": 17.578692493946733, "grad_norm": 6.114630650699837e-06, "learning_rate": 2.2015033456342983e-06, "loss": 0.0, "num_input_tokens_seen": 5947664, "step": 14520 }, { "epoch": 17.584745762711865, "grad_norm": 0.00013296687393449247, "learning_rate": 2.190678446308239e-06, "loss": 0.0, "num_input_tokens_seen": 5949744, "step": 14525 }, { "epoch": 17.590799031476998, "grad_norm": 6.810721970396116e-06, "learning_rate": 2.1798790063125407e-06, "loss": 0.0, "num_input_tokens_seen": 5951728, "step": 14530 }, { "epoch": 17.59685230024213, "grad_norm": 1.0595490493869875e-05, "learning_rate": 2.1691050377013328e-06, "loss": 0.0, "num_input_tokens_seen": 5953808, "step": 14535 }, { "epoch": 17.602905569007262, "grad_norm": 5.673636678693583e-06, "learning_rate": 2.1583565525003126e-06, "loss": 0.0, "num_input_tokens_seen": 5955824, "step": 14540 }, { "epoch": 17.6089588377724, "grad_norm": 1.3226526789367199e-05, "learning_rate": 2.1476335627067355e-06, "loss": 0.0, "num_input_tokens_seen": 5957840, "step": 14545 }, { "epoch": 17.61501210653753, "grad_norm": 5.922338914388092e-06, "learning_rate": 2.1369360802894005e-06, "loss": 0.0, "num_input_tokens_seen": 5959824, "step": 14550 }, { "epoch": 17.621065375302663, "grad_norm": 5.601073553407332e-06, "learning_rate": 2.126264117188631e-06, "loss": 0.0, "num_input_tokens_seen": 5961936, "step": 14555 }, { "epoch": 17.627118644067796, "grad_norm": 5.164901267562527e-06, "learning_rate": 2.1156176853162624e-06, "loss": 0.0, "num_input_tokens_seen": 5964048, "step": 14560 }, { "epoch": 17.63317191283293, "grad_norm": 6.307872808974935e-06, "learning_rate": 2.1049967965556583e-06, "loss": 0.0, "num_input_tokens_seen": 5966032, "step": 14565 }, { "epoch": 17.639225181598064, "grad_norm": 7.401348284474807e-06, "learning_rate": 2.094401462761633e-06, "loss": 0.0, "num_input_tokens_seen": 5968144, "step": 14570 }, { "epoch": 17.645278450363197, "grad_norm": 1.0061715329356957e-05, "learning_rate": 2.0838316957605074e-06, "loss": 0.0, "num_input_tokens_seen": 5970096, "step": 14575 }, { "epoch": 17.65133171912833, "grad_norm": 1.5564579371130094e-05, "learning_rate": 2.0732875073500497e-06, "loss": 0.0, "num_input_tokens_seen": 5972080, "step": 14580 }, { "epoch": 17.65738498789346, "grad_norm": 4.947223715134896e-06, "learning_rate": 2.062768909299484e-06, "loss": 0.0, "num_input_tokens_seen": 5974192, "step": 14585 }, { "epoch": 17.663438256658594, "grad_norm": 4.544495368463686e-06, "learning_rate": 2.052275913349469e-06, "loss": 0.0, "num_input_tokens_seen": 5976272, "step": 14590 }, { "epoch": 17.66949152542373, "grad_norm": 4.693577011494199e-06, "learning_rate": 2.041808531212086e-06, "loss": 0.0, "num_input_tokens_seen": 5978384, "step": 14595 }, { "epoch": 17.675544794188863, "grad_norm": 8.211035492422525e-06, "learning_rate": 2.0313667745708254e-06, "loss": 0.0, "num_input_tokens_seen": 5980464, "step": 14600 }, { "epoch": 17.681598062953995, "grad_norm": 4.468757197173545e-06, "learning_rate": 2.0209506550805863e-06, "loss": 0.0, "num_input_tokens_seen": 5982576, "step": 14605 }, { "epoch": 17.687651331719128, "grad_norm": 4.656430519389687e-06, "learning_rate": 2.010560184367627e-06, "loss": 0.0, "num_input_tokens_seen": 5984560, "step": 14610 }, { "epoch": 17.69370460048426, "grad_norm": 1.749349758028984e-05, "learning_rate": 2.000195374029609e-06, "loss": 0.0, "num_input_tokens_seen": 5986576, "step": 14615 }, { "epoch": 17.699757869249396, "grad_norm": 7.04316471455968e-06, "learning_rate": 1.9898562356355288e-06, "loss": 0.0, "num_input_tokens_seen": 5988592, "step": 14620 }, { "epoch": 17.70581113801453, "grad_norm": 4.546790933090961e-06, "learning_rate": 1.979542780725738e-06, "loss": 0.0, "num_input_tokens_seen": 5990544, "step": 14625 }, { "epoch": 17.71186440677966, "grad_norm": 0.003702257527038455, "learning_rate": 1.9692550208119135e-06, "loss": 0.0, "num_input_tokens_seen": 5992496, "step": 14630 }, { "epoch": 17.717917675544793, "grad_norm": 4.358506885182578e-06, "learning_rate": 1.958992967377066e-06, "loss": 0.0, "num_input_tokens_seen": 5994608, "step": 14635 }, { "epoch": 17.723970944309926, "grad_norm": 4.56554562333622e-06, "learning_rate": 1.948756631875495e-06, "loss": 0.0, "num_input_tokens_seen": 5996752, "step": 14640 }, { "epoch": 17.730024213075062, "grad_norm": 2.1252466467558406e-05, "learning_rate": 1.938546025732807e-06, "loss": 0.0, "num_input_tokens_seen": 5998864, "step": 14645 }, { "epoch": 17.736077481840194, "grad_norm": 4.174635250819847e-06, "learning_rate": 1.92836116034589e-06, "loss": 0.0, "num_input_tokens_seen": 6001072, "step": 14650 }, { "epoch": 17.742130750605327, "grad_norm": 7.6082483246864285e-06, "learning_rate": 1.9182020470828843e-06, "loss": 0.0, "num_input_tokens_seen": 6003120, "step": 14655 }, { "epoch": 17.74818401937046, "grad_norm": 5.2930581659893505e-06, "learning_rate": 1.908068697283219e-06, "loss": 0.0, "num_input_tokens_seen": 6005168, "step": 14660 }, { "epoch": 17.75423728813559, "grad_norm": 5.720949047827162e-06, "learning_rate": 1.8979611222575244e-06, "loss": 0.0, "num_input_tokens_seen": 6007152, "step": 14665 }, { "epoch": 17.760290556900728, "grad_norm": 5.340027655620361e-06, "learning_rate": 1.8878793332877037e-06, "loss": 0.0, "num_input_tokens_seen": 6009264, "step": 14670 }, { "epoch": 17.76634382566586, "grad_norm": 1.9422415789449587e-05, "learning_rate": 1.8778233416268388e-06, "loss": 0.0, "num_input_tokens_seen": 6011248, "step": 14675 }, { "epoch": 17.772397094430993, "grad_norm": 0.00010977911006193608, "learning_rate": 1.867793158499248e-06, "loss": 0.0, "num_input_tokens_seen": 6013360, "step": 14680 }, { "epoch": 17.778450363196125, "grad_norm": 5.814461474074051e-06, "learning_rate": 1.8577887951004263e-06, "loss": 0.0, "num_input_tokens_seen": 6015376, "step": 14685 }, { "epoch": 17.784503631961257, "grad_norm": 7.257460765686119e-06, "learning_rate": 1.8478102625970577e-06, "loss": 0.0, "num_input_tokens_seen": 6017424, "step": 14690 }, { "epoch": 17.790556900726394, "grad_norm": 5.745132511947304e-06, "learning_rate": 1.8378575721269858e-06, "loss": 0.0, "num_input_tokens_seen": 6019440, "step": 14695 }, { "epoch": 17.796610169491526, "grad_norm": 6.239656158868456e-06, "learning_rate": 1.8279307347992158e-06, "loss": 0.0, "num_input_tokens_seen": 6021424, "step": 14700 }, { "epoch": 17.80266343825666, "grad_norm": 5.889959993510274e-06, "learning_rate": 1.8180297616938928e-06, "loss": 0.0, "num_input_tokens_seen": 6023408, "step": 14705 }, { "epoch": 17.80871670702179, "grad_norm": 3.6484707379713655e-05, "learning_rate": 1.8081546638622993e-06, "loss": 0.0, "num_input_tokens_seen": 6025424, "step": 14710 }, { "epoch": 17.814769975786923, "grad_norm": 4.66615301775164e-06, "learning_rate": 1.798305452326826e-06, "loss": 0.0, "num_input_tokens_seen": 6027536, "step": 14715 }, { "epoch": 17.82082324455206, "grad_norm": 5.064895958639681e-06, "learning_rate": 1.7884821380809741e-06, "loss": 0.0, "num_input_tokens_seen": 6029520, "step": 14720 }, { "epoch": 17.826876513317192, "grad_norm": 5.443392183224205e-06, "learning_rate": 1.7786847320893502e-06, "loss": 0.0, "num_input_tokens_seen": 6031472, "step": 14725 }, { "epoch": 17.832929782082324, "grad_norm": 8.176685696525965e-06, "learning_rate": 1.7689132452876179e-06, "loss": 0.0, "num_input_tokens_seen": 6033552, "step": 14730 }, { "epoch": 17.838983050847457, "grad_norm": 4.511962742981268e-06, "learning_rate": 1.7591676885825358e-06, "loss": 0.0, "num_input_tokens_seen": 6035568, "step": 14735 }, { "epoch": 17.84503631961259, "grad_norm": 7.474989160982659e-06, "learning_rate": 1.7494480728519057e-06, "loss": 0.0, "num_input_tokens_seen": 6037616, "step": 14740 }, { "epoch": 17.851089588377725, "grad_norm": 4.749404070025776e-06, "learning_rate": 1.739754408944577e-06, "loss": 0.0, "num_input_tokens_seen": 6039664, "step": 14745 }, { "epoch": 17.857142857142858, "grad_norm": 3.8007676721463213e-06, "learning_rate": 1.7300867076804312e-06, "loss": 0.0, "num_input_tokens_seen": 6041584, "step": 14750 }, { "epoch": 17.86319612590799, "grad_norm": 6.109760761319194e-06, "learning_rate": 1.7204449798503746e-06, "loss": 0.0, "num_input_tokens_seen": 6043600, "step": 14755 }, { "epoch": 17.869249394673123, "grad_norm": 4.1153158235829324e-05, "learning_rate": 1.7108292362163159e-06, "loss": 0.0, "num_input_tokens_seen": 6045776, "step": 14760 }, { "epoch": 17.875302663438255, "grad_norm": 6.8410217863856815e-06, "learning_rate": 1.7012394875111775e-06, "loss": 0.0, "num_input_tokens_seen": 6047888, "step": 14765 }, { "epoch": 17.88135593220339, "grad_norm": 5.113944098411594e-06, "learning_rate": 1.6916757444388365e-06, "loss": 0.0, "num_input_tokens_seen": 6050192, "step": 14770 }, { "epoch": 17.887409200968523, "grad_norm": 9.418968147656415e-06, "learning_rate": 1.682138017674173e-06, "loss": 0.0, "num_input_tokens_seen": 6052304, "step": 14775 }, { "epoch": 17.893462469733656, "grad_norm": 4.017128958366811e-06, "learning_rate": 1.672626317863013e-06, "loss": 0.0, "num_input_tokens_seen": 6054256, "step": 14780 }, { "epoch": 17.89951573849879, "grad_norm": 1.0371774806117173e-05, "learning_rate": 1.6631406556221334e-06, "loss": 0.0, "num_input_tokens_seen": 6056400, "step": 14785 }, { "epoch": 17.90556900726392, "grad_norm": 1.1602520316955633e-05, "learning_rate": 1.6536810415392511e-06, "loss": 0.0, "num_input_tokens_seen": 6058480, "step": 14790 }, { "epoch": 17.911622276029057, "grad_norm": 5.037716164224548e-06, "learning_rate": 1.6442474861730062e-06, "loss": 0.0, "num_input_tokens_seen": 6060560, "step": 14795 }, { "epoch": 17.91767554479419, "grad_norm": 5.021526249038288e-06, "learning_rate": 1.6348400000529563e-06, "loss": 0.0, "num_input_tokens_seen": 6062704, "step": 14800 }, { "epoch": 17.923728813559322, "grad_norm": 6.096875949879177e-06, "learning_rate": 1.6254585936795546e-06, "loss": 0.0, "num_input_tokens_seen": 6064720, "step": 14805 }, { "epoch": 17.929782082324454, "grad_norm": 0.000171928753843531, "learning_rate": 1.6161032775241503e-06, "loss": 0.0, "num_input_tokens_seen": 6066608, "step": 14810 }, { "epoch": 17.935835351089587, "grad_norm": 5.5958803386602085e-06, "learning_rate": 1.6067740620289678e-06, "loss": 0.0, "num_input_tokens_seen": 6068688, "step": 14815 }, { "epoch": 17.941888619854723, "grad_norm": 6.523522642964963e-06, "learning_rate": 1.5974709576071e-06, "loss": 0.0, "num_input_tokens_seen": 6070768, "step": 14820 }, { "epoch": 17.947941888619855, "grad_norm": 1.1913663911400363e-05, "learning_rate": 1.5881939746424906e-06, "loss": 0.0, "num_input_tokens_seen": 6072912, "step": 14825 }, { "epoch": 17.953995157384988, "grad_norm": 7.216044650704134e-06, "learning_rate": 1.5789431234899483e-06, "loss": 0.0, "num_input_tokens_seen": 6075024, "step": 14830 }, { "epoch": 17.96004842615012, "grad_norm": 8.572385013394523e-06, "learning_rate": 1.5697184144750714e-06, "loss": 0.0, "num_input_tokens_seen": 6077040, "step": 14835 }, { "epoch": 17.966101694915253, "grad_norm": 6.370222308760276e-06, "learning_rate": 1.5605198578943269e-06, "loss": 0.0, "num_input_tokens_seen": 6079056, "step": 14840 }, { "epoch": 17.97215496368039, "grad_norm": 5.223396783549106e-06, "learning_rate": 1.5513474640149572e-06, "loss": 0.0, "num_input_tokens_seen": 6081072, "step": 14845 }, { "epoch": 17.97820823244552, "grad_norm": 5.730475550080882e-06, "learning_rate": 1.5422012430750143e-06, "loss": 0.0002, "num_input_tokens_seen": 6083088, "step": 14850 }, { "epoch": 17.984261501210653, "grad_norm": 7.120058853615774e-06, "learning_rate": 1.5330812052833405e-06, "loss": 0.0, "num_input_tokens_seen": 6085232, "step": 14855 }, { "epoch": 17.990314769975786, "grad_norm": 4.9261188905802555e-06, "learning_rate": 1.5239873608195425e-06, "loss": 0.0, "num_input_tokens_seen": 6087344, "step": 14860 }, { "epoch": 17.99636803874092, "grad_norm": 5.776866601081565e-06, "learning_rate": 1.5149197198340015e-06, "loss": 0.0, "num_input_tokens_seen": 6089392, "step": 14865 }, { "epoch": 18.0, "eval_loss": 0.45604705810546875, "eval_runtime": 4.9469, "eval_samples_per_second": 74.188, "eval_steps_per_second": 18.598, "num_input_tokens_seen": 6090384, "step": 14868 }, { "epoch": 18.002421307506054, "grad_norm": 2.2912616259418428e-05, "learning_rate": 1.5058782924478431e-06, "loss": 0.0, "num_input_tokens_seen": 6091120, "step": 14870 }, { "epoch": 18.008474576271187, "grad_norm": 2.2799935322836973e-05, "learning_rate": 1.4968630887529339e-06, "loss": 0.0, "num_input_tokens_seen": 6093232, "step": 14875 }, { "epoch": 18.01452784503632, "grad_norm": 6.33965510132839e-06, "learning_rate": 1.4878741188118744e-06, "loss": 0.0, "num_input_tokens_seen": 6095472, "step": 14880 }, { "epoch": 18.020581113801452, "grad_norm": 5.704115665139398e-06, "learning_rate": 1.478911392657989e-06, "loss": 0.0, "num_input_tokens_seen": 6097456, "step": 14885 }, { "epoch": 18.026634382566584, "grad_norm": 8.313199032272678e-06, "learning_rate": 1.469974920295289e-06, "loss": 0.0, "num_input_tokens_seen": 6099472, "step": 14890 }, { "epoch": 18.03268765133172, "grad_norm": 6.888427833473543e-06, "learning_rate": 1.4610647116985037e-06, "loss": 0.0, "num_input_tokens_seen": 6101456, "step": 14895 }, { "epoch": 18.038740920096853, "grad_norm": 4.5799511099176016e-06, "learning_rate": 1.4521807768130364e-06, "loss": 0.0, "num_input_tokens_seen": 6103472, "step": 14900 }, { "epoch": 18.044794188861985, "grad_norm": 5.367709945858223e-06, "learning_rate": 1.4433231255549655e-06, "loss": 0.0, "num_input_tokens_seen": 6105648, "step": 14905 }, { "epoch": 18.050847457627118, "grad_norm": 6.819249392719939e-06, "learning_rate": 1.4344917678110303e-06, "loss": 0.0, "num_input_tokens_seen": 6107664, "step": 14910 }, { "epoch": 18.05690072639225, "grad_norm": 8.20863260742044e-06, "learning_rate": 1.4256867134386288e-06, "loss": 0.0, "num_input_tokens_seen": 6109648, "step": 14915 }, { "epoch": 18.062953995157386, "grad_norm": 7.957406523928512e-06, "learning_rate": 1.416907972265788e-06, "loss": 0.0, "num_input_tokens_seen": 6111824, "step": 14920 }, { "epoch": 18.06900726392252, "grad_norm": 1.1227474715269636e-05, "learning_rate": 1.408155554091184e-06, "loss": 0.0, "num_input_tokens_seen": 6113776, "step": 14925 }, { "epoch": 18.07506053268765, "grad_norm": 4.463447567104595e-06, "learning_rate": 1.3994294686840853e-06, "loss": 0.0, "num_input_tokens_seen": 6115856, "step": 14930 }, { "epoch": 18.081113801452783, "grad_norm": 4.675592663261341e-06, "learning_rate": 1.3907297257843898e-06, "loss": 0.0, "num_input_tokens_seen": 6117968, "step": 14935 }, { "epoch": 18.087167070217916, "grad_norm": 4.23339452026994e-06, "learning_rate": 1.3820563351025884e-06, "loss": 0.0, "num_input_tokens_seen": 6119952, "step": 14940 }, { "epoch": 18.093220338983052, "grad_norm": 1.6654474165989086e-05, "learning_rate": 1.3734093063197424e-06, "loss": 0.0, "num_input_tokens_seen": 6122000, "step": 14945 }, { "epoch": 18.099273607748184, "grad_norm": 7.3802575570880435e-06, "learning_rate": 1.3647886490875144e-06, "loss": 0.0, "num_input_tokens_seen": 6124048, "step": 14950 }, { "epoch": 18.105326876513317, "grad_norm": 4.701149919128511e-06, "learning_rate": 1.3561943730281052e-06, "loss": 0.0, "num_input_tokens_seen": 6125968, "step": 14955 }, { "epoch": 18.11138014527845, "grad_norm": 4.529350007942412e-06, "learning_rate": 1.3476264877342908e-06, "loss": 0.0, "num_input_tokens_seen": 6127952, "step": 14960 }, { "epoch": 18.11743341404358, "grad_norm": 4.423546670295764e-06, "learning_rate": 1.3390850027693802e-06, "loss": 0.0, "num_input_tokens_seen": 6129968, "step": 14965 }, { "epoch": 18.123486682808718, "grad_norm": 6.350829153234372e-06, "learning_rate": 1.3305699276672134e-06, "loss": 0.0, "num_input_tokens_seen": 6132048, "step": 14970 }, { "epoch": 18.12953995157385, "grad_norm": 6.090076112741372e-06, "learning_rate": 1.3220812719321601e-06, "loss": 0.0, "num_input_tokens_seen": 6134096, "step": 14975 }, { "epoch": 18.135593220338983, "grad_norm": 4.151344455749495e-06, "learning_rate": 1.3136190450390912e-06, "loss": 0.0, "num_input_tokens_seen": 6136208, "step": 14980 }, { "epoch": 18.141646489104115, "grad_norm": 7.424807790812338e-06, "learning_rate": 1.3051832564333815e-06, "loss": 0.0, "num_input_tokens_seen": 6138224, "step": 14985 }, { "epoch": 18.147699757869248, "grad_norm": 1.0753095921245404e-05, "learning_rate": 1.2967739155309077e-06, "loss": 0.0, "num_input_tokens_seen": 6140272, "step": 14990 }, { "epoch": 18.153753026634384, "grad_norm": 7.003319296927657e-06, "learning_rate": 1.2883910317180004e-06, "loss": 0.0, "num_input_tokens_seen": 6142512, "step": 14995 }, { "epoch": 18.159806295399516, "grad_norm": 6.576755822607083e-06, "learning_rate": 1.2800346143514914e-06, "loss": 0.0, "num_input_tokens_seen": 6144432, "step": 15000 }, { "epoch": 18.16585956416465, "grad_norm": 4.494771019381005e-06, "learning_rate": 1.2717046727586447e-06, "loss": 0.0, "num_input_tokens_seen": 6146384, "step": 15005 }, { "epoch": 18.17191283292978, "grad_norm": 9.966833204089198e-06, "learning_rate": 1.2634012162371839e-06, "loss": 0.0, "num_input_tokens_seen": 6148400, "step": 15010 }, { "epoch": 18.177966101694917, "grad_norm": 4.568405347527005e-05, "learning_rate": 1.2551242540552733e-06, "loss": 0.0, "num_input_tokens_seen": 6150352, "step": 15015 }, { "epoch": 18.18401937046005, "grad_norm": 2.0199950085952878e-05, "learning_rate": 1.2468737954514948e-06, "loss": 0.0, "num_input_tokens_seen": 6152560, "step": 15020 }, { "epoch": 18.190072639225182, "grad_norm": 8.905576578399632e-06, "learning_rate": 1.2386498496348541e-06, "loss": 0.0, "num_input_tokens_seen": 6154672, "step": 15025 }, { "epoch": 18.196125907990314, "grad_norm": 7.346299753407948e-06, "learning_rate": 1.2304524257847672e-06, "loss": 0.0, "num_input_tokens_seen": 6156784, "step": 15030 }, { "epoch": 18.202179176755447, "grad_norm": 1.2653949852392543e-05, "learning_rate": 1.2222815330510367e-06, "loss": 0.0, "num_input_tokens_seen": 6158992, "step": 15035 }, { "epoch": 18.208232445520583, "grad_norm": 1.641985545575153e-05, "learning_rate": 1.2141371805538593e-06, "loss": 0.0, "num_input_tokens_seen": 6160976, "step": 15040 }, { "epoch": 18.214285714285715, "grad_norm": 6.399463018169627e-06, "learning_rate": 1.206019377383813e-06, "loss": 0.0, "num_input_tokens_seen": 6163120, "step": 15045 }, { "epoch": 18.220338983050848, "grad_norm": 6.3382581174664665e-06, "learning_rate": 1.197928132601825e-06, "loss": 0.0, "num_input_tokens_seen": 6165168, "step": 15050 }, { "epoch": 18.22639225181598, "grad_norm": 5.378949026635382e-06, "learning_rate": 1.189863455239193e-06, "loss": 0.0, "num_input_tokens_seen": 6167152, "step": 15055 }, { "epoch": 18.232445520581113, "grad_norm": 1.226281347044278e-05, "learning_rate": 1.1818253542975584e-06, "loss": 0.0, "num_input_tokens_seen": 6169136, "step": 15060 }, { "epoch": 18.23849878934625, "grad_norm": 5.201929980103159e-06, "learning_rate": 1.173813838748894e-06, "loss": 0.0, "num_input_tokens_seen": 6171184, "step": 15065 }, { "epoch": 18.24455205811138, "grad_norm": 7.878019459894858e-06, "learning_rate": 1.1658289175354996e-06, "loss": 0.0, "num_input_tokens_seen": 6173168, "step": 15070 }, { "epoch": 18.250605326876514, "grad_norm": 4.865495156991528e-06, "learning_rate": 1.1578705995699961e-06, "loss": 0.0, "num_input_tokens_seen": 6175216, "step": 15075 }, { "epoch": 18.256658595641646, "grad_norm": 7.658445611014031e-06, "learning_rate": 1.1499388937352974e-06, "loss": 0.0, "num_input_tokens_seen": 6177392, "step": 15080 }, { "epoch": 18.26271186440678, "grad_norm": 6.507094894914189e-06, "learning_rate": 1.1420338088846404e-06, "loss": 0.0, "num_input_tokens_seen": 6179408, "step": 15085 }, { "epoch": 18.268765133171915, "grad_norm": 4.901938154944219e-06, "learning_rate": 1.1341553538415095e-06, "loss": 0.0, "num_input_tokens_seen": 6181520, "step": 15090 }, { "epoch": 18.274818401937047, "grad_norm": 4.501100647757994e-06, "learning_rate": 1.1263035373997033e-06, "loss": 0.0, "num_input_tokens_seen": 6183536, "step": 15095 }, { "epoch": 18.28087167070218, "grad_norm": 4.354031716502504e-06, "learning_rate": 1.1184783683232585e-06, "loss": 0.0, "num_input_tokens_seen": 6185552, "step": 15100 }, { "epoch": 18.286924939467312, "grad_norm": 8.862057256919798e-06, "learning_rate": 1.1106798553464804e-06, "loss": 0.0, "num_input_tokens_seen": 6187472, "step": 15105 }, { "epoch": 18.292978208232444, "grad_norm": 4.194789198663784e-06, "learning_rate": 1.1029080071739333e-06, "loss": 0.0, "num_input_tokens_seen": 6189520, "step": 15110 }, { "epoch": 18.29903147699758, "grad_norm": 7.948316124384291e-06, "learning_rate": 1.095162832480387e-06, "loss": 0.0, "num_input_tokens_seen": 6191600, "step": 15115 }, { "epoch": 18.305084745762713, "grad_norm": 1.4693748198624235e-05, "learning_rate": 1.0874443399108702e-06, "loss": 0.0, "num_input_tokens_seen": 6193680, "step": 15120 }, { "epoch": 18.311138014527845, "grad_norm": 1.0692862815631088e-05, "learning_rate": 1.0797525380806168e-06, "loss": 0.0, "num_input_tokens_seen": 6195888, "step": 15125 }, { "epoch": 18.317191283292978, "grad_norm": 6.073034910514252e-06, "learning_rate": 1.072087435575067e-06, "loss": 0.0, "num_input_tokens_seen": 6197968, "step": 15130 }, { "epoch": 18.32324455205811, "grad_norm": 4.440308657649439e-06, "learning_rate": 1.0644490409498637e-06, "loss": 0.0, "num_input_tokens_seen": 6199984, "step": 15135 }, { "epoch": 18.329297820823246, "grad_norm": 9.799176950764377e-06, "learning_rate": 1.0568373627308365e-06, "loss": 0.0, "num_input_tokens_seen": 6201936, "step": 15140 }, { "epoch": 18.33535108958838, "grad_norm": 6.575263796548825e-06, "learning_rate": 1.0492524094139921e-06, "loss": 0.0, "num_input_tokens_seen": 6203952, "step": 15145 }, { "epoch": 18.34140435835351, "grad_norm": 4.430938588484423e-06, "learning_rate": 1.0416941894655224e-06, "loss": 0.0, "num_input_tokens_seen": 6205968, "step": 15150 }, { "epoch": 18.347457627118644, "grad_norm": 8.184981197700836e-06, "learning_rate": 1.0341627113217539e-06, "loss": 0.0, "num_input_tokens_seen": 6208080, "step": 15155 }, { "epoch": 18.353510895883776, "grad_norm": 4.475622517929878e-06, "learning_rate": 1.026657983389187e-06, "loss": 0.0, "num_input_tokens_seen": 6210224, "step": 15160 }, { "epoch": 18.359564164648912, "grad_norm": 6.808937087043887e-06, "learning_rate": 1.0191800140444574e-06, "loss": 0.0, "num_input_tokens_seen": 6212208, "step": 15165 }, { "epoch": 18.365617433414045, "grad_norm": 5.444606813398423e-06, "learning_rate": 1.0117288116343298e-06, "loss": 0.0, "num_input_tokens_seen": 6214192, "step": 15170 }, { "epoch": 18.371670702179177, "grad_norm": 4.928069301968208e-06, "learning_rate": 1.0043043844756934e-06, "loss": 0.0, "num_input_tokens_seen": 6216144, "step": 15175 }, { "epoch": 18.37772397094431, "grad_norm": 9.960959141608328e-05, "learning_rate": 9.96906740855555e-07, "loss": 0.0, "num_input_tokens_seen": 6218256, "step": 15180 }, { "epoch": 18.383777239709442, "grad_norm": 4.123034159420058e-05, "learning_rate": 9.895358890310208e-07, "loss": 0.0, "num_input_tokens_seen": 6220304, "step": 15185 }, { "epoch": 18.389830508474578, "grad_norm": 4.317685579735553e-06, "learning_rate": 9.821918372292959e-07, "loss": 0.0, "num_input_tokens_seen": 6222384, "step": 15190 }, { "epoch": 18.39588377723971, "grad_norm": 5.5450886975449976e-06, "learning_rate": 9.748745936476734e-07, "loss": 0.0, "num_input_tokens_seen": 6224208, "step": 15195 }, { "epoch": 18.401937046004843, "grad_norm": 6.685146217932925e-06, "learning_rate": 9.675841664535167e-07, "loss": 0.0, "num_input_tokens_seen": 6226256, "step": 15200 }, { "epoch": 18.407990314769975, "grad_norm": 4.696415999205783e-06, "learning_rate": 9.6032056378427e-07, "loss": 0.0, "num_input_tokens_seen": 6228400, "step": 15205 }, { "epoch": 18.414043583535108, "grad_norm": 7.400863523798762e-06, "learning_rate": 9.53083793747414e-07, "loss": 0.0, "num_input_tokens_seen": 6230576, "step": 15210 }, { "epoch": 18.420096852300244, "grad_norm": 3.9502874642494135e-06, "learning_rate": 9.458738644205129e-07, "loss": 0.0, "num_input_tokens_seen": 6232688, "step": 15215 }, { "epoch": 18.426150121065376, "grad_norm": 5.35521166966646e-06, "learning_rate": 9.386907838511344e-07, "loss": 0.0, "num_input_tokens_seen": 6234704, "step": 15220 }, { "epoch": 18.43220338983051, "grad_norm": 5.070205588708632e-06, "learning_rate": 9.31534560056907e-07, "loss": 0.0, "num_input_tokens_seen": 6236752, "step": 15225 }, { "epoch": 18.43825665859564, "grad_norm": 4.814105977857253e-06, "learning_rate": 9.244052010254662e-07, "loss": 0.0, "num_input_tokens_seen": 6238832, "step": 15230 }, { "epoch": 18.444309927360774, "grad_norm": 1.3292941730469465e-05, "learning_rate": 9.173027147144714e-07, "loss": 0.0, "num_input_tokens_seen": 6240976, "step": 15235 }, { "epoch": 18.45036319612591, "grad_norm": 4.880787855654489e-06, "learning_rate": 9.102271090515784e-07, "loss": 0.0, "num_input_tokens_seen": 6243024, "step": 15240 }, { "epoch": 18.456416464891042, "grad_norm": 5.1056535994575825e-06, "learning_rate": 9.031783919344478e-07, "loss": 0.0, "num_input_tokens_seen": 6245232, "step": 15245 }, { "epoch": 18.462469733656174, "grad_norm": 7.872795322327875e-06, "learning_rate": 8.961565712307163e-07, "loss": 0.0, "num_input_tokens_seen": 6247280, "step": 15250 }, { "epoch": 18.468523002421307, "grad_norm": 4.089719368494116e-06, "learning_rate": 8.891616547780174e-07, "loss": 0.0, "num_input_tokens_seen": 6249264, "step": 15255 }, { "epoch": 18.47457627118644, "grad_norm": 4.786067165696295e-06, "learning_rate": 8.821936503839334e-07, "loss": 0.0, "num_input_tokens_seen": 6251504, "step": 15260 }, { "epoch": 18.480629539951575, "grad_norm": 5.5305176829278935e-06, "learning_rate": 8.75252565826018e-07, "loss": 0.0, "num_input_tokens_seen": 6253616, "step": 15265 }, { "epoch": 18.486682808716708, "grad_norm": 5.061567208031192e-06, "learning_rate": 8.683384088517904e-07, "loss": 0.0, "num_input_tokens_seen": 6255728, "step": 15270 }, { "epoch": 18.49273607748184, "grad_norm": 1.592504486325197e-05, "learning_rate": 8.614511871786829e-07, "loss": 0.0, "num_input_tokens_seen": 6257840, "step": 15275 }, { "epoch": 18.498789346246973, "grad_norm": 5.092066658107797e-06, "learning_rate": 8.545909084940962e-07, "loss": 0.0, "num_input_tokens_seen": 6259952, "step": 15280 }, { "epoch": 18.504842615012105, "grad_norm": 4.067228474013973e-06, "learning_rate": 8.477575804553356e-07, "loss": 0.0, "num_input_tokens_seen": 6262160, "step": 15285 }, { "epoch": 18.51089588377724, "grad_norm": 1.0950917385343928e-05, "learning_rate": 8.409512106896334e-07, "loss": 0.0, "num_input_tokens_seen": 6264368, "step": 15290 }, { "epoch": 18.516949152542374, "grad_norm": 5.893878551432863e-06, "learning_rate": 8.34171806794129e-07, "loss": 0.0, "num_input_tokens_seen": 6266384, "step": 15295 }, { "epoch": 18.523002421307506, "grad_norm": 9.914739166561048e-06, "learning_rate": 8.27419376335864e-07, "loss": 0.0, "num_input_tokens_seen": 6268336, "step": 15300 }, { "epoch": 18.52905569007264, "grad_norm": 7.591918347316096e-06, "learning_rate": 8.206939268517705e-07, "loss": 0.0, "num_input_tokens_seen": 6270384, "step": 15305 }, { "epoch": 18.53510895883777, "grad_norm": 5.063903245172696e-06, "learning_rate": 8.139954658486771e-07, "loss": 0.0, "num_input_tokens_seen": 6272240, "step": 15310 }, { "epoch": 18.541162227602907, "grad_norm": 5.060672719991999e-06, "learning_rate": 8.07324000803264e-07, "loss": 0.0, "num_input_tokens_seen": 6274224, "step": 15315 }, { "epoch": 18.54721549636804, "grad_norm": 0.0002788496494758874, "learning_rate": 8.006795391621053e-07, "loss": 0.0, "num_input_tokens_seen": 6276240, "step": 15320 }, { "epoch": 18.553268765133172, "grad_norm": 5.7316708989674225e-05, "learning_rate": 7.940620883416155e-07, "loss": 0.0, "num_input_tokens_seen": 6278256, "step": 15325 }, { "epoch": 18.559322033898304, "grad_norm": 7.273780738614732e-06, "learning_rate": 7.874716557280698e-07, "loss": 0.0, "num_input_tokens_seen": 6280336, "step": 15330 }, { "epoch": 18.565375302663437, "grad_norm": 0.00013312447117641568, "learning_rate": 7.809082486775838e-07, "loss": 0.0, "num_input_tokens_seen": 6282288, "step": 15335 }, { "epoch": 18.571428571428573, "grad_norm": 2.174360452045221e-05, "learning_rate": 7.743718745161083e-07, "loss": 0.0, "num_input_tokens_seen": 6284464, "step": 15340 }, { "epoch": 18.577481840193705, "grad_norm": 5.1317801990080625e-06, "learning_rate": 7.678625405394157e-07, "loss": 0.0, "num_input_tokens_seen": 6286544, "step": 15345 }, { "epoch": 18.583535108958838, "grad_norm": 8.143402737914585e-06, "learning_rate": 7.613802540131054e-07, "loss": 0.0, "num_input_tokens_seen": 6288432, "step": 15350 }, { "epoch": 18.58958837772397, "grad_norm": 4.16441798734013e-06, "learning_rate": 7.549250221725784e-07, "loss": 0.0, "num_input_tokens_seen": 6290672, "step": 15355 }, { "epoch": 18.595641646489103, "grad_norm": 6.763746569049545e-06, "learning_rate": 7.484968522230434e-07, "loss": 0.0, "num_input_tokens_seen": 6292688, "step": 15360 }, { "epoch": 18.60169491525424, "grad_norm": 4.4103703658038285e-06, "learning_rate": 7.420957513395027e-07, "loss": 0.0, "num_input_tokens_seen": 6294576, "step": 15365 }, { "epoch": 18.60774818401937, "grad_norm": 5.327112376107834e-06, "learning_rate": 7.357217266667355e-07, "loss": 0.0, "num_input_tokens_seen": 6296592, "step": 15370 }, { "epoch": 18.613801452784504, "grad_norm": 3.11534276988823e-05, "learning_rate": 7.293747853193201e-07, "loss": 0.0, "num_input_tokens_seen": 6298512, "step": 15375 }, { "epoch": 18.619854721549636, "grad_norm": 5.264836090645986e-06, "learning_rate": 7.230549343815813e-07, "loss": 0.0, "num_input_tokens_seen": 6300528, "step": 15380 }, { "epoch": 18.62590799031477, "grad_norm": 5.006917490391061e-06, "learning_rate": 7.16762180907618e-07, "loss": 0.0, "num_input_tokens_seen": 6302480, "step": 15385 }, { "epoch": 18.631961259079905, "grad_norm": 5.206572041061008e-06, "learning_rate": 7.10496531921287e-07, "loss": 0.0, "num_input_tokens_seen": 6304400, "step": 15390 }, { "epoch": 18.638014527845037, "grad_norm": 5.7693146118253935e-06, "learning_rate": 7.042579944161797e-07, "loss": 0.0, "num_input_tokens_seen": 6306448, "step": 15395 }, { "epoch": 18.64406779661017, "grad_norm": 8.146085747284815e-06, "learning_rate": 6.980465753556376e-07, "loss": 0.0, "num_input_tokens_seen": 6308464, "step": 15400 }, { "epoch": 18.650121065375302, "grad_norm": 5.060618150309892e-06, "learning_rate": 6.918622816727255e-07, "loss": 0.0, "num_input_tokens_seen": 6310544, "step": 15405 }, { "epoch": 18.656174334140434, "grad_norm": 5.189661806070944e-06, "learning_rate": 6.85705120270233e-07, "loss": 0.0, "num_input_tokens_seen": 6312592, "step": 15410 }, { "epoch": 18.66222760290557, "grad_norm": 0.00027766937273554504, "learning_rate": 6.795750980206711e-07, "loss": 0.0, "num_input_tokens_seen": 6314576, "step": 15415 }, { "epoch": 18.668280871670703, "grad_norm": 6.5713197727745865e-06, "learning_rate": 6.734722217662526e-07, "loss": 0.0, "num_input_tokens_seen": 6316688, "step": 15420 }, { "epoch": 18.674334140435835, "grad_norm": 4.3718509914469905e-06, "learning_rate": 6.673964983188868e-07, "loss": 0.0, "num_input_tokens_seen": 6318544, "step": 15425 }, { "epoch": 18.680387409200968, "grad_norm": 1.2469618013710715e-05, "learning_rate": 6.613479344601881e-07, "loss": 0.0, "num_input_tokens_seen": 6320816, "step": 15430 }, { "epoch": 18.6864406779661, "grad_norm": 5.616590897261631e-06, "learning_rate": 6.553265369414419e-07, "loss": 0.0, "num_input_tokens_seen": 6322864, "step": 15435 }, { "epoch": 18.692493946731236, "grad_norm": 7.264085525093833e-06, "learning_rate": 6.493323124836193e-07, "loss": 0.0, "num_input_tokens_seen": 6324944, "step": 15440 }, { "epoch": 18.69854721549637, "grad_norm": 5.1147094382031355e-06, "learning_rate": 6.433652677773627e-07, "loss": 0.0, "num_input_tokens_seen": 6327056, "step": 15445 }, { "epoch": 18.7046004842615, "grad_norm": 4.237735083734151e-06, "learning_rate": 6.374254094829723e-07, "loss": 0.0, "num_input_tokens_seen": 6329136, "step": 15450 }, { "epoch": 18.710653753026634, "grad_norm": 0.00016883708303794265, "learning_rate": 6.315127442304003e-07, "loss": 0.0, "num_input_tokens_seen": 6331088, "step": 15455 }, { "epoch": 18.716707021791766, "grad_norm": 5.4114193517307285e-06, "learning_rate": 6.256272786192563e-07, "loss": 0.0, "num_input_tokens_seen": 6333040, "step": 15460 }, { "epoch": 18.722760290556902, "grad_norm": 4.867211828241125e-06, "learning_rate": 6.197690192187827e-07, "loss": 0.0, "num_input_tokens_seen": 6335088, "step": 15465 }, { "epoch": 18.728813559322035, "grad_norm": 5.932403837505262e-06, "learning_rate": 6.139379725678602e-07, "loss": 0.0, "num_input_tokens_seen": 6337200, "step": 15470 }, { "epoch": 18.734866828087167, "grad_norm": 4.748589617520338e-06, "learning_rate": 6.08134145174985e-07, "loss": 0.0, "num_input_tokens_seen": 6339248, "step": 15475 }, { "epoch": 18.7409200968523, "grad_norm": 4.760359843203332e-06, "learning_rate": 6.023575435182865e-07, "loss": 0.0, "num_input_tokens_seen": 6341424, "step": 15480 }, { "epoch": 18.746973365617432, "grad_norm": 4.091436039743712e-06, "learning_rate": 5.966081740454932e-07, "loss": 0.0, "num_input_tokens_seen": 6343408, "step": 15485 }, { "epoch": 18.753026634382568, "grad_norm": 5.388046702137217e-06, "learning_rate": 5.90886043173941e-07, "loss": 0.0, "num_input_tokens_seen": 6345456, "step": 15490 }, { "epoch": 18.7590799031477, "grad_norm": 6.4858659243327565e-06, "learning_rate": 5.851911572905711e-07, "loss": 0.0, "num_input_tokens_seen": 6347408, "step": 15495 }, { "epoch": 18.765133171912833, "grad_norm": 4.8236147449642885e-06, "learning_rate": 5.79523522751893e-07, "loss": 0.0, "num_input_tokens_seen": 6349392, "step": 15500 }, { "epoch": 18.771186440677965, "grad_norm": 5.798787697131047e-06, "learning_rate": 5.738831458840243e-07, "loss": 0.0, "num_input_tokens_seen": 6351408, "step": 15505 }, { "epoch": 18.777239709443098, "grad_norm": 4.4633566176344175e-06, "learning_rate": 5.682700329826401e-07, "loss": 0.0, "num_input_tokens_seen": 6353456, "step": 15510 }, { "epoch": 18.783292978208234, "grad_norm": 4.890620857622707e-06, "learning_rate": 5.626841903129954e-07, "loss": 0.0, "num_input_tokens_seen": 6355440, "step": 15515 }, { "epoch": 18.789346246973366, "grad_norm": 7.454083061020356e-06, "learning_rate": 5.571256241098943e-07, "loss": 0.0, "num_input_tokens_seen": 6357488, "step": 15520 }, { "epoch": 18.7953995157385, "grad_norm": 5.131016678205924e-06, "learning_rate": 5.515943405777102e-07, "loss": 0.0, "num_input_tokens_seen": 6359472, "step": 15525 }, { "epoch": 18.80145278450363, "grad_norm": 4.75515162179363e-06, "learning_rate": 5.460903458903488e-07, "loss": 0.0, "num_input_tokens_seen": 6361520, "step": 15530 }, { "epoch": 18.807506053268764, "grad_norm": 3.889401796186576e-06, "learning_rate": 5.406136461912709e-07, "loss": 0.0, "num_input_tokens_seen": 6363728, "step": 15535 }, { "epoch": 18.8135593220339, "grad_norm": 6.636434591200668e-06, "learning_rate": 5.351642475934587e-07, "loss": 0.0, "num_input_tokens_seen": 6365776, "step": 15540 }, { "epoch": 18.819612590799032, "grad_norm": 5.930378847551765e-06, "learning_rate": 5.29742156179433e-07, "loss": 0.0, "num_input_tokens_seen": 6367568, "step": 15545 }, { "epoch": 18.825665859564165, "grad_norm": 7.549625479441602e-06, "learning_rate": 5.243473780012248e-07, "loss": 0.0, "num_input_tokens_seen": 6369680, "step": 15550 }, { "epoch": 18.831719128329297, "grad_norm": 6.910522643011063e-06, "learning_rate": 5.18979919080384e-07, "loss": 0.0, "num_input_tokens_seen": 6371728, "step": 15555 }, { "epoch": 18.83777239709443, "grad_norm": 1.0409949027234688e-05, "learning_rate": 5.136397854079655e-07, "loss": 0.0, "num_input_tokens_seen": 6373840, "step": 15560 }, { "epoch": 18.843825665859566, "grad_norm": 6.489818588306662e-06, "learning_rate": 5.083269829445236e-07, "loss": 0.0, "num_input_tokens_seen": 6375888, "step": 15565 }, { "epoch": 18.849878934624698, "grad_norm": 4.286043804313522e-06, "learning_rate": 5.030415176201093e-07, "loss": 0.0, "num_input_tokens_seen": 6378064, "step": 15570 }, { "epoch": 18.85593220338983, "grad_norm": 5.755880465585506e-06, "learning_rate": 4.977833953342615e-07, "loss": 0.0, "num_input_tokens_seen": 6380080, "step": 15575 }, { "epoch": 18.861985472154963, "grad_norm": 5.252974460745463e-06, "learning_rate": 4.925526219559912e-07, "loss": 0.0, "num_input_tokens_seen": 6382096, "step": 15580 }, { "epoch": 18.868038740920095, "grad_norm": 5.946533292444656e-06, "learning_rate": 4.873492033237864e-07, "loss": 0.0, "num_input_tokens_seen": 6384048, "step": 15585 }, { "epoch": 18.87409200968523, "grad_norm": 7.412829745589988e-06, "learning_rate": 4.821731452456125e-07, "loss": 0.0, "num_input_tokens_seen": 6386160, "step": 15590 }, { "epoch": 18.880145278450364, "grad_norm": 4.405216714076232e-06, "learning_rate": 4.770244534988754e-07, "loss": 0.0, "num_input_tokens_seen": 6388368, "step": 15595 }, { "epoch": 18.886198547215496, "grad_norm": 6.18290096099372e-06, "learning_rate": 4.7190313383045637e-07, "loss": 0.0, "num_input_tokens_seen": 6390352, "step": 15600 }, { "epoch": 18.89225181598063, "grad_norm": 6.861406745883869e-06, "learning_rate": 4.6680919195667137e-07, "loss": 0.0, "num_input_tokens_seen": 6392400, "step": 15605 }, { "epoch": 18.89830508474576, "grad_norm": 1.0108996320923325e-05, "learning_rate": 4.6174263356328075e-07, "loss": 0.0, "num_input_tokens_seen": 6394608, "step": 15610 }, { "epoch": 18.904358353510897, "grad_norm": 7.1105819188233e-06, "learning_rate": 4.567034643054802e-07, "loss": 0.0, "num_input_tokens_seen": 6396624, "step": 15615 }, { "epoch": 18.91041162227603, "grad_norm": 5.965256605122704e-06, "learning_rate": 4.5169168980789545e-07, "loss": 0.0, "num_input_tokens_seen": 6398768, "step": 15620 }, { "epoch": 18.916464891041162, "grad_norm": 2.7343206966179423e-05, "learning_rate": 4.4670731566457126e-07, "loss": 0.0, "num_input_tokens_seen": 6400688, "step": 15625 }, { "epoch": 18.922518159806295, "grad_norm": 6.8000595092598815e-06, "learning_rate": 4.4175034743897947e-07, "loss": 0.0, "num_input_tokens_seen": 6402672, "step": 15630 }, { "epoch": 18.928571428571427, "grad_norm": 5.337892162060598e-06, "learning_rate": 4.368207906639804e-07, "loss": 0.0, "num_input_tokens_seen": 6404752, "step": 15635 }, { "epoch": 18.934624697336563, "grad_norm": 6.225355173228309e-05, "learning_rate": 4.319186508418671e-07, "loss": 0.0, "num_input_tokens_seen": 6406800, "step": 15640 }, { "epoch": 18.940677966101696, "grad_norm": 8.633550351078156e-06, "learning_rate": 4.270439334442988e-07, "loss": 0.0, "num_input_tokens_seen": 6408880, "step": 15645 }, { "epoch": 18.946731234866828, "grad_norm": 5.224795131653082e-06, "learning_rate": 4.221966439123509e-07, "loss": 0.0, "num_input_tokens_seen": 6410992, "step": 15650 }, { "epoch": 18.95278450363196, "grad_norm": 4.828110832022503e-06, "learning_rate": 4.173767876564788e-07, "loss": 0.0, "num_input_tokens_seen": 6413072, "step": 15655 }, { "epoch": 18.958837772397093, "grad_norm": 6.171998393256217e-06, "learning_rate": 4.1258437005650687e-07, "loss": 0.0, "num_input_tokens_seen": 6415344, "step": 15660 }, { "epoch": 18.96489104116223, "grad_norm": 4.617452759703156e-06, "learning_rate": 4.0781939646164226e-07, "loss": 0.0, "num_input_tokens_seen": 6417392, "step": 15665 }, { "epoch": 18.97094430992736, "grad_norm": 5.15056444783113e-06, "learning_rate": 4.030818721904611e-07, "loss": 0.0, "num_input_tokens_seen": 6419536, "step": 15670 }, { "epoch": 18.976997578692494, "grad_norm": 4.839399480260909e-06, "learning_rate": 3.983718025308947e-07, "loss": 0.0, "num_input_tokens_seen": 6421616, "step": 15675 }, { "epoch": 18.983050847457626, "grad_norm": 6.103675332269631e-06, "learning_rate": 3.9368919274023475e-07, "loss": 0.0, "num_input_tokens_seen": 6423792, "step": 15680 }, { "epoch": 18.98910411622276, "grad_norm": 5.03641422255896e-06, "learning_rate": 3.890340480451199e-07, "loss": 0.0, "num_input_tokens_seen": 6425936, "step": 15685 }, { "epoch": 18.995157384987895, "grad_norm": 4.93055858896696e-06, "learning_rate": 3.8440637364153265e-07, "loss": 0.0, "num_input_tokens_seen": 6428080, "step": 15690 }, { "epoch": 19.0, "eval_loss": 0.4505572021007538, "eval_runtime": 4.9669, "eval_samples_per_second": 73.889, "eval_steps_per_second": 18.523, "num_input_tokens_seen": 6429416, "step": 15694 }, { "epoch": 19.001210653753027, "grad_norm": 6.494582976301899e-06, "learning_rate": 3.7980617469479953e-07, "loss": 0.0, "num_input_tokens_seen": 6429832, "step": 15695 }, { "epoch": 19.00726392251816, "grad_norm": 5.295796654536389e-06, "learning_rate": 3.7523345633957153e-07, "loss": 0.0, "num_input_tokens_seen": 6431848, "step": 15700 }, { "epoch": 19.013317191283292, "grad_norm": 9.128617421083618e-06, "learning_rate": 3.706882236798298e-07, "loss": 0.0, "num_input_tokens_seen": 6433960, "step": 15705 }, { "epoch": 19.019370460048425, "grad_norm": 4.407363121572416e-06, "learning_rate": 3.6617048178887725e-07, "loss": 0.0, "num_input_tokens_seen": 6435944, "step": 15710 }, { "epoch": 19.02542372881356, "grad_norm": 5.440856511995662e-06, "learning_rate": 3.6168023570933297e-07, "loss": 0.0, "num_input_tokens_seen": 6437992, "step": 15715 }, { "epoch": 19.031476997578693, "grad_norm": 1.4490178728010505e-05, "learning_rate": 3.5721749045312114e-07, "loss": 0.0, "num_input_tokens_seen": 6440104, "step": 15720 }, { "epoch": 19.037530266343826, "grad_norm": 5.44957129022805e-06, "learning_rate": 3.5278225100147667e-07, "loss": 0.0, "num_input_tokens_seen": 6442152, "step": 15725 }, { "epoch": 19.043583535108958, "grad_norm": 3.726602517417632e-05, "learning_rate": 3.4837452230492284e-07, "loss": 0.0, "num_input_tokens_seen": 6444200, "step": 15730 }, { "epoch": 19.04963680387409, "grad_norm": 3.3983782486757264e-05, "learning_rate": 3.439943092832909e-07, "loss": 0.0, "num_input_tokens_seen": 6446216, "step": 15735 }, { "epoch": 19.055690072639226, "grad_norm": 6.1028176787658595e-06, "learning_rate": 3.3964161682568663e-07, "loss": 0.0, "num_input_tokens_seen": 6448360, "step": 15740 }, { "epoch": 19.06174334140436, "grad_norm": 5.354836957849329e-06, "learning_rate": 3.353164497904987e-07, "loss": 0.0, "num_input_tokens_seen": 6450440, "step": 15745 }, { "epoch": 19.06779661016949, "grad_norm": 1.3726965335081331e-05, "learning_rate": 3.31018813005407e-07, "loss": 0.0, "num_input_tokens_seen": 6452552, "step": 15750 }, { "epoch": 19.073849878934624, "grad_norm": 5.237910954747349e-06, "learning_rate": 3.267487112673412e-07, "loss": 0.0, "num_input_tokens_seen": 6454568, "step": 15755 }, { "epoch": 19.079903147699756, "grad_norm": 6.2909712141845375e-06, "learning_rate": 3.225061493425108e-07, "loss": 0.0, "num_input_tokens_seen": 6456552, "step": 15760 }, { "epoch": 19.085956416464892, "grad_norm": 4.848665867029922e-06, "learning_rate": 3.1829113196638614e-07, "loss": 0.0, "num_input_tokens_seen": 6458600, "step": 15765 }, { "epoch": 19.092009685230025, "grad_norm": 6.298221705947071e-06, "learning_rate": 3.141036638436845e-07, "loss": 0.0, "num_input_tokens_seen": 6460552, "step": 15770 }, { "epoch": 19.098062953995157, "grad_norm": 5.00290025229333e-06, "learning_rate": 3.099437496483837e-07, "loss": 0.0, "num_input_tokens_seen": 6462600, "step": 15775 }, { "epoch": 19.10411622276029, "grad_norm": 1.8173021089751273e-05, "learning_rate": 3.058113940236945e-07, "loss": 0.0, "num_input_tokens_seen": 6464520, "step": 15780 }, { "epoch": 19.110169491525422, "grad_norm": 5.786363544757478e-06, "learning_rate": 3.017066015820774e-07, "loss": 0.0, "num_input_tokens_seen": 6466440, "step": 15785 }, { "epoch": 19.116222760290558, "grad_norm": 5.353180767997401e-06, "learning_rate": 2.976293769052202e-07, "loss": 0.0, "num_input_tokens_seen": 6468424, "step": 15790 }, { "epoch": 19.12227602905569, "grad_norm": 1.9154413166688755e-05, "learning_rate": 2.9357972454404637e-07, "loss": 0.0, "num_input_tokens_seen": 6470472, "step": 15795 }, { "epoch": 19.128329297820823, "grad_norm": 4.929661827191012e-06, "learning_rate": 2.895576490187041e-07, "loss": 0.0, "num_input_tokens_seen": 6472456, "step": 15800 }, { "epoch": 19.134382566585955, "grad_norm": 3.254773037042469e-05, "learning_rate": 2.8556315481854943e-07, "loss": 0.0, "num_input_tokens_seen": 6474344, "step": 15805 }, { "epoch": 19.140435835351088, "grad_norm": 4.728616204374703e-06, "learning_rate": 2.8159624640216597e-07, "loss": 0.0, "num_input_tokens_seen": 6476168, "step": 15810 }, { "epoch": 19.146489104116224, "grad_norm": 6.348741408146452e-06, "learning_rate": 2.7765692819734236e-07, "loss": 0.0, "num_input_tokens_seen": 6478312, "step": 15815 }, { "epoch": 19.152542372881356, "grad_norm": 9.130230864684563e-06, "learning_rate": 2.737452046010641e-07, "loss": 0.0, "num_input_tokens_seen": 6480360, "step": 15820 }, { "epoch": 19.15859564164649, "grad_norm": 4.489437742449809e-06, "learning_rate": 2.6986107997953035e-07, "loss": 0.0, "num_input_tokens_seen": 6482536, "step": 15825 }, { "epoch": 19.16464891041162, "grad_norm": 7.855765943531878e-06, "learning_rate": 2.660045586681231e-07, "loss": 0.0, "num_input_tokens_seen": 6484584, "step": 15830 }, { "epoch": 19.170702179176754, "grad_norm": 4.94768210046459e-06, "learning_rate": 2.621756449714158e-07, "loss": 0.0, "num_input_tokens_seen": 6486888, "step": 15835 }, { "epoch": 19.17675544794189, "grad_norm": 2.3917215003166348e-05, "learning_rate": 2.5837434316317574e-07, "loss": 0.0, "num_input_tokens_seen": 6488840, "step": 15840 }, { "epoch": 19.182808716707022, "grad_norm": 9.790427611733321e-06, "learning_rate": 2.546006574863369e-07, "loss": 0.0, "num_input_tokens_seen": 6490696, "step": 15845 }, { "epoch": 19.188861985472155, "grad_norm": 4.889395313512068e-06, "learning_rate": 2.508545921530159e-07, "loss": 0.0, "num_input_tokens_seen": 6492808, "step": 15850 }, { "epoch": 19.194915254237287, "grad_norm": 4.282503596186871e-06, "learning_rate": 2.47136151344507e-07, "loss": 0.0, "num_input_tokens_seen": 6494760, "step": 15855 }, { "epoch": 19.20096852300242, "grad_norm": 4.9744990064937156e-06, "learning_rate": 2.43445339211254e-07, "loss": 0.0, "num_input_tokens_seen": 6496840, "step": 15860 }, { "epoch": 19.207021791767556, "grad_norm": 8.774015441304073e-06, "learning_rate": 2.3978215987287554e-07, "loss": 0.0, "num_input_tokens_seen": 6498920, "step": 15865 }, { "epoch": 19.213075060532688, "grad_norm": 4.203878688713303e-06, "learning_rate": 2.361466174181426e-07, "loss": 0.0, "num_input_tokens_seen": 6500968, "step": 15870 }, { "epoch": 19.21912832929782, "grad_norm": 5.410285211837618e-06, "learning_rate": 2.3253871590497856e-07, "loss": 0.0, "num_input_tokens_seen": 6503144, "step": 15875 }, { "epoch": 19.225181598062953, "grad_norm": 5.1170522965549026e-06, "learning_rate": 2.28958459360451e-07, "loss": 0.0, "num_input_tokens_seen": 6505224, "step": 15880 }, { "epoch": 19.231234866828085, "grad_norm": 5.57033945369767e-06, "learning_rate": 2.2540585178078e-07, "loss": 0.0, "num_input_tokens_seen": 6507176, "step": 15885 }, { "epoch": 19.23728813559322, "grad_norm": 1.2719829101115465e-05, "learning_rate": 2.21880897131313e-07, "loss": 0.0, "num_input_tokens_seen": 6509256, "step": 15890 }, { "epoch": 19.243341404358354, "grad_norm": 5.493272510648239e-06, "learning_rate": 2.1838359934653884e-07, "loss": 0.0, "num_input_tokens_seen": 6511144, "step": 15895 }, { "epoch": 19.249394673123486, "grad_norm": 6.716643838444725e-06, "learning_rate": 2.1491396233007665e-07, "loss": 0.0, "num_input_tokens_seen": 6513128, "step": 15900 }, { "epoch": 19.25544794188862, "grad_norm": 5.419781246018829e-06, "learning_rate": 2.1147198995466467e-07, "loss": 0.0, "num_input_tokens_seen": 6515080, "step": 15905 }, { "epoch": 19.26150121065375, "grad_norm": 8.505974619765766e-06, "learning_rate": 2.0805768606217412e-07, "loss": 0.0, "num_input_tokens_seen": 6517192, "step": 15910 }, { "epoch": 19.267554479418887, "grad_norm": 4.352784344519023e-06, "learning_rate": 2.046710544635788e-07, "loss": 0.0, "num_input_tokens_seen": 6519240, "step": 15915 }, { "epoch": 19.27360774818402, "grad_norm": 8.687480658409186e-06, "learning_rate": 2.0131209893897994e-07, "loss": 0.0, "num_input_tokens_seen": 6521256, "step": 15920 }, { "epoch": 19.279661016949152, "grad_norm": 5.621811851597158e-06, "learning_rate": 1.9798082323757016e-07, "loss": 0.0, "num_input_tokens_seen": 6523400, "step": 15925 }, { "epoch": 19.285714285714285, "grad_norm": 5.392955699790036e-06, "learning_rate": 1.94677231077664e-07, "loss": 0.0, "num_input_tokens_seen": 6525416, "step": 15930 }, { "epoch": 19.291767554479417, "grad_norm": 7.614147762069479e-06, "learning_rate": 1.9140132614666463e-07, "loss": 0.0, "num_input_tokens_seen": 6527496, "step": 15935 }, { "epoch": 19.297820823244553, "grad_norm": 5.4572283261222765e-06, "learning_rate": 1.881531121010749e-07, "loss": 0.0, "num_input_tokens_seen": 6529512, "step": 15940 }, { "epoch": 19.303874092009686, "grad_norm": 1.979965963982977e-05, "learning_rate": 1.8493259256649186e-07, "loss": 0.0, "num_input_tokens_seen": 6531688, "step": 15945 }, { "epoch": 19.309927360774818, "grad_norm": 4.609876214090036e-06, "learning_rate": 1.8173977113759288e-07, "loss": 0.0, "num_input_tokens_seen": 6533544, "step": 15950 }, { "epoch": 19.31598062953995, "grad_norm": 7.254253887367668e-06, "learning_rate": 1.7857465137814944e-07, "loss": 0.0, "num_input_tokens_seen": 6535592, "step": 15955 }, { "epoch": 19.322033898305083, "grad_norm": 5.286583473207429e-06, "learning_rate": 1.7543723682100777e-07, "loss": 0.0, "num_input_tokens_seen": 6537768, "step": 15960 }, { "epoch": 19.32808716707022, "grad_norm": 9.036450137500651e-06, "learning_rate": 1.7232753096808607e-07, "loss": 0.0, "num_input_tokens_seen": 6539720, "step": 15965 }, { "epoch": 19.33414043583535, "grad_norm": 4.622641881724121e-06, "learning_rate": 1.6924553729038285e-07, "loss": 0.0, "num_input_tokens_seen": 6541608, "step": 15970 }, { "epoch": 19.340193704600484, "grad_norm": 8.179793439921923e-06, "learning_rate": 1.661912592279602e-07, "loss": 0.0, "num_input_tokens_seen": 6543592, "step": 15975 }, { "epoch": 19.346246973365616, "grad_norm": 5.78137814954971e-06, "learning_rate": 1.6316470018994112e-07, "loss": 0.0, "num_input_tokens_seen": 6545800, "step": 15980 }, { "epoch": 19.352300242130752, "grad_norm": 5.10957670485368e-06, "learning_rate": 1.6016586355452056e-07, "loss": 0.0, "num_input_tokens_seen": 6547912, "step": 15985 }, { "epoch": 19.358353510895885, "grad_norm": 4.328032446210273e-06, "learning_rate": 1.571947526689349e-07, "loss": 0.0, "num_input_tokens_seen": 6550088, "step": 15990 }, { "epoch": 19.364406779661017, "grad_norm": 4.636328412743751e-06, "learning_rate": 1.5425137084948692e-07, "loss": 0.0, "num_input_tokens_seen": 6552200, "step": 15995 }, { "epoch": 19.37046004842615, "grad_norm": 5.35364733877941e-06, "learning_rate": 1.5133572138152364e-07, "loss": 0.0, "num_input_tokens_seen": 6554184, "step": 16000 }, { "epoch": 19.376513317191282, "grad_norm": 1.2720312042802107e-05, "learning_rate": 1.4844780751943345e-07, "loss": 0.0, "num_input_tokens_seen": 6556200, "step": 16005 }, { "epoch": 19.38256658595642, "grad_norm": 6.779689101676922e-06, "learning_rate": 1.4558763248665175e-07, "loss": 0.0, "num_input_tokens_seen": 6558312, "step": 16010 }, { "epoch": 19.38861985472155, "grad_norm": 5.811290066048969e-06, "learning_rate": 1.4275519947565542e-07, "loss": 0.0, "num_input_tokens_seen": 6560456, "step": 16015 }, { "epoch": 19.394673123486683, "grad_norm": 4.813057785213459e-06, "learning_rate": 1.3995051164794604e-07, "loss": 0.0, "num_input_tokens_seen": 6562568, "step": 16020 }, { "epoch": 19.400726392251816, "grad_norm": 7.373742391791893e-06, "learning_rate": 1.3717357213406667e-07, "loss": 0.0, "num_input_tokens_seen": 6564584, "step": 16025 }, { "epoch": 19.406779661016948, "grad_norm": 6.981010301387869e-06, "learning_rate": 1.3442438403358515e-07, "loss": 0.0, "num_input_tokens_seen": 6566568, "step": 16030 }, { "epoch": 19.412832929782084, "grad_norm": 9.092198524740525e-06, "learning_rate": 1.3170295041509128e-07, "loss": 0.0, "num_input_tokens_seen": 6568456, "step": 16035 }, { "epoch": 19.418886198547217, "grad_norm": 4.937672201776877e-06, "learning_rate": 1.290092743161997e-07, "loss": 0.0, "num_input_tokens_seen": 6570440, "step": 16040 }, { "epoch": 19.42493946731235, "grad_norm": 1.4556313544744626e-05, "learning_rate": 1.2634335874353585e-07, "loss": 0.0, "num_input_tokens_seen": 6572392, "step": 16045 }, { "epoch": 19.43099273607748, "grad_norm": 5.187645001569763e-06, "learning_rate": 1.2370520667274733e-07, "loss": 0.0, "num_input_tokens_seen": 6574408, "step": 16050 }, { "epoch": 19.437046004842614, "grad_norm": 4.784993052453501e-06, "learning_rate": 1.2109482104848692e-07, "loss": 0.0, "num_input_tokens_seen": 6576264, "step": 16055 }, { "epoch": 19.44309927360775, "grad_norm": 0.0005908640450797975, "learning_rate": 1.1851220478442115e-07, "loss": 0.0, "num_input_tokens_seen": 6578280, "step": 16060 }, { "epoch": 19.449152542372882, "grad_norm": 5.8853929658653215e-06, "learning_rate": 1.1595736076321362e-07, "loss": 0.0, "num_input_tokens_seen": 6580328, "step": 16065 }, { "epoch": 19.455205811138015, "grad_norm": 5.7752868087845854e-06, "learning_rate": 1.134302918365332e-07, "loss": 0.0, "num_input_tokens_seen": 6582376, "step": 16070 }, { "epoch": 19.461259079903147, "grad_norm": 4.1755224629014265e-06, "learning_rate": 1.1093100082504581e-07, "loss": 0.0, "num_input_tokens_seen": 6584552, "step": 16075 }, { "epoch": 19.46731234866828, "grad_norm": 5.330646672518924e-06, "learning_rate": 1.0845949051841441e-07, "loss": 0.0, "num_input_tokens_seen": 6586504, "step": 16080 }, { "epoch": 19.473365617433416, "grad_norm": 6.025893526384607e-06, "learning_rate": 1.0601576367529065e-07, "loss": 0.0, "num_input_tokens_seen": 6588648, "step": 16085 }, { "epoch": 19.479418886198548, "grad_norm": 7.217055554065155e-06, "learning_rate": 1.0359982302331484e-07, "loss": 0.0, "num_input_tokens_seen": 6590760, "step": 16090 }, { "epoch": 19.48547215496368, "grad_norm": 8.869212251738645e-06, "learning_rate": 1.0121167125911601e-07, "loss": 0.0, "num_input_tokens_seen": 6593000, "step": 16095 }, { "epoch": 19.491525423728813, "grad_norm": 8.384687134821434e-06, "learning_rate": 9.885131104830358e-08, "loss": 0.0, "num_input_tokens_seen": 6595176, "step": 16100 }, { "epoch": 19.497578692493946, "grad_norm": 5.0137832658947445e-06, "learning_rate": 9.651874502546454e-08, "loss": 0.0, "num_input_tokens_seen": 6597224, "step": 16105 }, { "epoch": 19.50363196125908, "grad_norm": 6.202753866091371e-06, "learning_rate": 9.421397579416625e-08, "loss": 0.0, "num_input_tokens_seen": 6599304, "step": 16110 }, { "epoch": 19.509685230024214, "grad_norm": 4.872733825322939e-06, "learning_rate": 9.193700592694532e-08, "loss": 0.0, "num_input_tokens_seen": 6601288, "step": 16115 }, { "epoch": 19.515738498789347, "grad_norm": 6.937243597349152e-06, "learning_rate": 8.9687837965316e-08, "loss": 0.0, "num_input_tokens_seen": 6603368, "step": 16120 }, { "epoch": 19.52179176755448, "grad_norm": 1.0882527931244113e-05, "learning_rate": 8.74664744197562e-08, "loss": 0.0, "num_input_tokens_seen": 6605352, "step": 16125 }, { "epoch": 19.52784503631961, "grad_norm": 4.174107743892819e-06, "learning_rate": 8.527291776970759e-08, "loss": 0.0, "num_input_tokens_seen": 6607240, "step": 16130 }, { "epoch": 19.533898305084747, "grad_norm": 8.441690624749754e-06, "learning_rate": 8.310717046358108e-08, "loss": 0.0, "num_input_tokens_seen": 6609256, "step": 16135 }, { "epoch": 19.53995157384988, "grad_norm": 1.3124134056852199e-05, "learning_rate": 8.096923491873465e-08, "loss": 0.0, "num_input_tokens_seen": 6611528, "step": 16140 }, { "epoch": 19.546004842615012, "grad_norm": 6.4034647948574275e-06, "learning_rate": 7.885911352149832e-08, "loss": 0.0, "num_input_tokens_seen": 6613576, "step": 16145 }, { "epoch": 19.552058111380145, "grad_norm": 5.610196240013465e-06, "learning_rate": 7.677680862714365e-08, "loss": 0.0, "num_input_tokens_seen": 6615656, "step": 16150 }, { "epoch": 19.558111380145277, "grad_norm": 1.2455478099582251e-05, "learning_rate": 7.472232255990585e-08, "loss": 0.0, "num_input_tokens_seen": 6617640, "step": 16155 }, { "epoch": 19.564164648910413, "grad_norm": 5.783362212241627e-06, "learning_rate": 7.269565761295893e-08, "loss": 0.0, "num_input_tokens_seen": 6619848, "step": 16160 }, { "epoch": 19.570217917675546, "grad_norm": 6.2237022575573064e-06, "learning_rate": 7.069681604842949e-08, "loss": 0.0, "num_input_tokens_seen": 6622024, "step": 16165 }, { "epoch": 19.576271186440678, "grad_norm": 7.517027825088007e-06, "learning_rate": 6.872580009738283e-08, "loss": 0.0, "num_input_tokens_seen": 6624072, "step": 16170 }, { "epoch": 19.58232445520581, "grad_norm": 4.344264652900165e-06, "learning_rate": 6.678261195983693e-08, "loss": 0.0, "num_input_tokens_seen": 6626120, "step": 16175 }, { "epoch": 19.588377723970943, "grad_norm": 1.1701344192260876e-05, "learning_rate": 6.486725380473457e-08, "loss": 0.0, "num_input_tokens_seen": 6628168, "step": 16180 }, { "epoch": 19.59443099273608, "grad_norm": 1.1258144695602823e-05, "learning_rate": 6.297972776996286e-08, "loss": 0.0, "num_input_tokens_seen": 6630376, "step": 16185 }, { "epoch": 19.60048426150121, "grad_norm": 6.270590802159859e-06, "learning_rate": 6.112003596234484e-08, "loss": 0.0, "num_input_tokens_seen": 6632520, "step": 16190 }, { "epoch": 19.606537530266344, "grad_norm": 6.019563898007618e-06, "learning_rate": 5.9288180457633954e-08, "loss": 0.0, "num_input_tokens_seen": 6634568, "step": 16195 }, { "epoch": 19.612590799031477, "grad_norm": 1.8741631720331497e-05, "learning_rate": 5.7484163300508545e-08, "loss": 0.0, "num_input_tokens_seen": 6636520, "step": 16200 }, { "epoch": 19.61864406779661, "grad_norm": 8.177896233974025e-06, "learning_rate": 5.570798650458009e-08, "loss": 0.0, "num_input_tokens_seen": 6638408, "step": 16205 }, { "epoch": 19.624697336561745, "grad_norm": 5.183936536923284e-06, "learning_rate": 5.3959652052384954e-08, "loss": 0.0, "num_input_tokens_seen": 6640616, "step": 16210 }, { "epoch": 19.630750605326877, "grad_norm": 8.105527740553953e-06, "learning_rate": 5.2239161895378806e-08, "loss": 0.0, "num_input_tokens_seen": 6642696, "step": 16215 }, { "epoch": 19.63680387409201, "grad_norm": 1.956855339813046e-05, "learning_rate": 5.054651795393939e-08, "loss": 0.0, "num_input_tokens_seen": 6644744, "step": 16220 }, { "epoch": 19.642857142857142, "grad_norm": 4.6271661631180905e-06, "learning_rate": 4.888172211736375e-08, "loss": 0.0, "num_input_tokens_seen": 6646888, "step": 16225 }, { "epoch": 19.648910411622275, "grad_norm": 4.70263194074505e-06, "learning_rate": 4.724477624386825e-08, "loss": 0.0, "num_input_tokens_seen": 6649128, "step": 16230 }, { "epoch": 19.65496368038741, "grad_norm": 1.5427345715579577e-05, "learning_rate": 4.563568216057745e-08, "loss": 0.0, "num_input_tokens_seen": 6651272, "step": 16235 }, { "epoch": 19.661016949152543, "grad_norm": 1.5486481061088853e-05, "learning_rate": 4.405444166353523e-08, "loss": 0.0, "num_input_tokens_seen": 6653256, "step": 16240 }, { "epoch": 19.667070217917676, "grad_norm": 6.531493454531301e-06, "learning_rate": 4.25010565176881e-08, "loss": 0.0, "num_input_tokens_seen": 6655240, "step": 16245 }, { "epoch": 19.673123486682808, "grad_norm": 1.249292108695954e-05, "learning_rate": 4.097552845689634e-08, "loss": 0.0, "num_input_tokens_seen": 6657160, "step": 16250 }, { "epoch": 19.67917675544794, "grad_norm": 5.097629582451191e-06, "learning_rate": 3.9477859183925657e-08, "loss": 0.0, "num_input_tokens_seen": 6659144, "step": 16255 }, { "epoch": 19.685230024213077, "grad_norm": 5.758065526606515e-06, "learning_rate": 3.8008050370444415e-08, "loss": 0.0, "num_input_tokens_seen": 6661160, "step": 16260 }, { "epoch": 19.69128329297821, "grad_norm": 6.915808171470417e-06, "learning_rate": 3.656610365702917e-08, "loss": 0.0, "num_input_tokens_seen": 6663272, "step": 16265 }, { "epoch": 19.69733656174334, "grad_norm": 2.971640606119763e-05, "learning_rate": 3.515202065314804e-08, "loss": 0.0, "num_input_tokens_seen": 6665288, "step": 16270 }, { "epoch": 19.703389830508474, "grad_norm": 4.307955350668635e-06, "learning_rate": 3.3765802937177346e-08, "loss": 0.0, "num_input_tokens_seen": 6667336, "step": 16275 }, { "epoch": 19.709443099273606, "grad_norm": 6.684949767077342e-06, "learning_rate": 3.240745205638773e-08, "loss": 0.0, "num_input_tokens_seen": 6669288, "step": 16280 }, { "epoch": 19.715496368038743, "grad_norm": 8.820302355161402e-06, "learning_rate": 3.107696952694139e-08, "loss": 0.0, "num_input_tokens_seen": 6671368, "step": 16285 }, { "epoch": 19.721549636803875, "grad_norm": 6.14823284195154e-06, "learning_rate": 2.977435683389762e-08, "loss": 0.0, "num_input_tokens_seen": 6673448, "step": 16290 }, { "epoch": 19.727602905569007, "grad_norm": 4.744976195070194e-06, "learning_rate": 2.8499615431212824e-08, "loss": 0.0, "num_input_tokens_seen": 6675432, "step": 16295 }, { "epoch": 19.73365617433414, "grad_norm": 4.928138878312893e-06, "learning_rate": 2.725274674172107e-08, "loss": 0.0, "num_input_tokens_seen": 6677384, "step": 16300 }, { "epoch": 19.739709443099272, "grad_norm": 5.6284434322151355e-06, "learning_rate": 2.6033752157161862e-08, "loss": 0.0, "num_input_tokens_seen": 6679400, "step": 16305 }, { "epoch": 19.74576271186441, "grad_norm": 4.456392616702942e-06, "learning_rate": 2.4842633038146822e-08, "loss": 0.0, "num_input_tokens_seen": 6681512, "step": 16310 }, { "epoch": 19.75181598062954, "grad_norm": 6.978987585171126e-06, "learning_rate": 2.367939071418468e-08, "loss": 0.0, "num_input_tokens_seen": 6683624, "step": 16315 }, { "epoch": 19.757869249394673, "grad_norm": 4.749705567519413e-06, "learning_rate": 2.2544026483664606e-08, "loss": 0.0, "num_input_tokens_seen": 6685672, "step": 16320 }, { "epoch": 19.763922518159806, "grad_norm": 9.760864486452192e-06, "learning_rate": 2.1436541613853444e-08, "loss": 0.0, "num_input_tokens_seen": 6687816, "step": 16325 }, { "epoch": 19.769975786924938, "grad_norm": 1.7190963262692094e-05, "learning_rate": 2.03569373409096e-08, "loss": 0.0, "num_input_tokens_seen": 6689768, "step": 16330 }, { "epoch": 19.776029055690074, "grad_norm": 1.7399221178493463e-05, "learning_rate": 1.930521486986636e-08, "loss": 0.0, "num_input_tokens_seen": 6691848, "step": 16335 }, { "epoch": 19.782082324455207, "grad_norm": 5.301436431182083e-06, "learning_rate": 1.8281375374634702e-08, "loss": 0.0, "num_input_tokens_seen": 6693896, "step": 16340 }, { "epoch": 19.78813559322034, "grad_norm": 6.021677108947188e-06, "learning_rate": 1.7285419998006035e-08, "loss": 0.0, "num_input_tokens_seen": 6696104, "step": 16345 }, { "epoch": 19.79418886198547, "grad_norm": 4.095191343367333e-06, "learning_rate": 1.6317349851646678e-08, "loss": 0.0, "num_input_tokens_seen": 6698088, "step": 16350 }, { "epoch": 19.800242130750604, "grad_norm": 5.1784863899229094e-06, "learning_rate": 1.5377166016097844e-08, "loss": 0.0, "num_input_tokens_seen": 6700168, "step": 16355 }, { "epoch": 19.80629539951574, "grad_norm": 7.396940873150015e-06, "learning_rate": 1.4464869540772863e-08, "loss": 0.0, "num_input_tokens_seen": 6702280, "step": 16360 }, { "epoch": 19.812348668280872, "grad_norm": 8.388045898755081e-06, "learning_rate": 1.3580461443962743e-08, "loss": 0.0, "num_input_tokens_seen": 6704360, "step": 16365 }, { "epoch": 19.818401937046005, "grad_norm": 6.637537353526568e-06, "learning_rate": 1.2723942712825065e-08, "loss": 0.0, "num_input_tokens_seen": 6706408, "step": 16370 }, { "epoch": 19.824455205811137, "grad_norm": 5.6377439250354655e-06, "learning_rate": 1.1895314303389526e-08, "loss": 0.0, "num_input_tokens_seen": 6708328, "step": 16375 }, { "epoch": 19.83050847457627, "grad_norm": 4.426029136084253e-06, "learning_rate": 1.109457714055795e-08, "loss": 0.0, "num_input_tokens_seen": 6710312, "step": 16380 }, { "epoch": 19.836561743341406, "grad_norm": 6.341003881971119e-06, "learning_rate": 1.0321732118095951e-08, "loss": 0.0, "num_input_tokens_seen": 6712360, "step": 16385 }, { "epoch": 19.84261501210654, "grad_norm": 7.6996229836368e-06, "learning_rate": 9.576780098638494e-09, "loss": 0.0, "num_input_tokens_seen": 6714344, "step": 16390 }, { "epoch": 19.84866828087167, "grad_norm": 5.592393335973611e-06, "learning_rate": 8.859721913684339e-09, "loss": 0.0, "num_input_tokens_seen": 6716616, "step": 16395 }, { "epoch": 19.854721549636803, "grad_norm": 5.735570994147565e-06, "learning_rate": 8.170558363607139e-09, "loss": 0.0, "num_input_tokens_seen": 6718728, "step": 16400 }, { "epoch": 19.860774818401936, "grad_norm": 4.4547173274622764e-06, "learning_rate": 7.50929021763047e-09, "loss": 0.0, "num_input_tokens_seen": 6720712, "step": 16405 }, { "epoch": 19.86682808716707, "grad_norm": 4.528930276137544e-06, "learning_rate": 6.8759182138528055e-09, "loss": 0.0, "num_input_tokens_seen": 6722632, "step": 16410 }, { "epoch": 19.872881355932204, "grad_norm": 6.558584573213011e-06, "learning_rate": 6.2704430592336326e-09, "loss": 0.0, "num_input_tokens_seen": 6724648, "step": 16415 }, { "epoch": 19.878934624697337, "grad_norm": 8.023318514460698e-05, "learning_rate": 5.692865429590688e-09, "loss": 0.0, "num_input_tokens_seen": 6726664, "step": 16420 }, { "epoch": 19.88498789346247, "grad_norm": 4.4898115447722375e-06, "learning_rate": 5.143185969602726e-09, "loss": 0.0, "num_input_tokens_seen": 6728552, "step": 16425 }, { "epoch": 19.8910411622276, "grad_norm": 4.802830062544672e-06, "learning_rate": 4.6214052928150734e-09, "loss": 0.0, "num_input_tokens_seen": 6730536, "step": 16430 }, { "epoch": 19.897094430992738, "grad_norm": 5.6370708989561535e-06, "learning_rate": 4.127523981631298e-09, "loss": 0.0, "num_input_tokens_seen": 6732552, "step": 16435 }, { "epoch": 19.90314769975787, "grad_norm": 4.691522462962894e-06, "learning_rate": 3.661542587304889e-09, "loss": 0.0, "num_input_tokens_seen": 6734536, "step": 16440 }, { "epoch": 19.909200968523002, "grad_norm": 4.8551087274972815e-06, "learning_rate": 3.2234616299642306e-09, "loss": 0.0, "num_input_tokens_seen": 6736680, "step": 16445 }, { "epoch": 19.915254237288135, "grad_norm": 8.555292879464105e-06, "learning_rate": 2.813281598579298e-09, "loss": 0.0, "num_input_tokens_seen": 6738728, "step": 16450 }, { "epoch": 19.921307506053267, "grad_norm": 4.3292425289109815e-06, "learning_rate": 2.431002950989414e-09, "loss": 0.0, "num_input_tokens_seen": 6740712, "step": 16455 }, { "epoch": 19.927360774818403, "grad_norm": 5.688560577254975e-06, "learning_rate": 2.076626113886593e-09, "loss": 0.0, "num_input_tokens_seen": 6742664, "step": 16460 }, { "epoch": 19.933414043583536, "grad_norm": 7.110482783900807e-06, "learning_rate": 1.7501514828183185e-09, "loss": 0.0, "num_input_tokens_seen": 6744744, "step": 16465 }, { "epoch": 19.93946731234867, "grad_norm": 1.4303933312476147e-05, "learning_rate": 1.4515794221875434e-09, "loss": 0.0, "num_input_tokens_seen": 6746728, "step": 16470 }, { "epoch": 19.9455205811138, "grad_norm": 2.7694333766703494e-05, "learning_rate": 1.1809102652610148e-09, "loss": 0.0, "num_input_tokens_seen": 6748776, "step": 16475 }, { "epoch": 19.951573849878933, "grad_norm": 2.700046206882689e-05, "learning_rate": 9.381443141470714e-10, "loss": 0.0, "num_input_tokens_seen": 6750888, "step": 16480 }, { "epoch": 19.95762711864407, "grad_norm": 6.077437774365535e-06, "learning_rate": 7.23281839820622e-10, "loss": 0.0, "num_input_tokens_seen": 6752936, "step": 16485 }, { "epoch": 19.9636803874092, "grad_norm": 9.02688771020621e-06, "learning_rate": 5.363230821064935e-10, "loss": 0.0, "num_input_tokens_seen": 6754984, "step": 16490 }, { "epoch": 19.969733656174334, "grad_norm": 8.423044164374005e-06, "learning_rate": 3.772682496849811e-10, "loss": 0.0, "num_input_tokens_seen": 6757128, "step": 16495 }, { "epoch": 19.975786924939467, "grad_norm": 6.105526153987739e-06, "learning_rate": 2.4611752008907307e-10, "loss": 0.0, "num_input_tokens_seen": 6759080, "step": 16500 }, { "epoch": 19.9818401937046, "grad_norm": 4.078896836290369e-06, "learning_rate": 1.4287103970722638e-10, "loss": 0.0, "num_input_tokens_seen": 6761128, "step": 16505 }, { "epoch": 19.987893462469735, "grad_norm": 2.2346766854752786e-05, "learning_rate": 6.752892378059095e-11, "loss": 0.0, "num_input_tokens_seen": 6763240, "step": 16510 }, { "epoch": 19.993946731234868, "grad_norm": 5.548779427044792e-06, "learning_rate": 2.0091256403009794e-11, "loss": 0.0, "num_input_tokens_seen": 6765288, "step": 16515 }, { "epoch": 20.0, "grad_norm": 3.7344227166613564e-05, "learning_rate": 5.58090529345634e-13, "loss": 0.0, "num_input_tokens_seen": 6767120, "step": 16520 }, { "epoch": 20.0, "eval_loss": 0.4480169117450714, "eval_runtime": 4.9649, "eval_samples_per_second": 73.919, "eval_steps_per_second": 18.53, "num_input_tokens_seen": 6767120, "step": 16520 }, { "epoch": 20.0, "num_input_tokens_seen": 6767120, "step": 16520, "total_flos": 3.053164056064819e+17, "train_loss": 0.0281026716994063, "train_runtime": 3308.452, "train_samples_per_second": 19.955, "train_steps_per_second": 4.993 } ], "logging_steps": 5, "max_steps": 16520, "num_input_tokens_seen": 6767120, "num_train_epochs": 20, "save_steps": 826, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.053164056064819e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }