{ "best_global_step": 7544, "best_metric": 0.31705325841903687, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_boolq_123_1762583754/checkpoint-7544", "epoch": 20.0, "eval_steps": 3772, "global_step": 37720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002651113467656416, "grad_norm": 180.79800415039062, "learning_rate": 1.0604453870625664e-08, "loss": 12.6224, "num_input_tokens_seen": 5024, "step": 5 }, { "epoch": 0.005302226935312832, "grad_norm": 154.1978302001953, "learning_rate": 2.3860021208907746e-08, "loss": 12.7637, "num_input_tokens_seen": 10336, "step": 10 }, { "epoch": 0.007953340402969246, "grad_norm": 164.89622497558594, "learning_rate": 3.711558854718982e-08, "loss": 12.4943, "num_input_tokens_seen": 14464, "step": 15 }, { "epoch": 0.010604453870625663, "grad_norm": 172.47335815429688, "learning_rate": 5.0371155885471906e-08, "loss": 12.3682, "num_input_tokens_seen": 19104, "step": 20 }, { "epoch": 0.013255567338282079, "grad_norm": 167.67042541503906, "learning_rate": 6.362672322375399e-08, "loss": 12.5269, "num_input_tokens_seen": 24288, "step": 25 }, { "epoch": 0.015906680805938492, "grad_norm": 169.2733154296875, "learning_rate": 7.688229056203607e-08, "loss": 12.3949, "num_input_tokens_seen": 29888, "step": 30 }, { "epoch": 0.01855779427359491, "grad_norm": 205.05348205566406, "learning_rate": 9.013785790031814e-08, "loss": 12.4289, "num_input_tokens_seen": 33952, "step": 35 }, { "epoch": 0.021208907741251327, "grad_norm": 158.97837829589844, "learning_rate": 1.0339342523860022e-07, "loss": 12.3883, "num_input_tokens_seen": 38688, "step": 40 }, { "epoch": 0.02386002120890774, "grad_norm": 176.1273193359375, "learning_rate": 1.1664899257688231e-07, "loss": 11.8626, "num_input_tokens_seen": 43296, "step": 45 }, { "epoch": 0.026511134676564158, "grad_norm": 167.12315368652344, "learning_rate": 1.2990455991516437e-07, "loss": 11.8001, "num_input_tokens_seen": 47584, "step": 50 }, { "epoch": 0.02916224814422057, "grad_norm": 158.29217529296875, "learning_rate": 1.4316012725344645e-07, "loss": 12.0561, "num_input_tokens_seen": 53952, "step": 55 }, { "epoch": 0.031813361611876985, "grad_norm": 161.88644409179688, "learning_rate": 1.5641569459172855e-07, "loss": 11.7005, "num_input_tokens_seen": 59712, "step": 60 }, { "epoch": 0.0344644750795334, "grad_norm": 175.29933166503906, "learning_rate": 1.6967126193001063e-07, "loss": 11.1751, "num_input_tokens_seen": 63616, "step": 65 }, { "epoch": 0.03711558854718982, "grad_norm": 177.03573608398438, "learning_rate": 1.8292682926829268e-07, "loss": 10.7579, "num_input_tokens_seen": 67232, "step": 70 }, { "epoch": 0.039766702014846236, "grad_norm": 147.53671264648438, "learning_rate": 1.9618239660657478e-07, "loss": 10.8822, "num_input_tokens_seen": 72736, "step": 75 }, { "epoch": 0.042417815482502653, "grad_norm": 157.15631103515625, "learning_rate": 2.0943796394485686e-07, "loss": 10.955, "num_input_tokens_seen": 80672, "step": 80 }, { "epoch": 0.045068928950159064, "grad_norm": 153.48460388183594, "learning_rate": 2.2269353128313894e-07, "loss": 10.5352, "num_input_tokens_seen": 84768, "step": 85 }, { "epoch": 0.04772004241781548, "grad_norm": 146.26751708984375, "learning_rate": 2.35949098621421e-07, "loss": 10.6717, "num_input_tokens_seen": 89888, "step": 90 }, { "epoch": 0.0503711558854719, "grad_norm": 161.37222290039062, "learning_rate": 2.4920466595970306e-07, "loss": 10.0814, "num_input_tokens_seen": 94208, "step": 95 }, { "epoch": 0.053022269353128315, "grad_norm": 154.37197875976562, "learning_rate": 2.6246023329798517e-07, "loss": 9.4508, "num_input_tokens_seen": 98464, "step": 100 }, { "epoch": 0.05567338282078473, "grad_norm": 135.10386657714844, "learning_rate": 2.7571580063626727e-07, "loss": 9.3543, "num_input_tokens_seen": 103776, "step": 105 }, { "epoch": 0.05832449628844114, "grad_norm": 133.27830505371094, "learning_rate": 2.8897136797454937e-07, "loss": 8.7759, "num_input_tokens_seen": 108096, "step": 110 }, { "epoch": 0.06097560975609756, "grad_norm": 128.38609313964844, "learning_rate": 3.022269353128314e-07, "loss": 8.7709, "num_input_tokens_seen": 113792, "step": 115 }, { "epoch": 0.06362672322375397, "grad_norm": 128.76727294921875, "learning_rate": 3.154825026511135e-07, "loss": 8.5273, "num_input_tokens_seen": 118368, "step": 120 }, { "epoch": 0.0662778366914104, "grad_norm": 129.35317993164062, "learning_rate": 3.287380699893956e-07, "loss": 7.8441, "num_input_tokens_seen": 123712, "step": 125 }, { "epoch": 0.0689289501590668, "grad_norm": 122.70599365234375, "learning_rate": 3.419936373276776e-07, "loss": 7.6647, "num_input_tokens_seen": 128256, "step": 130 }, { "epoch": 0.07158006362672323, "grad_norm": 121.6593017578125, "learning_rate": 3.5524920466595973e-07, "loss": 7.8168, "num_input_tokens_seen": 133152, "step": 135 }, { "epoch": 0.07423117709437964, "grad_norm": 122.41737365722656, "learning_rate": 3.685047720042418e-07, "loss": 7.0573, "num_input_tokens_seen": 138432, "step": 140 }, { "epoch": 0.07688229056203605, "grad_norm": 114.93856048583984, "learning_rate": 3.8176033934252394e-07, "loss": 7.0662, "num_input_tokens_seen": 144192, "step": 145 }, { "epoch": 0.07953340402969247, "grad_norm": 104.61894226074219, "learning_rate": 3.95015906680806e-07, "loss": 6.1379, "num_input_tokens_seen": 149120, "step": 150 }, { "epoch": 0.08218451749734888, "grad_norm": 101.94345092773438, "learning_rate": 4.0827147401908804e-07, "loss": 5.8472, "num_input_tokens_seen": 154592, "step": 155 }, { "epoch": 0.08483563096500531, "grad_norm": 94.36854553222656, "learning_rate": 4.2152704135737014e-07, "loss": 5.505, "num_input_tokens_seen": 159584, "step": 160 }, { "epoch": 0.08748674443266172, "grad_norm": 93.81121063232422, "learning_rate": 4.347826086956522e-07, "loss": 5.0439, "num_input_tokens_seen": 165088, "step": 165 }, { "epoch": 0.09013785790031813, "grad_norm": 93.01712799072266, "learning_rate": 4.4803817603393434e-07, "loss": 4.7697, "num_input_tokens_seen": 170080, "step": 170 }, { "epoch": 0.09278897136797455, "grad_norm": 81.1533432006836, "learning_rate": 4.612937433722164e-07, "loss": 4.3121, "num_input_tokens_seen": 174240, "step": 175 }, { "epoch": 0.09544008483563096, "grad_norm": 81.68360137939453, "learning_rate": 4.7454931071049845e-07, "loss": 4.037, "num_input_tokens_seen": 178880, "step": 180 }, { "epoch": 0.09809119830328739, "grad_norm": 86.886474609375, "learning_rate": 4.878048780487805e-07, "loss": 3.7875, "num_input_tokens_seen": 183648, "step": 185 }, { "epoch": 0.1007423117709438, "grad_norm": 69.02825164794922, "learning_rate": 5.010604453870625e-07, "loss": 3.5064, "num_input_tokens_seen": 188608, "step": 190 }, { "epoch": 0.1033934252386002, "grad_norm": 68.95243835449219, "learning_rate": 5.143160127253448e-07, "loss": 3.2013, "num_input_tokens_seen": 193888, "step": 195 }, { "epoch": 0.10604453870625663, "grad_norm": 57.51818084716797, "learning_rate": 5.275715800636268e-07, "loss": 2.7703, "num_input_tokens_seen": 197920, "step": 200 }, { "epoch": 0.10869565217391304, "grad_norm": 55.455379486083984, "learning_rate": 5.408271474019089e-07, "loss": 2.7321, "num_input_tokens_seen": 203072, "step": 205 }, { "epoch": 0.11134676564156946, "grad_norm": 46.082435607910156, "learning_rate": 5.54082714740191e-07, "loss": 2.4021, "num_input_tokens_seen": 208160, "step": 210 }, { "epoch": 0.11399787910922587, "grad_norm": 45.07759475708008, "learning_rate": 5.67338282078473e-07, "loss": 2.1364, "num_input_tokens_seen": 212928, "step": 215 }, { "epoch": 0.11664899257688228, "grad_norm": 51.84790802001953, "learning_rate": 5.805938494167552e-07, "loss": 1.952, "num_input_tokens_seen": 217536, "step": 220 }, { "epoch": 0.11930010604453871, "grad_norm": 46.098472595214844, "learning_rate": 5.938494167550372e-07, "loss": 1.6522, "num_input_tokens_seen": 223712, "step": 225 }, { "epoch": 0.12195121951219512, "grad_norm": 55.24103927612305, "learning_rate": 6.071049840933193e-07, "loss": 1.5155, "num_input_tokens_seen": 228480, "step": 230 }, { "epoch": 0.12460233297985154, "grad_norm": 49.926719665527344, "learning_rate": 6.203605514316014e-07, "loss": 1.1811, "num_input_tokens_seen": 233408, "step": 235 }, { "epoch": 0.12725344644750794, "grad_norm": 34.35500717163086, "learning_rate": 6.336161187698835e-07, "loss": 1.0524, "num_input_tokens_seen": 238240, "step": 240 }, { "epoch": 0.12990455991516436, "grad_norm": 34.7097053527832, "learning_rate": 6.468716861081655e-07, "loss": 0.8176, "num_input_tokens_seen": 243136, "step": 245 }, { "epoch": 0.1325556733828208, "grad_norm": 29.47632598876953, "learning_rate": 6.601272534464476e-07, "loss": 0.6796, "num_input_tokens_seen": 248160, "step": 250 }, { "epoch": 0.1352067868504772, "grad_norm": 17.037107467651367, "learning_rate": 6.733828207847297e-07, "loss": 0.5929, "num_input_tokens_seen": 252864, "step": 255 }, { "epoch": 0.1378579003181336, "grad_norm": 42.81969451904297, "learning_rate": 6.866383881230117e-07, "loss": 0.6364, "num_input_tokens_seen": 258912, "step": 260 }, { "epoch": 0.14050901378579003, "grad_norm": 25.878816604614258, "learning_rate": 6.998939554612938e-07, "loss": 0.5275, "num_input_tokens_seen": 264096, "step": 265 }, { "epoch": 0.14316012725344646, "grad_norm": 82.19541931152344, "learning_rate": 7.131495227995759e-07, "loss": 0.441, "num_input_tokens_seen": 268928, "step": 270 }, { "epoch": 0.14581124072110285, "grad_norm": 25.83802032470703, "learning_rate": 7.264050901378579e-07, "loss": 0.4237, "num_input_tokens_seen": 274592, "step": 275 }, { "epoch": 0.14846235418875928, "grad_norm": 37.13822555541992, "learning_rate": 7.3966065747614e-07, "loss": 0.3905, "num_input_tokens_seen": 279552, "step": 280 }, { "epoch": 0.1511134676564157, "grad_norm": 52.20307922363281, "learning_rate": 7.529162248144222e-07, "loss": 0.4759, "num_input_tokens_seen": 284192, "step": 285 }, { "epoch": 0.1537645811240721, "grad_norm": 32.847225189208984, "learning_rate": 7.661717921527043e-07, "loss": 0.3889, "num_input_tokens_seen": 289664, "step": 290 }, { "epoch": 0.15641569459172852, "grad_norm": 42.1656379699707, "learning_rate": 7.794273594909863e-07, "loss": 0.3698, "num_input_tokens_seen": 293600, "step": 295 }, { "epoch": 0.15906680805938495, "grad_norm": 91.29390716552734, "learning_rate": 7.926829268292684e-07, "loss": 0.4197, "num_input_tokens_seen": 298240, "step": 300 }, { "epoch": 0.16171792152704137, "grad_norm": 45.77964401245117, "learning_rate": 8.059384941675505e-07, "loss": 0.4195, "num_input_tokens_seen": 303936, "step": 305 }, { "epoch": 0.16436903499469777, "grad_norm": 17.251203536987305, "learning_rate": 8.191940615058325e-07, "loss": 0.3691, "num_input_tokens_seen": 308832, "step": 310 }, { "epoch": 0.1670201484623542, "grad_norm": 59.09403991699219, "learning_rate": 8.324496288441146e-07, "loss": 0.3662, "num_input_tokens_seen": 313568, "step": 315 }, { "epoch": 0.16967126193001061, "grad_norm": 50.41740036010742, "learning_rate": 8.457051961823967e-07, "loss": 0.3904, "num_input_tokens_seen": 318464, "step": 320 }, { "epoch": 0.172322375397667, "grad_norm": 35.478458404541016, "learning_rate": 8.589607635206787e-07, "loss": 0.3767, "num_input_tokens_seen": 323680, "step": 325 }, { "epoch": 0.17497348886532343, "grad_norm": 45.41325378417969, "learning_rate": 8.722163308589608e-07, "loss": 0.3505, "num_input_tokens_seen": 328416, "step": 330 }, { "epoch": 0.17762460233297986, "grad_norm": 76.23787689208984, "learning_rate": 8.854718981972429e-07, "loss": 0.3712, "num_input_tokens_seen": 333408, "step": 335 }, { "epoch": 0.18027571580063625, "grad_norm": 17.957679748535156, "learning_rate": 8.987274655355251e-07, "loss": 0.3338, "num_input_tokens_seen": 338016, "step": 340 }, { "epoch": 0.18292682926829268, "grad_norm": 46.55139923095703, "learning_rate": 9.11983032873807e-07, "loss": 0.3473, "num_input_tokens_seen": 343616, "step": 345 }, { "epoch": 0.1855779427359491, "grad_norm": 70.43095397949219, "learning_rate": 9.252386002120892e-07, "loss": 0.3475, "num_input_tokens_seen": 347808, "step": 350 }, { "epoch": 0.18822905620360553, "grad_norm": 14.87926959991455, "learning_rate": 9.384941675503713e-07, "loss": 0.3192, "num_input_tokens_seen": 352672, "step": 355 }, { "epoch": 0.19088016967126192, "grad_norm": 20.085914611816406, "learning_rate": 9.517497348886533e-07, "loss": 0.349, "num_input_tokens_seen": 358144, "step": 360 }, { "epoch": 0.19353128313891835, "grad_norm": 38.98188400268555, "learning_rate": 9.650053022269354e-07, "loss": 0.3054, "num_input_tokens_seen": 362976, "step": 365 }, { "epoch": 0.19618239660657477, "grad_norm": 50.02574920654297, "learning_rate": 9.782608695652175e-07, "loss": 0.3682, "num_input_tokens_seen": 368768, "step": 370 }, { "epoch": 0.19883351007423117, "grad_norm": 18.82611083984375, "learning_rate": 9.915164369034994e-07, "loss": 0.3686, "num_input_tokens_seen": 374048, "step": 375 }, { "epoch": 0.2014846235418876, "grad_norm": 15.154851913452148, "learning_rate": 1.0047720042417817e-06, "loss": 0.3814, "num_input_tokens_seen": 378752, "step": 380 }, { "epoch": 0.20413573700954402, "grad_norm": 88.22554016113281, "learning_rate": 1.0180275715800638e-06, "loss": 0.3991, "num_input_tokens_seen": 382784, "step": 385 }, { "epoch": 0.2067868504772004, "grad_norm": 32.351192474365234, "learning_rate": 1.0312831389183457e-06, "loss": 0.3585, "num_input_tokens_seen": 387104, "step": 390 }, { "epoch": 0.20943796394485684, "grad_norm": 88.63370513916016, "learning_rate": 1.0445387062566278e-06, "loss": 0.3955, "num_input_tokens_seen": 392128, "step": 395 }, { "epoch": 0.21208907741251326, "grad_norm": 21.993446350097656, "learning_rate": 1.05779427359491e-06, "loss": 0.352, "num_input_tokens_seen": 397600, "step": 400 }, { "epoch": 0.21474019088016968, "grad_norm": 64.73590087890625, "learning_rate": 1.071049840933192e-06, "loss": 0.3956, "num_input_tokens_seen": 403616, "step": 405 }, { "epoch": 0.21739130434782608, "grad_norm": 38.540122985839844, "learning_rate": 1.0843054082714741e-06, "loss": 0.3985, "num_input_tokens_seen": 408736, "step": 410 }, { "epoch": 0.2200424178154825, "grad_norm": 40.624420166015625, "learning_rate": 1.0975609756097562e-06, "loss": 0.3468, "num_input_tokens_seen": 413792, "step": 415 }, { "epoch": 0.22269353128313893, "grad_norm": 49.45564651489258, "learning_rate": 1.1108165429480383e-06, "loss": 0.4116, "num_input_tokens_seen": 418592, "step": 420 }, { "epoch": 0.22534464475079533, "grad_norm": 136.34982299804688, "learning_rate": 1.1240721102863202e-06, "loss": 0.4028, "num_input_tokens_seen": 423872, "step": 425 }, { "epoch": 0.22799575821845175, "grad_norm": 14.679060935974121, "learning_rate": 1.1373276776246023e-06, "loss": 0.297, "num_input_tokens_seen": 428576, "step": 430 }, { "epoch": 0.23064687168610817, "grad_norm": 84.33354949951172, "learning_rate": 1.1505832449628844e-06, "loss": 0.328, "num_input_tokens_seen": 433088, "step": 435 }, { "epoch": 0.23329798515376457, "grad_norm": 48.23221969604492, "learning_rate": 1.1638388123011665e-06, "loss": 0.3215, "num_input_tokens_seen": 437856, "step": 440 }, { "epoch": 0.235949098621421, "grad_norm": 40.1612548828125, "learning_rate": 1.1770943796394486e-06, "loss": 0.3382, "num_input_tokens_seen": 443008, "step": 445 }, { "epoch": 0.23860021208907742, "grad_norm": 21.103782653808594, "learning_rate": 1.1903499469777307e-06, "loss": 0.3361, "num_input_tokens_seen": 448224, "step": 450 }, { "epoch": 0.24125132555673381, "grad_norm": 27.31907081604004, "learning_rate": 1.2036055143160128e-06, "loss": 0.3506, "num_input_tokens_seen": 453312, "step": 455 }, { "epoch": 0.24390243902439024, "grad_norm": 75.08248901367188, "learning_rate": 1.216861081654295e-06, "loss": 0.3944, "num_input_tokens_seen": 458048, "step": 460 }, { "epoch": 0.24655355249204666, "grad_norm": 14.86404800415039, "learning_rate": 1.230116648992577e-06, "loss": 0.3527, "num_input_tokens_seen": 463392, "step": 465 }, { "epoch": 0.2492046659597031, "grad_norm": 48.94320297241211, "learning_rate": 1.2433722163308592e-06, "loss": 0.2855, "num_input_tokens_seen": 467904, "step": 470 }, { "epoch": 0.2518557794273595, "grad_norm": 15.099491119384766, "learning_rate": 1.2566277836691413e-06, "loss": 0.3578, "num_input_tokens_seen": 472768, "step": 475 }, { "epoch": 0.2545068928950159, "grad_norm": 45.64033889770508, "learning_rate": 1.2698833510074232e-06, "loss": 0.5244, "num_input_tokens_seen": 477408, "step": 480 }, { "epoch": 0.25715800636267233, "grad_norm": 28.265949249267578, "learning_rate": 1.2831389183457053e-06, "loss": 0.4351, "num_input_tokens_seen": 482016, "step": 485 }, { "epoch": 0.2598091198303287, "grad_norm": 15.742690086364746, "learning_rate": 1.2963944856839874e-06, "loss": 0.3506, "num_input_tokens_seen": 488256, "step": 490 }, { "epoch": 0.2624602332979852, "grad_norm": 13.115278244018555, "learning_rate": 1.3096500530222695e-06, "loss": 0.371, "num_input_tokens_seen": 492992, "step": 495 }, { "epoch": 0.2651113467656416, "grad_norm": 23.02974510192871, "learning_rate": 1.3229056203605514e-06, "loss": 0.348, "num_input_tokens_seen": 498816, "step": 500 }, { "epoch": 0.26776246023329797, "grad_norm": 30.663002014160156, "learning_rate": 1.3361611876988337e-06, "loss": 0.3505, "num_input_tokens_seen": 503424, "step": 505 }, { "epoch": 0.2704135737009544, "grad_norm": 47.241397857666016, "learning_rate": 1.3494167550371158e-06, "loss": 0.3699, "num_input_tokens_seen": 507712, "step": 510 }, { "epoch": 0.2730646871686108, "grad_norm": 23.934307098388672, "learning_rate": 1.3626723223753977e-06, "loss": 0.3325, "num_input_tokens_seen": 512512, "step": 515 }, { "epoch": 0.2757158006362672, "grad_norm": 11.60228157043457, "learning_rate": 1.37592788971368e-06, "loss": 0.3134, "num_input_tokens_seen": 517024, "step": 520 }, { "epoch": 0.27836691410392367, "grad_norm": 72.741455078125, "learning_rate": 1.3891834570519619e-06, "loss": 0.3902, "num_input_tokens_seen": 521792, "step": 525 }, { "epoch": 0.28101802757158006, "grad_norm": 59.52469253540039, "learning_rate": 1.4024390243902442e-06, "loss": 0.3287, "num_input_tokens_seen": 526880, "step": 530 }, { "epoch": 0.28366914103923646, "grad_norm": 21.13273811340332, "learning_rate": 1.415694591728526e-06, "loss": 0.3555, "num_input_tokens_seen": 532576, "step": 535 }, { "epoch": 0.2863202545068929, "grad_norm": 23.358440399169922, "learning_rate": 1.4289501590668082e-06, "loss": 0.4159, "num_input_tokens_seen": 537440, "step": 540 }, { "epoch": 0.2889713679745493, "grad_norm": 46.94316482543945, "learning_rate": 1.4422057264050903e-06, "loss": 0.4269, "num_input_tokens_seen": 542464, "step": 545 }, { "epoch": 0.2916224814422057, "grad_norm": 11.675150871276855, "learning_rate": 1.4554612937433724e-06, "loss": 0.4065, "num_input_tokens_seen": 547552, "step": 550 }, { "epoch": 0.29427359490986216, "grad_norm": 47.04457473754883, "learning_rate": 1.4687168610816543e-06, "loss": 0.3744, "num_input_tokens_seen": 551968, "step": 555 }, { "epoch": 0.29692470837751855, "grad_norm": 9.455533981323242, "learning_rate": 1.4819724284199366e-06, "loss": 0.345, "num_input_tokens_seen": 556704, "step": 560 }, { "epoch": 0.29957582184517495, "grad_norm": 29.811752319335938, "learning_rate": 1.4952279957582185e-06, "loss": 0.3552, "num_input_tokens_seen": 561184, "step": 565 }, { "epoch": 0.3022269353128314, "grad_norm": 24.34227752685547, "learning_rate": 1.5084835630965006e-06, "loss": 0.3695, "num_input_tokens_seen": 565632, "step": 570 }, { "epoch": 0.3048780487804878, "grad_norm": 14.854873657226562, "learning_rate": 1.521739130434783e-06, "loss": 0.3699, "num_input_tokens_seen": 570432, "step": 575 }, { "epoch": 0.3075291622481442, "grad_norm": 39.8348503112793, "learning_rate": 1.5349946977730648e-06, "loss": 0.3546, "num_input_tokens_seen": 575456, "step": 580 }, { "epoch": 0.31018027571580065, "grad_norm": 7.581491470336914, "learning_rate": 1.5482502651113469e-06, "loss": 0.3626, "num_input_tokens_seen": 581440, "step": 585 }, { "epoch": 0.31283138918345704, "grad_norm": 35.493289947509766, "learning_rate": 1.561505832449629e-06, "loss": 0.3411, "num_input_tokens_seen": 585664, "step": 590 }, { "epoch": 0.3154825026511135, "grad_norm": 20.165380477905273, "learning_rate": 1.574761399787911e-06, "loss": 0.2802, "num_input_tokens_seen": 591808, "step": 595 }, { "epoch": 0.3181336161187699, "grad_norm": 41.46565246582031, "learning_rate": 1.588016967126193e-06, "loss": 0.5639, "num_input_tokens_seen": 599424, "step": 600 }, { "epoch": 0.3207847295864263, "grad_norm": 23.512596130371094, "learning_rate": 1.6012725344644753e-06, "loss": 0.3476, "num_input_tokens_seen": 604576, "step": 605 }, { "epoch": 0.32343584305408274, "grad_norm": 29.009986877441406, "learning_rate": 1.6145281018027572e-06, "loss": 0.3564, "num_input_tokens_seen": 610272, "step": 610 }, { "epoch": 0.32608695652173914, "grad_norm": 10.026640892028809, "learning_rate": 1.6277836691410393e-06, "loss": 0.3556, "num_input_tokens_seen": 614912, "step": 615 }, { "epoch": 0.32873806998939553, "grad_norm": 12.006712913513184, "learning_rate": 1.6410392364793214e-06, "loss": 0.3288, "num_input_tokens_seen": 621216, "step": 620 }, { "epoch": 0.331389183457052, "grad_norm": 23.015817642211914, "learning_rate": 1.6542948038176035e-06, "loss": 0.3514, "num_input_tokens_seen": 626368, "step": 625 }, { "epoch": 0.3340402969247084, "grad_norm": 41.41285705566406, "learning_rate": 1.6675503711558854e-06, "loss": 0.3816, "num_input_tokens_seen": 630944, "step": 630 }, { "epoch": 0.3366914103923648, "grad_norm": 32.92915725708008, "learning_rate": 1.6808059384941677e-06, "loss": 0.3606, "num_input_tokens_seen": 636224, "step": 635 }, { "epoch": 0.33934252386002123, "grad_norm": 12.42184066772461, "learning_rate": 1.6940615058324498e-06, "loss": 0.3205, "num_input_tokens_seen": 641248, "step": 640 }, { "epoch": 0.3419936373276776, "grad_norm": 18.954364776611328, "learning_rate": 1.707317073170732e-06, "loss": 0.3201, "num_input_tokens_seen": 645760, "step": 645 }, { "epoch": 0.344644750795334, "grad_norm": 24.101211547851562, "learning_rate": 1.720572640509014e-06, "loss": 0.242, "num_input_tokens_seen": 651008, "step": 650 }, { "epoch": 0.3472958642629905, "grad_norm": 63.47054672241211, "learning_rate": 1.733828207847296e-06, "loss": 0.4658, "num_input_tokens_seen": 657504, "step": 655 }, { "epoch": 0.34994697773064687, "grad_norm": 16.48504638671875, "learning_rate": 1.7470837751855782e-06, "loss": 0.4152, "num_input_tokens_seen": 663168, "step": 660 }, { "epoch": 0.35259809119830327, "grad_norm": 66.33118438720703, "learning_rate": 1.7603393425238601e-06, "loss": 0.3649, "num_input_tokens_seen": 668352, "step": 665 }, { "epoch": 0.3552492046659597, "grad_norm": 18.371417999267578, "learning_rate": 1.7735949098621422e-06, "loss": 0.3435, "num_input_tokens_seen": 673888, "step": 670 }, { "epoch": 0.3579003181336161, "grad_norm": 28.61855697631836, "learning_rate": 1.7868504772004243e-06, "loss": 0.3028, "num_input_tokens_seen": 678976, "step": 675 }, { "epoch": 0.3605514316012725, "grad_norm": 21.884496688842773, "learning_rate": 1.8001060445387064e-06, "loss": 0.2734, "num_input_tokens_seen": 683008, "step": 680 }, { "epoch": 0.36320254506892896, "grad_norm": 57.031105041503906, "learning_rate": 1.8133616118769883e-06, "loss": 0.3254, "num_input_tokens_seen": 688000, "step": 685 }, { "epoch": 0.36585365853658536, "grad_norm": 7.284112453460693, "learning_rate": 1.8266171792152706e-06, "loss": 0.3999, "num_input_tokens_seen": 692096, "step": 690 }, { "epoch": 0.36850477200424175, "grad_norm": 11.197851181030273, "learning_rate": 1.8398727465535527e-06, "loss": 0.3324, "num_input_tokens_seen": 697728, "step": 695 }, { "epoch": 0.3711558854718982, "grad_norm": 8.783563613891602, "learning_rate": 1.8531283138918346e-06, "loss": 0.3798, "num_input_tokens_seen": 702624, "step": 700 }, { "epoch": 0.3738069989395546, "grad_norm": 33.333614349365234, "learning_rate": 1.866383881230117e-06, "loss": 0.3913, "num_input_tokens_seen": 707296, "step": 705 }, { "epoch": 0.37645811240721105, "grad_norm": 14.196060180664062, "learning_rate": 1.8796394485683988e-06, "loss": 0.3565, "num_input_tokens_seen": 712416, "step": 710 }, { "epoch": 0.37910922587486745, "grad_norm": 8.438395500183105, "learning_rate": 1.892895015906681e-06, "loss": 0.3342, "num_input_tokens_seen": 718112, "step": 715 }, { "epoch": 0.38176033934252385, "grad_norm": 7.106090545654297, "learning_rate": 1.906150583244963e-06, "loss": 0.3794, "num_input_tokens_seen": 723360, "step": 720 }, { "epoch": 0.3844114528101803, "grad_norm": 13.925073623657227, "learning_rate": 1.919406150583245e-06, "loss": 0.3157, "num_input_tokens_seen": 728800, "step": 725 }, { "epoch": 0.3870625662778367, "grad_norm": 11.269327163696289, "learning_rate": 1.932661717921527e-06, "loss": 0.3226, "num_input_tokens_seen": 733088, "step": 730 }, { "epoch": 0.3897136797454931, "grad_norm": 18.860862731933594, "learning_rate": 1.9459172852598094e-06, "loss": 0.3557, "num_input_tokens_seen": 738528, "step": 735 }, { "epoch": 0.39236479321314954, "grad_norm": 45.32082748413086, "learning_rate": 1.9591728525980912e-06, "loss": 0.3404, "num_input_tokens_seen": 744896, "step": 740 }, { "epoch": 0.39501590668080594, "grad_norm": 14.02343463897705, "learning_rate": 1.9724284199363736e-06, "loss": 0.3424, "num_input_tokens_seen": 750624, "step": 745 }, { "epoch": 0.39766702014846234, "grad_norm": 22.40477180480957, "learning_rate": 1.9856839872746554e-06, "loss": 0.335, "num_input_tokens_seen": 757152, "step": 750 }, { "epoch": 0.4003181336161188, "grad_norm": 8.183521270751953, "learning_rate": 1.9989395546129373e-06, "loss": 0.2914, "num_input_tokens_seen": 763648, "step": 755 }, { "epoch": 0.4029692470837752, "grad_norm": 29.884248733520508, "learning_rate": 2.0121951219512197e-06, "loss": 0.3317, "num_input_tokens_seen": 768256, "step": 760 }, { "epoch": 0.4056203605514316, "grad_norm": 31.17565155029297, "learning_rate": 2.025450689289502e-06, "loss": 0.3873, "num_input_tokens_seen": 772864, "step": 765 }, { "epoch": 0.40827147401908803, "grad_norm": 12.750073432922363, "learning_rate": 2.038706256627784e-06, "loss": 0.3442, "num_input_tokens_seen": 776896, "step": 770 }, { "epoch": 0.41092258748674443, "grad_norm": 56.07828140258789, "learning_rate": 2.051961823966066e-06, "loss": 0.3582, "num_input_tokens_seen": 781664, "step": 775 }, { "epoch": 0.4135737009544008, "grad_norm": 47.6268310546875, "learning_rate": 2.065217391304348e-06, "loss": 0.3756, "num_input_tokens_seen": 786240, "step": 780 }, { "epoch": 0.4162248144220573, "grad_norm": 10.236225128173828, "learning_rate": 2.07847295864263e-06, "loss": 0.3809, "num_input_tokens_seen": 790336, "step": 785 }, { "epoch": 0.4188759278897137, "grad_norm": 13.523309707641602, "learning_rate": 2.0917285259809123e-06, "loss": 0.3347, "num_input_tokens_seen": 794976, "step": 790 }, { "epoch": 0.42152704135737007, "grad_norm": 9.882080078125, "learning_rate": 2.104984093319194e-06, "loss": 0.2828, "num_input_tokens_seen": 799616, "step": 795 }, { "epoch": 0.4241781548250265, "grad_norm": 22.894922256469727, "learning_rate": 2.118239660657476e-06, "loss": 0.4188, "num_input_tokens_seen": 803584, "step": 800 }, { "epoch": 0.4268292682926829, "grad_norm": 19.923128128051758, "learning_rate": 2.1314952279957584e-06, "loss": 0.3413, "num_input_tokens_seen": 810272, "step": 805 }, { "epoch": 0.42948038176033937, "grad_norm": 13.821854591369629, "learning_rate": 2.1447507953340403e-06, "loss": 0.343, "num_input_tokens_seen": 815200, "step": 810 }, { "epoch": 0.43213149522799577, "grad_norm": 24.855836868286133, "learning_rate": 2.1580063626723226e-06, "loss": 0.3481, "num_input_tokens_seen": 821088, "step": 815 }, { "epoch": 0.43478260869565216, "grad_norm": 40.06626892089844, "learning_rate": 2.171261930010605e-06, "loss": 0.3474, "num_input_tokens_seen": 824832, "step": 820 }, { "epoch": 0.4374337221633086, "grad_norm": 17.185033798217773, "learning_rate": 2.1845174973488868e-06, "loss": 0.3875, "num_input_tokens_seen": 829920, "step": 825 }, { "epoch": 0.440084835630965, "grad_norm": 28.277124404907227, "learning_rate": 2.1977730646871687e-06, "loss": 0.3736, "num_input_tokens_seen": 835072, "step": 830 }, { "epoch": 0.4427359490986214, "grad_norm": 10.435997009277344, "learning_rate": 2.211028632025451e-06, "loss": 0.3124, "num_input_tokens_seen": 840576, "step": 835 }, { "epoch": 0.44538706256627786, "grad_norm": 18.55091667175293, "learning_rate": 2.224284199363733e-06, "loss": 0.3834, "num_input_tokens_seen": 845536, "step": 840 }, { "epoch": 0.44803817603393425, "grad_norm": 21.654109954833984, "learning_rate": 2.2375397667020148e-06, "loss": 0.3761, "num_input_tokens_seen": 850528, "step": 845 }, { "epoch": 0.45068928950159065, "grad_norm": 17.584075927734375, "learning_rate": 2.250795334040297e-06, "loss": 0.3864, "num_input_tokens_seen": 854912, "step": 850 }, { "epoch": 0.4533404029692471, "grad_norm": 9.291881561279297, "learning_rate": 2.264050901378579e-06, "loss": 0.3476, "num_input_tokens_seen": 859712, "step": 855 }, { "epoch": 0.4559915164369035, "grad_norm": 31.395204544067383, "learning_rate": 2.2773064687168613e-06, "loss": 0.3381, "num_input_tokens_seen": 864896, "step": 860 }, { "epoch": 0.4586426299045599, "grad_norm": 13.251413345336914, "learning_rate": 2.290562036055143e-06, "loss": 0.3128, "num_input_tokens_seen": 869120, "step": 865 }, { "epoch": 0.46129374337221635, "grad_norm": 33.73545837402344, "learning_rate": 2.3038176033934255e-06, "loss": 0.4312, "num_input_tokens_seen": 873792, "step": 870 }, { "epoch": 0.46394485683987274, "grad_norm": 8.48594856262207, "learning_rate": 2.317073170731708e-06, "loss": 0.3766, "num_input_tokens_seen": 879872, "step": 875 }, { "epoch": 0.46659597030752914, "grad_norm": 41.25191116333008, "learning_rate": 2.3303287380699897e-06, "loss": 0.3637, "num_input_tokens_seen": 886112, "step": 880 }, { "epoch": 0.4692470837751856, "grad_norm": 6.582920074462891, "learning_rate": 2.3435843054082716e-06, "loss": 0.3659, "num_input_tokens_seen": 891872, "step": 885 }, { "epoch": 0.471898197242842, "grad_norm": 10.457932472229004, "learning_rate": 2.356839872746554e-06, "loss": 0.3377, "num_input_tokens_seen": 896640, "step": 890 }, { "epoch": 0.4745493107104984, "grad_norm": 21.783222198486328, "learning_rate": 2.370095440084836e-06, "loss": 0.2952, "num_input_tokens_seen": 901600, "step": 895 }, { "epoch": 0.47720042417815484, "grad_norm": 11.409085273742676, "learning_rate": 2.3833510074231177e-06, "loss": 0.364, "num_input_tokens_seen": 905536, "step": 900 }, { "epoch": 0.47985153764581123, "grad_norm": 8.830218315124512, "learning_rate": 2.3966065747614e-06, "loss": 0.3318, "num_input_tokens_seen": 910720, "step": 905 }, { "epoch": 0.48250265111346763, "grad_norm": 16.637914657592773, "learning_rate": 2.409862142099682e-06, "loss": 0.3196, "num_input_tokens_seen": 914784, "step": 910 }, { "epoch": 0.4851537645811241, "grad_norm": 8.202871322631836, "learning_rate": 2.4231177094379642e-06, "loss": 0.2902, "num_input_tokens_seen": 921728, "step": 915 }, { "epoch": 0.4878048780487805, "grad_norm": 9.92109489440918, "learning_rate": 2.436373276776246e-06, "loss": 0.3711, "num_input_tokens_seen": 926112, "step": 920 }, { "epoch": 0.49045599151643693, "grad_norm": 8.500317573547363, "learning_rate": 2.4496288441145284e-06, "loss": 0.3624, "num_input_tokens_seen": 932096, "step": 925 }, { "epoch": 0.4931071049840933, "grad_norm": 18.307472229003906, "learning_rate": 2.4628844114528103e-06, "loss": 0.3347, "num_input_tokens_seen": 936672, "step": 930 }, { "epoch": 0.4957582184517497, "grad_norm": 14.046602249145508, "learning_rate": 2.4761399787910926e-06, "loss": 0.3448, "num_input_tokens_seen": 941472, "step": 935 }, { "epoch": 0.4984093319194062, "grad_norm": 16.217933654785156, "learning_rate": 2.4893955461293745e-06, "loss": 0.3181, "num_input_tokens_seen": 946304, "step": 940 }, { "epoch": 0.5010604453870625, "grad_norm": 6.7715535163879395, "learning_rate": 2.502651113467657e-06, "loss": 0.3547, "num_input_tokens_seen": 951648, "step": 945 }, { "epoch": 0.503711558854719, "grad_norm": 14.223743438720703, "learning_rate": 2.5159066808059383e-06, "loss": 0.2751, "num_input_tokens_seen": 956672, "step": 950 }, { "epoch": 0.5063626723223754, "grad_norm": 7.035590648651123, "learning_rate": 2.5291622481442206e-06, "loss": 0.327, "num_input_tokens_seen": 961088, "step": 955 }, { "epoch": 0.5090137857900318, "grad_norm": 59.33071517944336, "learning_rate": 2.542417815482503e-06, "loss": 0.3622, "num_input_tokens_seen": 965024, "step": 960 }, { "epoch": 0.5116648992576882, "grad_norm": 6.214361667633057, "learning_rate": 2.555673382820785e-06, "loss": 0.3074, "num_input_tokens_seen": 971936, "step": 965 }, { "epoch": 0.5143160127253447, "grad_norm": 7.172580718994141, "learning_rate": 2.568928950159067e-06, "loss": 0.3296, "num_input_tokens_seen": 977760, "step": 970 }, { "epoch": 0.5169671261930011, "grad_norm": 15.568145751953125, "learning_rate": 2.582184517497349e-06, "loss": 0.4047, "num_input_tokens_seen": 983232, "step": 975 }, { "epoch": 0.5196182396606575, "grad_norm": 40.77306365966797, "learning_rate": 2.595440084835631e-06, "loss": 0.3569, "num_input_tokens_seen": 987648, "step": 980 }, { "epoch": 0.5222693531283139, "grad_norm": 12.252131462097168, "learning_rate": 2.6086956521739132e-06, "loss": 0.3396, "num_input_tokens_seen": 991456, "step": 985 }, { "epoch": 0.5249204665959704, "grad_norm": 20.0458984375, "learning_rate": 2.6219512195121956e-06, "loss": 0.236, "num_input_tokens_seen": 996640, "step": 990 }, { "epoch": 0.5275715800636267, "grad_norm": 42.24561309814453, "learning_rate": 2.635206786850477e-06, "loss": 0.3746, "num_input_tokens_seen": 1001280, "step": 995 }, { "epoch": 0.5302226935312832, "grad_norm": 18.18479347229004, "learning_rate": 2.6484623541887593e-06, "loss": 0.3757, "num_input_tokens_seen": 1006048, "step": 1000 }, { "epoch": 0.5328738069989396, "grad_norm": 7.518799781799316, "learning_rate": 2.6617179215270417e-06, "loss": 0.2708, "num_input_tokens_seen": 1010848, "step": 1005 }, { "epoch": 0.5355249204665959, "grad_norm": 18.549467086791992, "learning_rate": 2.674973488865324e-06, "loss": 0.3677, "num_input_tokens_seen": 1015104, "step": 1010 }, { "epoch": 0.5381760339342524, "grad_norm": 20.725379943847656, "learning_rate": 2.6882290562036054e-06, "loss": 0.4327, "num_input_tokens_seen": 1018624, "step": 1015 }, { "epoch": 0.5408271474019088, "grad_norm": 35.48199462890625, "learning_rate": 2.7014846235418877e-06, "loss": 0.3839, "num_input_tokens_seen": 1023328, "step": 1020 }, { "epoch": 0.5434782608695652, "grad_norm": 16.40692138671875, "learning_rate": 2.71474019088017e-06, "loss": 0.3706, "num_input_tokens_seen": 1027936, "step": 1025 }, { "epoch": 0.5461293743372216, "grad_norm": 19.650909423828125, "learning_rate": 2.727995758218452e-06, "loss": 0.3465, "num_input_tokens_seen": 1032640, "step": 1030 }, { "epoch": 0.5487804878048781, "grad_norm": 10.727865219116211, "learning_rate": 2.7412513255567343e-06, "loss": 0.3561, "num_input_tokens_seen": 1037152, "step": 1035 }, { "epoch": 0.5514316012725344, "grad_norm": 10.487411499023438, "learning_rate": 2.7545068928950166e-06, "loss": 0.2063, "num_input_tokens_seen": 1042336, "step": 1040 }, { "epoch": 0.5540827147401909, "grad_norm": 24.2930908203125, "learning_rate": 2.767762460233298e-06, "loss": 0.3705, "num_input_tokens_seen": 1046912, "step": 1045 }, { "epoch": 0.5567338282078473, "grad_norm": 17.767601013183594, "learning_rate": 2.7810180275715804e-06, "loss": 0.4238, "num_input_tokens_seen": 1051904, "step": 1050 }, { "epoch": 0.5593849416755037, "grad_norm": 27.97957992553711, "learning_rate": 2.7942735949098627e-06, "loss": 0.3609, "num_input_tokens_seen": 1057920, "step": 1055 }, { "epoch": 0.5620360551431601, "grad_norm": 12.439352035522461, "learning_rate": 2.807529162248144e-06, "loss": 0.3512, "num_input_tokens_seen": 1062496, "step": 1060 }, { "epoch": 0.5646871686108166, "grad_norm": 16.221107482910156, "learning_rate": 2.8207847295864265e-06, "loss": 0.3761, "num_input_tokens_seen": 1067168, "step": 1065 }, { "epoch": 0.5673382820784729, "grad_norm": 9.137762069702148, "learning_rate": 2.8340402969247088e-06, "loss": 0.3651, "num_input_tokens_seen": 1072384, "step": 1070 }, { "epoch": 0.5699893955461294, "grad_norm": 4.287997245788574, "learning_rate": 2.8472958642629907e-06, "loss": 0.3962, "num_input_tokens_seen": 1077728, "step": 1075 }, { "epoch": 0.5726405090137858, "grad_norm": 5.609606742858887, "learning_rate": 2.860551431601273e-06, "loss": 0.366, "num_input_tokens_seen": 1083200, "step": 1080 }, { "epoch": 0.5752916224814422, "grad_norm": 7.56614875793457, "learning_rate": 2.873806998939555e-06, "loss": 0.3483, "num_input_tokens_seen": 1088352, "step": 1085 }, { "epoch": 0.5779427359490986, "grad_norm": 6.466113090515137, "learning_rate": 2.8870625662778368e-06, "loss": 0.3575, "num_input_tokens_seen": 1093120, "step": 1090 }, { "epoch": 0.5805938494167551, "grad_norm": 23.1430721282959, "learning_rate": 2.900318133616119e-06, "loss": 0.3687, "num_input_tokens_seen": 1099456, "step": 1095 }, { "epoch": 0.5832449628844114, "grad_norm": 21.0874080657959, "learning_rate": 2.9135737009544014e-06, "loss": 0.3381, "num_input_tokens_seen": 1104416, "step": 1100 }, { "epoch": 0.5858960763520679, "grad_norm": 4.66147518157959, "learning_rate": 2.926829268292683e-06, "loss": 0.3184, "num_input_tokens_seen": 1109408, "step": 1105 }, { "epoch": 0.5885471898197243, "grad_norm": 11.238460540771484, "learning_rate": 2.940084835630965e-06, "loss": 0.2692, "num_input_tokens_seen": 1113792, "step": 1110 }, { "epoch": 0.5911983032873807, "grad_norm": 15.724274635314941, "learning_rate": 2.9533404029692475e-06, "loss": 0.351, "num_input_tokens_seen": 1118816, "step": 1115 }, { "epoch": 0.5938494167550371, "grad_norm": 23.055299758911133, "learning_rate": 2.9665959703075294e-06, "loss": 0.3409, "num_input_tokens_seen": 1122336, "step": 1120 }, { "epoch": 0.5965005302226936, "grad_norm": 22.540861129760742, "learning_rate": 2.9798515376458113e-06, "loss": 0.3626, "num_input_tokens_seen": 1126944, "step": 1125 }, { "epoch": 0.5991516436903499, "grad_norm": 12.224953651428223, "learning_rate": 2.9931071049840936e-06, "loss": 0.3348, "num_input_tokens_seen": 1131232, "step": 1130 }, { "epoch": 0.6018027571580064, "grad_norm": 5.157472610473633, "learning_rate": 3.0063626723223755e-06, "loss": 0.3663, "num_input_tokens_seen": 1136672, "step": 1135 }, { "epoch": 0.6044538706256628, "grad_norm": 8.27541446685791, "learning_rate": 3.019618239660658e-06, "loss": 0.344, "num_input_tokens_seen": 1141600, "step": 1140 }, { "epoch": 0.6071049840933191, "grad_norm": 17.1798152923584, "learning_rate": 3.03287380699894e-06, "loss": 0.3618, "num_input_tokens_seen": 1146048, "step": 1145 }, { "epoch": 0.6097560975609756, "grad_norm": 12.44214916229248, "learning_rate": 3.0461293743372216e-06, "loss": 0.3135, "num_input_tokens_seen": 1151424, "step": 1150 }, { "epoch": 0.612407211028632, "grad_norm": 20.621501922607422, "learning_rate": 3.059384941675504e-06, "loss": 0.3757, "num_input_tokens_seen": 1155808, "step": 1155 }, { "epoch": 0.6150583244962884, "grad_norm": 15.686917304992676, "learning_rate": 3.0726405090137862e-06, "loss": 0.3579, "num_input_tokens_seen": 1161376, "step": 1160 }, { "epoch": 0.6177094379639448, "grad_norm": 6.067962169647217, "learning_rate": 3.085896076352068e-06, "loss": 0.3352, "num_input_tokens_seen": 1167232, "step": 1165 }, { "epoch": 0.6203605514316013, "grad_norm": 5.645861625671387, "learning_rate": 3.09915164369035e-06, "loss": 0.3443, "num_input_tokens_seen": 1172896, "step": 1170 }, { "epoch": 0.6230116648992576, "grad_norm": 11.983319282531738, "learning_rate": 3.1124072110286323e-06, "loss": 0.2838, "num_input_tokens_seen": 1178304, "step": 1175 }, { "epoch": 0.6256627783669141, "grad_norm": 14.314291000366211, "learning_rate": 3.125662778366914e-06, "loss": 0.3094, "num_input_tokens_seen": 1183840, "step": 1180 }, { "epoch": 0.6283138918345705, "grad_norm": 6.376946449279785, "learning_rate": 3.1389183457051965e-06, "loss": 0.3426, "num_input_tokens_seen": 1188928, "step": 1185 }, { "epoch": 0.630965005302227, "grad_norm": 16.38489532470703, "learning_rate": 3.152173913043479e-06, "loss": 0.3747, "num_input_tokens_seen": 1193824, "step": 1190 }, { "epoch": 0.6336161187698833, "grad_norm": 7.218472957611084, "learning_rate": 3.1654294803817603e-06, "loss": 0.3624, "num_input_tokens_seen": 1197984, "step": 1195 }, { "epoch": 0.6362672322375398, "grad_norm": 6.187770366668701, "learning_rate": 3.1786850477200426e-06, "loss": 0.3371, "num_input_tokens_seen": 1202976, "step": 1200 }, { "epoch": 0.6389183457051962, "grad_norm": 9.599185943603516, "learning_rate": 3.191940615058325e-06, "loss": 0.3439, "num_input_tokens_seen": 1207392, "step": 1205 }, { "epoch": 0.6415694591728526, "grad_norm": 20.310224533081055, "learning_rate": 3.2051961823966064e-06, "loss": 0.2956, "num_input_tokens_seen": 1211840, "step": 1210 }, { "epoch": 0.644220572640509, "grad_norm": 11.194534301757812, "learning_rate": 3.2184517497348887e-06, "loss": 0.3507, "num_input_tokens_seen": 1215648, "step": 1215 }, { "epoch": 0.6468716861081655, "grad_norm": 15.507196426391602, "learning_rate": 3.231707317073171e-06, "loss": 0.3633, "num_input_tokens_seen": 1220480, "step": 1220 }, { "epoch": 0.6495227995758218, "grad_norm": 16.35003662109375, "learning_rate": 3.2449628844114533e-06, "loss": 0.3059, "num_input_tokens_seen": 1226656, "step": 1225 }, { "epoch": 0.6521739130434783, "grad_norm": 10.12506103515625, "learning_rate": 3.2582184517497352e-06, "loss": 0.3596, "num_input_tokens_seen": 1232704, "step": 1230 }, { "epoch": 0.6548250265111347, "grad_norm": 11.694177627563477, "learning_rate": 3.271474019088017e-06, "loss": 0.3607, "num_input_tokens_seen": 1236992, "step": 1235 }, { "epoch": 0.6574761399787911, "grad_norm": 6.458616256713867, "learning_rate": 3.2847295864262994e-06, "loss": 0.3408, "num_input_tokens_seen": 1241920, "step": 1240 }, { "epoch": 0.6601272534464475, "grad_norm": 13.352863311767578, "learning_rate": 3.2979851537645813e-06, "loss": 0.3894, "num_input_tokens_seen": 1246080, "step": 1245 }, { "epoch": 0.662778366914104, "grad_norm": 5.038018226623535, "learning_rate": 3.3112407211028636e-06, "loss": 0.3446, "num_input_tokens_seen": 1251264, "step": 1250 }, { "epoch": 0.6654294803817603, "grad_norm": 13.417299270629883, "learning_rate": 3.324496288441146e-06, "loss": 0.3592, "num_input_tokens_seen": 1255808, "step": 1255 }, { "epoch": 0.6680805938494168, "grad_norm": 6.653986930847168, "learning_rate": 3.3377518557794274e-06, "loss": 0.3447, "num_input_tokens_seen": 1260640, "step": 1260 }, { "epoch": 0.6707317073170732, "grad_norm": 23.76848030090332, "learning_rate": 3.3510074231177097e-06, "loss": 0.3241, "num_input_tokens_seen": 1268704, "step": 1265 }, { "epoch": 0.6733828207847296, "grad_norm": 16.68150520324707, "learning_rate": 3.364262990455992e-06, "loss": 0.2083, "num_input_tokens_seen": 1274240, "step": 1270 }, { "epoch": 0.676033934252386, "grad_norm": 53.24739074707031, "learning_rate": 3.3775185577942735e-06, "loss": 0.345, "num_input_tokens_seen": 1279040, "step": 1275 }, { "epoch": 0.6786850477200425, "grad_norm": 16.659549713134766, "learning_rate": 3.390774125132556e-06, "loss": 0.3788, "num_input_tokens_seen": 1283520, "step": 1280 }, { "epoch": 0.6813361611876988, "grad_norm": 10.60910701751709, "learning_rate": 3.404029692470838e-06, "loss": 0.3657, "num_input_tokens_seen": 1287872, "step": 1285 }, { "epoch": 0.6839872746553552, "grad_norm": 17.200376510620117, "learning_rate": 3.41728525980912e-06, "loss": 0.4215, "num_input_tokens_seen": 1292864, "step": 1290 }, { "epoch": 0.6866383881230117, "grad_norm": 11.1148099899292, "learning_rate": 3.4305408271474024e-06, "loss": 0.3755, "num_input_tokens_seen": 1297920, "step": 1295 }, { "epoch": 0.689289501590668, "grad_norm": 9.163270950317383, "learning_rate": 3.4437963944856847e-06, "loss": 0.3649, "num_input_tokens_seen": 1302176, "step": 1300 }, { "epoch": 0.6919406150583245, "grad_norm": 7.180002212524414, "learning_rate": 3.457051961823966e-06, "loss": 0.3599, "num_input_tokens_seen": 1308064, "step": 1305 }, { "epoch": 0.694591728525981, "grad_norm": 5.846102714538574, "learning_rate": 3.4703075291622485e-06, "loss": 0.3321, "num_input_tokens_seen": 1312896, "step": 1310 }, { "epoch": 0.6972428419936373, "grad_norm": 22.08163070678711, "learning_rate": 3.4835630965005308e-06, "loss": 0.4208, "num_input_tokens_seen": 1318016, "step": 1315 }, { "epoch": 0.6998939554612937, "grad_norm": 5.293310165405273, "learning_rate": 3.4968186638388122e-06, "loss": 0.3589, "num_input_tokens_seen": 1322368, "step": 1320 }, { "epoch": 0.7025450689289502, "grad_norm": 12.750597953796387, "learning_rate": 3.5100742311770946e-06, "loss": 0.3538, "num_input_tokens_seen": 1326784, "step": 1325 }, { "epoch": 0.7051961823966065, "grad_norm": 15.100709915161133, "learning_rate": 3.523329798515377e-06, "loss": 0.3033, "num_input_tokens_seen": 1332288, "step": 1330 }, { "epoch": 0.707847295864263, "grad_norm": 9.565451622009277, "learning_rate": 3.5365853658536588e-06, "loss": 0.3162, "num_input_tokens_seen": 1337312, "step": 1335 }, { "epoch": 0.7104984093319194, "grad_norm": 3.5592236518859863, "learning_rate": 3.549840933191941e-06, "loss": 0.4238, "num_input_tokens_seen": 1343104, "step": 1340 }, { "epoch": 0.7131495227995758, "grad_norm": 8.085143089294434, "learning_rate": 3.563096500530223e-06, "loss": 0.284, "num_input_tokens_seen": 1347360, "step": 1345 }, { "epoch": 0.7158006362672322, "grad_norm": 14.5751314163208, "learning_rate": 3.576352067868505e-06, "loss": 0.3589, "num_input_tokens_seen": 1351488, "step": 1350 }, { "epoch": 0.7184517497348887, "grad_norm": 4.437746047973633, "learning_rate": 3.589607635206787e-06, "loss": 0.3171, "num_input_tokens_seen": 1357536, "step": 1355 }, { "epoch": 0.721102863202545, "grad_norm": 5.186069488525391, "learning_rate": 3.6028632025450695e-06, "loss": 0.3455, "num_input_tokens_seen": 1362592, "step": 1360 }, { "epoch": 0.7237539766702015, "grad_norm": 3.8820555210113525, "learning_rate": 3.616118769883351e-06, "loss": 0.3142, "num_input_tokens_seen": 1370176, "step": 1365 }, { "epoch": 0.7264050901378579, "grad_norm": 6.097499847412109, "learning_rate": 3.6293743372216333e-06, "loss": 0.3477, "num_input_tokens_seen": 1375264, "step": 1370 }, { "epoch": 0.7290562036055143, "grad_norm": 5.654319763183594, "learning_rate": 3.6426299045599156e-06, "loss": 0.3279, "num_input_tokens_seen": 1379968, "step": 1375 }, { "epoch": 0.7317073170731707, "grad_norm": 3.4267938137054443, "learning_rate": 3.6558854718981975e-06, "loss": 0.2774, "num_input_tokens_seen": 1384128, "step": 1380 }, { "epoch": 0.7343584305408272, "grad_norm": 4.552292346954346, "learning_rate": 3.6691410392364794e-06, "loss": 0.2543, "num_input_tokens_seen": 1390112, "step": 1385 }, { "epoch": 0.7370095440084835, "grad_norm": 4.781541347503662, "learning_rate": 3.6823966065747617e-06, "loss": 0.3372, "num_input_tokens_seen": 1394016, "step": 1390 }, { "epoch": 0.73966065747614, "grad_norm": 5.5178914070129395, "learning_rate": 3.6956521739130436e-06, "loss": 0.362, "num_input_tokens_seen": 1399648, "step": 1395 }, { "epoch": 0.7423117709437964, "grad_norm": 2.7522151470184326, "learning_rate": 3.708907741251326e-06, "loss": 0.3454, "num_input_tokens_seen": 1403264, "step": 1400 }, { "epoch": 0.7449628844114529, "grad_norm": 5.581048965454102, "learning_rate": 3.722163308589608e-06, "loss": 0.3673, "num_input_tokens_seen": 1407552, "step": 1405 }, { "epoch": 0.7476139978791092, "grad_norm": 11.638602256774902, "learning_rate": 3.7354188759278897e-06, "loss": 0.3505, "num_input_tokens_seen": 1411360, "step": 1410 }, { "epoch": 0.7502651113467657, "grad_norm": 9.834659576416016, "learning_rate": 3.748674443266172e-06, "loss": 0.3469, "num_input_tokens_seen": 1417280, "step": 1415 }, { "epoch": 0.7529162248144221, "grad_norm": 3.1643431186676025, "learning_rate": 3.7619300106044543e-06, "loss": 0.3667, "num_input_tokens_seen": 1422464, "step": 1420 }, { "epoch": 0.7555673382820784, "grad_norm": 8.454486846923828, "learning_rate": 3.775185577942736e-06, "loss": 0.3541, "num_input_tokens_seen": 1428320, "step": 1425 }, { "epoch": 0.7582184517497349, "grad_norm": 5.221370697021484, "learning_rate": 3.788441145281018e-06, "loss": 0.3364, "num_input_tokens_seen": 1433952, "step": 1430 }, { "epoch": 0.7608695652173914, "grad_norm": 9.231751441955566, "learning_rate": 3.8016967126193004e-06, "loss": 0.3439, "num_input_tokens_seen": 1439840, "step": 1435 }, { "epoch": 0.7635206786850477, "grad_norm": 5.480750560760498, "learning_rate": 3.814952279957583e-06, "loss": 0.2818, "num_input_tokens_seen": 1444224, "step": 1440 }, { "epoch": 0.7661717921527041, "grad_norm": 11.849740982055664, "learning_rate": 3.828207847295865e-06, "loss": 0.4089, "num_input_tokens_seen": 1449504, "step": 1445 }, { "epoch": 0.7688229056203606, "grad_norm": 8.253738403320312, "learning_rate": 3.8414634146341465e-06, "loss": 0.3358, "num_input_tokens_seen": 1453728, "step": 1450 }, { "epoch": 0.7714740190880169, "grad_norm": 4.297236919403076, "learning_rate": 3.854718981972429e-06, "loss": 0.3452, "num_input_tokens_seen": 1459136, "step": 1455 }, { "epoch": 0.7741251325556734, "grad_norm": 3.491947650909424, "learning_rate": 3.86797454931071e-06, "loss": 0.3483, "num_input_tokens_seen": 1463616, "step": 1460 }, { "epoch": 0.7767762460233298, "grad_norm": 34.5661506652832, "learning_rate": 3.881230116648993e-06, "loss": 0.4338, "num_input_tokens_seen": 1468736, "step": 1465 }, { "epoch": 0.7794273594909862, "grad_norm": 20.135234832763672, "learning_rate": 3.894485683987275e-06, "loss": 0.4158, "num_input_tokens_seen": 1474176, "step": 1470 }, { "epoch": 0.7820784729586426, "grad_norm": 5.518883228302002, "learning_rate": 3.907741251325557e-06, "loss": 0.3681, "num_input_tokens_seen": 1479616, "step": 1475 }, { "epoch": 0.7847295864262991, "grad_norm": 10.873873710632324, "learning_rate": 3.9209968186638395e-06, "loss": 0.3552, "num_input_tokens_seen": 1484192, "step": 1480 }, { "epoch": 0.7873806998939554, "grad_norm": 5.264577388763428, "learning_rate": 3.9342523860021214e-06, "loss": 0.3141, "num_input_tokens_seen": 1488672, "step": 1485 }, { "epoch": 0.7900318133616119, "grad_norm": 18.423065185546875, "learning_rate": 3.947507953340403e-06, "loss": 0.3393, "num_input_tokens_seen": 1493600, "step": 1490 }, { "epoch": 0.7926829268292683, "grad_norm": 5.143199920654297, "learning_rate": 3.960763520678685e-06, "loss": 0.2286, "num_input_tokens_seen": 1499264, "step": 1495 }, { "epoch": 0.7953340402969247, "grad_norm": 4.064625263214111, "learning_rate": 3.974019088016968e-06, "loss": 0.3153, "num_input_tokens_seen": 1504800, "step": 1500 }, { "epoch": 0.7979851537645811, "grad_norm": 4.524999141693115, "learning_rate": 3.987274655355249e-06, "loss": 0.3142, "num_input_tokens_seen": 1509632, "step": 1505 }, { "epoch": 0.8006362672322376, "grad_norm": 8.362607955932617, "learning_rate": 4.000530222693532e-06, "loss": 0.4208, "num_input_tokens_seen": 1514304, "step": 1510 }, { "epoch": 0.8032873806998939, "grad_norm": 3.3655107021331787, "learning_rate": 4.013785790031814e-06, "loss": 0.3491, "num_input_tokens_seen": 1519840, "step": 1515 }, { "epoch": 0.8059384941675504, "grad_norm": 10.712566375732422, "learning_rate": 4.0270413573700955e-06, "loss": 0.3485, "num_input_tokens_seen": 1524416, "step": 1520 }, { "epoch": 0.8085896076352068, "grad_norm": 10.808219909667969, "learning_rate": 4.040296924708378e-06, "loss": 0.3627, "num_input_tokens_seen": 1528768, "step": 1525 }, { "epoch": 0.8112407211028632, "grad_norm": 5.779064655303955, "learning_rate": 4.05355249204666e-06, "loss": 0.3223, "num_input_tokens_seen": 1533024, "step": 1530 }, { "epoch": 0.8138918345705196, "grad_norm": 4.807837963104248, "learning_rate": 4.066808059384942e-06, "loss": 0.3839, "num_input_tokens_seen": 1538048, "step": 1535 }, { "epoch": 0.8165429480381761, "grad_norm": 5.270480155944824, "learning_rate": 4.080063626723224e-06, "loss": 0.3487, "num_input_tokens_seen": 1543584, "step": 1540 }, { "epoch": 0.8191940615058324, "grad_norm": 9.829469680786133, "learning_rate": 4.093319194061507e-06, "loss": 0.3023, "num_input_tokens_seen": 1548608, "step": 1545 }, { "epoch": 0.8218451749734889, "grad_norm": 8.538362503051758, "learning_rate": 4.106574761399788e-06, "loss": 0.3575, "num_input_tokens_seen": 1554528, "step": 1550 }, { "epoch": 0.8244962884411453, "grad_norm": 2.974262237548828, "learning_rate": 4.1198303287380705e-06, "loss": 0.1923, "num_input_tokens_seen": 1559968, "step": 1555 }, { "epoch": 0.8271474019088016, "grad_norm": 15.535440444946289, "learning_rate": 4.133085896076352e-06, "loss": 0.341, "num_input_tokens_seen": 1564032, "step": 1560 }, { "epoch": 0.8297985153764581, "grad_norm": 9.317663192749023, "learning_rate": 4.146341463414634e-06, "loss": 0.3946, "num_input_tokens_seen": 1568352, "step": 1565 }, { "epoch": 0.8324496288441146, "grad_norm": 16.734577178955078, "learning_rate": 4.159597030752916e-06, "loss": 0.3232, "num_input_tokens_seen": 1572800, "step": 1570 }, { "epoch": 0.8351007423117709, "grad_norm": 14.589133262634277, "learning_rate": 4.172852598091199e-06, "loss": 0.3495, "num_input_tokens_seen": 1578720, "step": 1575 }, { "epoch": 0.8377518557794273, "grad_norm": 5.984978199005127, "learning_rate": 4.186108165429481e-06, "loss": 0.3342, "num_input_tokens_seen": 1584288, "step": 1580 }, { "epoch": 0.8404029692470838, "grad_norm": 3.433580160140991, "learning_rate": 4.199363732767763e-06, "loss": 0.307, "num_input_tokens_seen": 1589632, "step": 1585 }, { "epoch": 0.8430540827147401, "grad_norm": 4.706711292266846, "learning_rate": 4.212619300106045e-06, "loss": 0.3798, "num_input_tokens_seen": 1594560, "step": 1590 }, { "epoch": 0.8457051961823966, "grad_norm": 6.009047508239746, "learning_rate": 4.2258748674443264e-06, "loss": 0.2871, "num_input_tokens_seen": 1599040, "step": 1595 }, { "epoch": 0.848356309650053, "grad_norm": 7.110427379608154, "learning_rate": 4.239130434782609e-06, "loss": 0.2858, "num_input_tokens_seen": 1606496, "step": 1600 }, { "epoch": 0.8510074231177094, "grad_norm": 4.552140235900879, "learning_rate": 4.252386002120891e-06, "loss": 0.3047, "num_input_tokens_seen": 1612544, "step": 1605 }, { "epoch": 0.8536585365853658, "grad_norm": 12.21636962890625, "learning_rate": 4.265641569459173e-06, "loss": 0.4232, "num_input_tokens_seen": 1617504, "step": 1610 }, { "epoch": 0.8563096500530223, "grad_norm": 10.536648750305176, "learning_rate": 4.278897136797455e-06, "loss": 0.3178, "num_input_tokens_seen": 1622432, "step": 1615 }, { "epoch": 0.8589607635206787, "grad_norm": 6.32003116607666, "learning_rate": 4.292152704135738e-06, "loss": 0.3335, "num_input_tokens_seen": 1628224, "step": 1620 }, { "epoch": 0.8616118769883351, "grad_norm": 5.071932315826416, "learning_rate": 4.3054082714740195e-06, "loss": 0.3538, "num_input_tokens_seen": 1632704, "step": 1625 }, { "epoch": 0.8642629904559915, "grad_norm": 9.6878080368042, "learning_rate": 4.318663838812301e-06, "loss": 0.3129, "num_input_tokens_seen": 1637344, "step": 1630 }, { "epoch": 0.866914103923648, "grad_norm": 5.7884721755981445, "learning_rate": 4.331919406150584e-06, "loss": 0.34, "num_input_tokens_seen": 1643072, "step": 1635 }, { "epoch": 0.8695652173913043, "grad_norm": 3.3346502780914307, "learning_rate": 4.345174973488865e-06, "loss": 0.2838, "num_input_tokens_seen": 1649440, "step": 1640 }, { "epoch": 0.8722163308589608, "grad_norm": 10.851434707641602, "learning_rate": 4.358430540827148e-06, "loss": 0.3856, "num_input_tokens_seen": 1654752, "step": 1645 }, { "epoch": 0.8748674443266172, "grad_norm": 7.921802997589111, "learning_rate": 4.37168610816543e-06, "loss": 0.3168, "num_input_tokens_seen": 1660160, "step": 1650 }, { "epoch": 0.8775185577942736, "grad_norm": 3.3315258026123047, "learning_rate": 4.3849416755037125e-06, "loss": 0.3627, "num_input_tokens_seen": 1664800, "step": 1655 }, { "epoch": 0.88016967126193, "grad_norm": 11.895049095153809, "learning_rate": 4.3981972428419936e-06, "loss": 0.3582, "num_input_tokens_seen": 1669376, "step": 1660 }, { "epoch": 0.8828207847295865, "grad_norm": 4.246018886566162, "learning_rate": 4.411452810180276e-06, "loss": 0.3441, "num_input_tokens_seen": 1674240, "step": 1665 }, { "epoch": 0.8854718981972428, "grad_norm": 7.519724369049072, "learning_rate": 4.424708377518558e-06, "loss": 0.3522, "num_input_tokens_seen": 1679552, "step": 1670 }, { "epoch": 0.8881230116648993, "grad_norm": 5.945228099822998, "learning_rate": 4.43796394485684e-06, "loss": 0.3562, "num_input_tokens_seen": 1684128, "step": 1675 }, { "epoch": 0.8907741251325557, "grad_norm": 9.73492431640625, "learning_rate": 4.451219512195122e-06, "loss": 0.3212, "num_input_tokens_seen": 1688512, "step": 1680 }, { "epoch": 0.8934252386002121, "grad_norm": 3.246051788330078, "learning_rate": 4.464475079533405e-06, "loss": 0.375, "num_input_tokens_seen": 1694048, "step": 1685 }, { "epoch": 0.8960763520678685, "grad_norm": 4.298588275909424, "learning_rate": 4.477730646871687e-06, "loss": 0.3601, "num_input_tokens_seen": 1698720, "step": 1690 }, { "epoch": 0.898727465535525, "grad_norm": 4.701527118682861, "learning_rate": 4.4909862142099685e-06, "loss": 0.3369, "num_input_tokens_seen": 1704000, "step": 1695 }, { "epoch": 0.9013785790031813, "grad_norm": 7.69580078125, "learning_rate": 4.504241781548251e-06, "loss": 0.2413, "num_input_tokens_seen": 1709760, "step": 1700 }, { "epoch": 0.9040296924708378, "grad_norm": 13.18864631652832, "learning_rate": 4.517497348886532e-06, "loss": 0.3453, "num_input_tokens_seen": 1713856, "step": 1705 }, { "epoch": 0.9066808059384942, "grad_norm": 9.031767845153809, "learning_rate": 4.530752916224815e-06, "loss": 0.3908, "num_input_tokens_seen": 1718720, "step": 1710 }, { "epoch": 0.9093319194061505, "grad_norm": 11.187403678894043, "learning_rate": 4.544008483563097e-06, "loss": 0.3109, "num_input_tokens_seen": 1723072, "step": 1715 }, { "epoch": 0.911983032873807, "grad_norm": 8.797430038452148, "learning_rate": 4.557264050901379e-06, "loss": 0.2931, "num_input_tokens_seen": 1727328, "step": 1720 }, { "epoch": 0.9146341463414634, "grad_norm": 4.097959518432617, "learning_rate": 4.570519618239661e-06, "loss": 0.3611, "num_input_tokens_seen": 1732320, "step": 1725 }, { "epoch": 0.9172852598091198, "grad_norm": 6.790299892425537, "learning_rate": 4.5837751855779434e-06, "loss": 0.2227, "num_input_tokens_seen": 1735968, "step": 1730 }, { "epoch": 0.9199363732767762, "grad_norm": 15.436184883117676, "learning_rate": 4.597030752916225e-06, "loss": 0.4648, "num_input_tokens_seen": 1741056, "step": 1735 }, { "epoch": 0.9225874867444327, "grad_norm": 3.7913875579833984, "learning_rate": 4.610286320254507e-06, "loss": 0.377, "num_input_tokens_seen": 1747456, "step": 1740 }, { "epoch": 0.925238600212089, "grad_norm": 5.163459777832031, "learning_rate": 4.62354188759279e-06, "loss": 0.368, "num_input_tokens_seen": 1751520, "step": 1745 }, { "epoch": 0.9278897136797455, "grad_norm": 5.66231107711792, "learning_rate": 4.636797454931071e-06, "loss": 0.3228, "num_input_tokens_seen": 1757184, "step": 1750 }, { "epoch": 0.9305408271474019, "grad_norm": 3.4465548992156982, "learning_rate": 4.650053022269354e-06, "loss": 0.3378, "num_input_tokens_seen": 1762176, "step": 1755 }, { "epoch": 0.9331919406150583, "grad_norm": 14.19305419921875, "learning_rate": 4.663308589607636e-06, "loss": 0.3895, "num_input_tokens_seen": 1767040, "step": 1760 }, { "epoch": 0.9358430540827147, "grad_norm": 4.448056221008301, "learning_rate": 4.6765641569459175e-06, "loss": 0.3188, "num_input_tokens_seen": 1771840, "step": 1765 }, { "epoch": 0.9384941675503712, "grad_norm": 4.549402713775635, "learning_rate": 4.689819724284199e-06, "loss": 0.3131, "num_input_tokens_seen": 1776000, "step": 1770 }, { "epoch": 0.9411452810180275, "grad_norm": 5.010622501373291, "learning_rate": 4.703075291622482e-06, "loss": 0.3279, "num_input_tokens_seen": 1780736, "step": 1775 }, { "epoch": 0.943796394485684, "grad_norm": 4.201654434204102, "learning_rate": 4.716330858960764e-06, "loss": 0.3583, "num_input_tokens_seen": 1786112, "step": 1780 }, { "epoch": 0.9464475079533404, "grad_norm": 3.440140724182129, "learning_rate": 4.729586426299046e-06, "loss": 0.3314, "num_input_tokens_seen": 1791488, "step": 1785 }, { "epoch": 0.9490986214209968, "grad_norm": 5.818238258361816, "learning_rate": 4.742841993637328e-06, "loss": 0.3553, "num_input_tokens_seen": 1797088, "step": 1790 }, { "epoch": 0.9517497348886532, "grad_norm": 4.290258407592773, "learning_rate": 4.75609756097561e-06, "loss": 0.3748, "num_input_tokens_seen": 1802144, "step": 1795 }, { "epoch": 0.9544008483563097, "grad_norm": 3.9874069690704346, "learning_rate": 4.7693531283138924e-06, "loss": 0.3389, "num_input_tokens_seen": 1806016, "step": 1800 }, { "epoch": 0.957051961823966, "grad_norm": 3.192455768585205, "learning_rate": 4.782608695652174e-06, "loss": 0.3014, "num_input_tokens_seen": 1811968, "step": 1805 }, { "epoch": 0.9597030752916225, "grad_norm": 5.316605091094971, "learning_rate": 4.795864262990456e-06, "loss": 0.3224, "num_input_tokens_seen": 1816544, "step": 1810 }, { "epoch": 0.9623541887592789, "grad_norm": 4.532746315002441, "learning_rate": 4.809119830328738e-06, "loss": 0.3626, "num_input_tokens_seen": 1822208, "step": 1815 }, { "epoch": 0.9650053022269353, "grad_norm": 2.169666290283203, "learning_rate": 4.822375397667021e-06, "loss": 0.3737, "num_input_tokens_seen": 1829088, "step": 1820 }, { "epoch": 0.9676564156945917, "grad_norm": 2.0407767295837402, "learning_rate": 4.835630965005303e-06, "loss": 0.3747, "num_input_tokens_seen": 1832640, "step": 1825 }, { "epoch": 0.9703075291622482, "grad_norm": 4.4990644454956055, "learning_rate": 4.848886532343585e-06, "loss": 0.3426, "num_input_tokens_seen": 1838208, "step": 1830 }, { "epoch": 0.9729586426299046, "grad_norm": 8.219486236572266, "learning_rate": 4.8621420996818665e-06, "loss": 0.353, "num_input_tokens_seen": 1844672, "step": 1835 }, { "epoch": 0.975609756097561, "grad_norm": 2.64060378074646, "learning_rate": 4.8753976670201484e-06, "loss": 0.3478, "num_input_tokens_seen": 1849248, "step": 1840 }, { "epoch": 0.9782608695652174, "grad_norm": 3.875958204269409, "learning_rate": 4.888653234358431e-06, "loss": 0.3086, "num_input_tokens_seen": 1854784, "step": 1845 }, { "epoch": 0.9809119830328739, "grad_norm": 4.606790542602539, "learning_rate": 4.901908801696713e-06, "loss": 0.3561, "num_input_tokens_seen": 1860800, "step": 1850 }, { "epoch": 0.9835630965005302, "grad_norm": 7.162702560424805, "learning_rate": 4.915164369034995e-06, "loss": 0.3919, "num_input_tokens_seen": 1865760, "step": 1855 }, { "epoch": 0.9862142099681867, "grad_norm": 5.413555145263672, "learning_rate": 4.928419936373277e-06, "loss": 0.3754, "num_input_tokens_seen": 1870720, "step": 1860 }, { "epoch": 0.9888653234358431, "grad_norm": 2.944502353668213, "learning_rate": 4.94167550371156e-06, "loss": 0.3271, "num_input_tokens_seen": 1875296, "step": 1865 }, { "epoch": 0.9915164369034994, "grad_norm": 2.759016275405884, "learning_rate": 4.9549310710498415e-06, "loss": 0.3284, "num_input_tokens_seen": 1879360, "step": 1870 }, { "epoch": 0.9941675503711559, "grad_norm": 1.8491462469100952, "learning_rate": 4.968186638388123e-06, "loss": 0.3021, "num_input_tokens_seen": 1883776, "step": 1875 }, { "epoch": 0.9968186638388123, "grad_norm": 4.7240447998046875, "learning_rate": 4.981442205726405e-06, "loss": 0.3684, "num_input_tokens_seen": 1888928, "step": 1880 }, { "epoch": 0.9994697773064687, "grad_norm": 2.252026081085205, "learning_rate": 4.994697773064688e-06, "loss": 0.3019, "num_input_tokens_seen": 1893472, "step": 1885 }, { "epoch": 1.002120890774125, "grad_norm": 5.4598069190979, "learning_rate": 5.00795334040297e-06, "loss": 0.4007, "num_input_tokens_seen": 1897736, "step": 1890 }, { "epoch": 1.0047720042417816, "grad_norm": 8.790820121765137, "learning_rate": 5.021208907741252e-06, "loss": 0.3691, "num_input_tokens_seen": 1902696, "step": 1895 }, { "epoch": 1.007423117709438, "grad_norm": 3.2783396244049072, "learning_rate": 5.034464475079534e-06, "loss": 0.3462, "num_input_tokens_seen": 1907048, "step": 1900 }, { "epoch": 1.0100742311770943, "grad_norm": 3.335737705230713, "learning_rate": 5.047720042417816e-06, "loss": 0.3225, "num_input_tokens_seen": 1911208, "step": 1905 }, { "epoch": 1.0127253446447508, "grad_norm": 3.08367657661438, "learning_rate": 5.060975609756098e-06, "loss": 0.3364, "num_input_tokens_seen": 1915208, "step": 1910 }, { "epoch": 1.0153764581124072, "grad_norm": 2.95753812789917, "learning_rate": 5.074231177094379e-06, "loss": 0.3635, "num_input_tokens_seen": 1920200, "step": 1915 }, { "epoch": 1.0180275715800637, "grad_norm": 4.510390758514404, "learning_rate": 5.087486744432663e-06, "loss": 0.3318, "num_input_tokens_seen": 1926376, "step": 1920 }, { "epoch": 1.02067868504772, "grad_norm": 3.6176178455352783, "learning_rate": 5.100742311770944e-06, "loss": 0.3251, "num_input_tokens_seen": 1931176, "step": 1925 }, { "epoch": 1.0233297985153764, "grad_norm": 2.6346747875213623, "learning_rate": 5.113997879109226e-06, "loss": 0.3327, "num_input_tokens_seen": 1936264, "step": 1930 }, { "epoch": 1.0259809119830328, "grad_norm": 3.294318914413452, "learning_rate": 5.127253446447509e-06, "loss": 0.3251, "num_input_tokens_seen": 1941288, "step": 1935 }, { "epoch": 1.0286320254506893, "grad_norm": 3.94918155670166, "learning_rate": 5.1405090137857905e-06, "loss": 0.3264, "num_input_tokens_seen": 1946280, "step": 1940 }, { "epoch": 1.0312831389183457, "grad_norm": 2.3741047382354736, "learning_rate": 5.153764581124072e-06, "loss": 0.3241, "num_input_tokens_seen": 1951784, "step": 1945 }, { "epoch": 1.0339342523860022, "grad_norm": 2.701810836791992, "learning_rate": 5.167020148462355e-06, "loss": 0.4279, "num_input_tokens_seen": 1956328, "step": 1950 }, { "epoch": 1.0365853658536586, "grad_norm": 2.7991843223571777, "learning_rate": 5.180275715800637e-06, "loss": 0.325, "num_input_tokens_seen": 1961320, "step": 1955 }, { "epoch": 1.039236479321315, "grad_norm": 5.300170421600342, "learning_rate": 5.193531283138918e-06, "loss": 0.3645, "num_input_tokens_seen": 1966056, "step": 1960 }, { "epoch": 1.0418875927889715, "grad_norm": 2.522798776626587, "learning_rate": 5.206786850477202e-06, "loss": 0.3227, "num_input_tokens_seen": 1971112, "step": 1965 }, { "epoch": 1.0445387062566278, "grad_norm": 3.0152783393859863, "learning_rate": 5.220042417815483e-06, "loss": 0.3486, "num_input_tokens_seen": 1976232, "step": 1970 }, { "epoch": 1.0471898197242842, "grad_norm": 2.9829025268554688, "learning_rate": 5.233297985153765e-06, "loss": 0.3521, "num_input_tokens_seen": 1981032, "step": 1975 }, { "epoch": 1.0498409331919407, "grad_norm": 2.9453330039978027, "learning_rate": 5.246553552492047e-06, "loss": 0.2911, "num_input_tokens_seen": 1986664, "step": 1980 }, { "epoch": 1.052492046659597, "grad_norm": 2.310710906982422, "learning_rate": 5.259809119830329e-06, "loss": 0.3503, "num_input_tokens_seen": 1991688, "step": 1985 }, { "epoch": 1.0551431601272534, "grad_norm": 4.181407451629639, "learning_rate": 5.273064687168611e-06, "loss": 0.2692, "num_input_tokens_seen": 1996072, "step": 1990 }, { "epoch": 1.05779427359491, "grad_norm": 3.8508377075195312, "learning_rate": 5.286320254506894e-06, "loss": 0.3438, "num_input_tokens_seen": 2001096, "step": 1995 }, { "epoch": 1.0604453870625663, "grad_norm": 6.340102672576904, "learning_rate": 5.299575821845176e-06, "loss": 0.3168, "num_input_tokens_seen": 2005800, "step": 2000 }, { "epoch": 1.0630965005302226, "grad_norm": 4.72323751449585, "learning_rate": 5.312831389183457e-06, "loss": 0.37, "num_input_tokens_seen": 2010696, "step": 2005 }, { "epoch": 1.0657476139978792, "grad_norm": 2.354092597961426, "learning_rate": 5.3260869565217395e-06, "loss": 0.3777, "num_input_tokens_seen": 2014952, "step": 2010 }, { "epoch": 1.0683987274655355, "grad_norm": 5.924427509307861, "learning_rate": 5.339342523860021e-06, "loss": 0.2907, "num_input_tokens_seen": 2019688, "step": 2015 }, { "epoch": 1.0710498409331919, "grad_norm": 3.6510887145996094, "learning_rate": 5.352598091198303e-06, "loss": 0.3553, "num_input_tokens_seen": 2024104, "step": 2020 }, { "epoch": 1.0737009544008485, "grad_norm": 1.8145240545272827, "learning_rate": 5.365853658536586e-06, "loss": 0.3054, "num_input_tokens_seen": 2028584, "step": 2025 }, { "epoch": 1.0763520678685048, "grad_norm": 5.2487263679504395, "learning_rate": 5.379109225874868e-06, "loss": 0.3007, "num_input_tokens_seen": 2034120, "step": 2030 }, { "epoch": 1.0790031813361611, "grad_norm": 2.586838722229004, "learning_rate": 5.39236479321315e-06, "loss": 0.3433, "num_input_tokens_seen": 2038184, "step": 2035 }, { "epoch": 1.0816542948038177, "grad_norm": 8.615992546081543, "learning_rate": 5.4056203605514326e-06, "loss": 0.2683, "num_input_tokens_seen": 2043944, "step": 2040 }, { "epoch": 1.084305408271474, "grad_norm": 3.4531989097595215, "learning_rate": 5.4188759278897144e-06, "loss": 0.2819, "num_input_tokens_seen": 2048808, "step": 2045 }, { "epoch": 1.0869565217391304, "grad_norm": 3.9527392387390137, "learning_rate": 5.4321314952279955e-06, "loss": 0.3656, "num_input_tokens_seen": 2053320, "step": 2050 }, { "epoch": 1.089607635206787, "grad_norm": 2.67539644241333, "learning_rate": 5.445387062566278e-06, "loss": 0.4409, "num_input_tokens_seen": 2058664, "step": 2055 }, { "epoch": 1.0922587486744433, "grad_norm": 2.01046085357666, "learning_rate": 5.45864262990456e-06, "loss": 0.3428, "num_input_tokens_seen": 2063400, "step": 2060 }, { "epoch": 1.0949098621420996, "grad_norm": 3.9835286140441895, "learning_rate": 5.471898197242842e-06, "loss": 0.337, "num_input_tokens_seen": 2067912, "step": 2065 }, { "epoch": 1.0975609756097562, "grad_norm": 1.8803664445877075, "learning_rate": 5.485153764581125e-06, "loss": 0.3594, "num_input_tokens_seen": 2074728, "step": 2070 }, { "epoch": 1.1002120890774125, "grad_norm": 2.7358243465423584, "learning_rate": 5.498409331919407e-06, "loss": 0.3328, "num_input_tokens_seen": 2079048, "step": 2075 }, { "epoch": 1.1028632025450689, "grad_norm": 3.099327802658081, "learning_rate": 5.5116648992576885e-06, "loss": 0.3324, "num_input_tokens_seen": 2084520, "step": 2080 }, { "epoch": 1.1055143160127254, "grad_norm": 5.593451023101807, "learning_rate": 5.524920466595971e-06, "loss": 0.3747, "num_input_tokens_seen": 2090952, "step": 2085 }, { "epoch": 1.1081654294803818, "grad_norm": 5.437244415283203, "learning_rate": 5.538176033934253e-06, "loss": 0.367, "num_input_tokens_seen": 2096488, "step": 2090 }, { "epoch": 1.110816542948038, "grad_norm": 2.0236763954162598, "learning_rate": 5.551431601272534e-06, "loss": 0.3447, "num_input_tokens_seen": 2102952, "step": 2095 }, { "epoch": 1.1134676564156947, "grad_norm": 3.6438281536102295, "learning_rate": 5.564687168610817e-06, "loss": 0.3519, "num_input_tokens_seen": 2107240, "step": 2100 }, { "epoch": 1.116118769883351, "grad_norm": 4.646177291870117, "learning_rate": 5.577942735949099e-06, "loss": 0.3453, "num_input_tokens_seen": 2112008, "step": 2105 }, { "epoch": 1.1187698833510074, "grad_norm": 5.253504276275635, "learning_rate": 5.591198303287381e-06, "loss": 0.3621, "num_input_tokens_seen": 2116360, "step": 2110 }, { "epoch": 1.121420996818664, "grad_norm": 1.9025681018829346, "learning_rate": 5.6044538706256635e-06, "loss": 0.3356, "num_input_tokens_seen": 2121096, "step": 2115 }, { "epoch": 1.1240721102863203, "grad_norm": 5.107486248016357, "learning_rate": 5.617709437963945e-06, "loss": 0.3046, "num_input_tokens_seen": 2126248, "step": 2120 }, { "epoch": 1.1267232237539766, "grad_norm": 4.852975368499756, "learning_rate": 5.630965005302227e-06, "loss": 0.3399, "num_input_tokens_seen": 2131112, "step": 2125 }, { "epoch": 1.1293743372216332, "grad_norm": 1.4870749711990356, "learning_rate": 5.64422057264051e-06, "loss": 0.4361, "num_input_tokens_seen": 2136232, "step": 2130 }, { "epoch": 1.1320254506892895, "grad_norm": 7.268287658691406, "learning_rate": 5.657476139978791e-06, "loss": 0.443, "num_input_tokens_seen": 2140936, "step": 2135 }, { "epoch": 1.1346765641569458, "grad_norm": 3.169123411178589, "learning_rate": 5.670731707317073e-06, "loss": 0.2893, "num_input_tokens_seen": 2146056, "step": 2140 }, { "epoch": 1.1373276776246024, "grad_norm": 7.006564617156982, "learning_rate": 5.683987274655356e-06, "loss": 0.3352, "num_input_tokens_seen": 2150888, "step": 2145 }, { "epoch": 1.1399787910922587, "grad_norm": 3.740334987640381, "learning_rate": 5.6972428419936376e-06, "loss": 0.3185, "num_input_tokens_seen": 2155272, "step": 2150 }, { "epoch": 1.142629904559915, "grad_norm": 17.00642967224121, "learning_rate": 5.7104984093319194e-06, "loss": 0.3362, "num_input_tokens_seen": 2160136, "step": 2155 }, { "epoch": 1.1452810180275717, "grad_norm": 15.573376655578613, "learning_rate": 5.723753976670202e-06, "loss": 0.4399, "num_input_tokens_seen": 2165032, "step": 2160 }, { "epoch": 1.147932131495228, "grad_norm": 4.128669738769531, "learning_rate": 5.737009544008484e-06, "loss": 0.3, "num_input_tokens_seen": 2170664, "step": 2165 }, { "epoch": 1.1505832449628843, "grad_norm": 3.937875747680664, "learning_rate": 5.750265111346766e-06, "loss": 0.275, "num_input_tokens_seen": 2175720, "step": 2170 }, { "epoch": 1.153234358430541, "grad_norm": 4.814082622528076, "learning_rate": 5.763520678685049e-06, "loss": 0.336, "num_input_tokens_seen": 2181224, "step": 2175 }, { "epoch": 1.1558854718981972, "grad_norm": 1.5271682739257812, "learning_rate": 5.77677624602333e-06, "loss": 0.3746, "num_input_tokens_seen": 2185160, "step": 2180 }, { "epoch": 1.1585365853658536, "grad_norm": 1.8222355842590332, "learning_rate": 5.790031813361612e-06, "loss": 0.332, "num_input_tokens_seen": 2189864, "step": 2185 }, { "epoch": 1.1611876988335101, "grad_norm": 2.8317127227783203, "learning_rate": 5.803287380699894e-06, "loss": 0.3604, "num_input_tokens_seen": 2195240, "step": 2190 }, { "epoch": 1.1638388123011665, "grad_norm": 5.427493572235107, "learning_rate": 5.816542948038176e-06, "loss": 0.3438, "num_input_tokens_seen": 2199208, "step": 2195 }, { "epoch": 1.1664899257688228, "grad_norm": 1.8532438278198242, "learning_rate": 5.829798515376459e-06, "loss": 0.3624, "num_input_tokens_seen": 2205896, "step": 2200 }, { "epoch": 1.1691410392364794, "grad_norm": 1.4079322814941406, "learning_rate": 5.843054082714741e-06, "loss": 0.2742, "num_input_tokens_seen": 2211528, "step": 2205 }, { "epoch": 1.1717921527041357, "grad_norm": 9.745203971862793, "learning_rate": 5.856309650053023e-06, "loss": 0.375, "num_input_tokens_seen": 2216584, "step": 2210 }, { "epoch": 1.174443266171792, "grad_norm": 4.463660717010498, "learning_rate": 5.8695652173913055e-06, "loss": 0.3151, "num_input_tokens_seen": 2221160, "step": 2215 }, { "epoch": 1.1770943796394486, "grad_norm": 2.1886913776397705, "learning_rate": 5.882820784729587e-06, "loss": 0.3831, "num_input_tokens_seen": 2225800, "step": 2220 }, { "epoch": 1.179745493107105, "grad_norm": 3.071580648422241, "learning_rate": 5.8960763520678685e-06, "loss": 0.362, "num_input_tokens_seen": 2230536, "step": 2225 }, { "epoch": 1.1823966065747613, "grad_norm": 4.828311920166016, "learning_rate": 5.909331919406151e-06, "loss": 0.3341, "num_input_tokens_seen": 2235080, "step": 2230 }, { "epoch": 1.1850477200424179, "grad_norm": 6.033481597900391, "learning_rate": 5.922587486744433e-06, "loss": 0.3606, "num_input_tokens_seen": 2241224, "step": 2235 }, { "epoch": 1.1876988335100742, "grad_norm": 2.4908840656280518, "learning_rate": 5.935843054082715e-06, "loss": 0.3396, "num_input_tokens_seen": 2245800, "step": 2240 }, { "epoch": 1.1903499469777306, "grad_norm": 1.7878979444503784, "learning_rate": 5.949098621420998e-06, "loss": 0.3486, "num_input_tokens_seen": 2250280, "step": 2245 }, { "epoch": 1.1930010604453871, "grad_norm": 5.358484268188477, "learning_rate": 5.96235418875928e-06, "loss": 0.341, "num_input_tokens_seen": 2254696, "step": 2250 }, { "epoch": 1.1956521739130435, "grad_norm": 2.350015640258789, "learning_rate": 5.9756097560975615e-06, "loss": 0.3052, "num_input_tokens_seen": 2259016, "step": 2255 }, { "epoch": 1.1983032873806998, "grad_norm": 7.8559441566467285, "learning_rate": 5.988865323435844e-06, "loss": 0.4447, "num_input_tokens_seen": 2264040, "step": 2260 }, { "epoch": 1.2009544008483564, "grad_norm": 2.0670440196990967, "learning_rate": 6.002120890774126e-06, "loss": 0.3195, "num_input_tokens_seen": 2269608, "step": 2265 }, { "epoch": 1.2036055143160127, "grad_norm": 2.056845188140869, "learning_rate": 6.015376458112407e-06, "loss": 0.3174, "num_input_tokens_seen": 2274792, "step": 2270 }, { "epoch": 1.206256627783669, "grad_norm": 10.532705307006836, "learning_rate": 6.02863202545069e-06, "loss": 0.308, "num_input_tokens_seen": 2280072, "step": 2275 }, { "epoch": 1.2089077412513256, "grad_norm": 8.658740043640137, "learning_rate": 6.041887592788972e-06, "loss": 0.3987, "num_input_tokens_seen": 2286280, "step": 2280 }, { "epoch": 1.211558854718982, "grad_norm": 2.1456995010375977, "learning_rate": 6.055143160127254e-06, "loss": 0.2888, "num_input_tokens_seen": 2290152, "step": 2285 }, { "epoch": 1.2142099681866383, "grad_norm": 5.539259910583496, "learning_rate": 6.0683987274655364e-06, "loss": 0.3208, "num_input_tokens_seen": 2294824, "step": 2290 }, { "epoch": 1.2168610816542949, "grad_norm": 3.08736252784729, "learning_rate": 6.081654294803818e-06, "loss": 0.3441, "num_input_tokens_seen": 2299464, "step": 2295 }, { "epoch": 1.2195121951219512, "grad_norm": 3.310061454772949, "learning_rate": 6.0949098621421e-06, "loss": 0.4133, "num_input_tokens_seen": 2305416, "step": 2300 }, { "epoch": 1.2221633085896078, "grad_norm": 1.8396108150482178, "learning_rate": 6.108165429480383e-06, "loss": 0.3578, "num_input_tokens_seen": 2310440, "step": 2305 }, { "epoch": 1.224814422057264, "grad_norm": 1.4515868425369263, "learning_rate": 6.121420996818664e-06, "loss": 0.3596, "num_input_tokens_seen": 2315432, "step": 2310 }, { "epoch": 1.2274655355249204, "grad_norm": 1.4913272857666016, "learning_rate": 6.134676564156946e-06, "loss": 0.3235, "num_input_tokens_seen": 2321032, "step": 2315 }, { "epoch": 1.2301166489925768, "grad_norm": 2.1257946491241455, "learning_rate": 6.147932131495229e-06, "loss": 0.3503, "num_input_tokens_seen": 2325800, "step": 2320 }, { "epoch": 1.2327677624602333, "grad_norm": 3.8463387489318848, "learning_rate": 6.1611876988335105e-06, "loss": 0.3676, "num_input_tokens_seen": 2329992, "step": 2325 }, { "epoch": 1.2354188759278897, "grad_norm": 2.73652982711792, "learning_rate": 6.174443266171792e-06, "loss": 0.337, "num_input_tokens_seen": 2335336, "step": 2330 }, { "epoch": 1.2380699893955462, "grad_norm": 9.917343139648438, "learning_rate": 6.187698833510075e-06, "loss": 0.3631, "num_input_tokens_seen": 2339624, "step": 2335 }, { "epoch": 1.2407211028632026, "grad_norm": 3.9242680072784424, "learning_rate": 6.200954400848357e-06, "loss": 0.3878, "num_input_tokens_seen": 2344456, "step": 2340 }, { "epoch": 1.243372216330859, "grad_norm": 3.8689136505126953, "learning_rate": 6.214209968186639e-06, "loss": 0.3091, "num_input_tokens_seen": 2349320, "step": 2345 }, { "epoch": 1.2460233297985153, "grad_norm": 2.400324821472168, "learning_rate": 6.227465535524922e-06, "loss": 0.3478, "num_input_tokens_seen": 2353832, "step": 2350 }, { "epoch": 1.2486744432661718, "grad_norm": 4.558294296264648, "learning_rate": 6.240721102863203e-06, "loss": 0.2902, "num_input_tokens_seen": 2358408, "step": 2355 }, { "epoch": 1.2513255567338282, "grad_norm": 2.2375848293304443, "learning_rate": 6.253976670201485e-06, "loss": 0.3004, "num_input_tokens_seen": 2363464, "step": 2360 }, { "epoch": 1.2539766702014847, "grad_norm": 3.3137030601501465, "learning_rate": 6.267232237539767e-06, "loss": 0.3535, "num_input_tokens_seen": 2368712, "step": 2365 }, { "epoch": 1.256627783669141, "grad_norm": 6.262844085693359, "learning_rate": 6.280487804878049e-06, "loss": 0.2835, "num_input_tokens_seen": 2373832, "step": 2370 }, { "epoch": 1.2592788971367974, "grad_norm": 5.560203552246094, "learning_rate": 6.293743372216331e-06, "loss": 0.2969, "num_input_tokens_seen": 2379240, "step": 2375 }, { "epoch": 1.2619300106044538, "grad_norm": 2.3720338344573975, "learning_rate": 6.306998939554614e-06, "loss": 0.4085, "num_input_tokens_seen": 2384808, "step": 2380 }, { "epoch": 1.2645811240721103, "grad_norm": 5.961095333099365, "learning_rate": 6.320254506892896e-06, "loss": 0.2964, "num_input_tokens_seen": 2390024, "step": 2385 }, { "epoch": 1.2672322375397667, "grad_norm": 2.3321917057037354, "learning_rate": 6.333510074231178e-06, "loss": 0.3652, "num_input_tokens_seen": 2395432, "step": 2390 }, { "epoch": 1.2698833510074232, "grad_norm": 4.39452600479126, "learning_rate": 6.34676564156946e-06, "loss": 0.313, "num_input_tokens_seen": 2400264, "step": 2395 }, { "epoch": 1.2725344644750796, "grad_norm": 5.3319244384765625, "learning_rate": 6.3600212089077414e-06, "loss": 0.3009, "num_input_tokens_seen": 2405416, "step": 2400 }, { "epoch": 1.275185577942736, "grad_norm": 2.49867582321167, "learning_rate": 6.373276776246023e-06, "loss": 0.3497, "num_input_tokens_seen": 2410728, "step": 2405 }, { "epoch": 1.2778366914103922, "grad_norm": 4.971648216247559, "learning_rate": 6.386532343584306e-06, "loss": 0.3465, "num_input_tokens_seen": 2415336, "step": 2410 }, { "epoch": 1.2804878048780488, "grad_norm": 2.6847667694091797, "learning_rate": 6.399787910922588e-06, "loss": 0.289, "num_input_tokens_seen": 2420392, "step": 2415 }, { "epoch": 1.2831389183457051, "grad_norm": 1.6234729290008545, "learning_rate": 6.41304347826087e-06, "loss": 0.3878, "num_input_tokens_seen": 2424968, "step": 2420 }, { "epoch": 1.2857900318133617, "grad_norm": 2.3366174697875977, "learning_rate": 6.426299045599153e-06, "loss": 0.3578, "num_input_tokens_seen": 2430600, "step": 2425 }, { "epoch": 1.288441145281018, "grad_norm": 4.796760559082031, "learning_rate": 6.4395546129374345e-06, "loss": 0.3309, "num_input_tokens_seen": 2435208, "step": 2430 }, { "epoch": 1.2910922587486744, "grad_norm": 2.9566712379455566, "learning_rate": 6.4528101802757155e-06, "loss": 0.3626, "num_input_tokens_seen": 2441256, "step": 2435 }, { "epoch": 1.2937433722163307, "grad_norm": 1.9216738939285278, "learning_rate": 6.466065747613999e-06, "loss": 0.3455, "num_input_tokens_seen": 2446824, "step": 2440 }, { "epoch": 1.2963944856839873, "grad_norm": 2.4754767417907715, "learning_rate": 6.47932131495228e-06, "loss": 0.3384, "num_input_tokens_seen": 2451912, "step": 2445 }, { "epoch": 1.2990455991516436, "grad_norm": 4.735531806945801, "learning_rate": 6.492576882290562e-06, "loss": 0.3126, "num_input_tokens_seen": 2456456, "step": 2450 }, { "epoch": 1.3016967126193002, "grad_norm": 3.2168734073638916, "learning_rate": 6.505832449628845e-06, "loss": 0.337, "num_input_tokens_seen": 2460680, "step": 2455 }, { "epoch": 1.3043478260869565, "grad_norm": 7.855511665344238, "learning_rate": 6.519088016967127e-06, "loss": 0.4152, "num_input_tokens_seen": 2466568, "step": 2460 }, { "epoch": 1.3069989395546129, "grad_norm": 2.58923602104187, "learning_rate": 6.5323435843054086e-06, "loss": 0.3142, "num_input_tokens_seen": 2471944, "step": 2465 }, { "epoch": 1.3096500530222692, "grad_norm": 4.456223487854004, "learning_rate": 6.545599151643691e-06, "loss": 0.3432, "num_input_tokens_seen": 2477832, "step": 2470 }, { "epoch": 1.3123011664899258, "grad_norm": 4.049005031585693, "learning_rate": 6.558854718981973e-06, "loss": 0.3546, "num_input_tokens_seen": 2482376, "step": 2475 }, { "epoch": 1.3149522799575821, "grad_norm": 3.783052682876587, "learning_rate": 6.572110286320254e-06, "loss": 0.3061, "num_input_tokens_seen": 2487048, "step": 2480 }, { "epoch": 1.3176033934252387, "grad_norm": 1.5482982397079468, "learning_rate": 6.585365853658538e-06, "loss": 0.3033, "num_input_tokens_seen": 2492232, "step": 2485 }, { "epoch": 1.320254506892895, "grad_norm": 2.785672664642334, "learning_rate": 6.598621420996819e-06, "loss": 0.3731, "num_input_tokens_seen": 2497032, "step": 2490 }, { "epoch": 1.3229056203605514, "grad_norm": 1.6062266826629639, "learning_rate": 6.611876988335101e-06, "loss": 0.3134, "num_input_tokens_seen": 2502600, "step": 2495 }, { "epoch": 1.3255567338282077, "grad_norm": 2.690358877182007, "learning_rate": 6.6251325556733835e-06, "loss": 0.2848, "num_input_tokens_seen": 2507112, "step": 2500 }, { "epoch": 1.3282078472958643, "grad_norm": 2.926271915435791, "learning_rate": 6.638388123011665e-06, "loss": 0.3213, "num_input_tokens_seen": 2514056, "step": 2505 }, { "epoch": 1.3308589607635206, "grad_norm": 2.1852023601531982, "learning_rate": 6.651643690349947e-06, "loss": 0.2414, "num_input_tokens_seen": 2519272, "step": 2510 }, { "epoch": 1.3335100742311772, "grad_norm": 6.544677257537842, "learning_rate": 6.66489925768823e-06, "loss": 0.2637, "num_input_tokens_seen": 2523720, "step": 2515 }, { "epoch": 1.3361611876988335, "grad_norm": 1.682551622390747, "learning_rate": 6.678154825026512e-06, "loss": 0.444, "num_input_tokens_seen": 2528744, "step": 2520 }, { "epoch": 1.3388123011664899, "grad_norm": 3.8807287216186523, "learning_rate": 6.691410392364793e-06, "loss": 0.307, "num_input_tokens_seen": 2534792, "step": 2525 }, { "epoch": 1.3414634146341464, "grad_norm": 3.2970404624938965, "learning_rate": 6.704665959703076e-06, "loss": 0.2491, "num_input_tokens_seen": 2540072, "step": 2530 }, { "epoch": 1.3441145281018028, "grad_norm": 1.3132336139678955, "learning_rate": 6.717921527041358e-06, "loss": 0.3111, "num_input_tokens_seen": 2544680, "step": 2535 }, { "epoch": 1.346765641569459, "grad_norm": 1.9027589559555054, "learning_rate": 6.7311770943796395e-06, "loss": 0.3236, "num_input_tokens_seen": 2549192, "step": 2540 }, { "epoch": 1.3494167550371157, "grad_norm": 1.5583829879760742, "learning_rate": 6.744432661717922e-06, "loss": 0.2509, "num_input_tokens_seen": 2553128, "step": 2545 }, { "epoch": 1.352067868504772, "grad_norm": 4.049461364746094, "learning_rate": 6.757688229056204e-06, "loss": 0.3827, "num_input_tokens_seen": 2559528, "step": 2550 }, { "epoch": 1.3547189819724283, "grad_norm": 3.3679494857788086, "learning_rate": 6.770943796394486e-06, "loss": 0.3324, "num_input_tokens_seen": 2566216, "step": 2555 }, { "epoch": 1.357370095440085, "grad_norm": 3.665308952331543, "learning_rate": 6.784199363732769e-06, "loss": 0.3338, "num_input_tokens_seen": 2570728, "step": 2560 }, { "epoch": 1.3600212089077413, "grad_norm": 1.57960045337677, "learning_rate": 6.797454931071051e-06, "loss": 0.281, "num_input_tokens_seen": 2574760, "step": 2565 }, { "epoch": 1.3626723223753976, "grad_norm": 3.366250514984131, "learning_rate": 6.810710498409332e-06, "loss": 0.4096, "num_input_tokens_seen": 2581416, "step": 2570 }, { "epoch": 1.3653234358430542, "grad_norm": 1.6010679006576538, "learning_rate": 6.823966065747614e-06, "loss": 0.3761, "num_input_tokens_seen": 2586312, "step": 2575 }, { "epoch": 1.3679745493107105, "grad_norm": 1.6803922653198242, "learning_rate": 6.837221633085896e-06, "loss": 0.3007, "num_input_tokens_seen": 2591336, "step": 2580 }, { "epoch": 1.3706256627783668, "grad_norm": 2.6657745838165283, "learning_rate": 6.850477200424178e-06, "loss": 0.3093, "num_input_tokens_seen": 2595944, "step": 2585 }, { "epoch": 1.3732767762460234, "grad_norm": 2.067051410675049, "learning_rate": 6.863732767762461e-06, "loss": 0.4109, "num_input_tokens_seen": 2601416, "step": 2590 }, { "epoch": 1.3759278897136797, "grad_norm": 3.234164237976074, "learning_rate": 6.876988335100743e-06, "loss": 0.2534, "num_input_tokens_seen": 2606504, "step": 2595 }, { "epoch": 1.378579003181336, "grad_norm": 7.284481048583984, "learning_rate": 6.890243902439025e-06, "loss": 0.4129, "num_input_tokens_seen": 2610888, "step": 2600 }, { "epoch": 1.3812301166489926, "grad_norm": 2.617619276046753, "learning_rate": 6.9034994697773075e-06, "loss": 0.3324, "num_input_tokens_seen": 2615976, "step": 2605 }, { "epoch": 1.383881230116649, "grad_norm": 2.2311599254608154, "learning_rate": 6.916755037115589e-06, "loss": 0.3725, "num_input_tokens_seen": 2620296, "step": 2610 }, { "epoch": 1.3865323435843053, "grad_norm": 2.2560131549835205, "learning_rate": 6.93001060445387e-06, "loss": 0.3462, "num_input_tokens_seen": 2625768, "step": 2615 }, { "epoch": 1.389183457051962, "grad_norm": 4.332004547119141, "learning_rate": 6.943266171792153e-06, "loss": 0.3494, "num_input_tokens_seen": 2629672, "step": 2620 }, { "epoch": 1.3918345705196182, "grad_norm": 4.494298458099365, "learning_rate": 6.956521739130435e-06, "loss": 0.3498, "num_input_tokens_seen": 2634792, "step": 2625 }, { "epoch": 1.3944856839872748, "grad_norm": 2.0404233932495117, "learning_rate": 6.969777306468718e-06, "loss": 0.3512, "num_input_tokens_seen": 2639176, "step": 2630 }, { "epoch": 1.3971367974549311, "grad_norm": 2.544912338256836, "learning_rate": 6.983032873807e-06, "loss": 0.3308, "num_input_tokens_seen": 2644392, "step": 2635 }, { "epoch": 1.3997879109225875, "grad_norm": 1.3310779333114624, "learning_rate": 6.9962884411452815e-06, "loss": 0.2992, "num_input_tokens_seen": 2649224, "step": 2640 }, { "epoch": 1.4024390243902438, "grad_norm": 0.9112206101417542, "learning_rate": 7.009544008483564e-06, "loss": 0.3183, "num_input_tokens_seen": 2654024, "step": 2645 }, { "epoch": 1.4050901378579004, "grad_norm": 5.014410018920898, "learning_rate": 7.022799575821846e-06, "loss": 0.2654, "num_input_tokens_seen": 2658696, "step": 2650 }, { "epoch": 1.4077412513255567, "grad_norm": 6.902040481567383, "learning_rate": 7.036055143160127e-06, "loss": 0.3492, "num_input_tokens_seen": 2663528, "step": 2655 }, { "epoch": 1.4103923647932133, "grad_norm": 3.030869483947754, "learning_rate": 7.049310710498411e-06, "loss": 0.3072, "num_input_tokens_seen": 2668104, "step": 2660 }, { "epoch": 1.4130434782608696, "grad_norm": 1.85331392288208, "learning_rate": 7.062566277836692e-06, "loss": 0.3553, "num_input_tokens_seen": 2672648, "step": 2665 }, { "epoch": 1.415694591728526, "grad_norm": 1.3007984161376953, "learning_rate": 7.075821845174974e-06, "loss": 0.3928, "num_input_tokens_seen": 2680744, "step": 2670 }, { "epoch": 1.4183457051961823, "grad_norm": 1.2356187105178833, "learning_rate": 7.0890774125132565e-06, "loss": 0.3852, "num_input_tokens_seen": 2686504, "step": 2675 }, { "epoch": 1.4209968186638389, "grad_norm": 2.157616138458252, "learning_rate": 7.102332979851538e-06, "loss": 0.3574, "num_input_tokens_seen": 2690984, "step": 2680 }, { "epoch": 1.4236479321314952, "grad_norm": 3.8647496700286865, "learning_rate": 7.11558854718982e-06, "loss": 0.3662, "num_input_tokens_seen": 2696808, "step": 2685 }, { "epoch": 1.4262990455991518, "grad_norm": 0.9782596230506897, "learning_rate": 7.128844114528103e-06, "loss": 0.3539, "num_input_tokens_seen": 2701768, "step": 2690 }, { "epoch": 1.4289501590668081, "grad_norm": 1.688672661781311, "learning_rate": 7.142099681866385e-06, "loss": 0.3338, "num_input_tokens_seen": 2706696, "step": 2695 }, { "epoch": 1.4316012725344645, "grad_norm": 3.4827661514282227, "learning_rate": 7.155355249204666e-06, "loss": 0.3247, "num_input_tokens_seen": 2711720, "step": 2700 }, { "epoch": 1.4342523860021208, "grad_norm": 1.547356367111206, "learning_rate": 7.1686108165429495e-06, "loss": 0.3229, "num_input_tokens_seen": 2716808, "step": 2705 }, { "epoch": 1.4369034994697774, "grad_norm": 1.4477298259735107, "learning_rate": 7.1818663838812306e-06, "loss": 0.2463, "num_input_tokens_seen": 2721640, "step": 2710 }, { "epoch": 1.4395546129374337, "grad_norm": 2.7642366886138916, "learning_rate": 7.1951219512195125e-06, "loss": 0.2208, "num_input_tokens_seen": 2726888, "step": 2715 }, { "epoch": 1.4422057264050903, "grad_norm": 8.443668365478516, "learning_rate": 7.208377518557795e-06, "loss": 0.6073, "num_input_tokens_seen": 2731656, "step": 2720 }, { "epoch": 1.4448568398727466, "grad_norm": 1.226812720298767, "learning_rate": 7.221633085896077e-06, "loss": 0.4784, "num_input_tokens_seen": 2736424, "step": 2725 }, { "epoch": 1.447507953340403, "grad_norm": 2.422173023223877, "learning_rate": 7.234888653234359e-06, "loss": 0.3011, "num_input_tokens_seen": 2741064, "step": 2730 }, { "epoch": 1.4501590668080593, "grad_norm": 1.2217291593551636, "learning_rate": 7.248144220572642e-06, "loss": 0.3855, "num_input_tokens_seen": 2745512, "step": 2735 }, { "epoch": 1.4528101802757158, "grad_norm": 2.190093755722046, "learning_rate": 7.261399787910924e-06, "loss": 0.345, "num_input_tokens_seen": 2750344, "step": 2740 }, { "epoch": 1.4554612937433722, "grad_norm": 1.6991981267929077, "learning_rate": 7.274655355249205e-06, "loss": 0.4053, "num_input_tokens_seen": 2757256, "step": 2745 }, { "epoch": 1.4581124072110287, "grad_norm": 2.1815226078033447, "learning_rate": 7.287910922587487e-06, "loss": 0.3534, "num_input_tokens_seen": 2762472, "step": 2750 }, { "epoch": 1.460763520678685, "grad_norm": 3.2062857151031494, "learning_rate": 7.301166489925769e-06, "loss": 0.3304, "num_input_tokens_seen": 2766824, "step": 2755 }, { "epoch": 1.4634146341463414, "grad_norm": 2.7380318641662598, "learning_rate": 7.314422057264051e-06, "loss": 0.3607, "num_input_tokens_seen": 2770952, "step": 2760 }, { "epoch": 1.4660657476139978, "grad_norm": 1.4770134687423706, "learning_rate": 7.327677624602334e-06, "loss": 0.322, "num_input_tokens_seen": 2775528, "step": 2765 }, { "epoch": 1.4687168610816543, "grad_norm": 5.069868564605713, "learning_rate": 7.340933191940616e-06, "loss": 0.3318, "num_input_tokens_seen": 2780904, "step": 2770 }, { "epoch": 1.4713679745493107, "grad_norm": 2.1432876586914062, "learning_rate": 7.354188759278898e-06, "loss": 0.3654, "num_input_tokens_seen": 2786600, "step": 2775 }, { "epoch": 1.4740190880169672, "grad_norm": 1.69114351272583, "learning_rate": 7.3674443266171804e-06, "loss": 0.3376, "num_input_tokens_seen": 2792040, "step": 2780 }, { "epoch": 1.4766702014846236, "grad_norm": 1.3283171653747559, "learning_rate": 7.380699893955462e-06, "loss": 0.3622, "num_input_tokens_seen": 2797128, "step": 2785 }, { "epoch": 1.47932131495228, "grad_norm": 1.0066747665405273, "learning_rate": 7.393955461293743e-06, "loss": 0.3396, "num_input_tokens_seen": 2801800, "step": 2790 }, { "epoch": 1.4819724284199363, "grad_norm": 1.9031003713607788, "learning_rate": 7.407211028632026e-06, "loss": 0.3174, "num_input_tokens_seen": 2805544, "step": 2795 }, { "epoch": 1.4846235418875928, "grad_norm": 1.007303237915039, "learning_rate": 7.420466595970308e-06, "loss": 0.2986, "num_input_tokens_seen": 2811848, "step": 2800 }, { "epoch": 1.4872746553552492, "grad_norm": 3.3107244968414307, "learning_rate": 7.43372216330859e-06, "loss": 0.3534, "num_input_tokens_seen": 2817224, "step": 2805 }, { "epoch": 1.4899257688229057, "grad_norm": 1.2683227062225342, "learning_rate": 7.446977730646873e-06, "loss": 0.4081, "num_input_tokens_seen": 2821928, "step": 2810 }, { "epoch": 1.492576882290562, "grad_norm": 1.5549565553665161, "learning_rate": 7.4602332979851545e-06, "loss": 0.3652, "num_input_tokens_seen": 2827112, "step": 2815 }, { "epoch": 1.4952279957582184, "grad_norm": 5.058408737182617, "learning_rate": 7.473488865323436e-06, "loss": 0.337, "num_input_tokens_seen": 2832232, "step": 2820 }, { "epoch": 1.4978791092258747, "grad_norm": 2.6608753204345703, "learning_rate": 7.486744432661719e-06, "loss": 0.3375, "num_input_tokens_seen": 2836872, "step": 2825 }, { "epoch": 1.5005302226935313, "grad_norm": 1.8407816886901855, "learning_rate": 7.500000000000001e-06, "loss": 0.345, "num_input_tokens_seen": 2842184, "step": 2830 }, { "epoch": 1.5031813361611877, "grad_norm": 1.3737609386444092, "learning_rate": 7.513255567338282e-06, "loss": 0.3267, "num_input_tokens_seen": 2846952, "step": 2835 }, { "epoch": 1.5058324496288442, "grad_norm": 1.6970793008804321, "learning_rate": 7.526511134676565e-06, "loss": 0.3447, "num_input_tokens_seen": 2851240, "step": 2840 }, { "epoch": 1.5084835630965006, "grad_norm": 1.9319944381713867, "learning_rate": 7.539766702014847e-06, "loss": 0.3517, "num_input_tokens_seen": 2855496, "step": 2845 }, { "epoch": 1.511134676564157, "grad_norm": 4.4472336769104, "learning_rate": 7.553022269353129e-06, "loss": 0.3575, "num_input_tokens_seen": 2861128, "step": 2850 }, { "epoch": 1.5137857900318132, "grad_norm": 3.0398082733154297, "learning_rate": 7.566277836691411e-06, "loss": 0.2911, "num_input_tokens_seen": 2865832, "step": 2855 }, { "epoch": 1.5164369034994698, "grad_norm": 1.0097779035568237, "learning_rate": 7.579533404029693e-06, "loss": 0.3035, "num_input_tokens_seen": 2870792, "step": 2860 }, { "epoch": 1.5190880169671261, "grad_norm": 0.9426175355911255, "learning_rate": 7.592788971367975e-06, "loss": 0.2635, "num_input_tokens_seen": 2879080, "step": 2865 }, { "epoch": 1.5217391304347827, "grad_norm": 4.913973808288574, "learning_rate": 7.606044538706258e-06, "loss": 0.4548, "num_input_tokens_seen": 2883400, "step": 2870 }, { "epoch": 1.524390243902439, "grad_norm": 1.8753080368041992, "learning_rate": 7.619300106044539e-06, "loss": 0.3946, "num_input_tokens_seen": 2888168, "step": 2875 }, { "epoch": 1.5270413573700954, "grad_norm": 1.2732950448989868, "learning_rate": 7.632555673382822e-06, "loss": 0.3558, "num_input_tokens_seen": 2893288, "step": 2880 }, { "epoch": 1.5296924708377517, "grad_norm": 1.0868372917175293, "learning_rate": 7.645811240721104e-06, "loss": 0.3417, "num_input_tokens_seen": 2898344, "step": 2885 }, { "epoch": 1.5323435843054083, "grad_norm": 3.078803062438965, "learning_rate": 7.659066808059385e-06, "loss": 0.3124, "num_input_tokens_seen": 2903336, "step": 2890 }, { "epoch": 1.5349946977730649, "grad_norm": 1.7468458414077759, "learning_rate": 7.672322375397666e-06, "loss": 0.3196, "num_input_tokens_seen": 2909352, "step": 2895 }, { "epoch": 1.5376458112407212, "grad_norm": 2.285568952560425, "learning_rate": 7.68557794273595e-06, "loss": 0.3243, "num_input_tokens_seen": 2914536, "step": 2900 }, { "epoch": 1.5402969247083775, "grad_norm": 2.98472261428833, "learning_rate": 7.698833510074232e-06, "loss": 0.3871, "num_input_tokens_seen": 2919624, "step": 2905 }, { "epoch": 1.5429480381760339, "grad_norm": 2.5289926528930664, "learning_rate": 7.712089077412513e-06, "loss": 0.3328, "num_input_tokens_seen": 2924328, "step": 2910 }, { "epoch": 1.5455991516436902, "grad_norm": 1.3183238506317139, "learning_rate": 7.725344644750796e-06, "loss": 0.3403, "num_input_tokens_seen": 2930088, "step": 2915 }, { "epoch": 1.5482502651113468, "grad_norm": 1.0567806959152222, "learning_rate": 7.738600212089078e-06, "loss": 0.3244, "num_input_tokens_seen": 2935880, "step": 2920 }, { "epoch": 1.5509013785790033, "grad_norm": 2.5897884368896484, "learning_rate": 7.75185577942736e-06, "loss": 0.2913, "num_input_tokens_seen": 2941800, "step": 2925 }, { "epoch": 1.5535524920466597, "grad_norm": 2.7326579093933105, "learning_rate": 7.765111346765642e-06, "loss": 0.3114, "num_input_tokens_seen": 2947464, "step": 2930 }, { "epoch": 1.556203605514316, "grad_norm": 2.5685949325561523, "learning_rate": 7.778366914103925e-06, "loss": 0.3794, "num_input_tokens_seen": 2952072, "step": 2935 }, { "epoch": 1.5588547189819724, "grad_norm": 1.534917950630188, "learning_rate": 7.791622481442206e-06, "loss": 0.452, "num_input_tokens_seen": 2957672, "step": 2940 }, { "epoch": 1.5615058324496287, "grad_norm": 4.054192066192627, "learning_rate": 7.804878048780489e-06, "loss": 0.3281, "num_input_tokens_seen": 2962792, "step": 2945 }, { "epoch": 1.5641569459172853, "grad_norm": 3.7576894760131836, "learning_rate": 7.81813361611877e-06, "loss": 0.3542, "num_input_tokens_seen": 2967176, "step": 2950 }, { "epoch": 1.5668080593849418, "grad_norm": 4.071879863739014, "learning_rate": 7.831389183457053e-06, "loss": 0.3216, "num_input_tokens_seen": 2973512, "step": 2955 }, { "epoch": 1.5694591728525982, "grad_norm": 2.5051794052124023, "learning_rate": 7.844644750795335e-06, "loss": 0.3036, "num_input_tokens_seen": 2978344, "step": 2960 }, { "epoch": 1.5721102863202545, "grad_norm": 1.1741259098052979, "learning_rate": 7.857900318133616e-06, "loss": 0.3678, "num_input_tokens_seen": 2982408, "step": 2965 }, { "epoch": 1.5747613997879109, "grad_norm": 5.229185104370117, "learning_rate": 7.871155885471899e-06, "loss": 0.3873, "num_input_tokens_seen": 2986696, "step": 2970 }, { "epoch": 1.5774125132555672, "grad_norm": 1.1341434717178345, "learning_rate": 7.884411452810182e-06, "loss": 0.2725, "num_input_tokens_seen": 2991560, "step": 2975 }, { "epoch": 1.5800636267232238, "grad_norm": 0.6553869247436523, "learning_rate": 7.897667020148463e-06, "loss": 0.3296, "num_input_tokens_seen": 2995848, "step": 2980 }, { "epoch": 1.5827147401908803, "grad_norm": 0.561317503452301, "learning_rate": 7.910922587486744e-06, "loss": 0.335, "num_input_tokens_seen": 3001192, "step": 2985 }, { "epoch": 1.5853658536585367, "grad_norm": 1.9268378019332886, "learning_rate": 7.924178154825027e-06, "loss": 0.3498, "num_input_tokens_seen": 3005736, "step": 2990 }, { "epoch": 1.588016967126193, "grad_norm": 3.5640146732330322, "learning_rate": 7.93743372216331e-06, "loss": 0.4019, "num_input_tokens_seen": 3011304, "step": 2995 }, { "epoch": 1.5906680805938493, "grad_norm": 2.5722086429595947, "learning_rate": 7.95068928950159e-06, "loss": 0.3219, "num_input_tokens_seen": 3016520, "step": 3000 }, { "epoch": 1.5933191940615057, "grad_norm": 1.1257201433181763, "learning_rate": 7.963944856839873e-06, "loss": 0.2888, "num_input_tokens_seen": 3021192, "step": 3005 }, { "epoch": 1.5959703075291622, "grad_norm": 0.9906928539276123, "learning_rate": 7.977200424178156e-06, "loss": 0.363, "num_input_tokens_seen": 3027176, "step": 3010 }, { "epoch": 1.5986214209968188, "grad_norm": 3.5242726802825928, "learning_rate": 7.990455991516437e-06, "loss": 0.3022, "num_input_tokens_seen": 3032648, "step": 3015 }, { "epoch": 1.6012725344644752, "grad_norm": 2.1112136840820312, "learning_rate": 8.00371155885472e-06, "loss": 0.3128, "num_input_tokens_seen": 3037864, "step": 3020 }, { "epoch": 1.6039236479321315, "grad_norm": 0.9076073169708252, "learning_rate": 8.016967126193e-06, "loss": 0.3473, "num_input_tokens_seen": 3042824, "step": 3025 }, { "epoch": 1.6065747613997878, "grad_norm": 1.4753291606903076, "learning_rate": 8.030222693531283e-06, "loss": 0.3502, "num_input_tokens_seen": 3048136, "step": 3030 }, { "epoch": 1.6092258748674442, "grad_norm": 0.8736535906791687, "learning_rate": 8.043478260869566e-06, "loss": 0.2633, "num_input_tokens_seen": 3053896, "step": 3035 }, { "epoch": 1.6118769883351007, "grad_norm": 0.7488529682159424, "learning_rate": 8.056733828207847e-06, "loss": 0.326, "num_input_tokens_seen": 3058408, "step": 3040 }, { "epoch": 1.6145281018027573, "grad_norm": 0.7999944686889648, "learning_rate": 8.06998939554613e-06, "loss": 0.2671, "num_input_tokens_seen": 3063720, "step": 3045 }, { "epoch": 1.6171792152704136, "grad_norm": 1.0144041776657104, "learning_rate": 8.083244962884413e-06, "loss": 0.36, "num_input_tokens_seen": 3068424, "step": 3050 }, { "epoch": 1.61983032873807, "grad_norm": 0.9670430421829224, "learning_rate": 8.096500530222694e-06, "loss": 0.3573, "num_input_tokens_seen": 3073800, "step": 3055 }, { "epoch": 1.6224814422057263, "grad_norm": 1.652234673500061, "learning_rate": 8.109756097560977e-06, "loss": 0.3655, "num_input_tokens_seen": 3078504, "step": 3060 }, { "epoch": 1.6251325556733827, "grad_norm": 1.312745213508606, "learning_rate": 8.12301166489926e-06, "loss": 0.3125, "num_input_tokens_seen": 3083528, "step": 3065 }, { "epoch": 1.6277836691410392, "grad_norm": 1.131526231765747, "learning_rate": 8.13626723223754e-06, "loss": 0.3337, "num_input_tokens_seen": 3088296, "step": 3070 }, { "epoch": 1.6304347826086958, "grad_norm": 2.4428651332855225, "learning_rate": 8.149522799575823e-06, "loss": 0.3442, "num_input_tokens_seen": 3092232, "step": 3075 }, { "epoch": 1.6330858960763521, "grad_norm": 2.000783920288086, "learning_rate": 8.162778366914104e-06, "loss": 0.3143, "num_input_tokens_seen": 3097768, "step": 3080 }, { "epoch": 1.6357370095440085, "grad_norm": 1.2220290899276733, "learning_rate": 8.176033934252387e-06, "loss": 0.3058, "num_input_tokens_seen": 3103336, "step": 3085 }, { "epoch": 1.6383881230116648, "grad_norm": 2.05247163772583, "learning_rate": 8.18928950159067e-06, "loss": 0.3222, "num_input_tokens_seen": 3107624, "step": 3090 }, { "epoch": 1.6410392364793212, "grad_norm": 2.132530450820923, "learning_rate": 8.20254506892895e-06, "loss": 0.3906, "num_input_tokens_seen": 3111944, "step": 3095 }, { "epoch": 1.6436903499469777, "grad_norm": 1.6325839757919312, "learning_rate": 8.215800636267233e-06, "loss": 0.2794, "num_input_tokens_seen": 3117512, "step": 3100 }, { "epoch": 1.6463414634146343, "grad_norm": 1.9764732122421265, "learning_rate": 8.229056203605516e-06, "loss": 0.3242, "num_input_tokens_seen": 3122408, "step": 3105 }, { "epoch": 1.6489925768822906, "grad_norm": 1.830971360206604, "learning_rate": 8.242311770943797e-06, "loss": 0.3316, "num_input_tokens_seen": 3127208, "step": 3110 }, { "epoch": 1.651643690349947, "grad_norm": 2.566654682159424, "learning_rate": 8.255567338282078e-06, "loss": 0.4342, "num_input_tokens_seen": 3132328, "step": 3115 }, { "epoch": 1.6542948038176033, "grad_norm": 3.2954330444335938, "learning_rate": 8.268822905620361e-06, "loss": 0.3563, "num_input_tokens_seen": 3137032, "step": 3120 }, { "epoch": 1.6569459172852599, "grad_norm": 3.086599349975586, "learning_rate": 8.282078472958644e-06, "loss": 0.3454, "num_input_tokens_seen": 3141832, "step": 3125 }, { "epoch": 1.6595970307529162, "grad_norm": 1.2666234970092773, "learning_rate": 8.295334040296925e-06, "loss": 0.3592, "num_input_tokens_seen": 3146504, "step": 3130 }, { "epoch": 1.6622481442205728, "grad_norm": 1.486030101776123, "learning_rate": 8.308589607635207e-06, "loss": 0.3462, "num_input_tokens_seen": 3150824, "step": 3135 }, { "epoch": 1.664899257688229, "grad_norm": 1.0035545825958252, "learning_rate": 8.32184517497349e-06, "loss": 0.3376, "num_input_tokens_seen": 3156392, "step": 3140 }, { "epoch": 1.6675503711558854, "grad_norm": 1.465119481086731, "learning_rate": 8.335100742311771e-06, "loss": 0.3552, "num_input_tokens_seen": 3162888, "step": 3145 }, { "epoch": 1.6702014846235418, "grad_norm": 0.9886478781700134, "learning_rate": 8.348356309650054e-06, "loss": 0.34, "num_input_tokens_seen": 3168168, "step": 3150 }, { "epoch": 1.6728525980911984, "grad_norm": 0.89689701795578, "learning_rate": 8.361611876988337e-06, "loss": 0.3156, "num_input_tokens_seen": 3173000, "step": 3155 }, { "epoch": 1.6755037115588547, "grad_norm": 1.1612122058868408, "learning_rate": 8.374867444326618e-06, "loss": 0.3503, "num_input_tokens_seen": 3177992, "step": 3160 }, { "epoch": 1.6781548250265113, "grad_norm": 3.0193188190460205, "learning_rate": 8.3881230116649e-06, "loss": 0.2982, "num_input_tokens_seen": 3183304, "step": 3165 }, { "epoch": 1.6808059384941676, "grad_norm": 0.7121695876121521, "learning_rate": 8.401378579003182e-06, "loss": 0.3216, "num_input_tokens_seen": 3187560, "step": 3170 }, { "epoch": 1.683457051961824, "grad_norm": 2.263303756713867, "learning_rate": 8.414634146341464e-06, "loss": 0.3791, "num_input_tokens_seen": 3191656, "step": 3175 }, { "epoch": 1.6861081654294803, "grad_norm": 2.0244300365448, "learning_rate": 8.427889713679747e-06, "loss": 0.3363, "num_input_tokens_seen": 3197992, "step": 3180 }, { "epoch": 1.6887592788971368, "grad_norm": 1.0041414499282837, "learning_rate": 8.441145281018028e-06, "loss": 0.3002, "num_input_tokens_seen": 3202696, "step": 3185 }, { "epoch": 1.6914103923647932, "grad_norm": 1.0364971160888672, "learning_rate": 8.45440084835631e-06, "loss": 0.3056, "num_input_tokens_seen": 3207496, "step": 3190 }, { "epoch": 1.6940615058324497, "grad_norm": 1.8821876049041748, "learning_rate": 8.467656415694594e-06, "loss": 0.397, "num_input_tokens_seen": 3212072, "step": 3195 }, { "epoch": 1.696712619300106, "grad_norm": 1.0106459856033325, "learning_rate": 8.480911983032875e-06, "loss": 0.3312, "num_input_tokens_seen": 3216904, "step": 3200 }, { "epoch": 1.6993637327677624, "grad_norm": 1.75818932056427, "learning_rate": 8.494167550371156e-06, "loss": 0.3475, "num_input_tokens_seen": 3223080, "step": 3205 }, { "epoch": 1.7020148462354188, "grad_norm": 1.2515482902526855, "learning_rate": 8.507423117709438e-06, "loss": 0.359, "num_input_tokens_seen": 3227560, "step": 3210 }, { "epoch": 1.7046659597030753, "grad_norm": 1.18337881565094, "learning_rate": 8.520678685047721e-06, "loss": 0.3102, "num_input_tokens_seen": 3233512, "step": 3215 }, { "epoch": 1.7073170731707317, "grad_norm": 1.143583059310913, "learning_rate": 8.533934252386002e-06, "loss": 0.3274, "num_input_tokens_seen": 3239048, "step": 3220 }, { "epoch": 1.7099681866383882, "grad_norm": 1.1978884935379028, "learning_rate": 8.547189819724285e-06, "loss": 0.4254, "num_input_tokens_seen": 3244104, "step": 3225 }, { "epoch": 1.7126193001060446, "grad_norm": 0.9978301525115967, "learning_rate": 8.560445387062568e-06, "loss": 0.3277, "num_input_tokens_seen": 3249448, "step": 3230 }, { "epoch": 1.715270413573701, "grad_norm": 1.341477870941162, "learning_rate": 8.573700954400849e-06, "loss": 0.3103, "num_input_tokens_seen": 3254344, "step": 3235 }, { "epoch": 1.7179215270413573, "grad_norm": 2.0875368118286133, "learning_rate": 8.586956521739131e-06, "loss": 0.316, "num_input_tokens_seen": 3259528, "step": 3240 }, { "epoch": 1.7205726405090138, "grad_norm": 0.7330546975135803, "learning_rate": 8.600212089077412e-06, "loss": 0.3052, "num_input_tokens_seen": 3264104, "step": 3245 }, { "epoch": 1.7232237539766702, "grad_norm": 0.8045488595962524, "learning_rate": 8.613467656415695e-06, "loss": 0.3205, "num_input_tokens_seen": 3269288, "step": 3250 }, { "epoch": 1.7258748674443267, "grad_norm": 2.875115156173706, "learning_rate": 8.626723223753978e-06, "loss": 0.3885, "num_input_tokens_seen": 3273864, "step": 3255 }, { "epoch": 1.728525980911983, "grad_norm": 2.053762674331665, "learning_rate": 8.639978791092259e-06, "loss": 0.2553, "num_input_tokens_seen": 3278664, "step": 3260 }, { "epoch": 1.7311770943796394, "grad_norm": 1.6378449201583862, "learning_rate": 8.653234358430542e-06, "loss": 0.4163, "num_input_tokens_seen": 3283656, "step": 3265 }, { "epoch": 1.7338282078472957, "grad_norm": 1.0383445024490356, "learning_rate": 8.666489925768824e-06, "loss": 0.3483, "num_input_tokens_seen": 3288296, "step": 3270 }, { "epoch": 1.7364793213149523, "grad_norm": 1.7154470682144165, "learning_rate": 8.679745493107105e-06, "loss": 0.3281, "num_input_tokens_seen": 3294152, "step": 3275 }, { "epoch": 1.7391304347826086, "grad_norm": 1.9847846031188965, "learning_rate": 8.693001060445388e-06, "loss": 0.3584, "num_input_tokens_seen": 3299784, "step": 3280 }, { "epoch": 1.7417815482502652, "grad_norm": 2.792787551879883, "learning_rate": 8.706256627783671e-06, "loss": 0.3392, "num_input_tokens_seen": 3305032, "step": 3285 }, { "epoch": 1.7444326617179216, "grad_norm": 3.686734437942505, "learning_rate": 8.719512195121952e-06, "loss": 0.3022, "num_input_tokens_seen": 3308968, "step": 3290 }, { "epoch": 1.747083775185578, "grad_norm": 3.475872039794922, "learning_rate": 8.732767762460233e-06, "loss": 0.3444, "num_input_tokens_seen": 3313800, "step": 3295 }, { "epoch": 1.7497348886532342, "grad_norm": 1.7899328470230103, "learning_rate": 8.746023329798516e-06, "loss": 0.3357, "num_input_tokens_seen": 3317928, "step": 3300 }, { "epoch": 1.7523860021208908, "grad_norm": 0.983420729637146, "learning_rate": 8.759278897136799e-06, "loss": 0.2972, "num_input_tokens_seen": 3321736, "step": 3305 }, { "epoch": 1.7550371155885471, "grad_norm": 1.4092952013015747, "learning_rate": 8.77253446447508e-06, "loss": 0.3954, "num_input_tokens_seen": 3326600, "step": 3310 }, { "epoch": 1.7576882290562037, "grad_norm": 2.094761610031128, "learning_rate": 8.785790031813362e-06, "loss": 0.3089, "num_input_tokens_seen": 3331144, "step": 3315 }, { "epoch": 1.76033934252386, "grad_norm": 1.3707869052886963, "learning_rate": 8.799045599151645e-06, "loss": 0.2963, "num_input_tokens_seen": 3335400, "step": 3320 }, { "epoch": 1.7629904559915164, "grad_norm": 3.3401496410369873, "learning_rate": 8.812301166489926e-06, "loss": 0.2918, "num_input_tokens_seen": 3339912, "step": 3325 }, { "epoch": 1.7656415694591727, "grad_norm": 5.482647895812988, "learning_rate": 8.825556733828209e-06, "loss": 0.2954, "num_input_tokens_seen": 3344360, "step": 3330 }, { "epoch": 1.7682926829268293, "grad_norm": 12.760357856750488, "learning_rate": 8.83881230116649e-06, "loss": 0.4422, "num_input_tokens_seen": 3349640, "step": 3335 }, { "epoch": 1.7709437963944858, "grad_norm": 1.9082622528076172, "learning_rate": 8.852067868504773e-06, "loss": 0.3004, "num_input_tokens_seen": 3354696, "step": 3340 }, { "epoch": 1.7735949098621422, "grad_norm": 2.906261682510376, "learning_rate": 8.865323435843055e-06, "loss": 0.374, "num_input_tokens_seen": 3359624, "step": 3345 }, { "epoch": 1.7762460233297985, "grad_norm": 4.925124645233154, "learning_rate": 8.878579003181336e-06, "loss": 0.2643, "num_input_tokens_seen": 3364168, "step": 3350 }, { "epoch": 1.7788971367974549, "grad_norm": 5.236072063446045, "learning_rate": 8.891834570519619e-06, "loss": 0.4055, "num_input_tokens_seen": 3370056, "step": 3355 }, { "epoch": 1.7815482502651112, "grad_norm": 3.0121641159057617, "learning_rate": 8.905090137857902e-06, "loss": 0.3634, "num_input_tokens_seen": 3374824, "step": 3360 }, { "epoch": 1.7841993637327678, "grad_norm": 2.952296733856201, "learning_rate": 8.918345705196183e-06, "loss": 0.3812, "num_input_tokens_seen": 3380072, "step": 3365 }, { "epoch": 1.7868504772004243, "grad_norm": 3.3353614807128906, "learning_rate": 8.931601272534464e-06, "loss": 0.3535, "num_input_tokens_seen": 3385032, "step": 3370 }, { "epoch": 1.7895015906680807, "grad_norm": 1.4999767541885376, "learning_rate": 8.944856839872748e-06, "loss": 0.3254, "num_input_tokens_seen": 3390504, "step": 3375 }, { "epoch": 1.792152704135737, "grad_norm": 1.067877173423767, "learning_rate": 8.95811240721103e-06, "loss": 0.3649, "num_input_tokens_seen": 3396104, "step": 3380 }, { "epoch": 1.7948038176033934, "grad_norm": 1.1947733163833618, "learning_rate": 8.97136797454931e-06, "loss": 0.3226, "num_input_tokens_seen": 3401160, "step": 3385 }, { "epoch": 1.7974549310710497, "grad_norm": 3.8351638317108154, "learning_rate": 8.984623541887593e-06, "loss": 0.3195, "num_input_tokens_seen": 3406248, "step": 3390 }, { "epoch": 1.8001060445387063, "grad_norm": 0.9947619438171387, "learning_rate": 8.997879109225876e-06, "loss": 0.2794, "num_input_tokens_seen": 3410984, "step": 3395 }, { "epoch": 1.8027571580063628, "grad_norm": 1.0595146417617798, "learning_rate": 9.011134676564157e-06, "loss": 0.3554, "num_input_tokens_seen": 3416072, "step": 3400 }, { "epoch": 1.8054082714740192, "grad_norm": 3.5387556552886963, "learning_rate": 9.02439024390244e-06, "loss": 0.3983, "num_input_tokens_seen": 3420904, "step": 3405 }, { "epoch": 1.8080593849416755, "grad_norm": 2.5657947063446045, "learning_rate": 9.037645811240722e-06, "loss": 0.3422, "num_input_tokens_seen": 3426440, "step": 3410 }, { "epoch": 1.8107104984093318, "grad_norm": 1.681320309638977, "learning_rate": 9.050901378579004e-06, "loss": 0.35, "num_input_tokens_seen": 3431272, "step": 3415 }, { "epoch": 1.8133616118769882, "grad_norm": 5.483736515045166, "learning_rate": 9.064156945917286e-06, "loss": 0.3938, "num_input_tokens_seen": 3435464, "step": 3420 }, { "epoch": 1.8160127253446448, "grad_norm": 1.8682692050933838, "learning_rate": 9.077412513255567e-06, "loss": 0.3509, "num_input_tokens_seen": 3440616, "step": 3425 }, { "epoch": 1.8186638388123013, "grad_norm": 1.1985163688659668, "learning_rate": 9.09066808059385e-06, "loss": 0.3249, "num_input_tokens_seen": 3445256, "step": 3430 }, { "epoch": 1.8213149522799577, "grad_norm": 1.0247782468795776, "learning_rate": 9.103923647932133e-06, "loss": 0.3389, "num_input_tokens_seen": 3450152, "step": 3435 }, { "epoch": 1.823966065747614, "grad_norm": 1.3089128732681274, "learning_rate": 9.117179215270414e-06, "loss": 0.3464, "num_input_tokens_seen": 3454312, "step": 3440 }, { "epoch": 1.8266171792152703, "grad_norm": 1.088629126548767, "learning_rate": 9.130434782608697e-06, "loss": 0.3437, "num_input_tokens_seen": 3460104, "step": 3445 }, { "epoch": 1.8292682926829267, "grad_norm": 2.0439095497131348, "learning_rate": 9.14369034994698e-06, "loss": 0.3597, "num_input_tokens_seen": 3464136, "step": 3450 }, { "epoch": 1.8319194061505832, "grad_norm": 3.409231662750244, "learning_rate": 9.15694591728526e-06, "loss": 0.3733, "num_input_tokens_seen": 3468552, "step": 3455 }, { "epoch": 1.8345705196182398, "grad_norm": 0.6449555158615112, "learning_rate": 9.170201484623541e-06, "loss": 0.3483, "num_input_tokens_seen": 3472488, "step": 3460 }, { "epoch": 1.8372216330858961, "grad_norm": 1.5528491735458374, "learning_rate": 9.183457051961824e-06, "loss": 0.3409, "num_input_tokens_seen": 3476072, "step": 3465 }, { "epoch": 1.8398727465535525, "grad_norm": 0.8574497103691101, "learning_rate": 9.196712619300107e-06, "loss": 0.3473, "num_input_tokens_seen": 3480264, "step": 3470 }, { "epoch": 1.8425238600212088, "grad_norm": 1.005354404449463, "learning_rate": 9.20996818663839e-06, "loss": 0.3028, "num_input_tokens_seen": 3485896, "step": 3475 }, { "epoch": 1.8451749734888652, "grad_norm": 2.1988587379455566, "learning_rate": 9.22322375397667e-06, "loss": 0.299, "num_input_tokens_seen": 3493992, "step": 3480 }, { "epoch": 1.8478260869565217, "grad_norm": 2.926603078842163, "learning_rate": 9.236479321314953e-06, "loss": 0.3149, "num_input_tokens_seen": 3499240, "step": 3485 }, { "epoch": 1.8504772004241783, "grad_norm": 1.7073930501937866, "learning_rate": 9.249734888653236e-06, "loss": 0.4564, "num_input_tokens_seen": 3505160, "step": 3490 }, { "epoch": 1.8531283138918346, "grad_norm": 0.8470478057861328, "learning_rate": 9.262990455991517e-06, "loss": 0.309, "num_input_tokens_seen": 3510120, "step": 3495 }, { "epoch": 1.855779427359491, "grad_norm": 1.4931180477142334, "learning_rate": 9.2762460233298e-06, "loss": 0.3258, "num_input_tokens_seen": 3514920, "step": 3500 }, { "epoch": 1.8584305408271473, "grad_norm": 2.450784921646118, "learning_rate": 9.289501590668083e-06, "loss": 0.3176, "num_input_tokens_seen": 3520296, "step": 3505 }, { "epoch": 1.8610816542948037, "grad_norm": 1.0051296949386597, "learning_rate": 9.302757158006364e-06, "loss": 0.3003, "num_input_tokens_seen": 3524488, "step": 3510 }, { "epoch": 1.8637327677624602, "grad_norm": 1.3701915740966797, "learning_rate": 9.316012725344645e-06, "loss": 0.3657, "num_input_tokens_seen": 3527976, "step": 3515 }, { "epoch": 1.8663838812301168, "grad_norm": 1.3676164150238037, "learning_rate": 9.329268292682927e-06, "loss": 0.3383, "num_input_tokens_seen": 3532136, "step": 3520 }, { "epoch": 1.8690349946977731, "grad_norm": 0.9998632669448853, "learning_rate": 9.34252386002121e-06, "loss": 0.3321, "num_input_tokens_seen": 3536776, "step": 3525 }, { "epoch": 1.8716861081654295, "grad_norm": 1.3708542585372925, "learning_rate": 9.355779427359491e-06, "loss": 0.3465, "num_input_tokens_seen": 3541448, "step": 3530 }, { "epoch": 1.8743372216330858, "grad_norm": 1.2624917030334473, "learning_rate": 9.369034994697774e-06, "loss": 0.3144, "num_input_tokens_seen": 3545768, "step": 3535 }, { "epoch": 1.8769883351007424, "grad_norm": 3.3667869567871094, "learning_rate": 9.382290562036057e-06, "loss": 0.3302, "num_input_tokens_seen": 3549736, "step": 3540 }, { "epoch": 1.8796394485683987, "grad_norm": 2.1362555027008057, "learning_rate": 9.395546129374338e-06, "loss": 0.3893, "num_input_tokens_seen": 3554344, "step": 3545 }, { "epoch": 1.8822905620360553, "grad_norm": 2.6623952388763428, "learning_rate": 9.40880169671262e-06, "loss": 0.3917, "num_input_tokens_seen": 3560136, "step": 3550 }, { "epoch": 1.8849416755037116, "grad_norm": 1.4404734373092651, "learning_rate": 9.422057264050902e-06, "loss": 0.2951, "num_input_tokens_seen": 3564360, "step": 3555 }, { "epoch": 1.887592788971368, "grad_norm": 2.4315083026885986, "learning_rate": 9.435312831389184e-06, "loss": 0.3229, "num_input_tokens_seen": 3569192, "step": 3560 }, { "epoch": 1.8902439024390243, "grad_norm": 1.7359511852264404, "learning_rate": 9.448568398727467e-06, "loss": 0.3082, "num_input_tokens_seen": 3574152, "step": 3565 }, { "epoch": 1.8928950159066809, "grad_norm": 1.2507356405258179, "learning_rate": 9.461823966065748e-06, "loss": 0.339, "num_input_tokens_seen": 3579944, "step": 3570 }, { "epoch": 1.8955461293743372, "grad_norm": 3.1430797576904297, "learning_rate": 9.47507953340403e-06, "loss": 0.2922, "num_input_tokens_seen": 3584808, "step": 3575 }, { "epoch": 1.8981972428419938, "grad_norm": 1.2607841491699219, "learning_rate": 9.488335100742314e-06, "loss": 0.3446, "num_input_tokens_seen": 3589640, "step": 3580 }, { "epoch": 1.90084835630965, "grad_norm": 1.9004510641098022, "learning_rate": 9.501590668080595e-06, "loss": 0.3982, "num_input_tokens_seen": 3594376, "step": 3585 }, { "epoch": 1.9034994697773064, "grad_norm": 1.7445321083068848, "learning_rate": 9.514846235418876e-06, "loss": 0.3272, "num_input_tokens_seen": 3598632, "step": 3590 }, { "epoch": 1.9061505832449628, "grad_norm": 1.7495591640472412, "learning_rate": 9.52810180275716e-06, "loss": 0.3541, "num_input_tokens_seen": 3603720, "step": 3595 }, { "epoch": 1.9088016967126193, "grad_norm": 1.7179832458496094, "learning_rate": 9.541357370095441e-06, "loss": 0.3078, "num_input_tokens_seen": 3609704, "step": 3600 }, { "epoch": 1.9114528101802757, "grad_norm": 1.4328171014785767, "learning_rate": 9.554612937433722e-06, "loss": 0.3239, "num_input_tokens_seen": 3614888, "step": 3605 }, { "epoch": 1.9141039236479322, "grad_norm": 3.7855303287506104, "learning_rate": 9.567868504772005e-06, "loss": 0.3223, "num_input_tokens_seen": 3619976, "step": 3610 }, { "epoch": 1.9167550371155886, "grad_norm": 4.087214469909668, "learning_rate": 9.581124072110288e-06, "loss": 0.3098, "num_input_tokens_seen": 3625128, "step": 3615 }, { "epoch": 1.919406150583245, "grad_norm": 2.1526660919189453, "learning_rate": 9.594379639448569e-06, "loss": 0.2815, "num_input_tokens_seen": 3630824, "step": 3620 }, { "epoch": 1.9220572640509013, "grad_norm": 1.8465619087219238, "learning_rate": 9.607635206786851e-06, "loss": 0.3447, "num_input_tokens_seen": 3635784, "step": 3625 }, { "epoch": 1.9247083775185578, "grad_norm": 2.972574234008789, "learning_rate": 9.620890774125134e-06, "loss": 0.2878, "num_input_tokens_seen": 3641160, "step": 3630 }, { "epoch": 1.9273594909862142, "grad_norm": 5.048880100250244, "learning_rate": 9.634146341463415e-06, "loss": 0.4426, "num_input_tokens_seen": 3645736, "step": 3635 }, { "epoch": 1.9300106044538707, "grad_norm": 2.820141315460205, "learning_rate": 9.647401908801698e-06, "loss": 0.2905, "num_input_tokens_seen": 3650600, "step": 3640 }, { "epoch": 1.932661717921527, "grad_norm": 3.107917547225952, "learning_rate": 9.660657476139979e-06, "loss": 0.3111, "num_input_tokens_seen": 3655304, "step": 3645 }, { "epoch": 1.9353128313891834, "grad_norm": 1.9605566263198853, "learning_rate": 9.673913043478262e-06, "loss": 0.3459, "num_input_tokens_seen": 3660872, "step": 3650 }, { "epoch": 1.9379639448568398, "grad_norm": 2.171137809753418, "learning_rate": 9.687168610816544e-06, "loss": 0.3536, "num_input_tokens_seen": 3666632, "step": 3655 }, { "epoch": 1.9406150583244963, "grad_norm": 2.3954861164093018, "learning_rate": 9.700424178154825e-06, "loss": 0.3831, "num_input_tokens_seen": 3671240, "step": 3660 }, { "epoch": 1.9432661717921527, "grad_norm": 2.3409552574157715, "learning_rate": 9.713679745493108e-06, "loss": 0.384, "num_input_tokens_seen": 3676520, "step": 3665 }, { "epoch": 1.9459172852598092, "grad_norm": 1.9008415937423706, "learning_rate": 9.726935312831391e-06, "loss": 0.3465, "num_input_tokens_seen": 3681160, "step": 3670 }, { "epoch": 1.9485683987274656, "grad_norm": 2.356229543685913, "learning_rate": 9.740190880169672e-06, "loss": 0.3225, "num_input_tokens_seen": 3685128, "step": 3675 }, { "epoch": 1.951219512195122, "grad_norm": 2.3086724281311035, "learning_rate": 9.753446447507953e-06, "loss": 0.3608, "num_input_tokens_seen": 3689640, "step": 3680 }, { "epoch": 1.9538706256627782, "grad_norm": 1.459743857383728, "learning_rate": 9.766702014846236e-06, "loss": 0.3478, "num_input_tokens_seen": 3694216, "step": 3685 }, { "epoch": 1.9565217391304348, "grad_norm": 1.5540614128112793, "learning_rate": 9.779957582184519e-06, "loss": 0.3095, "num_input_tokens_seen": 3699624, "step": 3690 }, { "epoch": 1.9591728525980912, "grad_norm": 5.996768951416016, "learning_rate": 9.7932131495228e-06, "loss": 0.3542, "num_input_tokens_seen": 3704584, "step": 3695 }, { "epoch": 1.9618239660657477, "grad_norm": 2.7477781772613525, "learning_rate": 9.806468716861082e-06, "loss": 0.3416, "num_input_tokens_seen": 3709864, "step": 3700 }, { "epoch": 1.964475079533404, "grad_norm": 1.2466795444488525, "learning_rate": 9.819724284199365e-06, "loss": 0.3849, "num_input_tokens_seen": 3714920, "step": 3705 }, { "epoch": 1.9671261930010604, "grad_norm": 1.3192247152328491, "learning_rate": 9.832979851537646e-06, "loss": 0.3558, "num_input_tokens_seen": 3722664, "step": 3710 }, { "epoch": 1.9697773064687167, "grad_norm": 1.4396979808807373, "learning_rate": 9.846235418875929e-06, "loss": 0.3113, "num_input_tokens_seen": 3727720, "step": 3715 }, { "epoch": 1.9724284199363733, "grad_norm": 1.5091869831085205, "learning_rate": 9.859490986214212e-06, "loss": 0.3254, "num_input_tokens_seen": 3733160, "step": 3720 }, { "epoch": 1.9750795334040296, "grad_norm": 1.2507517337799072, "learning_rate": 9.872746553552493e-06, "loss": 0.3603, "num_input_tokens_seen": 3738664, "step": 3725 }, { "epoch": 1.9777306468716862, "grad_norm": 2.0841376781463623, "learning_rate": 9.886002120890775e-06, "loss": 0.3175, "num_input_tokens_seen": 3743368, "step": 3730 }, { "epoch": 1.9803817603393425, "grad_norm": 1.4721601009368896, "learning_rate": 9.899257688229056e-06, "loss": 0.3456, "num_input_tokens_seen": 3750152, "step": 3735 }, { "epoch": 1.9830328738069989, "grad_norm": 1.1719605922698975, "learning_rate": 9.912513255567339e-06, "loss": 0.337, "num_input_tokens_seen": 3756648, "step": 3740 }, { "epoch": 1.9856839872746552, "grad_norm": 1.775428295135498, "learning_rate": 9.925768822905622e-06, "loss": 0.361, "num_input_tokens_seen": 3761256, "step": 3745 }, { "epoch": 1.9883351007423118, "grad_norm": 1.7366381883621216, "learning_rate": 9.939024390243903e-06, "loss": 0.3599, "num_input_tokens_seen": 3765576, "step": 3750 }, { "epoch": 1.9909862142099684, "grad_norm": 2.48786997795105, "learning_rate": 9.952279957582186e-06, "loss": 0.3468, "num_input_tokens_seen": 3769864, "step": 3755 }, { "epoch": 1.9936373276776247, "grad_norm": 1.0788158178329468, "learning_rate": 9.965535524920468e-06, "loss": 0.3079, "num_input_tokens_seen": 3775176, "step": 3760 }, { "epoch": 1.996288441145281, "grad_norm": 2.592529296875, "learning_rate": 9.97879109225875e-06, "loss": 0.33, "num_input_tokens_seen": 3781096, "step": 3765 }, { "epoch": 1.9989395546129374, "grad_norm": 3.4258971214294434, "learning_rate": 9.99204665959703e-06, "loss": 0.4523, "num_input_tokens_seen": 3785704, "step": 3770 }, { "epoch": 2.0, "eval_loss": 0.32950884103775024, "eval_runtime": 29.2182, "eval_samples_per_second": 64.549, "eval_steps_per_second": 16.154, "num_input_tokens_seen": 3786792, "step": 3772 }, { "epoch": 2.0015906680805937, "grad_norm": 1.4639918804168701, "learning_rate": 9.999999914361046e-06, "loss": 0.3555, "num_input_tokens_seen": 3789288, "step": 3775 }, { "epoch": 2.00424178154825, "grad_norm": 2.408226728439331, "learning_rate": 9.99999895092284e-06, "loss": 0.3407, "num_input_tokens_seen": 3795144, "step": 3780 }, { "epoch": 2.006892895015907, "grad_norm": 1.5259562730789185, "learning_rate": 9.999996916997943e-06, "loss": 0.3378, "num_input_tokens_seen": 3799592, "step": 3785 }, { "epoch": 2.009544008483563, "grad_norm": 2.205695867538452, "learning_rate": 9.99999381258679e-06, "loss": 0.361, "num_input_tokens_seen": 3803272, "step": 3790 }, { "epoch": 2.0121951219512195, "grad_norm": 1.4625104665756226, "learning_rate": 9.999989637690046e-06, "loss": 0.318, "num_input_tokens_seen": 3808456, "step": 3795 }, { "epoch": 2.014846235418876, "grad_norm": 1.1207523345947266, "learning_rate": 9.999984392308602e-06, "loss": 0.3333, "num_input_tokens_seen": 3813704, "step": 3800 }, { "epoch": 2.017497348886532, "grad_norm": 2.1348092555999756, "learning_rate": 9.999978076443586e-06, "loss": 0.3353, "num_input_tokens_seen": 3818792, "step": 3805 }, { "epoch": 2.0201484623541885, "grad_norm": 1.6425188779830933, "learning_rate": 9.999970690096345e-06, "loss": 0.3593, "num_input_tokens_seen": 3824744, "step": 3810 }, { "epoch": 2.0227995758218453, "grad_norm": 1.5479648113250732, "learning_rate": 9.999962233268463e-06, "loss": 0.3072, "num_input_tokens_seen": 3829224, "step": 3815 }, { "epoch": 2.0254506892895017, "grad_norm": 3.479700803756714, "learning_rate": 9.999952705961752e-06, "loss": 0.315, "num_input_tokens_seen": 3833960, "step": 3820 }, { "epoch": 2.028101802757158, "grad_norm": 1.7385318279266357, "learning_rate": 9.99994210817825e-06, "loss": 0.3493, "num_input_tokens_seen": 3838760, "step": 3825 }, { "epoch": 2.0307529162248144, "grad_norm": 1.728268027305603, "learning_rate": 9.999930439920225e-06, "loss": 0.332, "num_input_tokens_seen": 3844232, "step": 3830 }, { "epoch": 2.0334040296924707, "grad_norm": 1.2590038776397705, "learning_rate": 9.999917701190176e-06, "loss": 0.3621, "num_input_tokens_seen": 3849192, "step": 3835 }, { "epoch": 2.0360551431601275, "grad_norm": 1.4956721067428589, "learning_rate": 9.999903891990832e-06, "loss": 0.311, "num_input_tokens_seen": 3854024, "step": 3840 }, { "epoch": 2.038706256627784, "grad_norm": 2.8484387397766113, "learning_rate": 9.999889012325148e-06, "loss": 0.3125, "num_input_tokens_seen": 3861160, "step": 3845 }, { "epoch": 2.04135737009544, "grad_norm": 1.147659182548523, "learning_rate": 9.99987306219631e-06, "loss": 0.3, "num_input_tokens_seen": 3866440, "step": 3850 }, { "epoch": 2.0440084835630965, "grad_norm": 2.06652569770813, "learning_rate": 9.999856041607732e-06, "loss": 0.3283, "num_input_tokens_seen": 3870536, "step": 3855 }, { "epoch": 2.046659597030753, "grad_norm": 1.63395357131958, "learning_rate": 9.999837950563059e-06, "loss": 0.3516, "num_input_tokens_seen": 3874600, "step": 3860 }, { "epoch": 2.049310710498409, "grad_norm": 2.0683252811431885, "learning_rate": 9.999818789066164e-06, "loss": 0.2973, "num_input_tokens_seen": 3879560, "step": 3865 }, { "epoch": 2.0519618239660655, "grad_norm": 1.5750685930252075, "learning_rate": 9.99979855712115e-06, "loss": 0.3428, "num_input_tokens_seen": 3884392, "step": 3870 }, { "epoch": 2.0546129374337223, "grad_norm": 1.5054960250854492, "learning_rate": 9.999777254732347e-06, "loss": 0.2776, "num_input_tokens_seen": 3889736, "step": 3875 }, { "epoch": 2.0572640509013786, "grad_norm": 1.4297832250595093, "learning_rate": 9.999754881904317e-06, "loss": 0.3514, "num_input_tokens_seen": 3894728, "step": 3880 }, { "epoch": 2.059915164369035, "grad_norm": 4.326128959655762, "learning_rate": 9.999731438641851e-06, "loss": 0.3556, "num_input_tokens_seen": 3899720, "step": 3885 }, { "epoch": 2.0625662778366913, "grad_norm": 2.7146077156066895, "learning_rate": 9.999706924949966e-06, "loss": 0.3339, "num_input_tokens_seen": 3904712, "step": 3890 }, { "epoch": 2.0652173913043477, "grad_norm": 2.381166458129883, "learning_rate": 9.999681340833911e-06, "loss": 0.3303, "num_input_tokens_seen": 3909352, "step": 3895 }, { "epoch": 2.0678685047720045, "grad_norm": 1.6810177564620972, "learning_rate": 9.999654686299165e-06, "loss": 0.2915, "num_input_tokens_seen": 3914344, "step": 3900 }, { "epoch": 2.070519618239661, "grad_norm": 2.517723560333252, "learning_rate": 9.999626961351434e-06, "loss": 0.2936, "num_input_tokens_seen": 3918696, "step": 3905 }, { "epoch": 2.073170731707317, "grad_norm": 1.6751279830932617, "learning_rate": 9.999598165996651e-06, "loss": 0.3225, "num_input_tokens_seen": 3923368, "step": 3910 }, { "epoch": 2.0758218451749735, "grad_norm": 3.3273110389709473, "learning_rate": 9.999568300240985e-06, "loss": 0.2646, "num_input_tokens_seen": 3929096, "step": 3915 }, { "epoch": 2.07847295864263, "grad_norm": 2.1011462211608887, "learning_rate": 9.999537364090828e-06, "loss": 0.1812, "num_input_tokens_seen": 3933352, "step": 3920 }, { "epoch": 2.081124072110286, "grad_norm": 1.7628991603851318, "learning_rate": 9.999505357552804e-06, "loss": 0.4739, "num_input_tokens_seen": 3938760, "step": 3925 }, { "epoch": 2.083775185577943, "grad_norm": 3.453199863433838, "learning_rate": 9.999472280633764e-06, "loss": 0.3923, "num_input_tokens_seen": 3943208, "step": 3930 }, { "epoch": 2.0864262990455993, "grad_norm": 1.9780856370925903, "learning_rate": 9.999438133340792e-06, "loss": 0.3138, "num_input_tokens_seen": 3947624, "step": 3935 }, { "epoch": 2.0890774125132556, "grad_norm": 4.419776916503906, "learning_rate": 9.999402915681198e-06, "loss": 0.306, "num_input_tokens_seen": 3952296, "step": 3940 }, { "epoch": 2.091728525980912, "grad_norm": 2.33111572265625, "learning_rate": 9.999366627662522e-06, "loss": 0.3075, "num_input_tokens_seen": 3957448, "step": 3945 }, { "epoch": 2.0943796394485683, "grad_norm": 4.830143928527832, "learning_rate": 9.999329269292533e-06, "loss": 0.408, "num_input_tokens_seen": 3962344, "step": 3950 }, { "epoch": 2.0970307529162246, "grad_norm": 1.980584979057312, "learning_rate": 9.999290840579229e-06, "loss": 0.323, "num_input_tokens_seen": 3966760, "step": 3955 }, { "epoch": 2.0996818663838814, "grad_norm": 2.1130728721618652, "learning_rate": 9.999251341530839e-06, "loss": 0.3198, "num_input_tokens_seen": 3971336, "step": 3960 }, { "epoch": 2.1023329798515378, "grad_norm": 1.407677173614502, "learning_rate": 9.999210772155816e-06, "loss": 0.3273, "num_input_tokens_seen": 3977160, "step": 3965 }, { "epoch": 2.104984093319194, "grad_norm": 1.362854242324829, "learning_rate": 9.999169132462849e-06, "loss": 0.3147, "num_input_tokens_seen": 3982248, "step": 3970 }, { "epoch": 2.1076352067868505, "grad_norm": 5.064838409423828, "learning_rate": 9.999126422460854e-06, "loss": 0.3701, "num_input_tokens_seen": 3986952, "step": 3975 }, { "epoch": 2.110286320254507, "grad_norm": 1.2653611898422241, "learning_rate": 9.999082642158972e-06, "loss": 0.2447, "num_input_tokens_seen": 3992328, "step": 3980 }, { "epoch": 2.112937433722163, "grad_norm": 1.582877516746521, "learning_rate": 9.999037791566579e-06, "loss": 0.3752, "num_input_tokens_seen": 3997192, "step": 3985 }, { "epoch": 2.11558854718982, "grad_norm": 2.0539774894714355, "learning_rate": 9.998991870693273e-06, "loss": 0.2989, "num_input_tokens_seen": 4001416, "step": 3990 }, { "epoch": 2.1182396606574763, "grad_norm": 2.827834367752075, "learning_rate": 9.998944879548893e-06, "loss": 0.41, "num_input_tokens_seen": 4005320, "step": 3995 }, { "epoch": 2.1208907741251326, "grad_norm": 1.8350270986557007, "learning_rate": 9.998896818143492e-06, "loss": 0.2319, "num_input_tokens_seen": 4009608, "step": 4000 }, { "epoch": 2.123541887592789, "grad_norm": 2.076072931289673, "learning_rate": 9.998847686487363e-06, "loss": 0.3123, "num_input_tokens_seen": 4014376, "step": 4005 }, { "epoch": 2.1261930010604453, "grad_norm": 4.838057518005371, "learning_rate": 9.998797484591027e-06, "loss": 0.3204, "num_input_tokens_seen": 4019144, "step": 4010 }, { "epoch": 2.1288441145281016, "grad_norm": 2.452533483505249, "learning_rate": 9.998746212465229e-06, "loss": 0.3, "num_input_tokens_seen": 4024040, "step": 4015 }, { "epoch": 2.1314952279957584, "grad_norm": 1.3009705543518066, "learning_rate": 9.998693870120945e-06, "loss": 0.2738, "num_input_tokens_seen": 4029096, "step": 4020 }, { "epoch": 2.1341463414634148, "grad_norm": 1.6232308149337769, "learning_rate": 9.998640457569386e-06, "loss": 0.3884, "num_input_tokens_seen": 4034920, "step": 4025 }, { "epoch": 2.136797454931071, "grad_norm": 2.8586740493774414, "learning_rate": 9.998585974821986e-06, "loss": 0.3529, "num_input_tokens_seen": 4039624, "step": 4030 }, { "epoch": 2.1394485683987274, "grad_norm": 1.6512142419815063, "learning_rate": 9.998530421890407e-06, "loss": 0.3755, "num_input_tokens_seen": 4044072, "step": 4035 }, { "epoch": 2.1420996818663838, "grad_norm": 2.3425381183624268, "learning_rate": 9.998473798786546e-06, "loss": 0.3214, "num_input_tokens_seen": 4049960, "step": 4040 }, { "epoch": 2.14475079533404, "grad_norm": 1.8312472105026245, "learning_rate": 9.998416105522524e-06, "loss": 0.2812, "num_input_tokens_seen": 4055144, "step": 4045 }, { "epoch": 2.147401908801697, "grad_norm": 1.9925678968429565, "learning_rate": 9.998357342110693e-06, "loss": 0.3035, "num_input_tokens_seen": 4059176, "step": 4050 }, { "epoch": 2.1500530222693532, "grad_norm": 2.202847719192505, "learning_rate": 9.998297508563634e-06, "loss": 0.2466, "num_input_tokens_seen": 4064200, "step": 4055 }, { "epoch": 2.1527041357370096, "grad_norm": 2.8479528427124023, "learning_rate": 9.998236604894158e-06, "loss": 0.3066, "num_input_tokens_seen": 4070632, "step": 4060 }, { "epoch": 2.155355249204666, "grad_norm": 3.212773561477661, "learning_rate": 9.998174631115303e-06, "loss": 0.5457, "num_input_tokens_seen": 4074696, "step": 4065 }, { "epoch": 2.1580063626723223, "grad_norm": 4.745420455932617, "learning_rate": 9.99811158724034e-06, "loss": 0.3051, "num_input_tokens_seen": 4078984, "step": 4070 }, { "epoch": 2.1606574761399786, "grad_norm": 2.5609335899353027, "learning_rate": 9.998047473282765e-06, "loss": 0.3996, "num_input_tokens_seen": 4083432, "step": 4075 }, { "epoch": 2.1633085896076354, "grad_norm": 2.1555798053741455, "learning_rate": 9.997982289256302e-06, "loss": 0.4031, "num_input_tokens_seen": 4088424, "step": 4080 }, { "epoch": 2.1659597030752917, "grad_norm": 1.5893701314926147, "learning_rate": 9.997916035174913e-06, "loss": 0.3249, "num_input_tokens_seen": 4092744, "step": 4085 }, { "epoch": 2.168610816542948, "grad_norm": 1.6321381330490112, "learning_rate": 9.997848711052777e-06, "loss": 0.3212, "num_input_tokens_seen": 4097352, "step": 4090 }, { "epoch": 2.1712619300106044, "grad_norm": 1.6562272310256958, "learning_rate": 9.99778031690431e-06, "loss": 0.3354, "num_input_tokens_seen": 4102632, "step": 4095 }, { "epoch": 2.1739130434782608, "grad_norm": 1.7897478342056274, "learning_rate": 9.997710852744157e-06, "loss": 0.3335, "num_input_tokens_seen": 4108264, "step": 4100 }, { "epoch": 2.176564156945917, "grad_norm": 1.3018629550933838, "learning_rate": 9.997640318587186e-06, "loss": 0.3373, "num_input_tokens_seen": 4113768, "step": 4105 }, { "epoch": 2.179215270413574, "grad_norm": 1.733642339706421, "learning_rate": 9.9975687144485e-06, "loss": 0.3417, "num_input_tokens_seen": 4117992, "step": 4110 }, { "epoch": 2.18186638388123, "grad_norm": 1.7558513879776, "learning_rate": 9.99749604034343e-06, "loss": 0.3126, "num_input_tokens_seen": 4122920, "step": 4115 }, { "epoch": 2.1845174973488866, "grad_norm": 1.8243309259414673, "learning_rate": 9.997422296287534e-06, "loss": 0.3413, "num_input_tokens_seen": 4128040, "step": 4120 }, { "epoch": 2.187168610816543, "grad_norm": 1.8081051111221313, "learning_rate": 9.997347482296603e-06, "loss": 0.3219, "num_input_tokens_seen": 4134120, "step": 4125 }, { "epoch": 2.1898197242841992, "grad_norm": 1.6625264883041382, "learning_rate": 9.997271598386653e-06, "loss": 0.2923, "num_input_tokens_seen": 4139208, "step": 4130 }, { "epoch": 2.1924708377518556, "grad_norm": 2.1161105632781982, "learning_rate": 9.99719464457393e-06, "loss": 0.3111, "num_input_tokens_seen": 4145160, "step": 4135 }, { "epoch": 2.1951219512195124, "grad_norm": 1.6280951499938965, "learning_rate": 9.997116620874908e-06, "loss": 0.3324, "num_input_tokens_seen": 4149544, "step": 4140 }, { "epoch": 2.1977730646871687, "grad_norm": 1.708295226097107, "learning_rate": 9.997037527306294e-06, "loss": 0.3235, "num_input_tokens_seen": 4154568, "step": 4145 }, { "epoch": 2.200424178154825, "grad_norm": 2.2993433475494385, "learning_rate": 9.996957363885024e-06, "loss": 0.2715, "num_input_tokens_seen": 4158984, "step": 4150 }, { "epoch": 2.2030752916224814, "grad_norm": 2.825035572052002, "learning_rate": 9.996876130628257e-06, "loss": 0.2889, "num_input_tokens_seen": 4164360, "step": 4155 }, { "epoch": 2.2057264050901377, "grad_norm": 3.733164072036743, "learning_rate": 9.996793827553383e-06, "loss": 0.3511, "num_input_tokens_seen": 4170568, "step": 4160 }, { "epoch": 2.208377518557794, "grad_norm": 4.716299057006836, "learning_rate": 9.996710454678029e-06, "loss": 0.3601, "num_input_tokens_seen": 4175752, "step": 4165 }, { "epoch": 2.211028632025451, "grad_norm": 4.974800109863281, "learning_rate": 9.996626012020043e-06, "loss": 0.4195, "num_input_tokens_seen": 4179720, "step": 4170 }, { "epoch": 2.213679745493107, "grad_norm": 1.77295982837677, "learning_rate": 9.9965404995975e-06, "loss": 0.3342, "num_input_tokens_seen": 4184104, "step": 4175 }, { "epoch": 2.2163308589607635, "grad_norm": 2.473114252090454, "learning_rate": 9.99645391742871e-06, "loss": 0.3764, "num_input_tokens_seen": 4189352, "step": 4180 }, { "epoch": 2.21898197242842, "grad_norm": 1.5241763591766357, "learning_rate": 9.996366265532213e-06, "loss": 0.3545, "num_input_tokens_seen": 4194728, "step": 4185 }, { "epoch": 2.221633085896076, "grad_norm": 2.5167200565338135, "learning_rate": 9.99627754392677e-06, "loss": 0.3353, "num_input_tokens_seen": 4200072, "step": 4190 }, { "epoch": 2.2242841993637326, "grad_norm": 2.2590487003326416, "learning_rate": 9.996187752631381e-06, "loss": 0.2719, "num_input_tokens_seen": 4204680, "step": 4195 }, { "epoch": 2.2269353128313893, "grad_norm": 2.1616432666778564, "learning_rate": 9.996096891665268e-06, "loss": 0.3352, "num_input_tokens_seen": 4209544, "step": 4200 }, { "epoch": 2.2295864262990457, "grad_norm": 3.7294468879699707, "learning_rate": 9.996004961047883e-06, "loss": 0.2972, "num_input_tokens_seen": 4214280, "step": 4205 }, { "epoch": 2.232237539766702, "grad_norm": 4.482620716094971, "learning_rate": 9.99591196079891e-06, "loss": 0.4045, "num_input_tokens_seen": 4219048, "step": 4210 }, { "epoch": 2.2348886532343584, "grad_norm": 2.6314637660980225, "learning_rate": 9.995817890938258e-06, "loss": 0.3315, "num_input_tokens_seen": 4224360, "step": 4215 }, { "epoch": 2.2375397667020147, "grad_norm": 2.1567723751068115, "learning_rate": 9.99572275148607e-06, "loss": 0.311, "num_input_tokens_seen": 4229000, "step": 4220 }, { "epoch": 2.2401908801696715, "grad_norm": 2.9207332134246826, "learning_rate": 9.995626542462712e-06, "loss": 0.3027, "num_input_tokens_seen": 4232936, "step": 4225 }, { "epoch": 2.242841993637328, "grad_norm": 2.3232433795928955, "learning_rate": 9.995529263888783e-06, "loss": 0.3344, "num_input_tokens_seen": 4238152, "step": 4230 }, { "epoch": 2.245493107104984, "grad_norm": 1.9399901628494263, "learning_rate": 9.995430915785112e-06, "loss": 0.3218, "num_input_tokens_seen": 4242568, "step": 4235 }, { "epoch": 2.2481442205726405, "grad_norm": 1.9970290660858154, "learning_rate": 9.995331498172754e-06, "loss": 0.3435, "num_input_tokens_seen": 4247688, "step": 4240 }, { "epoch": 2.250795334040297, "grad_norm": 2.3674936294555664, "learning_rate": 9.995231011072993e-06, "loss": 0.3631, "num_input_tokens_seen": 4251944, "step": 4245 }, { "epoch": 2.253446447507953, "grad_norm": 1.6259046792984009, "learning_rate": 9.995129454507342e-06, "loss": 0.329, "num_input_tokens_seen": 4257128, "step": 4250 }, { "epoch": 2.2560975609756095, "grad_norm": 1.23585045337677, "learning_rate": 9.995026828497547e-06, "loss": 0.3454, "num_input_tokens_seen": 4261256, "step": 4255 }, { "epoch": 2.2587486744432663, "grad_norm": 2.558190107345581, "learning_rate": 9.994923133065579e-06, "loss": 0.3737, "num_input_tokens_seen": 4266120, "step": 4260 }, { "epoch": 2.2613997879109227, "grad_norm": 1.239846110343933, "learning_rate": 9.994818368233639e-06, "loss": 0.3593, "num_input_tokens_seen": 4272072, "step": 4265 }, { "epoch": 2.264050901378579, "grad_norm": 1.4396393299102783, "learning_rate": 9.994712534024155e-06, "loss": 0.3087, "num_input_tokens_seen": 4276968, "step": 4270 }, { "epoch": 2.2667020148462353, "grad_norm": 2.1718051433563232, "learning_rate": 9.994605630459788e-06, "loss": 0.3514, "num_input_tokens_seen": 4281640, "step": 4275 }, { "epoch": 2.2693531283138917, "grad_norm": 1.6059575080871582, "learning_rate": 9.994497657563426e-06, "loss": 0.2959, "num_input_tokens_seen": 4286728, "step": 4280 }, { "epoch": 2.2720042417815485, "grad_norm": 1.7791502475738525, "learning_rate": 9.994388615358183e-06, "loss": 0.2972, "num_input_tokens_seen": 4291528, "step": 4285 }, { "epoch": 2.274655355249205, "grad_norm": 1.6495227813720703, "learning_rate": 9.99427850386741e-06, "loss": 0.3856, "num_input_tokens_seen": 4295464, "step": 4290 }, { "epoch": 2.277306468716861, "grad_norm": 1.600366234779358, "learning_rate": 9.994167323114675e-06, "loss": 0.2752, "num_input_tokens_seen": 4301192, "step": 4295 }, { "epoch": 2.2799575821845175, "grad_norm": 2.1739397048950195, "learning_rate": 9.994055073123785e-06, "loss": 0.2944, "num_input_tokens_seen": 4306408, "step": 4300 }, { "epoch": 2.282608695652174, "grad_norm": 2.339418888092041, "learning_rate": 9.993941753918773e-06, "loss": 0.3796, "num_input_tokens_seen": 4313832, "step": 4305 }, { "epoch": 2.28525980911983, "grad_norm": 1.470942497253418, "learning_rate": 9.993827365523897e-06, "loss": 0.297, "num_input_tokens_seen": 4319080, "step": 4310 }, { "epoch": 2.2879109225874865, "grad_norm": 1.7593445777893066, "learning_rate": 9.993711907963651e-06, "loss": 0.3239, "num_input_tokens_seen": 4323912, "step": 4315 }, { "epoch": 2.2905620360551433, "grad_norm": 1.4734457731246948, "learning_rate": 9.993595381262753e-06, "loss": 0.3736, "num_input_tokens_seen": 4329512, "step": 4320 }, { "epoch": 2.2932131495227996, "grad_norm": 1.5417529344558716, "learning_rate": 9.993477785446151e-06, "loss": 0.3256, "num_input_tokens_seen": 4333832, "step": 4325 }, { "epoch": 2.295864262990456, "grad_norm": 2.378765106201172, "learning_rate": 9.993359120539022e-06, "loss": 0.3205, "num_input_tokens_seen": 4338056, "step": 4330 }, { "epoch": 2.2985153764581123, "grad_norm": 1.4487082958221436, "learning_rate": 9.99323938656677e-06, "loss": 0.2972, "num_input_tokens_seen": 4344840, "step": 4335 }, { "epoch": 2.3011664899257687, "grad_norm": 1.9550249576568604, "learning_rate": 9.993118583555033e-06, "loss": 0.3184, "num_input_tokens_seen": 4350888, "step": 4340 }, { "epoch": 2.3038176033934255, "grad_norm": 3.7276036739349365, "learning_rate": 9.992996711529671e-06, "loss": 0.3687, "num_input_tokens_seen": 4355720, "step": 4345 }, { "epoch": 2.306468716861082, "grad_norm": 3.48157000541687, "learning_rate": 9.99287377051678e-06, "loss": 0.3674, "num_input_tokens_seen": 4360168, "step": 4350 }, { "epoch": 2.309119830328738, "grad_norm": 1.5477986335754395, "learning_rate": 9.99274976054268e-06, "loss": 0.3008, "num_input_tokens_seen": 4364584, "step": 4355 }, { "epoch": 2.3117709437963945, "grad_norm": 1.2086178064346313, "learning_rate": 9.99262468163392e-06, "loss": 0.3027, "num_input_tokens_seen": 4370312, "step": 4360 }, { "epoch": 2.314422057264051, "grad_norm": 2.745114326477051, "learning_rate": 9.99249853381728e-06, "loss": 0.3266, "num_input_tokens_seen": 4374888, "step": 4365 }, { "epoch": 2.317073170731707, "grad_norm": 3.348611831665039, "learning_rate": 9.992371317119767e-06, "loss": 0.3552, "num_input_tokens_seen": 4379816, "step": 4370 }, { "epoch": 2.3197242841993635, "grad_norm": 3.7829971313476562, "learning_rate": 9.99224303156862e-06, "loss": 0.3927, "num_input_tokens_seen": 4385128, "step": 4375 }, { "epoch": 2.3223753976670203, "grad_norm": 1.7303000688552856, "learning_rate": 9.992113677191304e-06, "loss": 0.3818, "num_input_tokens_seen": 4390824, "step": 4380 }, { "epoch": 2.3250265111346766, "grad_norm": 1.7286456823349, "learning_rate": 9.991983254015513e-06, "loss": 0.303, "num_input_tokens_seen": 4396040, "step": 4385 }, { "epoch": 2.327677624602333, "grad_norm": 1.75588059425354, "learning_rate": 9.991851762069169e-06, "loss": 0.3263, "num_input_tokens_seen": 4401224, "step": 4390 }, { "epoch": 2.3303287380699893, "grad_norm": 3.151418685913086, "learning_rate": 9.991719201380425e-06, "loss": 0.3392, "num_input_tokens_seen": 4406632, "step": 4395 }, { "epoch": 2.3329798515376456, "grad_norm": 1.7113425731658936, "learning_rate": 9.991585571977663e-06, "loss": 0.3162, "num_input_tokens_seen": 4411432, "step": 4400 }, { "epoch": 2.3356309650053024, "grad_norm": 2.1331827640533447, "learning_rate": 9.99145087388949e-06, "loss": 0.2559, "num_input_tokens_seen": 4416424, "step": 4405 }, { "epoch": 2.3382820784729588, "grad_norm": 1.4677972793579102, "learning_rate": 9.991315107144748e-06, "loss": 0.4566, "num_input_tokens_seen": 4421096, "step": 4410 }, { "epoch": 2.340933191940615, "grad_norm": 2.1914937496185303, "learning_rate": 9.991178271772501e-06, "loss": 0.2492, "num_input_tokens_seen": 4425992, "step": 4415 }, { "epoch": 2.3435843054082715, "grad_norm": 1.508065104484558, "learning_rate": 9.991040367802047e-06, "loss": 0.2139, "num_input_tokens_seen": 4431304, "step": 4420 }, { "epoch": 2.346235418875928, "grad_norm": 1.7055732011795044, "learning_rate": 9.990901395262911e-06, "loss": 0.3267, "num_input_tokens_seen": 4436072, "step": 4425 }, { "epoch": 2.348886532343584, "grad_norm": 4.783508777618408, "learning_rate": 9.990761354184846e-06, "loss": 0.3798, "num_input_tokens_seen": 4440616, "step": 4430 }, { "epoch": 2.3515376458112405, "grad_norm": 1.6841068267822266, "learning_rate": 9.990620244597833e-06, "loss": 0.228, "num_input_tokens_seen": 4445608, "step": 4435 }, { "epoch": 2.3541887592788973, "grad_norm": 2.4450695514678955, "learning_rate": 9.990478066532088e-06, "loss": 0.3631, "num_input_tokens_seen": 4450184, "step": 4440 }, { "epoch": 2.3568398727465536, "grad_norm": 3.0907411575317383, "learning_rate": 9.990334820018046e-06, "loss": 0.3338, "num_input_tokens_seen": 4455912, "step": 4445 }, { "epoch": 2.35949098621421, "grad_norm": 1.4580949544906616, "learning_rate": 9.990190505086377e-06, "loss": 0.3815, "num_input_tokens_seen": 4460584, "step": 4450 }, { "epoch": 2.3621420996818663, "grad_norm": 1.5104683637619019, "learning_rate": 9.99004512176798e-06, "loss": 0.3694, "num_input_tokens_seen": 4465960, "step": 4455 }, { "epoch": 2.3647932131495226, "grad_norm": 1.6548292636871338, "learning_rate": 9.989898670093979e-06, "loss": 0.3065, "num_input_tokens_seen": 4470664, "step": 4460 }, { "epoch": 2.3674443266171794, "grad_norm": 2.1633970737457275, "learning_rate": 9.98975115009573e-06, "loss": 0.3456, "num_input_tokens_seen": 4475528, "step": 4465 }, { "epoch": 2.3700954400848357, "grad_norm": 1.3983345031738281, "learning_rate": 9.989602561804816e-06, "loss": 0.3256, "num_input_tokens_seen": 4479688, "step": 4470 }, { "epoch": 2.372746553552492, "grad_norm": 1.359516978263855, "learning_rate": 9.989452905253052e-06, "loss": 0.3374, "num_input_tokens_seen": 4484840, "step": 4475 }, { "epoch": 2.3753976670201484, "grad_norm": 3.520049571990967, "learning_rate": 9.989302180472475e-06, "loss": 0.3432, "num_input_tokens_seen": 4490472, "step": 4480 }, { "epoch": 2.3780487804878048, "grad_norm": 1.367875576019287, "learning_rate": 9.989150387495358e-06, "loss": 0.3191, "num_input_tokens_seen": 4497032, "step": 4485 }, { "epoch": 2.380699893955461, "grad_norm": 0.9143961668014526, "learning_rate": 9.988997526354199e-06, "loss": 0.2881, "num_input_tokens_seen": 4501320, "step": 4490 }, { "epoch": 2.383351007423118, "grad_norm": 2.607023239135742, "learning_rate": 9.988843597081723e-06, "loss": 0.4676, "num_input_tokens_seen": 4506152, "step": 4495 }, { "epoch": 2.3860021208907742, "grad_norm": 3.0903234481811523, "learning_rate": 9.988688599710889e-06, "loss": 0.3678, "num_input_tokens_seen": 4510824, "step": 4500 }, { "epoch": 2.3886532343584306, "grad_norm": 1.7076308727264404, "learning_rate": 9.988532534274878e-06, "loss": 0.3251, "num_input_tokens_seen": 4517128, "step": 4505 }, { "epoch": 2.391304347826087, "grad_norm": 2.5147507190704346, "learning_rate": 9.988375400807106e-06, "loss": 0.3578, "num_input_tokens_seen": 4521928, "step": 4510 }, { "epoch": 2.3939554612937433, "grad_norm": 1.6423227787017822, "learning_rate": 9.988217199341215e-06, "loss": 0.3579, "num_input_tokens_seen": 4525800, "step": 4515 }, { "epoch": 2.3966065747613996, "grad_norm": 0.9727085828781128, "learning_rate": 9.988057929911073e-06, "loss": 0.3408, "num_input_tokens_seen": 4531976, "step": 4520 }, { "epoch": 2.3992576882290564, "grad_norm": 3.2829315662384033, "learning_rate": 9.98789759255078e-06, "loss": 0.3545, "num_input_tokens_seen": 4537224, "step": 4525 }, { "epoch": 2.4019088016967127, "grad_norm": 0.80263751745224, "learning_rate": 9.987736187294666e-06, "loss": 0.2434, "num_input_tokens_seen": 4541448, "step": 4530 }, { "epoch": 2.404559915164369, "grad_norm": 1.1546047925949097, "learning_rate": 9.987573714177285e-06, "loss": 0.3178, "num_input_tokens_seen": 4547240, "step": 4535 }, { "epoch": 2.4072110286320254, "grad_norm": 1.1106642484664917, "learning_rate": 9.987410173233424e-06, "loss": 0.2637, "num_input_tokens_seen": 4551752, "step": 4540 }, { "epoch": 2.4098621420996817, "grad_norm": 1.2190451622009277, "learning_rate": 9.987245564498096e-06, "loss": 0.2829, "num_input_tokens_seen": 4557992, "step": 4545 }, { "epoch": 2.412513255567338, "grad_norm": 1.0834264755249023, "learning_rate": 9.98707988800654e-06, "loss": 0.4106, "num_input_tokens_seen": 4562824, "step": 4550 }, { "epoch": 2.415164369034995, "grad_norm": 2.2718327045440674, "learning_rate": 9.986913143794232e-06, "loss": 0.2955, "num_input_tokens_seen": 4566888, "step": 4555 }, { "epoch": 2.417815482502651, "grad_norm": 1.3709276914596558, "learning_rate": 9.98674533189687e-06, "loss": 0.2567, "num_input_tokens_seen": 4572008, "step": 4560 }, { "epoch": 2.4204665959703076, "grad_norm": 2.7051331996917725, "learning_rate": 9.986576452350381e-06, "loss": 0.2945, "num_input_tokens_seen": 4576552, "step": 4565 }, { "epoch": 2.423117709437964, "grad_norm": 1.1696598529815674, "learning_rate": 9.98640650519092e-06, "loss": 0.3033, "num_input_tokens_seen": 4581928, "step": 4570 }, { "epoch": 2.4257688229056202, "grad_norm": 1.897847294807434, "learning_rate": 9.986235490454877e-06, "loss": 0.3899, "num_input_tokens_seen": 4586472, "step": 4575 }, { "epoch": 2.4284199363732766, "grad_norm": 1.4848648309707642, "learning_rate": 9.986063408178861e-06, "loss": 0.2408, "num_input_tokens_seen": 4592040, "step": 4580 }, { "epoch": 2.4310710498409334, "grad_norm": 1.1773149967193604, "learning_rate": 9.98589025839972e-06, "loss": 0.2912, "num_input_tokens_seen": 4596840, "step": 4585 }, { "epoch": 2.4337221633085897, "grad_norm": 1.1682875156402588, "learning_rate": 9.98571604115452e-06, "loss": 0.2686, "num_input_tokens_seen": 4603080, "step": 4590 }, { "epoch": 2.436373276776246, "grad_norm": 3.6571249961853027, "learning_rate": 9.985540756480559e-06, "loss": 0.3319, "num_input_tokens_seen": 4607976, "step": 4595 }, { "epoch": 2.4390243902439024, "grad_norm": 2.3860740661621094, "learning_rate": 9.98536440441537e-06, "loss": 0.3437, "num_input_tokens_seen": 4612104, "step": 4600 }, { "epoch": 2.4416755037115587, "grad_norm": 3.723193645477295, "learning_rate": 9.985186984996707e-06, "loss": 0.3689, "num_input_tokens_seen": 4616136, "step": 4605 }, { "epoch": 2.4443266171792155, "grad_norm": 2.722214937210083, "learning_rate": 9.985008498262556e-06, "loss": 0.3193, "num_input_tokens_seen": 4620968, "step": 4610 }, { "epoch": 2.446977730646872, "grad_norm": 1.6474449634552002, "learning_rate": 9.98482894425113e-06, "loss": 0.3654, "num_input_tokens_seen": 4625224, "step": 4615 }, { "epoch": 2.449628844114528, "grad_norm": 2.5003416538238525, "learning_rate": 9.98464832300087e-06, "loss": 0.3502, "num_input_tokens_seen": 4629928, "step": 4620 }, { "epoch": 2.4522799575821845, "grad_norm": 2.2178471088409424, "learning_rate": 9.98446663455045e-06, "loss": 0.3341, "num_input_tokens_seen": 4634952, "step": 4625 }, { "epoch": 2.454931071049841, "grad_norm": 3.589728593826294, "learning_rate": 9.984283878938763e-06, "loss": 0.3331, "num_input_tokens_seen": 4640232, "step": 4630 }, { "epoch": 2.457582184517497, "grad_norm": 3.7530102729797363, "learning_rate": 9.984100056204942e-06, "loss": 0.3284, "num_input_tokens_seen": 4644808, "step": 4635 }, { "epoch": 2.4602332979851536, "grad_norm": 1.4139134883880615, "learning_rate": 9.983915166388342e-06, "loss": 0.339, "num_input_tokens_seen": 4649192, "step": 4640 }, { "epoch": 2.4628844114528103, "grad_norm": 2.153416395187378, "learning_rate": 9.983729209528546e-06, "loss": 0.2868, "num_input_tokens_seen": 4654024, "step": 4645 }, { "epoch": 2.4655355249204667, "grad_norm": 1.577866792678833, "learning_rate": 9.983542185665367e-06, "loss": 0.2995, "num_input_tokens_seen": 4659048, "step": 4650 }, { "epoch": 2.468186638388123, "grad_norm": 2.390167236328125, "learning_rate": 9.983354094838844e-06, "loss": 0.3113, "num_input_tokens_seen": 4664872, "step": 4655 }, { "epoch": 2.4708377518557794, "grad_norm": 1.8166067600250244, "learning_rate": 9.983164937089254e-06, "loss": 0.3936, "num_input_tokens_seen": 4669512, "step": 4660 }, { "epoch": 2.4734888653234357, "grad_norm": 2.3130526542663574, "learning_rate": 9.982974712457087e-06, "loss": 0.3074, "num_input_tokens_seen": 4676072, "step": 4665 }, { "epoch": 2.4761399787910925, "grad_norm": 2.032601833343506, "learning_rate": 9.982783420983075e-06, "loss": 0.3744, "num_input_tokens_seen": 4681160, "step": 4670 }, { "epoch": 2.478791092258749, "grad_norm": 1.6185040473937988, "learning_rate": 9.982591062708172e-06, "loss": 0.3649, "num_input_tokens_seen": 4685448, "step": 4675 }, { "epoch": 2.481442205726405, "grad_norm": 1.2931517362594604, "learning_rate": 9.982397637673558e-06, "loss": 0.3597, "num_input_tokens_seen": 4689192, "step": 4680 }, { "epoch": 2.4840933191940615, "grad_norm": 1.0171538591384888, "learning_rate": 9.98220314592065e-06, "loss": 0.2952, "num_input_tokens_seen": 4694664, "step": 4685 }, { "epoch": 2.486744432661718, "grad_norm": 1.8282222747802734, "learning_rate": 9.982007587491084e-06, "loss": 0.3413, "num_input_tokens_seen": 4698888, "step": 4690 }, { "epoch": 2.489395546129374, "grad_norm": 2.287631034851074, "learning_rate": 9.98181096242673e-06, "loss": 0.3461, "num_input_tokens_seen": 4703464, "step": 4695 }, { "epoch": 2.4920466595970305, "grad_norm": 1.1712628602981567, "learning_rate": 9.981613270769686e-06, "loss": 0.2755, "num_input_tokens_seen": 4709384, "step": 4700 }, { "epoch": 2.4946977730646873, "grad_norm": 1.5933585166931152, "learning_rate": 9.981414512562277e-06, "loss": 0.3432, "num_input_tokens_seen": 4714472, "step": 4705 }, { "epoch": 2.4973488865323437, "grad_norm": 1.7987855672836304, "learning_rate": 9.981214687847054e-06, "loss": 0.302, "num_input_tokens_seen": 4719656, "step": 4710 }, { "epoch": 2.5, "grad_norm": 2.2629218101501465, "learning_rate": 9.981013796666802e-06, "loss": 0.3271, "num_input_tokens_seen": 4724680, "step": 4715 }, { "epoch": 2.5026511134676563, "grad_norm": 1.2800958156585693, "learning_rate": 9.98081183906453e-06, "loss": 0.2724, "num_input_tokens_seen": 4730632, "step": 4720 }, { "epoch": 2.5053022269353127, "grad_norm": 2.573147773742676, "learning_rate": 9.980608815083477e-06, "loss": 0.3297, "num_input_tokens_seen": 4735688, "step": 4725 }, { "epoch": 2.5079533404029695, "grad_norm": 2.687422752380371, "learning_rate": 9.980404724767107e-06, "loss": 0.2879, "num_input_tokens_seen": 4740072, "step": 4730 }, { "epoch": 2.510604453870626, "grad_norm": 1.3021624088287354, "learning_rate": 9.980199568159122e-06, "loss": 0.2462, "num_input_tokens_seen": 4744328, "step": 4735 }, { "epoch": 2.513255567338282, "grad_norm": 3.070315361022949, "learning_rate": 9.979993345303439e-06, "loss": 0.3883, "num_input_tokens_seen": 4748936, "step": 4740 }, { "epoch": 2.5159066808059385, "grad_norm": 2.291142702102661, "learning_rate": 9.979786056244211e-06, "loss": 0.3652, "num_input_tokens_seen": 4754920, "step": 4745 }, { "epoch": 2.518557794273595, "grad_norm": 3.184499979019165, "learning_rate": 9.979577701025821e-06, "loss": 0.2931, "num_input_tokens_seen": 4760392, "step": 4750 }, { "epoch": 2.521208907741251, "grad_norm": 1.657728910446167, "learning_rate": 9.979368279692877e-06, "loss": 0.2817, "num_input_tokens_seen": 4765320, "step": 4755 }, { "epoch": 2.5238600212089075, "grad_norm": 2.1784396171569824, "learning_rate": 9.97915779229021e-06, "loss": 0.299, "num_input_tokens_seen": 4770728, "step": 4760 }, { "epoch": 2.5265111346765643, "grad_norm": 1.9730360507965088, "learning_rate": 9.97894623886289e-06, "loss": 0.3114, "num_input_tokens_seen": 4776072, "step": 4765 }, { "epoch": 2.5291622481442206, "grad_norm": 1.5987528562545776, "learning_rate": 9.978733619456211e-06, "loss": 0.2756, "num_input_tokens_seen": 4781416, "step": 4770 }, { "epoch": 2.531813361611877, "grad_norm": 4.088924407958984, "learning_rate": 9.978519934115691e-06, "loss": 0.3633, "num_input_tokens_seen": 4786152, "step": 4775 }, { "epoch": 2.5344644750795333, "grad_norm": 1.8873506784439087, "learning_rate": 9.978305182887081e-06, "loss": 0.3202, "num_input_tokens_seen": 4791016, "step": 4780 }, { "epoch": 2.5371155885471897, "grad_norm": 3.3650267124176025, "learning_rate": 9.978089365816357e-06, "loss": 0.3659, "num_input_tokens_seen": 4795496, "step": 4785 }, { "epoch": 2.5397667020148464, "grad_norm": 2.4548232555389404, "learning_rate": 9.977872482949726e-06, "loss": 0.3421, "num_input_tokens_seen": 4800040, "step": 4790 }, { "epoch": 2.542417815482503, "grad_norm": 4.544750213623047, "learning_rate": 9.977654534333623e-06, "loss": 0.3267, "num_input_tokens_seen": 4804456, "step": 4795 }, { "epoch": 2.545068928950159, "grad_norm": 2.1259214878082275, "learning_rate": 9.977435520014708e-06, "loss": 0.3722, "num_input_tokens_seen": 4809416, "step": 4800 }, { "epoch": 2.5477200424178155, "grad_norm": 3.013707399368286, "learning_rate": 9.977215440039875e-06, "loss": 0.325, "num_input_tokens_seen": 4814120, "step": 4805 }, { "epoch": 2.550371155885472, "grad_norm": 3.29555606842041, "learning_rate": 9.97699429445624e-06, "loss": 0.335, "num_input_tokens_seen": 4818536, "step": 4810 }, { "epoch": 2.553022269353128, "grad_norm": 2.938520669937134, "learning_rate": 9.97677208331115e-06, "loss": 0.279, "num_input_tokens_seen": 4824168, "step": 4815 }, { "epoch": 2.5556733828207845, "grad_norm": 5.221179485321045, "learning_rate": 9.97654880665218e-06, "loss": 0.3798, "num_input_tokens_seen": 4827816, "step": 4820 }, { "epoch": 2.5583244962884413, "grad_norm": 1.5248223543167114, "learning_rate": 9.976324464527131e-06, "loss": 0.3157, "num_input_tokens_seen": 4832456, "step": 4825 }, { "epoch": 2.5609756097560976, "grad_norm": 3.360704183578491, "learning_rate": 9.976099056984036e-06, "loss": 0.3122, "num_input_tokens_seen": 4836552, "step": 4830 }, { "epoch": 2.563626723223754, "grad_norm": 3.2770683765411377, "learning_rate": 9.975872584071156e-06, "loss": 0.3303, "num_input_tokens_seen": 4843048, "step": 4835 }, { "epoch": 2.5662778366914103, "grad_norm": 2.977635383605957, "learning_rate": 9.975645045836976e-06, "loss": 0.3538, "num_input_tokens_seen": 4848968, "step": 4840 }, { "epoch": 2.5689289501590666, "grad_norm": 1.8620516061782837, "learning_rate": 9.97541644233021e-06, "loss": 0.3458, "num_input_tokens_seen": 4853800, "step": 4845 }, { "epoch": 2.5715800636267234, "grad_norm": 1.972456693649292, "learning_rate": 9.975186773599806e-06, "loss": 0.3238, "num_input_tokens_seen": 4858824, "step": 4850 }, { "epoch": 2.5742311770943798, "grad_norm": 2.028103828430176, "learning_rate": 9.97495603969493e-06, "loss": 0.3051, "num_input_tokens_seen": 4864264, "step": 4855 }, { "epoch": 2.576882290562036, "grad_norm": 1.7975759506225586, "learning_rate": 9.974724240664983e-06, "loss": 0.2895, "num_input_tokens_seen": 4869544, "step": 4860 }, { "epoch": 2.5795334040296924, "grad_norm": 4.12934684753418, "learning_rate": 9.974491376559596e-06, "loss": 0.3358, "num_input_tokens_seen": 4874632, "step": 4865 }, { "epoch": 2.582184517497349, "grad_norm": 3.3712222576141357, "learning_rate": 9.974257447428621e-06, "loss": 0.2725, "num_input_tokens_seen": 4880680, "step": 4870 }, { "epoch": 2.5848356309650056, "grad_norm": 2.2009332180023193, "learning_rate": 9.974022453322143e-06, "loss": 0.2953, "num_input_tokens_seen": 4884648, "step": 4875 }, { "epoch": 2.5874867444326615, "grad_norm": 4.543708801269531, "learning_rate": 9.973786394290475e-06, "loss": 0.5189, "num_input_tokens_seen": 4890120, "step": 4880 }, { "epoch": 2.5901378579003183, "grad_norm": 1.9070825576782227, "learning_rate": 9.973549270384153e-06, "loss": 0.3863, "num_input_tokens_seen": 4894728, "step": 4885 }, { "epoch": 2.5927889713679746, "grad_norm": 6.509688854217529, "learning_rate": 9.973311081653947e-06, "loss": 0.3464, "num_input_tokens_seen": 4899528, "step": 4890 }, { "epoch": 2.595440084835631, "grad_norm": 2.0356218814849854, "learning_rate": 9.973071828150855e-06, "loss": 0.3787, "num_input_tokens_seen": 4905128, "step": 4895 }, { "epoch": 2.5980911983032873, "grad_norm": 3.4030213356018066, "learning_rate": 9.972831509926094e-06, "loss": 0.3199, "num_input_tokens_seen": 4909576, "step": 4900 }, { "epoch": 2.6007423117709436, "grad_norm": 1.6536813974380493, "learning_rate": 9.972590127031121e-06, "loss": 0.3166, "num_input_tokens_seen": 4914248, "step": 4905 }, { "epoch": 2.6033934252386004, "grad_norm": 2.963632583618164, "learning_rate": 9.972347679517614e-06, "loss": 0.3413, "num_input_tokens_seen": 4918824, "step": 4910 }, { "epoch": 2.6060445387062567, "grad_norm": 2.1897735595703125, "learning_rate": 9.97210416743748e-06, "loss": 0.3361, "num_input_tokens_seen": 4923784, "step": 4915 }, { "epoch": 2.608695652173913, "grad_norm": 1.6906440258026123, "learning_rate": 9.971859590842853e-06, "loss": 0.3554, "num_input_tokens_seen": 4928808, "step": 4920 }, { "epoch": 2.6113467656415694, "grad_norm": 3.1080715656280518, "learning_rate": 9.971613949786099e-06, "loss": 0.3731, "num_input_tokens_seen": 4933320, "step": 4925 }, { "epoch": 2.6139978791092258, "grad_norm": 1.6709421873092651, "learning_rate": 9.971367244319808e-06, "loss": 0.3568, "num_input_tokens_seen": 4939016, "step": 4930 }, { "epoch": 2.6166489925768825, "grad_norm": 2.029660224914551, "learning_rate": 9.971119474496799e-06, "loss": 0.2735, "num_input_tokens_seen": 4945000, "step": 4935 }, { "epoch": 2.6193001060445384, "grad_norm": 1.529923677444458, "learning_rate": 9.970870640370118e-06, "loss": 0.3356, "num_input_tokens_seen": 4951016, "step": 4940 }, { "epoch": 2.6219512195121952, "grad_norm": 2.3740315437316895, "learning_rate": 9.97062074199304e-06, "loss": 0.3283, "num_input_tokens_seen": 4955688, "step": 4945 }, { "epoch": 2.6246023329798516, "grad_norm": 1.8063719272613525, "learning_rate": 9.97036977941907e-06, "loss": 0.3773, "num_input_tokens_seen": 4961064, "step": 4950 }, { "epoch": 2.627253446447508, "grad_norm": 2.9998505115509033, "learning_rate": 9.970117752701933e-06, "loss": 0.3155, "num_input_tokens_seen": 4966344, "step": 4955 }, { "epoch": 2.6299045599151643, "grad_norm": 2.346479654312134, "learning_rate": 9.969864661895591e-06, "loss": 0.2967, "num_input_tokens_seen": 4971176, "step": 4960 }, { "epoch": 2.6325556733828206, "grad_norm": 4.571146488189697, "learning_rate": 9.969610507054233e-06, "loss": 0.3085, "num_input_tokens_seen": 4975720, "step": 4965 }, { "epoch": 2.6352067868504774, "grad_norm": 1.7810981273651123, "learning_rate": 9.969355288232268e-06, "loss": 0.347, "num_input_tokens_seen": 4981096, "step": 4970 }, { "epoch": 2.6378579003181337, "grad_norm": 1.889158844947815, "learning_rate": 9.969099005484336e-06, "loss": 0.3008, "num_input_tokens_seen": 4985480, "step": 4975 }, { "epoch": 2.64050901378579, "grad_norm": 3.9166948795318604, "learning_rate": 9.968841658865314e-06, "loss": 0.3977, "num_input_tokens_seen": 4989480, "step": 4980 }, { "epoch": 2.6431601272534464, "grad_norm": 1.6556257009506226, "learning_rate": 9.968583248430294e-06, "loss": 0.3743, "num_input_tokens_seen": 4994344, "step": 4985 }, { "epoch": 2.6458112407211027, "grad_norm": 2.58345365524292, "learning_rate": 9.968323774234601e-06, "loss": 0.3027, "num_input_tokens_seen": 4999304, "step": 4990 }, { "epoch": 2.6484623541887595, "grad_norm": 2.3939828872680664, "learning_rate": 9.96806323633379e-06, "loss": 0.3861, "num_input_tokens_seen": 5004776, "step": 4995 }, { "epoch": 2.6511134676564154, "grad_norm": 1.473334789276123, "learning_rate": 9.96780163478364e-06, "loss": 0.3291, "num_input_tokens_seen": 5009128, "step": 5000 }, { "epoch": 2.653764581124072, "grad_norm": 1.9744658470153809, "learning_rate": 9.967538969640158e-06, "loss": 0.3181, "num_input_tokens_seen": 5014408, "step": 5005 }, { "epoch": 2.6564156945917285, "grad_norm": 2.171259641647339, "learning_rate": 9.967275240959583e-06, "loss": 0.3095, "num_input_tokens_seen": 5019048, "step": 5010 }, { "epoch": 2.659066808059385, "grad_norm": 1.4644873142242432, "learning_rate": 9.967010448798376e-06, "loss": 0.3044, "num_input_tokens_seen": 5023880, "step": 5015 }, { "epoch": 2.6617179215270412, "grad_norm": 1.4564101696014404, "learning_rate": 9.966744593213229e-06, "loss": 0.3172, "num_input_tokens_seen": 5028424, "step": 5020 }, { "epoch": 2.6643690349946976, "grad_norm": 2.3546957969665527, "learning_rate": 9.966477674261063e-06, "loss": 0.3779, "num_input_tokens_seen": 5032744, "step": 5025 }, { "epoch": 2.6670201484623544, "grad_norm": 1.8366203308105469, "learning_rate": 9.96620969199902e-06, "loss": 0.2496, "num_input_tokens_seen": 5037960, "step": 5030 }, { "epoch": 2.6696712619300107, "grad_norm": 2.784464120864868, "learning_rate": 9.96594064648448e-06, "loss": 0.3301, "num_input_tokens_seen": 5043144, "step": 5035 }, { "epoch": 2.672322375397667, "grad_norm": 1.9869132041931152, "learning_rate": 9.965670537775041e-06, "loss": 0.2706, "num_input_tokens_seen": 5049160, "step": 5040 }, { "epoch": 2.6749734888653234, "grad_norm": 1.68059504032135, "learning_rate": 9.965399365928535e-06, "loss": 0.3708, "num_input_tokens_seen": 5053736, "step": 5045 }, { "epoch": 2.6776246023329797, "grad_norm": 2.555830955505371, "learning_rate": 9.965127131003016e-06, "loss": 0.3712, "num_input_tokens_seen": 5058728, "step": 5050 }, { "epoch": 2.6802757158006365, "grad_norm": 1.1440049409866333, "learning_rate": 9.964853833056771e-06, "loss": 0.3091, "num_input_tokens_seen": 5063272, "step": 5055 }, { "epoch": 2.682926829268293, "grad_norm": 2.1093356609344482, "learning_rate": 9.964579472148313e-06, "loss": 0.3512, "num_input_tokens_seen": 5067208, "step": 5060 }, { "epoch": 2.685577942735949, "grad_norm": 2.769991874694824, "learning_rate": 9.964304048336379e-06, "loss": 0.2947, "num_input_tokens_seen": 5072552, "step": 5065 }, { "epoch": 2.6882290562036055, "grad_norm": 3.373852252960205, "learning_rate": 9.964027561679941e-06, "loss": 0.3217, "num_input_tokens_seen": 5076680, "step": 5070 }, { "epoch": 2.690880169671262, "grad_norm": 4.220776557922363, "learning_rate": 9.963750012238189e-06, "loss": 0.2557, "num_input_tokens_seen": 5082184, "step": 5075 }, { "epoch": 2.693531283138918, "grad_norm": 2.0422849655151367, "learning_rate": 9.96347140007055e-06, "loss": 0.3587, "num_input_tokens_seen": 5087464, "step": 5080 }, { "epoch": 2.6961823966065745, "grad_norm": 2.362098455429077, "learning_rate": 9.963191725236672e-06, "loss": 0.3143, "num_input_tokens_seen": 5092456, "step": 5085 }, { "epoch": 2.6988335100742313, "grad_norm": 3.280280828475952, "learning_rate": 9.962910987796433e-06, "loss": 0.3342, "num_input_tokens_seen": 5098824, "step": 5090 }, { "epoch": 2.7014846235418877, "grad_norm": 4.000919342041016, "learning_rate": 9.962629187809938e-06, "loss": 0.3657, "num_input_tokens_seen": 5104072, "step": 5095 }, { "epoch": 2.704135737009544, "grad_norm": 1.800770878791809, "learning_rate": 9.96234632533752e-06, "loss": 0.2821, "num_input_tokens_seen": 5111048, "step": 5100 }, { "epoch": 2.7067868504772004, "grad_norm": 1.3177549839019775, "learning_rate": 9.96206240043974e-06, "loss": 0.3494, "num_input_tokens_seen": 5116936, "step": 5105 }, { "epoch": 2.7094379639448567, "grad_norm": 2.3318018913269043, "learning_rate": 9.961777413177383e-06, "loss": 0.2995, "num_input_tokens_seen": 5121672, "step": 5110 }, { "epoch": 2.7120890774125135, "grad_norm": 2.2208337783813477, "learning_rate": 9.961491363611466e-06, "loss": 0.3485, "num_input_tokens_seen": 5126280, "step": 5115 }, { "epoch": 2.71474019088017, "grad_norm": 2.3101789951324463, "learning_rate": 9.96120425180323e-06, "loss": 0.3056, "num_input_tokens_seen": 5131976, "step": 5120 }, { "epoch": 2.717391304347826, "grad_norm": 3.9120781421661377, "learning_rate": 9.960916077814147e-06, "loss": 0.4431, "num_input_tokens_seen": 5138760, "step": 5125 }, { "epoch": 2.7200424178154825, "grad_norm": 2.1377909183502197, "learning_rate": 9.960626841705913e-06, "loss": 0.3515, "num_input_tokens_seen": 5144744, "step": 5130 }, { "epoch": 2.722693531283139, "grad_norm": 1.828887701034546, "learning_rate": 9.960336543540451e-06, "loss": 0.3967, "num_input_tokens_seen": 5150696, "step": 5135 }, { "epoch": 2.725344644750795, "grad_norm": 3.5042216777801514, "learning_rate": 9.960045183379918e-06, "loss": 0.3332, "num_input_tokens_seen": 5155144, "step": 5140 }, { "epoch": 2.7279957582184515, "grad_norm": 2.840740203857422, "learning_rate": 9.95975276128669e-06, "loss": 0.2962, "num_input_tokens_seen": 5160040, "step": 5145 }, { "epoch": 2.7306468716861083, "grad_norm": 1.804474115371704, "learning_rate": 9.959459277323372e-06, "loss": 0.2954, "num_input_tokens_seen": 5165192, "step": 5150 }, { "epoch": 2.7332979851537647, "grad_norm": 3.3440804481506348, "learning_rate": 9.959164731552801e-06, "loss": 0.3814, "num_input_tokens_seen": 5171464, "step": 5155 }, { "epoch": 2.735949098621421, "grad_norm": 1.7915478944778442, "learning_rate": 9.958869124038038e-06, "loss": 0.2644, "num_input_tokens_seen": 5176296, "step": 5160 }, { "epoch": 2.7386002120890773, "grad_norm": 1.4581600427627563, "learning_rate": 9.958572454842373e-06, "loss": 0.259, "num_input_tokens_seen": 5181128, "step": 5165 }, { "epoch": 2.7412513255567337, "grad_norm": 3.9526753425598145, "learning_rate": 9.95827472402932e-06, "loss": 0.4084, "num_input_tokens_seen": 5185960, "step": 5170 }, { "epoch": 2.7439024390243905, "grad_norm": 3.550363063812256, "learning_rate": 9.957975931662623e-06, "loss": 0.326, "num_input_tokens_seen": 5190184, "step": 5175 }, { "epoch": 2.746553552492047, "grad_norm": 2.030776023864746, "learning_rate": 9.957676077806252e-06, "loss": 0.302, "num_input_tokens_seen": 5195016, "step": 5180 }, { "epoch": 2.749204665959703, "grad_norm": 4.064992904663086, "learning_rate": 9.957375162524405e-06, "loss": 0.3882, "num_input_tokens_seen": 5200872, "step": 5185 }, { "epoch": 2.7518557794273595, "grad_norm": 1.2912344932556152, "learning_rate": 9.95707318588151e-06, "loss": 0.2994, "num_input_tokens_seen": 5207048, "step": 5190 }, { "epoch": 2.754506892895016, "grad_norm": 1.5337339639663696, "learning_rate": 9.956770147942214e-06, "loss": 0.2639, "num_input_tokens_seen": 5211432, "step": 5195 }, { "epoch": 2.757158006362672, "grad_norm": 2.229057788848877, "learning_rate": 9.956466048771402e-06, "loss": 0.29, "num_input_tokens_seen": 5217640, "step": 5200 }, { "epoch": 2.7598091198303285, "grad_norm": 3.3204152584075928, "learning_rate": 9.95616088843418e-06, "loss": 0.3767, "num_input_tokens_seen": 5222248, "step": 5205 }, { "epoch": 2.7624602332979853, "grad_norm": 1.6975057125091553, "learning_rate": 9.955854666995879e-06, "loss": 0.2583, "num_input_tokens_seen": 5226824, "step": 5210 }, { "epoch": 2.7651113467656416, "grad_norm": 2.6599161624908447, "learning_rate": 9.955547384522063e-06, "loss": 0.3192, "num_input_tokens_seen": 5231656, "step": 5215 }, { "epoch": 2.767762460233298, "grad_norm": 1.3554377555847168, "learning_rate": 9.955239041078519e-06, "loss": 0.3785, "num_input_tokens_seen": 5236776, "step": 5220 }, { "epoch": 2.7704135737009543, "grad_norm": 2.4889121055603027, "learning_rate": 9.954929636731263e-06, "loss": 0.3305, "num_input_tokens_seen": 5241480, "step": 5225 }, { "epoch": 2.7730646871686107, "grad_norm": 3.406820297241211, "learning_rate": 9.954619171546535e-06, "loss": 0.3887, "num_input_tokens_seen": 5248840, "step": 5230 }, { "epoch": 2.7757158006362674, "grad_norm": 1.981106162071228, "learning_rate": 9.95430764559081e-06, "loss": 0.3094, "num_input_tokens_seen": 5254216, "step": 5235 }, { "epoch": 2.778366914103924, "grad_norm": 3.26011323928833, "learning_rate": 9.953995058930782e-06, "loss": 0.3564, "num_input_tokens_seen": 5258696, "step": 5240 }, { "epoch": 2.78101802757158, "grad_norm": 3.433810234069824, "learning_rate": 9.953681411633376e-06, "loss": 0.3668, "num_input_tokens_seen": 5263560, "step": 5245 }, { "epoch": 2.7836691410392365, "grad_norm": 1.6835403442382812, "learning_rate": 9.95336670376574e-06, "loss": 0.3333, "num_input_tokens_seen": 5267880, "step": 5250 }, { "epoch": 2.786320254506893, "grad_norm": 2.3489491939544678, "learning_rate": 9.953050935395256e-06, "loss": 0.3946, "num_input_tokens_seen": 5273480, "step": 5255 }, { "epoch": 2.7889713679745496, "grad_norm": 2.646480083465576, "learning_rate": 9.952734106589527e-06, "loss": 0.3275, "num_input_tokens_seen": 5279336, "step": 5260 }, { "epoch": 2.7916224814422055, "grad_norm": 2.378345251083374, "learning_rate": 9.952416217416385e-06, "loss": 0.3913, "num_input_tokens_seen": 5284328, "step": 5265 }, { "epoch": 2.7942735949098623, "grad_norm": 1.4668362140655518, "learning_rate": 9.952097267943891e-06, "loss": 0.3747, "num_input_tokens_seen": 5291528, "step": 5270 }, { "epoch": 2.7969247083775186, "grad_norm": 2.35672926902771, "learning_rate": 9.95177725824033e-06, "loss": 0.3666, "num_input_tokens_seen": 5297896, "step": 5275 }, { "epoch": 2.799575821845175, "grad_norm": 1.2701425552368164, "learning_rate": 9.951456188374216e-06, "loss": 0.2425, "num_input_tokens_seen": 5302472, "step": 5280 }, { "epoch": 2.8022269353128313, "grad_norm": 1.4811232089996338, "learning_rate": 9.951134058414289e-06, "loss": 0.3148, "num_input_tokens_seen": 5307336, "step": 5285 }, { "epoch": 2.8048780487804876, "grad_norm": 2.092021942138672, "learning_rate": 9.950810868429515e-06, "loss": 0.391, "num_input_tokens_seen": 5311976, "step": 5290 }, { "epoch": 2.8075291622481444, "grad_norm": 1.632820963859558, "learning_rate": 9.950486618489088e-06, "loss": 0.2909, "num_input_tokens_seen": 5316136, "step": 5295 }, { "epoch": 2.8101802757158008, "grad_norm": 1.6918513774871826, "learning_rate": 9.950161308662431e-06, "loss": 0.3327, "num_input_tokens_seen": 5321064, "step": 5300 }, { "epoch": 2.812831389183457, "grad_norm": 1.836511254310608, "learning_rate": 9.949834939019191e-06, "loss": 0.3041, "num_input_tokens_seen": 5325192, "step": 5305 }, { "epoch": 2.8154825026511134, "grad_norm": 1.8548634052276611, "learning_rate": 9.949507509629244e-06, "loss": 0.3173, "num_input_tokens_seen": 5331176, "step": 5310 }, { "epoch": 2.81813361611877, "grad_norm": 3.2798001766204834, "learning_rate": 9.949179020562691e-06, "loss": 0.401, "num_input_tokens_seen": 5336488, "step": 5315 }, { "epoch": 2.8207847295864266, "grad_norm": 1.1074527502059937, "learning_rate": 9.948849471889859e-06, "loss": 0.3584, "num_input_tokens_seen": 5341192, "step": 5320 }, { "epoch": 2.8234358430540825, "grad_norm": 1.7870029211044312, "learning_rate": 9.948518863681306e-06, "loss": 0.323, "num_input_tokens_seen": 5345992, "step": 5325 }, { "epoch": 2.8260869565217392, "grad_norm": 1.844692587852478, "learning_rate": 9.948187196007812e-06, "loss": 0.2834, "num_input_tokens_seen": 5351048, "step": 5330 }, { "epoch": 2.8287380699893956, "grad_norm": 1.2871328592300415, "learning_rate": 9.947854468940388e-06, "loss": 0.3386, "num_input_tokens_seen": 5356232, "step": 5335 }, { "epoch": 2.831389183457052, "grad_norm": 1.0625920295715332, "learning_rate": 9.94752068255027e-06, "loss": 0.3348, "num_input_tokens_seen": 5360328, "step": 5340 }, { "epoch": 2.8340402969247083, "grad_norm": 1.9323410987854004, "learning_rate": 9.94718583690892e-06, "loss": 0.2903, "num_input_tokens_seen": 5366312, "step": 5345 }, { "epoch": 2.8366914103923646, "grad_norm": 0.9000908136367798, "learning_rate": 9.946849932088027e-06, "loss": 0.2766, "num_input_tokens_seen": 5370984, "step": 5350 }, { "epoch": 2.8393425238600214, "grad_norm": 3.239537239074707, "learning_rate": 9.946512968159509e-06, "loss": 0.3175, "num_input_tokens_seen": 5377160, "step": 5355 }, { "epoch": 2.8419936373276777, "grad_norm": 3.2535152435302734, "learning_rate": 9.946174945195508e-06, "loss": 0.3935, "num_input_tokens_seen": 5380872, "step": 5360 }, { "epoch": 2.844644750795334, "grad_norm": 1.6011508703231812, "learning_rate": 9.945835863268393e-06, "loss": 0.3342, "num_input_tokens_seen": 5386088, "step": 5365 }, { "epoch": 2.8472958642629904, "grad_norm": 2.246155023574829, "learning_rate": 9.945495722450765e-06, "loss": 0.3312, "num_input_tokens_seen": 5391784, "step": 5370 }, { "epoch": 2.8499469777306468, "grad_norm": 1.4689619541168213, "learning_rate": 9.94515452281544e-06, "loss": 0.3312, "num_input_tokens_seen": 5396200, "step": 5375 }, { "epoch": 2.8525980911983035, "grad_norm": 1.302746295928955, "learning_rate": 9.944812264435475e-06, "loss": 0.297, "num_input_tokens_seen": 5400648, "step": 5380 }, { "epoch": 2.8552492046659594, "grad_norm": 1.6829196214675903, "learning_rate": 9.94446894738414e-06, "loss": 0.3667, "num_input_tokens_seen": 5405992, "step": 5385 }, { "epoch": 2.8579003181336162, "grad_norm": 4.222191333770752, "learning_rate": 9.944124571734945e-06, "loss": 0.3761, "num_input_tokens_seen": 5411720, "step": 5390 }, { "epoch": 2.8605514316012726, "grad_norm": 1.8854058980941772, "learning_rate": 9.943779137561614e-06, "loss": 0.351, "num_input_tokens_seen": 5417096, "step": 5395 }, { "epoch": 2.863202545068929, "grad_norm": 2.057615280151367, "learning_rate": 9.943432644938107e-06, "loss": 0.3344, "num_input_tokens_seen": 5421352, "step": 5400 }, { "epoch": 2.8658536585365852, "grad_norm": 3.9503562450408936, "learning_rate": 9.943085093938607e-06, "loss": 0.3302, "num_input_tokens_seen": 5425992, "step": 5405 }, { "epoch": 2.8685047720042416, "grad_norm": 1.3400033712387085, "learning_rate": 9.942736484637523e-06, "loss": 0.304, "num_input_tokens_seen": 5430536, "step": 5410 }, { "epoch": 2.8711558854718984, "grad_norm": 1.6559994220733643, "learning_rate": 9.942386817109492e-06, "loss": 0.3665, "num_input_tokens_seen": 5435176, "step": 5415 }, { "epoch": 2.8738069989395547, "grad_norm": 1.3210614919662476, "learning_rate": 9.942036091429376e-06, "loss": 0.2885, "num_input_tokens_seen": 5440008, "step": 5420 }, { "epoch": 2.876458112407211, "grad_norm": 1.8794153928756714, "learning_rate": 9.941684307672263e-06, "loss": 0.2895, "num_input_tokens_seen": 5444936, "step": 5425 }, { "epoch": 2.8791092258748674, "grad_norm": 1.3694576025009155, "learning_rate": 9.941331465913473e-06, "loss": 0.2715, "num_input_tokens_seen": 5450120, "step": 5430 }, { "epoch": 2.8817603393425237, "grad_norm": 2.2543394565582275, "learning_rate": 9.940977566228548e-06, "loss": 0.2761, "num_input_tokens_seen": 5455816, "step": 5435 }, { "epoch": 2.8844114528101805, "grad_norm": 1.0455247163772583, "learning_rate": 9.94062260869325e-06, "loss": 0.2399, "num_input_tokens_seen": 5460936, "step": 5440 }, { "epoch": 2.8870625662778364, "grad_norm": 2.5029549598693848, "learning_rate": 9.940266593383585e-06, "loss": 0.3912, "num_input_tokens_seen": 5466280, "step": 5445 }, { "epoch": 2.889713679745493, "grad_norm": 1.3010402917861938, "learning_rate": 9.939909520375766e-06, "loss": 0.3266, "num_input_tokens_seen": 5471944, "step": 5450 }, { "epoch": 2.8923647932131495, "grad_norm": 1.1239256858825684, "learning_rate": 9.939551389746247e-06, "loss": 0.3676, "num_input_tokens_seen": 5476936, "step": 5455 }, { "epoch": 2.895015906680806, "grad_norm": 1.0406631231307983, "learning_rate": 9.939192201571701e-06, "loss": 0.2994, "num_input_tokens_seen": 5481480, "step": 5460 }, { "epoch": 2.8976670201484622, "grad_norm": 1.2531019449234009, "learning_rate": 9.93883195592903e-06, "loss": 0.366, "num_input_tokens_seen": 5486344, "step": 5465 }, { "epoch": 2.9003181336161186, "grad_norm": 1.4462616443634033, "learning_rate": 9.938470652895358e-06, "loss": 0.2926, "num_input_tokens_seen": 5490952, "step": 5470 }, { "epoch": 2.9029692470837754, "grad_norm": 1.6768476963043213, "learning_rate": 9.938108292548044e-06, "loss": 0.377, "num_input_tokens_seen": 5495816, "step": 5475 }, { "epoch": 2.9056203605514317, "grad_norm": 1.245050311088562, "learning_rate": 9.937744874964667e-06, "loss": 0.3296, "num_input_tokens_seen": 5500872, "step": 5480 }, { "epoch": 2.908271474019088, "grad_norm": 1.001334547996521, "learning_rate": 9.937380400223031e-06, "loss": 0.3262, "num_input_tokens_seen": 5504872, "step": 5485 }, { "epoch": 2.9109225874867444, "grad_norm": 1.1880114078521729, "learning_rate": 9.937014868401173e-06, "loss": 0.322, "num_input_tokens_seen": 5510664, "step": 5490 }, { "epoch": 2.9135737009544007, "grad_norm": 1.7926074266433716, "learning_rate": 9.93664827957735e-06, "loss": 0.3438, "num_input_tokens_seen": 5516136, "step": 5495 }, { "epoch": 2.9162248144220575, "grad_norm": 1.5148249864578247, "learning_rate": 9.936280633830047e-06, "loss": 0.336, "num_input_tokens_seen": 5521608, "step": 5500 }, { "epoch": 2.918875927889714, "grad_norm": 1.701119065284729, "learning_rate": 9.935911931237978e-06, "loss": 0.2969, "num_input_tokens_seen": 5527400, "step": 5505 }, { "epoch": 2.92152704135737, "grad_norm": 1.393584966659546, "learning_rate": 9.93554217188008e-06, "loss": 0.2882, "num_input_tokens_seen": 5532168, "step": 5510 }, { "epoch": 2.9241781548250265, "grad_norm": 1.5946073532104492, "learning_rate": 9.935171355835519e-06, "loss": 0.3811, "num_input_tokens_seen": 5536520, "step": 5515 }, { "epoch": 2.926829268292683, "grad_norm": 1.3910722732543945, "learning_rate": 9.934799483183684e-06, "loss": 0.3283, "num_input_tokens_seen": 5541416, "step": 5520 }, { "epoch": 2.929480381760339, "grad_norm": 2.3154985904693604, "learning_rate": 9.934426554004193e-06, "loss": 0.3634, "num_input_tokens_seen": 5546184, "step": 5525 }, { "epoch": 2.9321314952279955, "grad_norm": 1.1864933967590332, "learning_rate": 9.93405256837689e-06, "loss": 0.399, "num_input_tokens_seen": 5551016, "step": 5530 }, { "epoch": 2.9347826086956523, "grad_norm": 1.46334707736969, "learning_rate": 9.933677526381842e-06, "loss": 0.298, "num_input_tokens_seen": 5557160, "step": 5535 }, { "epoch": 2.9374337221633087, "grad_norm": 1.4376330375671387, "learning_rate": 9.933301428099345e-06, "loss": 0.3136, "num_input_tokens_seen": 5562088, "step": 5540 }, { "epoch": 2.940084835630965, "grad_norm": 2.564458131790161, "learning_rate": 9.932924273609923e-06, "loss": 0.32, "num_input_tokens_seen": 5568232, "step": 5545 }, { "epoch": 2.9427359490986214, "grad_norm": 1.5440123081207275, "learning_rate": 9.93254606299432e-06, "loss": 0.2932, "num_input_tokens_seen": 5572744, "step": 5550 }, { "epoch": 2.9453870625662777, "grad_norm": 1.3202790021896362, "learning_rate": 9.932166796333516e-06, "loss": 0.2784, "num_input_tokens_seen": 5577224, "step": 5555 }, { "epoch": 2.9480381760339345, "grad_norm": 1.483247995376587, "learning_rate": 9.931786473708704e-06, "loss": 0.2946, "num_input_tokens_seen": 5582408, "step": 5560 }, { "epoch": 2.950689289501591, "grad_norm": 3.0815463066101074, "learning_rate": 9.931405095201314e-06, "loss": 0.2799, "num_input_tokens_seen": 5587944, "step": 5565 }, { "epoch": 2.953340402969247, "grad_norm": 1.6807671785354614, "learning_rate": 9.931022660892997e-06, "loss": 0.2913, "num_input_tokens_seen": 5592360, "step": 5570 }, { "epoch": 2.9559915164369035, "grad_norm": 2.8143539428710938, "learning_rate": 9.930639170865632e-06, "loss": 0.4146, "num_input_tokens_seen": 5597000, "step": 5575 }, { "epoch": 2.95864262990456, "grad_norm": 2.8586292266845703, "learning_rate": 9.930254625201322e-06, "loss": 0.409, "num_input_tokens_seen": 5602344, "step": 5580 }, { "epoch": 2.961293743372216, "grad_norm": 1.3832967281341553, "learning_rate": 9.929869023982398e-06, "loss": 0.3366, "num_input_tokens_seen": 5608008, "step": 5585 }, { "epoch": 2.9639448568398725, "grad_norm": 1.4715656042099, "learning_rate": 9.929482367291417e-06, "loss": 0.3198, "num_input_tokens_seen": 5612296, "step": 5590 }, { "epoch": 2.9665959703075293, "grad_norm": 1.2592592239379883, "learning_rate": 9.929094655211159e-06, "loss": 0.3269, "num_input_tokens_seen": 5617224, "step": 5595 }, { "epoch": 2.9692470837751856, "grad_norm": 1.1815940141677856, "learning_rate": 9.928705887824635e-06, "loss": 0.3311, "num_input_tokens_seen": 5625128, "step": 5600 }, { "epoch": 2.971898197242842, "grad_norm": 2.0794527530670166, "learning_rate": 9.928316065215076e-06, "loss": 0.3062, "num_input_tokens_seen": 5629992, "step": 5605 }, { "epoch": 2.9745493107104983, "grad_norm": 2.6820600032806396, "learning_rate": 9.927925187465944e-06, "loss": 0.3152, "num_input_tokens_seen": 5635848, "step": 5610 }, { "epoch": 2.9772004241781547, "grad_norm": 1.1350017786026, "learning_rate": 9.927533254660925e-06, "loss": 0.3801, "num_input_tokens_seen": 5641288, "step": 5615 }, { "epoch": 2.9798515376458115, "grad_norm": 1.1542870998382568, "learning_rate": 9.92714026688393e-06, "loss": 0.2881, "num_input_tokens_seen": 5645384, "step": 5620 }, { "epoch": 2.982502651113468, "grad_norm": 1.6562879085540771, "learning_rate": 9.926746224219096e-06, "loss": 0.3059, "num_input_tokens_seen": 5650760, "step": 5625 }, { "epoch": 2.985153764581124, "grad_norm": 1.8286359310150146, "learning_rate": 9.926351126750789e-06, "loss": 0.3171, "num_input_tokens_seen": 5656040, "step": 5630 }, { "epoch": 2.9878048780487805, "grad_norm": 1.6466128826141357, "learning_rate": 9.925954974563597e-06, "loss": 0.2412, "num_input_tokens_seen": 5660936, "step": 5635 }, { "epoch": 2.990455991516437, "grad_norm": 1.0031259059906006, "learning_rate": 9.925557767742333e-06, "loss": 0.2545, "num_input_tokens_seen": 5665192, "step": 5640 }, { "epoch": 2.993107104984093, "grad_norm": 1.2654229402542114, "learning_rate": 9.92515950637204e-06, "loss": 0.3478, "num_input_tokens_seen": 5670184, "step": 5645 }, { "epoch": 2.9957582184517495, "grad_norm": 1.6650681495666504, "learning_rate": 9.924760190537985e-06, "loss": 0.3216, "num_input_tokens_seen": 5674152, "step": 5650 }, { "epoch": 2.9984093319194063, "grad_norm": 1.4394774436950684, "learning_rate": 9.92435982032566e-06, "loss": 0.1983, "num_input_tokens_seen": 5679688, "step": 5655 }, { "epoch": 3.0010604453870626, "grad_norm": 2.660623550415039, "learning_rate": 9.923958395820784e-06, "loss": 0.3395, "num_input_tokens_seen": 5684664, "step": 5660 }, { "epoch": 3.003711558854719, "grad_norm": 1.2792364358901978, "learning_rate": 9.9235559171093e-06, "loss": 0.4149, "num_input_tokens_seen": 5690072, "step": 5665 }, { "epoch": 3.0063626723223753, "grad_norm": 1.1659891605377197, "learning_rate": 9.923152384277376e-06, "loss": 0.3382, "num_input_tokens_seen": 5694360, "step": 5670 }, { "epoch": 3.0090137857900316, "grad_norm": 2.213657855987549, "learning_rate": 9.922747797411412e-06, "loss": 0.2837, "num_input_tokens_seen": 5698520, "step": 5675 }, { "epoch": 3.0116648992576884, "grad_norm": 1.9253689050674438, "learning_rate": 9.922342156598025e-06, "loss": 0.3753, "num_input_tokens_seen": 5706232, "step": 5680 }, { "epoch": 3.0143160127253448, "grad_norm": 1.7313767671585083, "learning_rate": 9.921935461924061e-06, "loss": 0.3498, "num_input_tokens_seen": 5711640, "step": 5685 }, { "epoch": 3.016967126193001, "grad_norm": 2.007113218307495, "learning_rate": 9.921527713476596e-06, "loss": 0.3386, "num_input_tokens_seen": 5716952, "step": 5690 }, { "epoch": 3.0196182396606575, "grad_norm": 1.3363778591156006, "learning_rate": 9.921118911342924e-06, "loss": 0.345, "num_input_tokens_seen": 5722104, "step": 5695 }, { "epoch": 3.022269353128314, "grad_norm": 2.8042757511138916, "learning_rate": 9.920709055610572e-06, "loss": 0.3215, "num_input_tokens_seen": 5725880, "step": 5700 }, { "epoch": 3.02492046659597, "grad_norm": 1.6016566753387451, "learning_rate": 9.920298146367287e-06, "loss": 0.2751, "num_input_tokens_seen": 5731672, "step": 5705 }, { "epoch": 3.027571580063627, "grad_norm": 1.6872591972351074, "learning_rate": 9.919886183701044e-06, "loss": 0.3065, "num_input_tokens_seen": 5737368, "step": 5710 }, { "epoch": 3.0302226935312833, "grad_norm": 0.9538717865943909, "learning_rate": 9.919473167700042e-06, "loss": 0.361, "num_input_tokens_seen": 5743704, "step": 5715 }, { "epoch": 3.0328738069989396, "grad_norm": 1.7332419157028198, "learning_rate": 9.919059098452709e-06, "loss": 0.4019, "num_input_tokens_seen": 5749112, "step": 5720 }, { "epoch": 3.035524920466596, "grad_norm": 1.1074631214141846, "learning_rate": 9.918643976047694e-06, "loss": 0.3069, "num_input_tokens_seen": 5752888, "step": 5725 }, { "epoch": 3.0381760339342523, "grad_norm": 0.8922601342201233, "learning_rate": 9.918227800573876e-06, "loss": 0.3232, "num_input_tokens_seen": 5757176, "step": 5730 }, { "epoch": 3.0408271474019086, "grad_norm": 1.3191481828689575, "learning_rate": 9.917810572120355e-06, "loss": 0.219, "num_input_tokens_seen": 5762392, "step": 5735 }, { "epoch": 3.0434782608695654, "grad_norm": 2.8259096145629883, "learning_rate": 9.917392290776459e-06, "loss": 0.4055, "num_input_tokens_seen": 5767416, "step": 5740 }, { "epoch": 3.0461293743372218, "grad_norm": 1.1908624172210693, "learning_rate": 9.916972956631742e-06, "loss": 0.3527, "num_input_tokens_seen": 5771928, "step": 5745 }, { "epoch": 3.048780487804878, "grad_norm": 1.363996148109436, "learning_rate": 9.91655256977598e-06, "loss": 0.2556, "num_input_tokens_seen": 5777112, "step": 5750 }, { "epoch": 3.0514316012725344, "grad_norm": 1.0837979316711426, "learning_rate": 9.916131130299178e-06, "loss": 0.3031, "num_input_tokens_seen": 5781784, "step": 5755 }, { "epoch": 3.0540827147401908, "grad_norm": 2.5798633098602295, "learning_rate": 9.915708638291567e-06, "loss": 0.4533, "num_input_tokens_seen": 5785944, "step": 5760 }, { "epoch": 3.056733828207847, "grad_norm": 1.3150124549865723, "learning_rate": 9.915285093843595e-06, "loss": 0.2128, "num_input_tokens_seen": 5791064, "step": 5765 }, { "epoch": 3.059384941675504, "grad_norm": 1.1695406436920166, "learning_rate": 9.914860497045951e-06, "loss": 0.2195, "num_input_tokens_seen": 5795384, "step": 5770 }, { "epoch": 3.0620360551431602, "grad_norm": 2.54042911529541, "learning_rate": 9.914434847989535e-06, "loss": 0.3103, "num_input_tokens_seen": 5801976, "step": 5775 }, { "epoch": 3.0646871686108166, "grad_norm": 2.2332863807678223, "learning_rate": 9.914008146765477e-06, "loss": 0.3425, "num_input_tokens_seen": 5806872, "step": 5780 }, { "epoch": 3.067338282078473, "grad_norm": 1.9911441802978516, "learning_rate": 9.913580393465133e-06, "loss": 0.3559, "num_input_tokens_seen": 5812280, "step": 5785 }, { "epoch": 3.0699893955461293, "grad_norm": 2.0168025493621826, "learning_rate": 9.913151588180085e-06, "loss": 0.3268, "num_input_tokens_seen": 5817336, "step": 5790 }, { "epoch": 3.0726405090137856, "grad_norm": 1.9230608940124512, "learning_rate": 9.912721731002139e-06, "loss": 0.3141, "num_input_tokens_seen": 5821688, "step": 5795 }, { "epoch": 3.0752916224814424, "grad_norm": 1.7508748769760132, "learning_rate": 9.912290822023325e-06, "loss": 0.2902, "num_input_tokens_seen": 5826872, "step": 5800 }, { "epoch": 3.0779427359490987, "grad_norm": 1.7847315073013306, "learning_rate": 9.911858861335899e-06, "loss": 0.3253, "num_input_tokens_seen": 5830680, "step": 5805 }, { "epoch": 3.080593849416755, "grad_norm": 1.7607054710388184, "learning_rate": 9.911425849032346e-06, "loss": 0.2968, "num_input_tokens_seen": 5836600, "step": 5810 }, { "epoch": 3.0832449628844114, "grad_norm": 3.0736799240112305, "learning_rate": 9.910991785205369e-06, "loss": 0.3056, "num_input_tokens_seen": 5841560, "step": 5815 }, { "epoch": 3.0858960763520678, "grad_norm": 1.2619166374206543, "learning_rate": 9.910556669947902e-06, "loss": 0.2971, "num_input_tokens_seen": 5846936, "step": 5820 }, { "epoch": 3.088547189819724, "grad_norm": 1.1916565895080566, "learning_rate": 9.910120503353102e-06, "loss": 0.3237, "num_input_tokens_seen": 5851160, "step": 5825 }, { "epoch": 3.091198303287381, "grad_norm": 2.8722569942474365, "learning_rate": 9.90968328551435e-06, "loss": 0.3043, "num_input_tokens_seen": 5855544, "step": 5830 }, { "epoch": 3.093849416755037, "grad_norm": 3.369112491607666, "learning_rate": 9.909245016525255e-06, "loss": 0.3711, "num_input_tokens_seen": 5859864, "step": 5835 }, { "epoch": 3.0965005302226936, "grad_norm": 2.051082134246826, "learning_rate": 9.908805696479647e-06, "loss": 0.293, "num_input_tokens_seen": 5865208, "step": 5840 }, { "epoch": 3.09915164369035, "grad_norm": 2.0143747329711914, "learning_rate": 9.908365325471586e-06, "loss": 0.3105, "num_input_tokens_seen": 5870712, "step": 5845 }, { "epoch": 3.1018027571580062, "grad_norm": 1.9842045307159424, "learning_rate": 9.907923903595351e-06, "loss": 0.3379, "num_input_tokens_seen": 5875640, "step": 5850 }, { "epoch": 3.1044538706256626, "grad_norm": 1.924788475036621, "learning_rate": 9.907481430945452e-06, "loss": 0.3148, "num_input_tokens_seen": 5882520, "step": 5855 }, { "epoch": 3.1071049840933194, "grad_norm": 2.342857599258423, "learning_rate": 9.90703790761662e-06, "loss": 0.3503, "num_input_tokens_seen": 5886968, "step": 5860 }, { "epoch": 3.1097560975609757, "grad_norm": 2.691033124923706, "learning_rate": 9.906593333703812e-06, "loss": 0.2926, "num_input_tokens_seen": 5892440, "step": 5865 }, { "epoch": 3.112407211028632, "grad_norm": 2.237422227859497, "learning_rate": 9.90614770930221e-06, "loss": 0.3323, "num_input_tokens_seen": 5899704, "step": 5870 }, { "epoch": 3.1150583244962884, "grad_norm": 1.8929976224899292, "learning_rate": 9.905701034507223e-06, "loss": 0.3595, "num_input_tokens_seen": 5904376, "step": 5875 }, { "epoch": 3.1177094379639447, "grad_norm": 1.906461477279663, "learning_rate": 9.90525330941448e-06, "loss": 0.3368, "num_input_tokens_seen": 5909528, "step": 5880 }, { "epoch": 3.120360551431601, "grad_norm": 1.4794176816940308, "learning_rate": 9.90480453411984e-06, "loss": 0.3129, "num_input_tokens_seen": 5915512, "step": 5885 }, { "epoch": 3.123011664899258, "grad_norm": 1.866586685180664, "learning_rate": 9.904354708719384e-06, "loss": 0.2606, "num_input_tokens_seen": 5921496, "step": 5890 }, { "epoch": 3.125662778366914, "grad_norm": 2.3336451053619385, "learning_rate": 9.903903833309418e-06, "loss": 0.3176, "num_input_tokens_seen": 5926520, "step": 5895 }, { "epoch": 3.1283138918345705, "grad_norm": 2.9293625354766846, "learning_rate": 9.903451907986475e-06, "loss": 0.3279, "num_input_tokens_seen": 5931704, "step": 5900 }, { "epoch": 3.130965005302227, "grad_norm": 1.6851451396942139, "learning_rate": 9.902998932847308e-06, "loss": 0.369, "num_input_tokens_seen": 5935960, "step": 5905 }, { "epoch": 3.133616118769883, "grad_norm": 1.78294038772583, "learning_rate": 9.902544907988898e-06, "loss": 0.4087, "num_input_tokens_seen": 5940856, "step": 5910 }, { "epoch": 3.1362672322375396, "grad_norm": 3.4000203609466553, "learning_rate": 9.902089833508453e-06, "loss": 0.3456, "num_input_tokens_seen": 5944952, "step": 5915 }, { "epoch": 3.1389183457051963, "grad_norm": 1.7638791799545288, "learning_rate": 9.901633709503403e-06, "loss": 0.3533, "num_input_tokens_seen": 5949624, "step": 5920 }, { "epoch": 3.1415694591728527, "grad_norm": 3.567509889602661, "learning_rate": 9.901176536071401e-06, "loss": 0.343, "num_input_tokens_seen": 5954200, "step": 5925 }, { "epoch": 3.144220572640509, "grad_norm": 3.803525924682617, "learning_rate": 9.900718313310329e-06, "loss": 0.3287, "num_input_tokens_seen": 5961848, "step": 5930 }, { "epoch": 3.1468716861081654, "grad_norm": 1.7876296043395996, "learning_rate": 9.90025904131829e-06, "loss": 0.3275, "num_input_tokens_seen": 5965880, "step": 5935 }, { "epoch": 3.1495227995758217, "grad_norm": 1.7992777824401855, "learning_rate": 9.899798720193611e-06, "loss": 0.3004, "num_input_tokens_seen": 5970904, "step": 5940 }, { "epoch": 3.1521739130434785, "grad_norm": 1.7514740228652954, "learning_rate": 9.899337350034849e-06, "loss": 0.2837, "num_input_tokens_seen": 5975928, "step": 5945 }, { "epoch": 3.154825026511135, "grad_norm": 2.1507012844085693, "learning_rate": 9.898874930940781e-06, "loss": 0.272, "num_input_tokens_seen": 5981208, "step": 5950 }, { "epoch": 3.157476139978791, "grad_norm": 1.826375126838684, "learning_rate": 9.898411463010408e-06, "loss": 0.2892, "num_input_tokens_seen": 5987576, "step": 5955 }, { "epoch": 3.1601272534464475, "grad_norm": 3.426370143890381, "learning_rate": 9.897946946342959e-06, "loss": 0.397, "num_input_tokens_seen": 5992120, "step": 5960 }, { "epoch": 3.162778366914104, "grad_norm": 1.3389078378677368, "learning_rate": 9.897481381037886e-06, "loss": 0.3086, "num_input_tokens_seen": 5996824, "step": 5965 }, { "epoch": 3.16542948038176, "grad_norm": 1.9737176895141602, "learning_rate": 9.897014767194864e-06, "loss": 0.3483, "num_input_tokens_seen": 6001336, "step": 5970 }, { "epoch": 3.1680805938494165, "grad_norm": 2.6814982891082764, "learning_rate": 9.896547104913795e-06, "loss": 0.3122, "num_input_tokens_seen": 6006104, "step": 5975 }, { "epoch": 3.1707317073170733, "grad_norm": 2.219510793685913, "learning_rate": 9.896078394294804e-06, "loss": 0.2814, "num_input_tokens_seen": 6010552, "step": 5980 }, { "epoch": 3.1733828207847297, "grad_norm": 2.429180860519409, "learning_rate": 9.89560863543824e-06, "loss": 0.3659, "num_input_tokens_seen": 6015160, "step": 5985 }, { "epoch": 3.176033934252386, "grad_norm": 2.7335972785949707, "learning_rate": 9.895137828444678e-06, "loss": 0.3751, "num_input_tokens_seen": 6019192, "step": 5990 }, { "epoch": 3.1786850477200423, "grad_norm": 2.6600699424743652, "learning_rate": 9.894665973414916e-06, "loss": 0.3381, "num_input_tokens_seen": 6023512, "step": 5995 }, { "epoch": 3.1813361611876987, "grad_norm": 1.8897004127502441, "learning_rate": 9.894193070449978e-06, "loss": 0.3205, "num_input_tokens_seen": 6028600, "step": 6000 }, { "epoch": 3.1839872746553555, "grad_norm": 1.7566715478897095, "learning_rate": 9.893719119651109e-06, "loss": 0.254, "num_input_tokens_seen": 6033240, "step": 6005 }, { "epoch": 3.186638388123012, "grad_norm": 2.56673526763916, "learning_rate": 9.893244121119783e-06, "loss": 0.2434, "num_input_tokens_seen": 6039672, "step": 6010 }, { "epoch": 3.189289501590668, "grad_norm": 1.882104754447937, "learning_rate": 9.892768074957695e-06, "loss": 0.3737, "num_input_tokens_seen": 6045560, "step": 6015 }, { "epoch": 3.1919406150583245, "grad_norm": 2.937410831451416, "learning_rate": 9.892290981266764e-06, "loss": 0.3338, "num_input_tokens_seen": 6049752, "step": 6020 }, { "epoch": 3.194591728525981, "grad_norm": 1.6729097366333008, "learning_rate": 9.891812840149138e-06, "loss": 0.3056, "num_input_tokens_seen": 6054264, "step": 6025 }, { "epoch": 3.197242841993637, "grad_norm": 3.018108606338501, "learning_rate": 9.891333651707182e-06, "loss": 0.3458, "num_input_tokens_seen": 6059992, "step": 6030 }, { "epoch": 3.1998939554612935, "grad_norm": 1.7431305646896362, "learning_rate": 9.890853416043492e-06, "loss": 0.2974, "num_input_tokens_seen": 6064664, "step": 6035 }, { "epoch": 3.2025450689289503, "grad_norm": 1.504824161529541, "learning_rate": 9.890372133260881e-06, "loss": 0.3018, "num_input_tokens_seen": 6069208, "step": 6040 }, { "epoch": 3.2051961823966066, "grad_norm": 3.3824968338012695, "learning_rate": 9.889889803462394e-06, "loss": 0.334, "num_input_tokens_seen": 6074712, "step": 6045 }, { "epoch": 3.207847295864263, "grad_norm": 2.4320380687713623, "learning_rate": 9.889406426751296e-06, "loss": 0.2776, "num_input_tokens_seen": 6079032, "step": 6050 }, { "epoch": 3.2104984093319193, "grad_norm": 2.6125049591064453, "learning_rate": 9.888922003231078e-06, "loss": 0.2843, "num_input_tokens_seen": 6083448, "step": 6055 }, { "epoch": 3.2131495227995757, "grad_norm": 3.96478009223938, "learning_rate": 9.88843653300545e-06, "loss": 0.3372, "num_input_tokens_seen": 6088280, "step": 6060 }, { "epoch": 3.2158006362672324, "grad_norm": 1.8517677783966064, "learning_rate": 9.887950016178352e-06, "loss": 0.2891, "num_input_tokens_seen": 6092184, "step": 6065 }, { "epoch": 3.218451749734889, "grad_norm": 1.6983423233032227, "learning_rate": 9.887462452853947e-06, "loss": 0.2839, "num_input_tokens_seen": 6097336, "step": 6070 }, { "epoch": 3.221102863202545, "grad_norm": 3.9088287353515625, "learning_rate": 9.886973843136621e-06, "loss": 0.2919, "num_input_tokens_seen": 6103096, "step": 6075 }, { "epoch": 3.2237539766702015, "grad_norm": 1.5092169046401978, "learning_rate": 9.88648418713098e-06, "loss": 0.3986, "num_input_tokens_seen": 6108088, "step": 6080 }, { "epoch": 3.226405090137858, "grad_norm": 4.349833965301514, "learning_rate": 9.885993484941864e-06, "loss": 0.3847, "num_input_tokens_seen": 6112152, "step": 6085 }, { "epoch": 3.229056203605514, "grad_norm": 2.1031386852264404, "learning_rate": 9.885501736674327e-06, "loss": 0.329, "num_input_tokens_seen": 6117368, "step": 6090 }, { "epoch": 3.231707317073171, "grad_norm": 2.1996169090270996, "learning_rate": 9.885008942433653e-06, "loss": 0.3084, "num_input_tokens_seen": 6121304, "step": 6095 }, { "epoch": 3.2343584305408273, "grad_norm": 2.483250856399536, "learning_rate": 9.884515102325345e-06, "loss": 0.2813, "num_input_tokens_seen": 6126520, "step": 6100 }, { "epoch": 3.2370095440084836, "grad_norm": 2.942196846008301, "learning_rate": 9.884020216455138e-06, "loss": 0.3542, "num_input_tokens_seen": 6130520, "step": 6105 }, { "epoch": 3.23966065747614, "grad_norm": 1.8526769876480103, "learning_rate": 9.883524284928982e-06, "loss": 0.3581, "num_input_tokens_seen": 6135224, "step": 6110 }, { "epoch": 3.2423117709437963, "grad_norm": 2.2246673107147217, "learning_rate": 9.883027307853055e-06, "loss": 0.297, "num_input_tokens_seen": 6140792, "step": 6115 }, { "epoch": 3.2449628844114526, "grad_norm": 3.391726493835449, "learning_rate": 9.882529285333758e-06, "loss": 0.3349, "num_input_tokens_seen": 6146072, "step": 6120 }, { "epoch": 3.2476139978791094, "grad_norm": 2.987734317779541, "learning_rate": 9.882030217477718e-06, "loss": 0.3466, "num_input_tokens_seen": 6151256, "step": 6125 }, { "epoch": 3.2502651113467658, "grad_norm": 2.1100728511810303, "learning_rate": 9.881530104391785e-06, "loss": 0.3441, "num_input_tokens_seen": 6155448, "step": 6130 }, { "epoch": 3.252916224814422, "grad_norm": 2.760686159133911, "learning_rate": 9.881028946183028e-06, "loss": 0.3367, "num_input_tokens_seen": 6160280, "step": 6135 }, { "epoch": 3.2555673382820784, "grad_norm": 1.9976508617401123, "learning_rate": 9.880526742958748e-06, "loss": 0.281, "num_input_tokens_seen": 6164696, "step": 6140 }, { "epoch": 3.258218451749735, "grad_norm": 1.9971928596496582, "learning_rate": 9.880023494826463e-06, "loss": 0.315, "num_input_tokens_seen": 6169176, "step": 6145 }, { "epoch": 3.260869565217391, "grad_norm": 2.238617420196533, "learning_rate": 9.879519201893918e-06, "loss": 0.3514, "num_input_tokens_seen": 6174200, "step": 6150 }, { "epoch": 3.2635206786850475, "grad_norm": 3.1454010009765625, "learning_rate": 9.879013864269079e-06, "loss": 0.3139, "num_input_tokens_seen": 6179992, "step": 6155 }, { "epoch": 3.2661717921527043, "grad_norm": 1.4444589614868164, "learning_rate": 9.878507482060138e-06, "loss": 0.2849, "num_input_tokens_seen": 6185048, "step": 6160 }, { "epoch": 3.2688229056203606, "grad_norm": 3.217843770980835, "learning_rate": 9.878000055375512e-06, "loss": 0.402, "num_input_tokens_seen": 6190424, "step": 6165 }, { "epoch": 3.271474019088017, "grad_norm": 3.2132411003112793, "learning_rate": 9.87749158432384e-06, "loss": 0.3365, "num_input_tokens_seen": 6194776, "step": 6170 }, { "epoch": 3.2741251325556733, "grad_norm": 1.7145729064941406, "learning_rate": 9.87698206901398e-06, "loss": 0.2777, "num_input_tokens_seen": 6199000, "step": 6175 }, { "epoch": 3.2767762460233296, "grad_norm": 2.486027479171753, "learning_rate": 9.876471509555021e-06, "loss": 0.3416, "num_input_tokens_seen": 6203736, "step": 6180 }, { "epoch": 3.2794273594909864, "grad_norm": 2.318505048751831, "learning_rate": 9.875959906056274e-06, "loss": 0.2634, "num_input_tokens_seen": 6209208, "step": 6185 }, { "epoch": 3.2820784729586427, "grad_norm": 3.4909374713897705, "learning_rate": 9.875447258627269e-06, "loss": 0.3331, "num_input_tokens_seen": 6214648, "step": 6190 }, { "epoch": 3.284729586426299, "grad_norm": 3.503173589706421, "learning_rate": 9.874933567377765e-06, "loss": 0.3731, "num_input_tokens_seen": 6220184, "step": 6195 }, { "epoch": 3.2873806998939554, "grad_norm": 2.1822328567504883, "learning_rate": 9.87441883241774e-06, "loss": 0.3332, "num_input_tokens_seen": 6225016, "step": 6200 }, { "epoch": 3.2900318133616118, "grad_norm": 3.4192891120910645, "learning_rate": 9.8739030538574e-06, "loss": 0.3652, "num_input_tokens_seen": 6229816, "step": 6205 }, { "epoch": 3.292682926829268, "grad_norm": 2.4552364349365234, "learning_rate": 9.873386231807166e-06, "loss": 0.2686, "num_input_tokens_seen": 6234392, "step": 6210 }, { "epoch": 3.295334040296925, "grad_norm": 2.336777925491333, "learning_rate": 9.872868366377695e-06, "loss": 0.3044, "num_input_tokens_seen": 6239192, "step": 6215 }, { "epoch": 3.2979851537645812, "grad_norm": 3.137230634689331, "learning_rate": 9.872349457679856e-06, "loss": 0.3275, "num_input_tokens_seen": 6244696, "step": 6220 }, { "epoch": 3.3006362672322376, "grad_norm": 4.613373279571533, "learning_rate": 9.871829505824747e-06, "loss": 0.316, "num_input_tokens_seen": 6249400, "step": 6225 }, { "epoch": 3.303287380699894, "grad_norm": 2.9306511878967285, "learning_rate": 9.871308510923692e-06, "loss": 0.3076, "num_input_tokens_seen": 6254296, "step": 6230 }, { "epoch": 3.3059384941675503, "grad_norm": 3.109279155731201, "learning_rate": 9.87078647308823e-06, "loss": 0.3367, "num_input_tokens_seen": 6260536, "step": 6235 }, { "epoch": 3.3085896076352066, "grad_norm": 5.785363674163818, "learning_rate": 9.87026339243013e-06, "loss": 0.3316, "num_input_tokens_seen": 6265304, "step": 6240 }, { "epoch": 3.3112407211028634, "grad_norm": 3.6005873680114746, "learning_rate": 9.869739269061383e-06, "loss": 0.3481, "num_input_tokens_seen": 6269560, "step": 6245 }, { "epoch": 3.3138918345705197, "grad_norm": 2.684107542037964, "learning_rate": 9.8692141030942e-06, "loss": 0.3074, "num_input_tokens_seen": 6273976, "step": 6250 }, { "epoch": 3.316542948038176, "grad_norm": 5.428035259246826, "learning_rate": 9.86868789464102e-06, "loss": 0.4224, "num_input_tokens_seen": 6277816, "step": 6255 }, { "epoch": 3.3191940615058324, "grad_norm": 2.271782875061035, "learning_rate": 9.8681606438145e-06, "loss": 0.26, "num_input_tokens_seen": 6283032, "step": 6260 }, { "epoch": 3.3218451749734887, "grad_norm": 2.3140647411346436, "learning_rate": 9.867632350727527e-06, "loss": 0.373, "num_input_tokens_seen": 6288344, "step": 6265 }, { "epoch": 3.3244962884411455, "grad_norm": 2.1529948711395264, "learning_rate": 9.867103015493204e-06, "loss": 0.3233, "num_input_tokens_seen": 6293304, "step": 6270 }, { "epoch": 3.327147401908802, "grad_norm": 2.895756959915161, "learning_rate": 9.86657263822486e-06, "loss": 0.3348, "num_input_tokens_seen": 6298616, "step": 6275 }, { "epoch": 3.329798515376458, "grad_norm": 1.6624668836593628, "learning_rate": 9.866041219036051e-06, "loss": 0.3058, "num_input_tokens_seen": 6305496, "step": 6280 }, { "epoch": 3.3324496288441146, "grad_norm": 3.826225519180298, "learning_rate": 9.86550875804055e-06, "loss": 0.3744, "num_input_tokens_seen": 6309400, "step": 6285 }, { "epoch": 3.335100742311771, "grad_norm": 2.1825172901153564, "learning_rate": 9.864975255352354e-06, "loss": 0.3213, "num_input_tokens_seen": 6314680, "step": 6290 }, { "epoch": 3.3377518557794272, "grad_norm": 2.924736976623535, "learning_rate": 9.864440711085687e-06, "loss": 0.343, "num_input_tokens_seen": 6320024, "step": 6295 }, { "epoch": 3.3404029692470836, "grad_norm": 2.3977038860321045, "learning_rate": 9.863905125354992e-06, "loss": 0.2756, "num_input_tokens_seen": 6325656, "step": 6300 }, { "epoch": 3.3430540827147404, "grad_norm": 3.6082870960235596, "learning_rate": 9.86336849827494e-06, "loss": 0.2902, "num_input_tokens_seen": 6330584, "step": 6305 }, { "epoch": 3.3457051961823967, "grad_norm": 2.8761579990386963, "learning_rate": 9.862830829960414e-06, "loss": 0.2773, "num_input_tokens_seen": 6335224, "step": 6310 }, { "epoch": 3.348356309650053, "grad_norm": 2.5109779834747314, "learning_rate": 9.862292120526536e-06, "loss": 0.3123, "num_input_tokens_seen": 6340568, "step": 6315 }, { "epoch": 3.3510074231177094, "grad_norm": 4.022862911224365, "learning_rate": 9.861752370088637e-06, "loss": 0.365, "num_input_tokens_seen": 6345560, "step": 6320 }, { "epoch": 3.3536585365853657, "grad_norm": 3.9428131580352783, "learning_rate": 9.861211578762276e-06, "loss": 0.326, "num_input_tokens_seen": 6352632, "step": 6325 }, { "epoch": 3.3563096500530225, "grad_norm": 1.7322500944137573, "learning_rate": 9.860669746663238e-06, "loss": 0.3257, "num_input_tokens_seen": 6358840, "step": 6330 }, { "epoch": 3.358960763520679, "grad_norm": 2.1373724937438965, "learning_rate": 9.860126873907527e-06, "loss": 0.2652, "num_input_tokens_seen": 6363640, "step": 6335 }, { "epoch": 3.361611876988335, "grad_norm": 2.2313268184661865, "learning_rate": 9.859582960611369e-06, "loss": 0.2539, "num_input_tokens_seen": 6367768, "step": 6340 }, { "epoch": 3.3642629904559915, "grad_norm": 2.439635992050171, "learning_rate": 9.859038006891215e-06, "loss": 0.2646, "num_input_tokens_seen": 6372376, "step": 6345 }, { "epoch": 3.366914103923648, "grad_norm": 2.3321585655212402, "learning_rate": 9.858492012863739e-06, "loss": 0.2928, "num_input_tokens_seen": 6376248, "step": 6350 }, { "epoch": 3.369565217391304, "grad_norm": 2.284290313720703, "learning_rate": 9.857944978645838e-06, "loss": 0.2839, "num_input_tokens_seen": 6380856, "step": 6355 }, { "epoch": 3.3722163308589606, "grad_norm": 2.933971881866455, "learning_rate": 9.857396904354627e-06, "loss": 0.2799, "num_input_tokens_seen": 6385816, "step": 6360 }, { "epoch": 3.3748674443266173, "grad_norm": 3.4215445518493652, "learning_rate": 9.85684779010745e-06, "loss": 0.3518, "num_input_tokens_seen": 6390904, "step": 6365 }, { "epoch": 3.3775185577942737, "grad_norm": 2.7537693977355957, "learning_rate": 9.85629763602187e-06, "loss": 0.3311, "num_input_tokens_seen": 6395512, "step": 6370 }, { "epoch": 3.38016967126193, "grad_norm": 4.653378486633301, "learning_rate": 9.855746442215675e-06, "loss": 0.3206, "num_input_tokens_seen": 6400408, "step": 6375 }, { "epoch": 3.3828207847295864, "grad_norm": 2.8695359230041504, "learning_rate": 9.855194208806871e-06, "loss": 0.28, "num_input_tokens_seen": 6405272, "step": 6380 }, { "epoch": 3.3854718981972427, "grad_norm": 2.437791585922241, "learning_rate": 9.854640935913693e-06, "loss": 0.259, "num_input_tokens_seen": 6409912, "step": 6385 }, { "epoch": 3.3881230116648995, "grad_norm": 3.6889877319335938, "learning_rate": 9.854086623654594e-06, "loss": 0.2759, "num_input_tokens_seen": 6414936, "step": 6390 }, { "epoch": 3.390774125132556, "grad_norm": 5.037117004394531, "learning_rate": 9.853531272148248e-06, "loss": 0.3675, "num_input_tokens_seen": 6420472, "step": 6395 }, { "epoch": 3.393425238600212, "grad_norm": 3.6316449642181396, "learning_rate": 9.852974881513559e-06, "loss": 0.2854, "num_input_tokens_seen": 6425528, "step": 6400 }, { "epoch": 3.3960763520678685, "grad_norm": 2.8533341884613037, "learning_rate": 9.852417451869649e-06, "loss": 0.253, "num_input_tokens_seen": 6431000, "step": 6405 }, { "epoch": 3.398727465535525, "grad_norm": 2.6568479537963867, "learning_rate": 9.851858983335857e-06, "loss": 0.3354, "num_input_tokens_seen": 6436088, "step": 6410 }, { "epoch": 3.401378579003181, "grad_norm": 1.4414536952972412, "learning_rate": 9.851299476031755e-06, "loss": 0.3493, "num_input_tokens_seen": 6441016, "step": 6415 }, { "epoch": 3.4040296924708375, "grad_norm": 4.231828212738037, "learning_rate": 9.850738930077129e-06, "loss": 0.3423, "num_input_tokens_seen": 6445496, "step": 6420 }, { "epoch": 3.4066808059384943, "grad_norm": 2.283458948135376, "learning_rate": 9.850177345591988e-06, "loss": 0.2412, "num_input_tokens_seen": 6450296, "step": 6425 }, { "epoch": 3.4093319194061507, "grad_norm": 2.8732235431671143, "learning_rate": 9.849614722696574e-06, "loss": 0.4377, "num_input_tokens_seen": 6456056, "step": 6430 }, { "epoch": 3.411983032873807, "grad_norm": 3.083784580230713, "learning_rate": 9.849051061511334e-06, "loss": 0.2945, "num_input_tokens_seen": 6461976, "step": 6435 }, { "epoch": 3.4146341463414633, "grad_norm": 2.6753735542297363, "learning_rate": 9.848486362156952e-06, "loss": 0.3876, "num_input_tokens_seen": 6466616, "step": 6440 }, { "epoch": 3.4172852598091197, "grad_norm": 1.7077982425689697, "learning_rate": 9.847920624754328e-06, "loss": 0.322, "num_input_tokens_seen": 6471096, "step": 6445 }, { "epoch": 3.4199363732767765, "grad_norm": 2.3505852222442627, "learning_rate": 9.847353849424583e-06, "loss": 0.3092, "num_input_tokens_seen": 6475288, "step": 6450 }, { "epoch": 3.422587486744433, "grad_norm": 1.568376898765564, "learning_rate": 9.846786036289064e-06, "loss": 0.3481, "num_input_tokens_seen": 6480440, "step": 6455 }, { "epoch": 3.425238600212089, "grad_norm": 4.773524761199951, "learning_rate": 9.846217185469335e-06, "loss": 0.3247, "num_input_tokens_seen": 6485752, "step": 6460 }, { "epoch": 3.4278897136797455, "grad_norm": 1.6111854314804077, "learning_rate": 9.84564729708719e-06, "loss": 0.2814, "num_input_tokens_seen": 6492280, "step": 6465 }, { "epoch": 3.430540827147402, "grad_norm": 1.9263168573379517, "learning_rate": 9.845076371264638e-06, "loss": 0.2213, "num_input_tokens_seen": 6498392, "step": 6470 }, { "epoch": 3.433191940615058, "grad_norm": 3.197970151901245, "learning_rate": 9.844504408123914e-06, "loss": 0.3364, "num_input_tokens_seen": 6503128, "step": 6475 }, { "epoch": 3.4358430540827145, "grad_norm": 3.124413251876831, "learning_rate": 9.843931407787472e-06, "loss": 0.3425, "num_input_tokens_seen": 6507768, "step": 6480 }, { "epoch": 3.4384941675503713, "grad_norm": 3.3536453247070312, "learning_rate": 9.84335737037799e-06, "loss": 0.3946, "num_input_tokens_seen": 6512600, "step": 6485 }, { "epoch": 3.4411452810180276, "grad_norm": 4.058743000030518, "learning_rate": 9.842782296018371e-06, "loss": 0.377, "num_input_tokens_seen": 6517592, "step": 6490 }, { "epoch": 3.443796394485684, "grad_norm": 2.7563869953155518, "learning_rate": 9.842206184831734e-06, "loss": 0.3465, "num_input_tokens_seen": 6522488, "step": 6495 }, { "epoch": 3.4464475079533403, "grad_norm": 2.3332159519195557, "learning_rate": 9.841629036941423e-06, "loss": 0.353, "num_input_tokens_seen": 6528248, "step": 6500 }, { "epoch": 3.4490986214209967, "grad_norm": 2.6160027980804443, "learning_rate": 9.841050852471005e-06, "loss": 0.2563, "num_input_tokens_seen": 6533336, "step": 6505 }, { "epoch": 3.4517497348886534, "grad_norm": 2.218707323074341, "learning_rate": 9.840471631544266e-06, "loss": 0.3418, "num_input_tokens_seen": 6538392, "step": 6510 }, { "epoch": 3.45440084835631, "grad_norm": 2.301093578338623, "learning_rate": 9.839891374285217e-06, "loss": 0.2803, "num_input_tokens_seen": 6542840, "step": 6515 }, { "epoch": 3.457051961823966, "grad_norm": 2.858103036880493, "learning_rate": 9.839310080818092e-06, "loss": 0.3266, "num_input_tokens_seen": 6549176, "step": 6520 }, { "epoch": 3.4597030752916225, "grad_norm": 2.2283449172973633, "learning_rate": 9.83872775126734e-06, "loss": 0.4152, "num_input_tokens_seen": 6554296, "step": 6525 }, { "epoch": 3.462354188759279, "grad_norm": 1.888743281364441, "learning_rate": 9.838144385757637e-06, "loss": 0.3195, "num_input_tokens_seen": 6559480, "step": 6530 }, { "epoch": 3.465005302226935, "grad_norm": 2.269221305847168, "learning_rate": 9.837559984413883e-06, "loss": 0.344, "num_input_tokens_seen": 6565688, "step": 6535 }, { "epoch": 3.4676564156945915, "grad_norm": 1.7364530563354492, "learning_rate": 9.836974547361196e-06, "loss": 0.2838, "num_input_tokens_seen": 6571288, "step": 6540 }, { "epoch": 3.4703075291622483, "grad_norm": 2.0036189556121826, "learning_rate": 9.836388074724913e-06, "loss": 0.2353, "num_input_tokens_seen": 6576856, "step": 6545 }, { "epoch": 3.4729586426299046, "grad_norm": 2.9279208183288574, "learning_rate": 9.8358005666306e-06, "loss": 0.2961, "num_input_tokens_seen": 6581528, "step": 6550 }, { "epoch": 3.475609756097561, "grad_norm": 4.0263800621032715, "learning_rate": 9.835212023204039e-06, "loss": 0.3386, "num_input_tokens_seen": 6586104, "step": 6555 }, { "epoch": 3.4782608695652173, "grad_norm": 3.406116247177124, "learning_rate": 9.834622444571235e-06, "loss": 0.3614, "num_input_tokens_seen": 6590456, "step": 6560 }, { "epoch": 3.4809119830328736, "grad_norm": 2.934072256088257, "learning_rate": 9.834031830858419e-06, "loss": 0.3474, "num_input_tokens_seen": 6596280, "step": 6565 }, { "epoch": 3.4835630965005304, "grad_norm": 2.586231231689453, "learning_rate": 9.833440182192038e-06, "loss": 0.3326, "num_input_tokens_seen": 6600216, "step": 6570 }, { "epoch": 3.4862142099681868, "grad_norm": 3.3697288036346436, "learning_rate": 9.83284749869876e-06, "loss": 0.3197, "num_input_tokens_seen": 6604024, "step": 6575 }, { "epoch": 3.488865323435843, "grad_norm": 2.2110538482666016, "learning_rate": 9.832253780505479e-06, "loss": 0.2941, "num_input_tokens_seen": 6608760, "step": 6580 }, { "epoch": 3.4915164369034994, "grad_norm": 2.65781307220459, "learning_rate": 9.831659027739308e-06, "loss": 0.2222, "num_input_tokens_seen": 6614104, "step": 6585 }, { "epoch": 3.494167550371156, "grad_norm": 3.9955320358276367, "learning_rate": 9.831063240527582e-06, "loss": 0.3785, "num_input_tokens_seen": 6618680, "step": 6590 }, { "epoch": 3.496818663838812, "grad_norm": 3.2640082836151123, "learning_rate": 9.83046641899786e-06, "loss": 0.3271, "num_input_tokens_seen": 6624376, "step": 6595 }, { "epoch": 3.499469777306469, "grad_norm": 2.5128774642944336, "learning_rate": 9.829868563277918e-06, "loss": 0.2688, "num_input_tokens_seen": 6629240, "step": 6600 }, { "epoch": 3.5021208907741253, "grad_norm": 2.3034257888793945, "learning_rate": 9.829269673495752e-06, "loss": 0.3229, "num_input_tokens_seen": 6634136, "step": 6605 }, { "epoch": 3.5047720042417816, "grad_norm": 2.4610347747802734, "learning_rate": 9.828669749779589e-06, "loss": 0.3301, "num_input_tokens_seen": 6638648, "step": 6610 }, { "epoch": 3.507423117709438, "grad_norm": 2.734368085861206, "learning_rate": 9.828068792257865e-06, "loss": 0.2473, "num_input_tokens_seen": 6643544, "step": 6615 }, { "epoch": 3.5100742311770943, "grad_norm": 2.959641456604004, "learning_rate": 9.827466801059247e-06, "loss": 0.3248, "num_input_tokens_seen": 6647896, "step": 6620 }, { "epoch": 3.5127253446447506, "grad_norm": 1.9242711067199707, "learning_rate": 9.826863776312621e-06, "loss": 0.3032, "num_input_tokens_seen": 6652216, "step": 6625 }, { "epoch": 3.5153764581124074, "grad_norm": 1.7953715324401855, "learning_rate": 9.82625971814709e-06, "loss": 0.2851, "num_input_tokens_seen": 6656984, "step": 6630 }, { "epoch": 3.5180275715800637, "grad_norm": 3.984525442123413, "learning_rate": 9.825654626691982e-06, "loss": 0.3143, "num_input_tokens_seen": 6662072, "step": 6635 }, { "epoch": 3.52067868504772, "grad_norm": 1.6500533819198608, "learning_rate": 9.825048502076846e-06, "loss": 0.2284, "num_input_tokens_seen": 6667064, "step": 6640 }, { "epoch": 3.5233297985153764, "grad_norm": 1.7849220037460327, "learning_rate": 9.824441344431453e-06, "loss": 0.2917, "num_input_tokens_seen": 6671800, "step": 6645 }, { "epoch": 3.5259809119830328, "grad_norm": 2.9593067169189453, "learning_rate": 9.82383315388579e-06, "loss": 0.3588, "num_input_tokens_seen": 6678424, "step": 6650 }, { "epoch": 3.5286320254506895, "grad_norm": 3.6051852703094482, "learning_rate": 9.823223930570073e-06, "loss": 0.3022, "num_input_tokens_seen": 6683640, "step": 6655 }, { "epoch": 3.5312831389183454, "grad_norm": 3.0287845134735107, "learning_rate": 9.822613674614734e-06, "loss": 0.3094, "num_input_tokens_seen": 6688856, "step": 6660 }, { "epoch": 3.5339342523860022, "grad_norm": 2.6870155334472656, "learning_rate": 9.822002386150426e-06, "loss": 0.2435, "num_input_tokens_seen": 6694296, "step": 6665 }, { "epoch": 3.5365853658536586, "grad_norm": 4.999189853668213, "learning_rate": 9.821390065308026e-06, "loss": 0.3482, "num_input_tokens_seen": 6699288, "step": 6670 }, { "epoch": 3.539236479321315, "grad_norm": 3.7854580879211426, "learning_rate": 9.820776712218626e-06, "loss": 0.2793, "num_input_tokens_seen": 6704056, "step": 6675 }, { "epoch": 3.5418875927889713, "grad_norm": 3.0133605003356934, "learning_rate": 9.82016232701355e-06, "loss": 0.2852, "num_input_tokens_seen": 6710008, "step": 6680 }, { "epoch": 3.5445387062566276, "grad_norm": 6.504159927368164, "learning_rate": 9.819546909824332e-06, "loss": 0.3789, "num_input_tokens_seen": 6714616, "step": 6685 }, { "epoch": 3.5471898197242844, "grad_norm": 2.728144407272339, "learning_rate": 9.818930460782732e-06, "loss": 0.2887, "num_input_tokens_seen": 6720920, "step": 6690 }, { "epoch": 3.5498409331919407, "grad_norm": 2.691136121749878, "learning_rate": 9.81831298002073e-06, "loss": 0.3085, "num_input_tokens_seen": 6726776, "step": 6695 }, { "epoch": 3.552492046659597, "grad_norm": 4.489621162414551, "learning_rate": 9.817694467670527e-06, "loss": 0.3192, "num_input_tokens_seen": 6732248, "step": 6700 }, { "epoch": 3.5551431601272534, "grad_norm": 2.5153005123138428, "learning_rate": 9.817074923864546e-06, "loss": 0.2952, "num_input_tokens_seen": 6737112, "step": 6705 }, { "epoch": 3.5577942735949097, "grad_norm": 3.2876250743865967, "learning_rate": 9.816454348735429e-06, "loss": 0.3175, "num_input_tokens_seen": 6741528, "step": 6710 }, { "epoch": 3.5604453870625665, "grad_norm": 3.645113468170166, "learning_rate": 9.815832742416038e-06, "loss": 0.3256, "num_input_tokens_seen": 6747000, "step": 6715 }, { "epoch": 3.5630965005302224, "grad_norm": 4.323393821716309, "learning_rate": 9.81521010503946e-06, "loss": 0.4039, "num_input_tokens_seen": 6751864, "step": 6720 }, { "epoch": 3.565747613997879, "grad_norm": 2.287567377090454, "learning_rate": 9.814586436738998e-06, "loss": 0.2428, "num_input_tokens_seen": 6756152, "step": 6725 }, { "epoch": 3.5683987274655355, "grad_norm": 4.297470569610596, "learning_rate": 9.813961737648178e-06, "loss": 0.3024, "num_input_tokens_seen": 6760920, "step": 6730 }, { "epoch": 3.571049840933192, "grad_norm": 4.790125846862793, "learning_rate": 9.813336007900747e-06, "loss": 0.3113, "num_input_tokens_seen": 6766616, "step": 6735 }, { "epoch": 3.5737009544008482, "grad_norm": 2.59946346282959, "learning_rate": 9.812709247630671e-06, "loss": 0.2914, "num_input_tokens_seen": 6771832, "step": 6740 }, { "epoch": 3.5763520678685046, "grad_norm": 2.4719467163085938, "learning_rate": 9.81208145697214e-06, "loss": 0.3233, "num_input_tokens_seen": 6777176, "step": 6745 }, { "epoch": 3.5790031813361614, "grad_norm": 3.015824556350708, "learning_rate": 9.811452636059562e-06, "loss": 0.2927, "num_input_tokens_seen": 6781752, "step": 6750 }, { "epoch": 3.5816542948038177, "grad_norm": 2.8888561725616455, "learning_rate": 9.810822785027563e-06, "loss": 0.338, "num_input_tokens_seen": 6787064, "step": 6755 }, { "epoch": 3.584305408271474, "grad_norm": 2.9463090896606445, "learning_rate": 9.810191904010995e-06, "loss": 0.3078, "num_input_tokens_seen": 6793016, "step": 6760 }, { "epoch": 3.5869565217391304, "grad_norm": 2.6753828525543213, "learning_rate": 9.809559993144927e-06, "loss": 0.3378, "num_input_tokens_seen": 6797592, "step": 6765 }, { "epoch": 3.5896076352067867, "grad_norm": 3.465250015258789, "learning_rate": 9.808927052564651e-06, "loss": 0.3034, "num_input_tokens_seen": 6802136, "step": 6770 }, { "epoch": 3.5922587486744435, "grad_norm": 4.382126331329346, "learning_rate": 9.808293082405675e-06, "loss": 0.3047, "num_input_tokens_seen": 6807544, "step": 6775 }, { "epoch": 3.5949098621421, "grad_norm": 2.899468421936035, "learning_rate": 9.807658082803735e-06, "loss": 0.3467, "num_input_tokens_seen": 6812856, "step": 6780 }, { "epoch": 3.597560975609756, "grad_norm": 3.8113937377929688, "learning_rate": 9.807022053894777e-06, "loss": 0.2826, "num_input_tokens_seen": 6818744, "step": 6785 }, { "epoch": 3.6002120890774125, "grad_norm": 2.2592194080352783, "learning_rate": 9.806384995814977e-06, "loss": 0.3172, "num_input_tokens_seen": 6825336, "step": 6790 }, { "epoch": 3.602863202545069, "grad_norm": 1.5957125425338745, "learning_rate": 9.805746908700726e-06, "loss": 0.2749, "num_input_tokens_seen": 6829720, "step": 6795 }, { "epoch": 3.605514316012725, "grad_norm": 4.045586109161377, "learning_rate": 9.805107792688638e-06, "loss": 0.321, "num_input_tokens_seen": 6833848, "step": 6800 }, { "epoch": 3.6081654294803815, "grad_norm": 3.7757341861724854, "learning_rate": 9.804467647915546e-06, "loss": 0.461, "num_input_tokens_seen": 6838840, "step": 6805 }, { "epoch": 3.6108165429480383, "grad_norm": 2.213986873626709, "learning_rate": 9.803826474518501e-06, "loss": 0.3388, "num_input_tokens_seen": 6842872, "step": 6810 }, { "epoch": 3.6134676564156947, "grad_norm": 3.2147367000579834, "learning_rate": 9.803184272634778e-06, "loss": 0.3496, "num_input_tokens_seen": 6847480, "step": 6815 }, { "epoch": 3.616118769883351, "grad_norm": 2.2460250854492188, "learning_rate": 9.802541042401872e-06, "loss": 0.3331, "num_input_tokens_seen": 6851256, "step": 6820 }, { "epoch": 3.6187698833510074, "grad_norm": 3.1435844898223877, "learning_rate": 9.801896783957495e-06, "loss": 0.2573, "num_input_tokens_seen": 6855736, "step": 6825 }, { "epoch": 3.6214209968186637, "grad_norm": 2.331561326980591, "learning_rate": 9.801251497439582e-06, "loss": 0.2718, "num_input_tokens_seen": 6861272, "step": 6830 }, { "epoch": 3.6240721102863205, "grad_norm": 3.3228018283843994, "learning_rate": 9.800605182986287e-06, "loss": 0.3425, "num_input_tokens_seen": 6866456, "step": 6835 }, { "epoch": 3.626723223753977, "grad_norm": 2.9784793853759766, "learning_rate": 9.799957840735986e-06, "loss": 0.2742, "num_input_tokens_seen": 6870936, "step": 6840 }, { "epoch": 3.629374337221633, "grad_norm": 3.3336102962493896, "learning_rate": 9.799309470827268e-06, "loss": 0.3803, "num_input_tokens_seen": 6878168, "step": 6845 }, { "epoch": 3.6320254506892895, "grad_norm": 2.5911002159118652, "learning_rate": 9.798660073398953e-06, "loss": 0.3806, "num_input_tokens_seen": 6882456, "step": 6850 }, { "epoch": 3.634676564156946, "grad_norm": 4.53442907333374, "learning_rate": 9.798009648590073e-06, "loss": 0.4174, "num_input_tokens_seen": 6886680, "step": 6855 }, { "epoch": 3.637327677624602, "grad_norm": 2.015073299407959, "learning_rate": 9.797358196539883e-06, "loss": 0.3519, "num_input_tokens_seen": 6892280, "step": 6860 }, { "epoch": 3.6399787910922585, "grad_norm": 2.2529613971710205, "learning_rate": 9.796705717387856e-06, "loss": 0.3025, "num_input_tokens_seen": 6896888, "step": 6865 }, { "epoch": 3.6426299045599153, "grad_norm": 3.038663864135742, "learning_rate": 9.796052211273686e-06, "loss": 0.3305, "num_input_tokens_seen": 6902040, "step": 6870 }, { "epoch": 3.6452810180275717, "grad_norm": 1.6990277767181396, "learning_rate": 9.795397678337289e-06, "loss": 0.3143, "num_input_tokens_seen": 6907064, "step": 6875 }, { "epoch": 3.647932131495228, "grad_norm": 1.5869895219802856, "learning_rate": 9.794742118718794e-06, "loss": 0.2875, "num_input_tokens_seen": 6912344, "step": 6880 }, { "epoch": 3.6505832449628843, "grad_norm": 2.5048224925994873, "learning_rate": 9.794085532558561e-06, "loss": 0.3177, "num_input_tokens_seen": 6917304, "step": 6885 }, { "epoch": 3.6532343584305407, "grad_norm": 2.2562148571014404, "learning_rate": 9.79342791999716e-06, "loss": 0.3201, "num_input_tokens_seen": 6922264, "step": 6890 }, { "epoch": 3.6558854718981975, "grad_norm": 2.3633341789245605, "learning_rate": 9.792769281175384e-06, "loss": 0.3102, "num_input_tokens_seen": 6927928, "step": 6895 }, { "epoch": 3.658536585365854, "grad_norm": 2.2164418697357178, "learning_rate": 9.792109616234247e-06, "loss": 0.2647, "num_input_tokens_seen": 6933528, "step": 6900 }, { "epoch": 3.66118769883351, "grad_norm": 1.4946179389953613, "learning_rate": 9.791448925314979e-06, "loss": 0.232, "num_input_tokens_seen": 6938584, "step": 6905 }, { "epoch": 3.6638388123011665, "grad_norm": 2.9087586402893066, "learning_rate": 9.790787208559036e-06, "loss": 0.2351, "num_input_tokens_seen": 6943832, "step": 6910 }, { "epoch": 3.666489925768823, "grad_norm": 2.3698668479919434, "learning_rate": 9.790124466108086e-06, "loss": 0.2901, "num_input_tokens_seen": 6949208, "step": 6915 }, { "epoch": 3.669141039236479, "grad_norm": 5.668125152587891, "learning_rate": 9.789460698104025e-06, "loss": 0.4464, "num_input_tokens_seen": 6954968, "step": 6920 }, { "epoch": 3.6717921527041355, "grad_norm": 3.2248575687408447, "learning_rate": 9.788795904688959e-06, "loss": 0.3021, "num_input_tokens_seen": 6960120, "step": 6925 }, { "epoch": 3.6744432661717923, "grad_norm": 2.266167402267456, "learning_rate": 9.788130086005222e-06, "loss": 0.3078, "num_input_tokens_seen": 6965112, "step": 6930 }, { "epoch": 3.6770943796394486, "grad_norm": 3.37200665473938, "learning_rate": 9.787463242195362e-06, "loss": 0.2864, "num_input_tokens_seen": 6969560, "step": 6935 }, { "epoch": 3.679745493107105, "grad_norm": 3.788750171661377, "learning_rate": 9.786795373402151e-06, "loss": 0.3992, "num_input_tokens_seen": 6975000, "step": 6940 }, { "epoch": 3.6823966065747613, "grad_norm": 2.4354054927825928, "learning_rate": 9.786126479768574e-06, "loss": 0.2176, "num_input_tokens_seen": 6980408, "step": 6945 }, { "epoch": 3.6850477200424177, "grad_norm": 2.116508960723877, "learning_rate": 9.785456561437843e-06, "loss": 0.2227, "num_input_tokens_seen": 6986264, "step": 6950 }, { "epoch": 3.6876988335100744, "grad_norm": 2.423370599746704, "learning_rate": 9.784785618553384e-06, "loss": 0.3121, "num_input_tokens_seen": 6990936, "step": 6955 }, { "epoch": 3.6903499469777308, "grad_norm": 2.335563898086548, "learning_rate": 9.784113651258845e-06, "loss": 0.2146, "num_input_tokens_seen": 6996088, "step": 6960 }, { "epoch": 3.693001060445387, "grad_norm": 2.7717902660369873, "learning_rate": 9.783440659698092e-06, "loss": 0.288, "num_input_tokens_seen": 7001368, "step": 6965 }, { "epoch": 3.6956521739130435, "grad_norm": 3.566831588745117, "learning_rate": 9.782766644015212e-06, "loss": 0.3496, "num_input_tokens_seen": 7006136, "step": 6970 }, { "epoch": 3.6983032873807, "grad_norm": 2.722048044204712, "learning_rate": 9.782091604354507e-06, "loss": 0.278, "num_input_tokens_seen": 7010904, "step": 6975 }, { "epoch": 3.7009544008483566, "grad_norm": 2.3402836322784424, "learning_rate": 9.781415540860503e-06, "loss": 0.1828, "num_input_tokens_seen": 7015768, "step": 6980 }, { "epoch": 3.7036055143160125, "grad_norm": 3.7042083740234375, "learning_rate": 9.780738453677943e-06, "loss": 0.274, "num_input_tokens_seen": 7021880, "step": 6985 }, { "epoch": 3.7062566277836693, "grad_norm": 2.8689627647399902, "learning_rate": 9.78006034295179e-06, "loss": 0.364, "num_input_tokens_seen": 7026680, "step": 6990 }, { "epoch": 3.7089077412513256, "grad_norm": 2.262120246887207, "learning_rate": 9.779381208827228e-06, "loss": 0.2491, "num_input_tokens_seen": 7032088, "step": 6995 }, { "epoch": 3.711558854718982, "grad_norm": 3.0141282081604004, "learning_rate": 9.778701051449655e-06, "loss": 0.3159, "num_input_tokens_seen": 7036920, "step": 7000 }, { "epoch": 3.7142099681866383, "grad_norm": 2.0405261516571045, "learning_rate": 9.77801987096469e-06, "loss": 0.328, "num_input_tokens_seen": 7041656, "step": 7005 }, { "epoch": 3.7168610816542946, "grad_norm": 4.540583610534668, "learning_rate": 9.777337667518175e-06, "loss": 0.3258, "num_input_tokens_seen": 7045944, "step": 7010 }, { "epoch": 3.7195121951219514, "grad_norm": 4.118709564208984, "learning_rate": 9.776654441256164e-06, "loss": 0.2742, "num_input_tokens_seen": 7051736, "step": 7015 }, { "epoch": 3.7221633085896078, "grad_norm": 2.0337789058685303, "learning_rate": 9.77597019232494e-06, "loss": 0.2473, "num_input_tokens_seen": 7056280, "step": 7020 }, { "epoch": 3.724814422057264, "grad_norm": 3.3310601711273193, "learning_rate": 9.775284920870992e-06, "loss": 0.2918, "num_input_tokens_seen": 7061816, "step": 7025 }, { "epoch": 3.7274655355249204, "grad_norm": 2.311312198638916, "learning_rate": 9.77459862704104e-06, "loss": 0.2715, "num_input_tokens_seen": 7067032, "step": 7030 }, { "epoch": 3.7301166489925768, "grad_norm": 3.392975091934204, "learning_rate": 9.773911310982014e-06, "loss": 0.2563, "num_input_tokens_seen": 7072280, "step": 7035 }, { "epoch": 3.7327677624602336, "grad_norm": 3.54777455329895, "learning_rate": 9.77322297284107e-06, "loss": 0.4481, "num_input_tokens_seen": 7077432, "step": 7040 }, { "epoch": 3.7354188759278895, "grad_norm": 2.2697386741638184, "learning_rate": 9.772533612765578e-06, "loss": 0.2212, "num_input_tokens_seen": 7082232, "step": 7045 }, { "epoch": 3.7380699893955462, "grad_norm": 2.83034610748291, "learning_rate": 9.771843230903126e-06, "loss": 0.2801, "num_input_tokens_seen": 7088152, "step": 7050 }, { "epoch": 3.7407211028632026, "grad_norm": 4.359287261962891, "learning_rate": 9.771151827401525e-06, "loss": 0.3301, "num_input_tokens_seen": 7093080, "step": 7055 }, { "epoch": 3.743372216330859, "grad_norm": 4.577280044555664, "learning_rate": 9.770459402408806e-06, "loss": 0.3318, "num_input_tokens_seen": 7098136, "step": 7060 }, { "epoch": 3.7460233297985153, "grad_norm": 4.7785797119140625, "learning_rate": 9.76976595607321e-06, "loss": 0.2806, "num_input_tokens_seen": 7102072, "step": 7065 }, { "epoch": 3.7486744432661716, "grad_norm": 3.935695171356201, "learning_rate": 9.769071488543203e-06, "loss": 0.3765, "num_input_tokens_seen": 7107064, "step": 7070 }, { "epoch": 3.7513255567338284, "grad_norm": 3.6892693042755127, "learning_rate": 9.76837599996747e-06, "loss": 0.2999, "num_input_tokens_seen": 7111480, "step": 7075 }, { "epoch": 3.7539766702014847, "grad_norm": 3.363959789276123, "learning_rate": 9.767679490494913e-06, "loss": 0.2566, "num_input_tokens_seen": 7116760, "step": 7080 }, { "epoch": 3.756627783669141, "grad_norm": 2.6764776706695557, "learning_rate": 9.766981960274653e-06, "loss": 0.3122, "num_input_tokens_seen": 7121560, "step": 7085 }, { "epoch": 3.7592788971367974, "grad_norm": 2.8376426696777344, "learning_rate": 9.76628340945603e-06, "loss": 0.3095, "num_input_tokens_seen": 7126680, "step": 7090 }, { "epoch": 3.7619300106044538, "grad_norm": 5.029613971710205, "learning_rate": 9.765583838188602e-06, "loss": 0.3799, "num_input_tokens_seen": 7131416, "step": 7095 }, { "epoch": 3.7645811240721105, "grad_norm": 4.690761566162109, "learning_rate": 9.764883246622142e-06, "loss": 0.2465, "num_input_tokens_seen": 7135224, "step": 7100 }, { "epoch": 3.7672322375397664, "grad_norm": 4.8207688331604, "learning_rate": 9.76418163490665e-06, "loss": 0.3284, "num_input_tokens_seen": 7140184, "step": 7105 }, { "epoch": 3.7698833510074232, "grad_norm": 4.226078510284424, "learning_rate": 9.763479003192333e-06, "loss": 0.2932, "num_input_tokens_seen": 7145784, "step": 7110 }, { "epoch": 3.7725344644750796, "grad_norm": 3.9906563758850098, "learning_rate": 9.762775351629628e-06, "loss": 0.3321, "num_input_tokens_seen": 7149848, "step": 7115 }, { "epoch": 3.775185577942736, "grad_norm": 4.908949375152588, "learning_rate": 9.762070680369185e-06, "loss": 0.3384, "num_input_tokens_seen": 7154936, "step": 7120 }, { "epoch": 3.7778366914103922, "grad_norm": 2.772242784500122, "learning_rate": 9.761364989561868e-06, "loss": 0.3375, "num_input_tokens_seen": 7160472, "step": 7125 }, { "epoch": 3.7804878048780486, "grad_norm": 1.6423165798187256, "learning_rate": 9.760658279358767e-06, "loss": 0.2219, "num_input_tokens_seen": 7166840, "step": 7130 }, { "epoch": 3.7831389183457054, "grad_norm": 2.523148775100708, "learning_rate": 9.759950549911185e-06, "loss": 0.2319, "num_input_tokens_seen": 7171896, "step": 7135 }, { "epoch": 3.7857900318133617, "grad_norm": 3.128155469894409, "learning_rate": 9.759241801370646e-06, "loss": 0.3241, "num_input_tokens_seen": 7176856, "step": 7140 }, { "epoch": 3.788441145281018, "grad_norm": 2.200883626937866, "learning_rate": 9.758532033888893e-06, "loss": 0.3755, "num_input_tokens_seen": 7181560, "step": 7145 }, { "epoch": 3.7910922587486744, "grad_norm": 2.250439167022705, "learning_rate": 9.75782124761788e-06, "loss": 0.4005, "num_input_tokens_seen": 7186840, "step": 7150 }, { "epoch": 3.7937433722163307, "grad_norm": 3.7253501415252686, "learning_rate": 9.757109442709791e-06, "loss": 0.3487, "num_input_tokens_seen": 7192408, "step": 7155 }, { "epoch": 3.7963944856839875, "grad_norm": 3.10357666015625, "learning_rate": 9.756396619317017e-06, "loss": 0.2791, "num_input_tokens_seen": 7197912, "step": 7160 }, { "epoch": 3.7990455991516434, "grad_norm": 3.563096284866333, "learning_rate": 9.755682777592173e-06, "loss": 0.2774, "num_input_tokens_seen": 7202648, "step": 7165 }, { "epoch": 3.8016967126193, "grad_norm": 3.7636451721191406, "learning_rate": 9.754967917688089e-06, "loss": 0.325, "num_input_tokens_seen": 7207064, "step": 7170 }, { "epoch": 3.8043478260869565, "grad_norm": 2.9229886531829834, "learning_rate": 9.754252039757818e-06, "loss": 0.3067, "num_input_tokens_seen": 7211672, "step": 7175 }, { "epoch": 3.806998939554613, "grad_norm": 2.383805751800537, "learning_rate": 9.753535143954626e-06, "loss": 0.2747, "num_input_tokens_seen": 7215448, "step": 7180 }, { "epoch": 3.8096500530222692, "grad_norm": 1.96214759349823, "learning_rate": 9.752817230431999e-06, "loss": 0.3354, "num_input_tokens_seen": 7220472, "step": 7185 }, { "epoch": 3.8123011664899256, "grad_norm": 3.2972095012664795, "learning_rate": 9.75209829934364e-06, "loss": 0.3077, "num_input_tokens_seen": 7225528, "step": 7190 }, { "epoch": 3.8149522799575823, "grad_norm": 5.197932243347168, "learning_rate": 9.751378350843469e-06, "loss": 0.4332, "num_input_tokens_seen": 7230680, "step": 7195 }, { "epoch": 3.8176033934252387, "grad_norm": 1.8867472410202026, "learning_rate": 9.750657385085627e-06, "loss": 0.329, "num_input_tokens_seen": 7235416, "step": 7200 }, { "epoch": 3.820254506892895, "grad_norm": 3.1173696517944336, "learning_rate": 9.749935402224468e-06, "loss": 0.3414, "num_input_tokens_seen": 7241912, "step": 7205 }, { "epoch": 3.8229056203605514, "grad_norm": 3.7879061698913574, "learning_rate": 9.749212402414571e-06, "loss": 0.3261, "num_input_tokens_seen": 7246872, "step": 7210 }, { "epoch": 3.8255567338282077, "grad_norm": 3.433128833770752, "learning_rate": 9.748488385810726e-06, "loss": 0.299, "num_input_tokens_seen": 7251032, "step": 7215 }, { "epoch": 3.8282078472958645, "grad_norm": 5.692037105560303, "learning_rate": 9.747763352567942e-06, "loss": 0.3564, "num_input_tokens_seen": 7256376, "step": 7220 }, { "epoch": 3.830858960763521, "grad_norm": 4.316887855529785, "learning_rate": 9.74703730284145e-06, "loss": 0.3336, "num_input_tokens_seen": 7261496, "step": 7225 }, { "epoch": 3.833510074231177, "grad_norm": 6.001941204071045, "learning_rate": 9.74631023678669e-06, "loss": 0.3589, "num_input_tokens_seen": 7266360, "step": 7230 }, { "epoch": 3.8361611876988335, "grad_norm": 4.733035087585449, "learning_rate": 9.74558215455933e-06, "loss": 0.2956, "num_input_tokens_seen": 7271704, "step": 7235 }, { "epoch": 3.83881230116649, "grad_norm": 4.79730224609375, "learning_rate": 9.744853056315248e-06, "loss": 0.2847, "num_input_tokens_seen": 7276152, "step": 7240 }, { "epoch": 3.841463414634146, "grad_norm": 2.6886518001556396, "learning_rate": 9.744122942210544e-06, "loss": 0.2917, "num_input_tokens_seen": 7280472, "step": 7245 }, { "epoch": 3.8441145281018025, "grad_norm": 2.4362552165985107, "learning_rate": 9.74339181240153e-06, "loss": 0.4237, "num_input_tokens_seen": 7285016, "step": 7250 }, { "epoch": 3.8467656415694593, "grad_norm": 6.642955780029297, "learning_rate": 9.742659667044743e-06, "loss": 0.3998, "num_input_tokens_seen": 7289880, "step": 7255 }, { "epoch": 3.8494167550371157, "grad_norm": 2.767820119857788, "learning_rate": 9.741926506296931e-06, "loss": 0.3382, "num_input_tokens_seen": 7294616, "step": 7260 }, { "epoch": 3.852067868504772, "grad_norm": 4.023975849151611, "learning_rate": 9.741192330315062e-06, "loss": 0.3353, "num_input_tokens_seen": 7300600, "step": 7265 }, { "epoch": 3.8547189819724283, "grad_norm": 2.350238800048828, "learning_rate": 9.740457139256323e-06, "loss": 0.3447, "num_input_tokens_seen": 7304632, "step": 7270 }, { "epoch": 3.8573700954400847, "grad_norm": 2.0482499599456787, "learning_rate": 9.739720933278114e-06, "loss": 0.2654, "num_input_tokens_seen": 7309208, "step": 7275 }, { "epoch": 3.8600212089077415, "grad_norm": 2.527784824371338, "learning_rate": 9.738983712538055e-06, "loss": 0.2686, "num_input_tokens_seen": 7315160, "step": 7280 }, { "epoch": 3.862672322375398, "grad_norm": 2.9083151817321777, "learning_rate": 9.738245477193984e-06, "loss": 0.2598, "num_input_tokens_seen": 7320312, "step": 7285 }, { "epoch": 3.865323435843054, "grad_norm": 3.7913825511932373, "learning_rate": 9.737506227403956e-06, "loss": 0.2991, "num_input_tokens_seen": 7325080, "step": 7290 }, { "epoch": 3.8679745493107105, "grad_norm": 1.8917393684387207, "learning_rate": 9.736765963326241e-06, "loss": 0.2783, "num_input_tokens_seen": 7329400, "step": 7295 }, { "epoch": 3.870625662778367, "grad_norm": 3.430950403213501, "learning_rate": 9.736024685119328e-06, "loss": 0.2718, "num_input_tokens_seen": 7334008, "step": 7300 }, { "epoch": 3.873276776246023, "grad_norm": 1.758609414100647, "learning_rate": 9.735282392941924e-06, "loss": 0.2783, "num_input_tokens_seen": 7339032, "step": 7305 }, { "epoch": 3.8759278897136795, "grad_norm": 3.8806312084198, "learning_rate": 9.73453908695295e-06, "loss": 0.362, "num_input_tokens_seen": 7344856, "step": 7310 }, { "epoch": 3.8785790031813363, "grad_norm": 3.4294490814208984, "learning_rate": 9.733794767311545e-06, "loss": 0.2538, "num_input_tokens_seen": 7349496, "step": 7315 }, { "epoch": 3.8812301166489926, "grad_norm": 5.834209442138672, "learning_rate": 9.733049434177068e-06, "loss": 0.2938, "num_input_tokens_seen": 7354584, "step": 7320 }, { "epoch": 3.883881230116649, "grad_norm": 4.43411111831665, "learning_rate": 9.732303087709094e-06, "loss": 0.3115, "num_input_tokens_seen": 7359224, "step": 7325 }, { "epoch": 3.8865323435843053, "grad_norm": 6.258910655975342, "learning_rate": 9.73155572806741e-06, "loss": 0.4383, "num_input_tokens_seen": 7364312, "step": 7330 }, { "epoch": 3.8891834570519617, "grad_norm": 2.2184417247772217, "learning_rate": 9.730807355412029e-06, "loss": 0.3277, "num_input_tokens_seen": 7368792, "step": 7335 }, { "epoch": 3.8918345705196185, "grad_norm": 2.5560150146484375, "learning_rate": 9.73005796990317e-06, "loss": 0.2516, "num_input_tokens_seen": 7375000, "step": 7340 }, { "epoch": 3.894485683987275, "grad_norm": 2.7851152420043945, "learning_rate": 9.729307571701277e-06, "loss": 0.2683, "num_input_tokens_seen": 7379256, "step": 7345 }, { "epoch": 3.897136797454931, "grad_norm": 1.9826303720474243, "learning_rate": 9.728556160967009e-06, "loss": 0.2772, "num_input_tokens_seen": 7384152, "step": 7350 }, { "epoch": 3.8997879109225875, "grad_norm": 2.201449394226074, "learning_rate": 9.727803737861239e-06, "loss": 0.3087, "num_input_tokens_seen": 7388984, "step": 7355 }, { "epoch": 3.902439024390244, "grad_norm": 3.680527448654175, "learning_rate": 9.72705030254506e-06, "loss": 0.372, "num_input_tokens_seen": 7393272, "step": 7360 }, { "epoch": 3.9050901378579, "grad_norm": 4.102544784545898, "learning_rate": 9.726295855179783e-06, "loss": 0.4882, "num_input_tokens_seen": 7398840, "step": 7365 }, { "epoch": 3.9077412513255565, "grad_norm": 3.4181900024414062, "learning_rate": 9.72554039592693e-06, "loss": 0.2502, "num_input_tokens_seen": 7403640, "step": 7370 }, { "epoch": 3.9103923647932133, "grad_norm": 3.786867380142212, "learning_rate": 9.724783924948243e-06, "loss": 0.2925, "num_input_tokens_seen": 7408152, "step": 7375 }, { "epoch": 3.9130434782608696, "grad_norm": 3.664804458618164, "learning_rate": 9.724026442405681e-06, "loss": 0.2072, "num_input_tokens_seen": 7412760, "step": 7380 }, { "epoch": 3.915694591728526, "grad_norm": 2.577958822250366, "learning_rate": 9.72326794846142e-06, "loss": 0.2265, "num_input_tokens_seen": 7417848, "step": 7385 }, { "epoch": 3.9183457051961823, "grad_norm": 2.9106452465057373, "learning_rate": 9.722508443277851e-06, "loss": 0.2537, "num_input_tokens_seen": 7422424, "step": 7390 }, { "epoch": 3.9209968186638386, "grad_norm": 5.046348571777344, "learning_rate": 9.721747927017581e-06, "loss": 0.3592, "num_input_tokens_seen": 7426776, "step": 7395 }, { "epoch": 3.9236479321314954, "grad_norm": 3.373392105102539, "learning_rate": 9.720986399843435e-06, "loss": 0.3159, "num_input_tokens_seen": 7431448, "step": 7400 }, { "epoch": 3.9262990455991518, "grad_norm": 2.733987808227539, "learning_rate": 9.720223861918457e-06, "loss": 0.3553, "num_input_tokens_seen": 7436280, "step": 7405 }, { "epoch": 3.928950159066808, "grad_norm": 7.832640171051025, "learning_rate": 9.719460313405898e-06, "loss": 0.3657, "num_input_tokens_seen": 7443384, "step": 7410 }, { "epoch": 3.9316012725344645, "grad_norm": 3.5235700607299805, "learning_rate": 9.718695754469238e-06, "loss": 0.2456, "num_input_tokens_seen": 7448120, "step": 7415 }, { "epoch": 3.934252386002121, "grad_norm": 4.9732866287231445, "learning_rate": 9.717930185272165e-06, "loss": 0.403, "num_input_tokens_seen": 7454104, "step": 7420 }, { "epoch": 3.9369034994697776, "grad_norm": 3.905548095703125, "learning_rate": 9.717163605978583e-06, "loss": 0.2213, "num_input_tokens_seen": 7459256, "step": 7425 }, { "epoch": 3.9395546129374335, "grad_norm": 4.100707530975342, "learning_rate": 9.716396016752616e-06, "loss": 0.3816, "num_input_tokens_seen": 7464024, "step": 7430 }, { "epoch": 3.9422057264050903, "grad_norm": 6.8213043212890625, "learning_rate": 9.715627417758606e-06, "loss": 0.3521, "num_input_tokens_seen": 7470296, "step": 7435 }, { "epoch": 3.9448568398727466, "grad_norm": 5.256969451904297, "learning_rate": 9.714857809161103e-06, "loss": 0.4985, "num_input_tokens_seen": 7475384, "step": 7440 }, { "epoch": 3.947507953340403, "grad_norm": 3.678638219833374, "learning_rate": 9.714087191124882e-06, "loss": 0.2859, "num_input_tokens_seen": 7481272, "step": 7445 }, { "epoch": 3.9501590668080593, "grad_norm": 2.848257541656494, "learning_rate": 9.713315563814928e-06, "loss": 0.2788, "num_input_tokens_seen": 7486200, "step": 7450 }, { "epoch": 3.9528101802757156, "grad_norm": 3.5370917320251465, "learning_rate": 9.712542927396447e-06, "loss": 0.2999, "num_input_tokens_seen": 7491160, "step": 7455 }, { "epoch": 3.9554612937433724, "grad_norm": 3.4632298946380615, "learning_rate": 9.711769282034855e-06, "loss": 0.2998, "num_input_tokens_seen": 7495736, "step": 7460 }, { "epoch": 3.9581124072110287, "grad_norm": 3.7468674182891846, "learning_rate": 9.71099462789579e-06, "loss": 0.2133, "num_input_tokens_seen": 7500696, "step": 7465 }, { "epoch": 3.960763520678685, "grad_norm": 1.4493764638900757, "learning_rate": 9.710218965145102e-06, "loss": 0.2927, "num_input_tokens_seen": 7505016, "step": 7470 }, { "epoch": 3.9634146341463414, "grad_norm": 5.876739025115967, "learning_rate": 9.70944229394886e-06, "loss": 0.209, "num_input_tokens_seen": 7509464, "step": 7475 }, { "epoch": 3.9660657476139978, "grad_norm": 2.469789743423462, "learning_rate": 9.708664614473345e-06, "loss": 0.5282, "num_input_tokens_seen": 7515192, "step": 7480 }, { "epoch": 3.9687168610816546, "grad_norm": 2.8216121196746826, "learning_rate": 9.707885926885059e-06, "loss": 0.4168, "num_input_tokens_seen": 7520856, "step": 7485 }, { "epoch": 3.9713679745493105, "grad_norm": 3.5013773441314697, "learning_rate": 9.707106231350714e-06, "loss": 0.3877, "num_input_tokens_seen": 7526104, "step": 7490 }, { "epoch": 3.9740190880169672, "grad_norm": 4.70474910736084, "learning_rate": 9.706325528037241e-06, "loss": 0.3606, "num_input_tokens_seen": 7531512, "step": 7495 }, { "epoch": 3.9766702014846236, "grad_norm": 4.250648021697998, "learning_rate": 9.70554381711179e-06, "loss": 0.3611, "num_input_tokens_seen": 7536152, "step": 7500 }, { "epoch": 3.97932131495228, "grad_norm": 5.524711608886719, "learning_rate": 9.70476109874172e-06, "loss": 0.3294, "num_input_tokens_seen": 7540952, "step": 7505 }, { "epoch": 3.9819724284199363, "grad_norm": 2.5454530715942383, "learning_rate": 9.70397737309461e-06, "loss": 0.3579, "num_input_tokens_seen": 7545304, "step": 7510 }, { "epoch": 3.9846235418875926, "grad_norm": 1.966206669807434, "learning_rate": 9.703192640338253e-06, "loss": 0.293, "num_input_tokens_seen": 7550200, "step": 7515 }, { "epoch": 3.9872746553552494, "grad_norm": 1.9047660827636719, "learning_rate": 9.70240690064066e-06, "loss": 0.2423, "num_input_tokens_seen": 7554968, "step": 7520 }, { "epoch": 3.9899257688229057, "grad_norm": 2.9154551029205322, "learning_rate": 9.701620154170053e-06, "loss": 0.4798, "num_input_tokens_seen": 7561016, "step": 7525 }, { "epoch": 3.992576882290562, "grad_norm": 2.17421293258667, "learning_rate": 9.700832401094876e-06, "loss": 0.332, "num_input_tokens_seen": 7566232, "step": 7530 }, { "epoch": 3.9952279957582184, "grad_norm": 3.3731443881988525, "learning_rate": 9.70004364158378e-06, "loss": 0.2883, "num_input_tokens_seen": 7571128, "step": 7535 }, { "epoch": 3.9978791092258747, "grad_norm": 4.18191385269165, "learning_rate": 9.69925387580564e-06, "loss": 0.3608, "num_input_tokens_seen": 7576600, "step": 7540 }, { "epoch": 4.0, "eval_loss": 0.31705325841903687, "eval_runtime": 29.1912, "eval_samples_per_second": 64.609, "eval_steps_per_second": 16.169, "num_input_tokens_seen": 7579824, "step": 7544 }, { "epoch": 4.0005302226935315, "grad_norm": 2.339348316192627, "learning_rate": 9.698463103929542e-06, "loss": 0.272, "num_input_tokens_seen": 7580592, "step": 7545 }, { "epoch": 4.003181336161187, "grad_norm": 2.934096574783325, "learning_rate": 9.697671326124789e-06, "loss": 0.3286, "num_input_tokens_seen": 7584880, "step": 7550 }, { "epoch": 4.005832449628844, "grad_norm": 2.0224528312683105, "learning_rate": 9.696878542560896e-06, "loss": 0.3572, "num_input_tokens_seen": 7590320, "step": 7555 }, { "epoch": 4.0084835630965, "grad_norm": 2.493910789489746, "learning_rate": 9.6960847534076e-06, "loss": 0.2648, "num_input_tokens_seen": 7595376, "step": 7560 }, { "epoch": 4.011134676564157, "grad_norm": 2.59262752532959, "learning_rate": 9.695289958834845e-06, "loss": 0.2409, "num_input_tokens_seen": 7599600, "step": 7565 }, { "epoch": 4.013785790031814, "grad_norm": 2.7176754474639893, "learning_rate": 9.694494159012796e-06, "loss": 0.2828, "num_input_tokens_seen": 7605456, "step": 7570 }, { "epoch": 4.01643690349947, "grad_norm": 2.6747922897338867, "learning_rate": 9.693697354111832e-06, "loss": 0.2718, "num_input_tokens_seen": 7610320, "step": 7575 }, { "epoch": 4.019088016967126, "grad_norm": 2.4492669105529785, "learning_rate": 9.692899544302547e-06, "loss": 0.2205, "num_input_tokens_seen": 7616208, "step": 7580 }, { "epoch": 4.021739130434782, "grad_norm": 7.055381774902344, "learning_rate": 9.69210072975575e-06, "loss": 0.3263, "num_input_tokens_seen": 7622320, "step": 7585 }, { "epoch": 4.024390243902439, "grad_norm": 3.0840721130371094, "learning_rate": 9.691300910642465e-06, "loss": 0.3678, "num_input_tokens_seen": 7627632, "step": 7590 }, { "epoch": 4.027041357370096, "grad_norm": 5.7286553382873535, "learning_rate": 9.69050008713393e-06, "loss": 0.2227, "num_input_tokens_seen": 7632432, "step": 7595 }, { "epoch": 4.029692470837752, "grad_norm": 3.419126033782959, "learning_rate": 9.6896982594016e-06, "loss": 0.3032, "num_input_tokens_seen": 7636912, "step": 7600 }, { "epoch": 4.0323435843054085, "grad_norm": 2.255366086959839, "learning_rate": 9.688895427617144e-06, "loss": 0.2225, "num_input_tokens_seen": 7642416, "step": 7605 }, { "epoch": 4.034994697773064, "grad_norm": 5.503284931182861, "learning_rate": 9.688091591952448e-06, "loss": 0.3038, "num_input_tokens_seen": 7647536, "step": 7610 }, { "epoch": 4.037645811240721, "grad_norm": 3.6870598793029785, "learning_rate": 9.687286752579608e-06, "loss": 0.2191, "num_input_tokens_seen": 7652624, "step": 7615 }, { "epoch": 4.040296924708377, "grad_norm": 4.547236442565918, "learning_rate": 9.68648090967094e-06, "loss": 0.3732, "num_input_tokens_seen": 7658928, "step": 7620 }, { "epoch": 4.042948038176034, "grad_norm": 4.352234840393066, "learning_rate": 9.685674063398974e-06, "loss": 0.3196, "num_input_tokens_seen": 7662864, "step": 7625 }, { "epoch": 4.045599151643691, "grad_norm": 6.683608531951904, "learning_rate": 9.684866213936453e-06, "loss": 0.3523, "num_input_tokens_seen": 7667152, "step": 7630 }, { "epoch": 4.048250265111347, "grad_norm": 3.99589204788208, "learning_rate": 9.684057361456331e-06, "loss": 0.2906, "num_input_tokens_seen": 7672080, "step": 7635 }, { "epoch": 4.050901378579003, "grad_norm": 2.89542293548584, "learning_rate": 9.683247506131787e-06, "loss": 0.3242, "num_input_tokens_seen": 7677104, "step": 7640 }, { "epoch": 4.053552492046659, "grad_norm": 4.472675323486328, "learning_rate": 9.682436648136207e-06, "loss": 0.2225, "num_input_tokens_seen": 7683600, "step": 7645 }, { "epoch": 4.056203605514316, "grad_norm": 5.333066940307617, "learning_rate": 9.681624787643193e-06, "loss": 0.2803, "num_input_tokens_seen": 7688368, "step": 7650 }, { "epoch": 4.058854718981973, "grad_norm": 6.358186721801758, "learning_rate": 9.680811924826563e-06, "loss": 0.519, "num_input_tokens_seen": 7693840, "step": 7655 }, { "epoch": 4.061505832449629, "grad_norm": 2.796617031097412, "learning_rate": 9.679998059860347e-06, "loss": 0.3184, "num_input_tokens_seen": 7697968, "step": 7660 }, { "epoch": 4.0641569459172855, "grad_norm": 1.979000449180603, "learning_rate": 9.679183192918795e-06, "loss": 0.2632, "num_input_tokens_seen": 7703184, "step": 7665 }, { "epoch": 4.066808059384941, "grad_norm": 7.04100227355957, "learning_rate": 9.678367324176363e-06, "loss": 0.3661, "num_input_tokens_seen": 7707952, "step": 7670 }, { "epoch": 4.069459172852598, "grad_norm": 3.014408588409424, "learning_rate": 9.67755045380773e-06, "loss": 0.2777, "num_input_tokens_seen": 7713296, "step": 7675 }, { "epoch": 4.072110286320255, "grad_norm": 3.4781241416931152, "learning_rate": 9.676732581987784e-06, "loss": 0.329, "num_input_tokens_seen": 7717392, "step": 7680 }, { "epoch": 4.074761399787911, "grad_norm": 2.8392441272735596, "learning_rate": 9.67591370889163e-06, "loss": 0.3974, "num_input_tokens_seen": 7722320, "step": 7685 }, { "epoch": 4.077412513255568, "grad_norm": 2.863442897796631, "learning_rate": 9.675093834694588e-06, "loss": 0.3089, "num_input_tokens_seen": 7726640, "step": 7690 }, { "epoch": 4.0800636267232235, "grad_norm": 3.649232864379883, "learning_rate": 9.674272959572189e-06, "loss": 0.2164, "num_input_tokens_seen": 7731984, "step": 7695 }, { "epoch": 4.08271474019088, "grad_norm": 3.7159018516540527, "learning_rate": 9.673451083700181e-06, "loss": 0.3043, "num_input_tokens_seen": 7736720, "step": 7700 }, { "epoch": 4.085365853658536, "grad_norm": 4.811644077301025, "learning_rate": 9.672628207254525e-06, "loss": 0.3814, "num_input_tokens_seen": 7741392, "step": 7705 }, { "epoch": 4.088016967126193, "grad_norm": 2.8663582801818848, "learning_rate": 9.671804330411397e-06, "loss": 0.3742, "num_input_tokens_seen": 7746704, "step": 7710 }, { "epoch": 4.09066808059385, "grad_norm": 5.233295917510986, "learning_rate": 9.670979453347185e-06, "loss": 0.3854, "num_input_tokens_seen": 7752304, "step": 7715 }, { "epoch": 4.093319194061506, "grad_norm": 2.652689218521118, "learning_rate": 9.670153576238497e-06, "loss": 0.2593, "num_input_tokens_seen": 7756112, "step": 7720 }, { "epoch": 4.0959703075291625, "grad_norm": 3.5266339778900146, "learning_rate": 9.669326699262148e-06, "loss": 0.335, "num_input_tokens_seen": 7760624, "step": 7725 }, { "epoch": 4.098621420996818, "grad_norm": 4.051380634307861, "learning_rate": 9.668498822595171e-06, "loss": 0.3707, "num_input_tokens_seen": 7765968, "step": 7730 }, { "epoch": 4.101272534464475, "grad_norm": 3.2463152408599854, "learning_rate": 9.667669946414812e-06, "loss": 0.3384, "num_input_tokens_seen": 7770704, "step": 7735 }, { "epoch": 4.103923647932131, "grad_norm": 2.646996021270752, "learning_rate": 9.666840070898533e-06, "loss": 0.2796, "num_input_tokens_seen": 7774416, "step": 7740 }, { "epoch": 4.106574761399788, "grad_norm": 4.266456127166748, "learning_rate": 9.666009196224005e-06, "loss": 0.2556, "num_input_tokens_seen": 7779984, "step": 7745 }, { "epoch": 4.109225874867445, "grad_norm": 3.4190316200256348, "learning_rate": 9.665177322569117e-06, "loss": 0.2405, "num_input_tokens_seen": 7784400, "step": 7750 }, { "epoch": 4.1118769883351005, "grad_norm": 3.5920138359069824, "learning_rate": 9.664344450111974e-06, "loss": 0.2411, "num_input_tokens_seen": 7789616, "step": 7755 }, { "epoch": 4.114528101802757, "grad_norm": 3.902404308319092, "learning_rate": 9.663510579030888e-06, "loss": 0.2781, "num_input_tokens_seen": 7794160, "step": 7760 }, { "epoch": 4.117179215270413, "grad_norm": 2.8484623432159424, "learning_rate": 9.662675709504392e-06, "loss": 0.2335, "num_input_tokens_seen": 7799856, "step": 7765 }, { "epoch": 4.11983032873807, "grad_norm": 3.8742589950561523, "learning_rate": 9.661839841711227e-06, "loss": 0.3487, "num_input_tokens_seen": 7804656, "step": 7770 }, { "epoch": 4.122481442205727, "grad_norm": 5.470076084136963, "learning_rate": 9.66100297583035e-06, "loss": 0.3372, "num_input_tokens_seen": 7809232, "step": 7775 }, { "epoch": 4.125132555673383, "grad_norm": 3.4373855590820312, "learning_rate": 9.660165112040933e-06, "loss": 0.136, "num_input_tokens_seen": 7815856, "step": 7780 }, { "epoch": 4.1277836691410394, "grad_norm": 5.499693393707275, "learning_rate": 9.659326250522358e-06, "loss": 0.3888, "num_input_tokens_seen": 7820944, "step": 7785 }, { "epoch": 4.130434782608695, "grad_norm": 4.643935203552246, "learning_rate": 9.658486391454228e-06, "loss": 0.3919, "num_input_tokens_seen": 7826640, "step": 7790 }, { "epoch": 4.133085896076352, "grad_norm": 3.902449607849121, "learning_rate": 9.657645535016349e-06, "loss": 0.3166, "num_input_tokens_seen": 7831728, "step": 7795 }, { "epoch": 4.135737009544009, "grad_norm": 3.240372657775879, "learning_rate": 9.656803681388751e-06, "loss": 0.299, "num_input_tokens_seen": 7836944, "step": 7800 }, { "epoch": 4.138388123011665, "grad_norm": 2.7195539474487305, "learning_rate": 9.655960830751669e-06, "loss": 0.214, "num_input_tokens_seen": 7842096, "step": 7805 }, { "epoch": 4.141039236479322, "grad_norm": 4.248956203460693, "learning_rate": 9.655116983285558e-06, "loss": 0.4741, "num_input_tokens_seen": 7847376, "step": 7810 }, { "epoch": 4.1436903499469775, "grad_norm": 4.07131814956665, "learning_rate": 9.654272139171081e-06, "loss": 0.2863, "num_input_tokens_seen": 7854096, "step": 7815 }, { "epoch": 4.146341463414634, "grad_norm": 5.621731281280518, "learning_rate": 9.653426298589117e-06, "loss": 0.2967, "num_input_tokens_seen": 7859056, "step": 7820 }, { "epoch": 4.14899257688229, "grad_norm": 8.091150283813477, "learning_rate": 9.652579461720762e-06, "loss": 0.4355, "num_input_tokens_seen": 7863408, "step": 7825 }, { "epoch": 4.151643690349947, "grad_norm": 3.6249420642852783, "learning_rate": 9.651731628747316e-06, "loss": 0.2402, "num_input_tokens_seen": 7870192, "step": 7830 }, { "epoch": 4.154294803817604, "grad_norm": 3.085050344467163, "learning_rate": 9.650882799850302e-06, "loss": 0.284, "num_input_tokens_seen": 7874384, "step": 7835 }, { "epoch": 4.15694591728526, "grad_norm": 4.148222923278809, "learning_rate": 9.650032975211451e-06, "loss": 0.3575, "num_input_tokens_seen": 7879152, "step": 7840 }, { "epoch": 4.159597030752916, "grad_norm": 3.6121716499328613, "learning_rate": 9.649182155012706e-06, "loss": 0.2124, "num_input_tokens_seen": 7883536, "step": 7845 }, { "epoch": 4.162248144220572, "grad_norm": 3.659792900085449, "learning_rate": 9.64833033943623e-06, "loss": 0.3351, "num_input_tokens_seen": 7887664, "step": 7850 }, { "epoch": 4.164899257688229, "grad_norm": 2.197615385055542, "learning_rate": 9.64747752866439e-06, "loss": 0.259, "num_input_tokens_seen": 7891952, "step": 7855 }, { "epoch": 4.167550371155886, "grad_norm": 6.0352277755737305, "learning_rate": 9.646623722879774e-06, "loss": 0.3426, "num_input_tokens_seen": 7897392, "step": 7860 }, { "epoch": 4.170201484623542, "grad_norm": 2.052795648574829, "learning_rate": 9.645768922265176e-06, "loss": 0.2127, "num_input_tokens_seen": 7901232, "step": 7865 }, { "epoch": 4.172852598091199, "grad_norm": 1.5825353860855103, "learning_rate": 9.64491312700361e-06, "loss": 0.2056, "num_input_tokens_seen": 7907440, "step": 7870 }, { "epoch": 4.1755037115588545, "grad_norm": 3.714686393737793, "learning_rate": 9.644056337278295e-06, "loss": 0.3297, "num_input_tokens_seen": 7912880, "step": 7875 }, { "epoch": 4.178154825026511, "grad_norm": 5.125101566314697, "learning_rate": 9.643198553272674e-06, "loss": 0.3684, "num_input_tokens_seen": 7917712, "step": 7880 }, { "epoch": 4.180805938494167, "grad_norm": 2.9379727840423584, "learning_rate": 9.642339775170392e-06, "loss": 0.3046, "num_input_tokens_seen": 7922064, "step": 7885 }, { "epoch": 4.183457051961824, "grad_norm": 5.130656719207764, "learning_rate": 9.64148000315531e-06, "loss": 0.3299, "num_input_tokens_seen": 7926448, "step": 7890 }, { "epoch": 4.186108165429481, "grad_norm": 3.119292974472046, "learning_rate": 9.640619237411508e-06, "loss": 0.2933, "num_input_tokens_seen": 7930704, "step": 7895 }, { "epoch": 4.188759278897137, "grad_norm": 4.490871429443359, "learning_rate": 9.639757478123268e-06, "loss": 0.3506, "num_input_tokens_seen": 7936144, "step": 7900 }, { "epoch": 4.191410392364793, "grad_norm": 6.422787189483643, "learning_rate": 9.638894725475094e-06, "loss": 0.3839, "num_input_tokens_seen": 7940880, "step": 7905 }, { "epoch": 4.194061505832449, "grad_norm": 7.954066753387451, "learning_rate": 9.638030979651698e-06, "loss": 0.3031, "num_input_tokens_seen": 7946064, "step": 7910 }, { "epoch": 4.196712619300106, "grad_norm": 3.4192235469818115, "learning_rate": 9.637166240838005e-06, "loss": 0.2431, "num_input_tokens_seen": 7951088, "step": 7915 }, { "epoch": 4.199363732767763, "grad_norm": 6.013655185699463, "learning_rate": 9.636300509219155e-06, "loss": 0.2544, "num_input_tokens_seen": 7955152, "step": 7920 }, { "epoch": 4.202014846235419, "grad_norm": 5.386206150054932, "learning_rate": 9.635433784980497e-06, "loss": 0.3089, "num_input_tokens_seen": 7959568, "step": 7925 }, { "epoch": 4.2046659597030756, "grad_norm": 3.942338705062866, "learning_rate": 9.634566068307598e-06, "loss": 0.319, "num_input_tokens_seen": 7964432, "step": 7930 }, { "epoch": 4.2073170731707314, "grad_norm": 4.4476094245910645, "learning_rate": 9.633697359386228e-06, "loss": 0.4095, "num_input_tokens_seen": 7969424, "step": 7935 }, { "epoch": 4.209968186638388, "grad_norm": 2.587111234664917, "learning_rate": 9.63282765840238e-06, "loss": 0.3081, "num_input_tokens_seen": 7974768, "step": 7940 }, { "epoch": 4.212619300106044, "grad_norm": 3.5811429023742676, "learning_rate": 9.631956965542253e-06, "loss": 0.283, "num_input_tokens_seen": 7979472, "step": 7945 }, { "epoch": 4.215270413573701, "grad_norm": 3.13065505027771, "learning_rate": 9.631085280992261e-06, "loss": 0.2159, "num_input_tokens_seen": 7984560, "step": 7950 }, { "epoch": 4.217921527041358, "grad_norm": 3.0174155235290527, "learning_rate": 9.630212604939026e-06, "loss": 0.2842, "num_input_tokens_seen": 7989136, "step": 7955 }, { "epoch": 4.220572640509014, "grad_norm": 3.69775128364563, "learning_rate": 9.629338937569392e-06, "loss": 0.3813, "num_input_tokens_seen": 7992880, "step": 7960 }, { "epoch": 4.22322375397667, "grad_norm": 2.9384865760803223, "learning_rate": 9.628464279070403e-06, "loss": 0.3142, "num_input_tokens_seen": 7998928, "step": 7965 }, { "epoch": 4.225874867444326, "grad_norm": 3.1056113243103027, "learning_rate": 9.627588629629324e-06, "loss": 0.2954, "num_input_tokens_seen": 8005168, "step": 7970 }, { "epoch": 4.228525980911983, "grad_norm": 2.61811900138855, "learning_rate": 9.626711989433628e-06, "loss": 0.291, "num_input_tokens_seen": 8010288, "step": 7975 }, { "epoch": 4.23117709437964, "grad_norm": 2.5745255947113037, "learning_rate": 9.625834358671002e-06, "loss": 0.3774, "num_input_tokens_seen": 8014064, "step": 7980 }, { "epoch": 4.233828207847296, "grad_norm": 1.8756134510040283, "learning_rate": 9.624955737529345e-06, "loss": 0.2293, "num_input_tokens_seen": 8019088, "step": 7985 }, { "epoch": 4.2364793213149525, "grad_norm": 1.8623496294021606, "learning_rate": 9.624076126196767e-06, "loss": 0.2416, "num_input_tokens_seen": 8023216, "step": 7990 }, { "epoch": 4.239130434782608, "grad_norm": 3.5484676361083984, "learning_rate": 9.62319552486159e-06, "loss": 0.2528, "num_input_tokens_seen": 8028144, "step": 7995 }, { "epoch": 4.241781548250265, "grad_norm": 1.0340365171432495, "learning_rate": 9.622313933712348e-06, "loss": 0.2104, "num_input_tokens_seen": 8033424, "step": 8000 }, { "epoch": 4.244432661717921, "grad_norm": 3.051669120788574, "learning_rate": 9.62143135293779e-06, "loss": 0.2634, "num_input_tokens_seen": 8039248, "step": 8005 }, { "epoch": 4.247083775185578, "grad_norm": 4.678811550140381, "learning_rate": 9.62054778272687e-06, "loss": 0.5645, "num_input_tokens_seen": 8045936, "step": 8010 }, { "epoch": 4.249734888653235, "grad_norm": 1.2824199199676514, "learning_rate": 9.619663223268763e-06, "loss": 0.2226, "num_input_tokens_seen": 8050192, "step": 8015 }, { "epoch": 4.252386002120891, "grad_norm": 2.7113239765167236, "learning_rate": 9.618777674752847e-06, "loss": 0.4267, "num_input_tokens_seen": 8055216, "step": 8020 }, { "epoch": 4.255037115588547, "grad_norm": 2.5602307319641113, "learning_rate": 9.617891137368715e-06, "loss": 0.2934, "num_input_tokens_seen": 8061264, "step": 8025 }, { "epoch": 4.257688229056203, "grad_norm": 2.505462408065796, "learning_rate": 9.617003611306178e-06, "loss": 0.2845, "num_input_tokens_seen": 8065328, "step": 8030 }, { "epoch": 4.26033934252386, "grad_norm": 4.312906742095947, "learning_rate": 9.616115096755247e-06, "loss": 0.294, "num_input_tokens_seen": 8069904, "step": 8035 }, { "epoch": 4.262990455991517, "grad_norm": 4.376289367675781, "learning_rate": 9.615225593906154e-06, "loss": 0.3612, "num_input_tokens_seen": 8074640, "step": 8040 }, { "epoch": 4.265641569459173, "grad_norm": 2.6478734016418457, "learning_rate": 9.614335102949338e-06, "loss": 0.2964, "num_input_tokens_seen": 8080336, "step": 8045 }, { "epoch": 4.2682926829268295, "grad_norm": 3.5885612964630127, "learning_rate": 9.61344362407545e-06, "loss": 0.2672, "num_input_tokens_seen": 8085680, "step": 8050 }, { "epoch": 4.270943796394485, "grad_norm": 4.821183681488037, "learning_rate": 9.612551157475356e-06, "loss": 0.2931, "num_input_tokens_seen": 8090800, "step": 8055 }, { "epoch": 4.273594909862142, "grad_norm": 3.4559876918792725, "learning_rate": 9.611657703340127e-06, "loss": 0.2749, "num_input_tokens_seen": 8094960, "step": 8060 }, { "epoch": 4.276246023329799, "grad_norm": 4.183812618255615, "learning_rate": 9.610763261861052e-06, "loss": 0.3136, "num_input_tokens_seen": 8099280, "step": 8065 }, { "epoch": 4.278897136797455, "grad_norm": 1.8060485124588013, "learning_rate": 9.60986783322963e-06, "loss": 0.3089, "num_input_tokens_seen": 8103760, "step": 8070 }, { "epoch": 4.281548250265112, "grad_norm": 4.743368148803711, "learning_rate": 9.608971417637565e-06, "loss": 0.2959, "num_input_tokens_seen": 8110288, "step": 8075 }, { "epoch": 4.2841993637327676, "grad_norm": 2.8547074794769287, "learning_rate": 9.60807401527678e-06, "loss": 0.2743, "num_input_tokens_seen": 8115344, "step": 8080 }, { "epoch": 4.286850477200424, "grad_norm": 8.587517738342285, "learning_rate": 9.607175626339407e-06, "loss": 0.318, "num_input_tokens_seen": 8121136, "step": 8085 }, { "epoch": 4.28950159066808, "grad_norm": 4.960387706756592, "learning_rate": 9.606276251017788e-06, "loss": 0.2358, "num_input_tokens_seen": 8125584, "step": 8090 }, { "epoch": 4.292152704135737, "grad_norm": 4.302561283111572, "learning_rate": 9.605375889504478e-06, "loss": 0.3302, "num_input_tokens_seen": 8129808, "step": 8095 }, { "epoch": 4.294803817603394, "grad_norm": 6.486279010772705, "learning_rate": 9.60447454199224e-06, "loss": 0.48, "num_input_tokens_seen": 8134000, "step": 8100 }, { "epoch": 4.29745493107105, "grad_norm": 4.50283145904541, "learning_rate": 9.603572208674052e-06, "loss": 0.2882, "num_input_tokens_seen": 8139408, "step": 8105 }, { "epoch": 4.3001060445387065, "grad_norm": 7.497051239013672, "learning_rate": 9.6026688897431e-06, "loss": 0.3536, "num_input_tokens_seen": 8143792, "step": 8110 }, { "epoch": 4.302757158006362, "grad_norm": 3.9407050609588623, "learning_rate": 9.601764585392783e-06, "loss": 0.3218, "num_input_tokens_seen": 8148496, "step": 8115 }, { "epoch": 4.305408271474019, "grad_norm": 3.234036684036255, "learning_rate": 9.600859295816708e-06, "loss": 0.3176, "num_input_tokens_seen": 8153488, "step": 8120 }, { "epoch": 4.308059384941675, "grad_norm": 4.217057228088379, "learning_rate": 9.599953021208701e-06, "loss": 0.2995, "num_input_tokens_seen": 8158096, "step": 8125 }, { "epoch": 4.310710498409332, "grad_norm": 4.5998969078063965, "learning_rate": 9.599045761762786e-06, "loss": 0.3306, "num_input_tokens_seen": 8162480, "step": 8130 }, { "epoch": 4.313361611876989, "grad_norm": 4.908044338226318, "learning_rate": 9.59813751767321e-06, "loss": 0.3796, "num_input_tokens_seen": 8167248, "step": 8135 }, { "epoch": 4.3160127253446445, "grad_norm": 5.332406520843506, "learning_rate": 9.597228289134422e-06, "loss": 0.2799, "num_input_tokens_seen": 8172464, "step": 8140 }, { "epoch": 4.318663838812301, "grad_norm": 6.331389427185059, "learning_rate": 9.596318076341086e-06, "loss": 0.3682, "num_input_tokens_seen": 8176944, "step": 8145 }, { "epoch": 4.321314952279957, "grad_norm": 3.261112928390503, "learning_rate": 9.595406879488078e-06, "loss": 0.2322, "num_input_tokens_seen": 8182480, "step": 8150 }, { "epoch": 4.323966065747614, "grad_norm": 3.4293742179870605, "learning_rate": 9.594494698770483e-06, "loss": 0.2414, "num_input_tokens_seen": 8186896, "step": 8155 }, { "epoch": 4.326617179215271, "grad_norm": 6.047082424163818, "learning_rate": 9.593581534383595e-06, "loss": 0.2681, "num_input_tokens_seen": 8191248, "step": 8160 }, { "epoch": 4.329268292682927, "grad_norm": 6.315337657928467, "learning_rate": 9.59266738652292e-06, "loss": 0.499, "num_input_tokens_seen": 8195952, "step": 8165 }, { "epoch": 4.3319194061505835, "grad_norm": 4.7439470291137695, "learning_rate": 9.591752255384176e-06, "loss": 0.431, "num_input_tokens_seen": 8201200, "step": 8170 }, { "epoch": 4.334570519618239, "grad_norm": 3.9513659477233887, "learning_rate": 9.59083614116329e-06, "loss": 0.3616, "num_input_tokens_seen": 8205456, "step": 8175 }, { "epoch": 4.337221633085896, "grad_norm": 2.651029348373413, "learning_rate": 9.5899190440564e-06, "loss": 0.2168, "num_input_tokens_seen": 8210480, "step": 8180 }, { "epoch": 4.339872746553553, "grad_norm": 4.326486110687256, "learning_rate": 9.589000964259852e-06, "loss": 0.3655, "num_input_tokens_seen": 8214704, "step": 8185 }, { "epoch": 4.342523860021209, "grad_norm": 2.720820903778076, "learning_rate": 9.588081901970205e-06, "loss": 0.2819, "num_input_tokens_seen": 8219600, "step": 8190 }, { "epoch": 4.345174973488866, "grad_norm": 1.8042489290237427, "learning_rate": 9.58716185738423e-06, "loss": 0.24, "num_input_tokens_seen": 8224336, "step": 8195 }, { "epoch": 4.3478260869565215, "grad_norm": 3.4630658626556396, "learning_rate": 9.586240830698904e-06, "loss": 0.3983, "num_input_tokens_seen": 8229840, "step": 8200 }, { "epoch": 4.350477200424178, "grad_norm": 2.9303057193756104, "learning_rate": 9.585318822111417e-06, "loss": 0.2828, "num_input_tokens_seen": 8234736, "step": 8205 }, { "epoch": 4.353128313891834, "grad_norm": 4.25535774230957, "learning_rate": 9.584395831819169e-06, "loss": 0.3898, "num_input_tokens_seen": 8239184, "step": 8210 }, { "epoch": 4.355779427359491, "grad_norm": 2.400542974472046, "learning_rate": 9.583471860019771e-06, "loss": 0.2753, "num_input_tokens_seen": 8245072, "step": 8215 }, { "epoch": 4.358430540827148, "grad_norm": 2.7043867111206055, "learning_rate": 9.582546906911039e-06, "loss": 0.2608, "num_input_tokens_seen": 8249936, "step": 8220 }, { "epoch": 4.361081654294804, "grad_norm": 4.929440021514893, "learning_rate": 9.581620972691006e-06, "loss": 0.2662, "num_input_tokens_seen": 8256688, "step": 8225 }, { "epoch": 4.36373276776246, "grad_norm": 2.126783847808838, "learning_rate": 9.580694057557913e-06, "loss": 0.329, "num_input_tokens_seen": 8262032, "step": 8230 }, { "epoch": 4.366383881230116, "grad_norm": 3.7037265300750732, "learning_rate": 9.579766161710209e-06, "loss": 0.2943, "num_input_tokens_seen": 8267664, "step": 8235 }, { "epoch": 4.369034994697773, "grad_norm": 4.254008769989014, "learning_rate": 9.578837285346552e-06, "loss": 0.3174, "num_input_tokens_seen": 8271152, "step": 8240 }, { "epoch": 4.371686108165429, "grad_norm": 2.2873647212982178, "learning_rate": 9.577907428665815e-06, "loss": 0.3298, "num_input_tokens_seen": 8276272, "step": 8245 }, { "epoch": 4.374337221633086, "grad_norm": 4.298961639404297, "learning_rate": 9.576976591867077e-06, "loss": 0.3996, "num_input_tokens_seen": 8281200, "step": 8250 }, { "epoch": 4.376988335100743, "grad_norm": 4.830214500427246, "learning_rate": 9.576044775149626e-06, "loss": 0.2969, "num_input_tokens_seen": 8286704, "step": 8255 }, { "epoch": 4.3796394485683985, "grad_norm": 2.877985954284668, "learning_rate": 9.575111978712963e-06, "loss": 0.2634, "num_input_tokens_seen": 8291440, "step": 8260 }, { "epoch": 4.382290562036055, "grad_norm": 5.840717315673828, "learning_rate": 9.5741782027568e-06, "loss": 0.2921, "num_input_tokens_seen": 8296080, "step": 8265 }, { "epoch": 4.384941675503711, "grad_norm": 3.4627251625061035, "learning_rate": 9.57324344748105e-06, "loss": 0.3295, "num_input_tokens_seen": 8301392, "step": 8270 }, { "epoch": 4.387592788971368, "grad_norm": 2.5700888633728027, "learning_rate": 9.572307713085846e-06, "loss": 0.2929, "num_input_tokens_seen": 8306576, "step": 8275 }, { "epoch": 4.390243902439025, "grad_norm": 4.1222758293151855, "learning_rate": 9.571370999771525e-06, "loss": 0.2669, "num_input_tokens_seen": 8311056, "step": 8280 }, { "epoch": 4.392895015906681, "grad_norm": 3.5876307487487793, "learning_rate": 9.570433307738636e-06, "loss": 0.3614, "num_input_tokens_seen": 8315728, "step": 8285 }, { "epoch": 4.395546129374337, "grad_norm": 3.2271063327789307, "learning_rate": 9.569494637187935e-06, "loss": 0.2596, "num_input_tokens_seen": 8320336, "step": 8290 }, { "epoch": 4.398197242841993, "grad_norm": 3.0541281700134277, "learning_rate": 9.568554988320388e-06, "loss": 0.3067, "num_input_tokens_seen": 8325520, "step": 8295 }, { "epoch": 4.40084835630965, "grad_norm": 4.076274394989014, "learning_rate": 9.567614361337174e-06, "loss": 0.3209, "num_input_tokens_seen": 8331056, "step": 8300 }, { "epoch": 4.403499469777307, "grad_norm": 5.3656511306762695, "learning_rate": 9.566672756439676e-06, "loss": 0.2793, "num_input_tokens_seen": 8336464, "step": 8305 }, { "epoch": 4.406150583244963, "grad_norm": 4.284735679626465, "learning_rate": 9.565730173829493e-06, "loss": 0.3258, "num_input_tokens_seen": 8341008, "step": 8310 }, { "epoch": 4.40880169671262, "grad_norm": 3.8783586025238037, "learning_rate": 9.564786613708425e-06, "loss": 0.1875, "num_input_tokens_seen": 8344816, "step": 8315 }, { "epoch": 4.4114528101802755, "grad_norm": 2.608715534210205, "learning_rate": 9.56384207627849e-06, "loss": 0.2302, "num_input_tokens_seen": 8349872, "step": 8320 }, { "epoch": 4.414103923647932, "grad_norm": 3.697300910949707, "learning_rate": 9.562896561741907e-06, "loss": 0.3997, "num_input_tokens_seen": 8354256, "step": 8325 }, { "epoch": 4.416755037115588, "grad_norm": 4.028566837310791, "learning_rate": 9.56195007030111e-06, "loss": 0.3773, "num_input_tokens_seen": 8359312, "step": 8330 }, { "epoch": 4.419406150583245, "grad_norm": 3.64216685295105, "learning_rate": 9.561002602158742e-06, "loss": 0.2858, "num_input_tokens_seen": 8363664, "step": 8335 }, { "epoch": 4.422057264050902, "grad_norm": 3.783508062362671, "learning_rate": 9.560054157517651e-06, "loss": 0.3398, "num_input_tokens_seen": 8368688, "step": 8340 }, { "epoch": 4.424708377518558, "grad_norm": 2.1523239612579346, "learning_rate": 9.559104736580897e-06, "loss": 0.2575, "num_input_tokens_seen": 8373456, "step": 8345 }, { "epoch": 4.427359490986214, "grad_norm": 3.503631114959717, "learning_rate": 9.558154339551748e-06, "loss": 0.3538, "num_input_tokens_seen": 8378960, "step": 8350 }, { "epoch": 4.43001060445387, "grad_norm": 3.239305019378662, "learning_rate": 9.557202966633684e-06, "loss": 0.3222, "num_input_tokens_seen": 8384592, "step": 8355 }, { "epoch": 4.432661717921527, "grad_norm": 4.004849433898926, "learning_rate": 9.556250618030389e-06, "loss": 0.3347, "num_input_tokens_seen": 8389520, "step": 8360 }, { "epoch": 4.435312831389184, "grad_norm": 1.951295018196106, "learning_rate": 9.55529729394576e-06, "loss": 0.2039, "num_input_tokens_seen": 8394224, "step": 8365 }, { "epoch": 4.43796394485684, "grad_norm": 10.00972843170166, "learning_rate": 9.554342994583899e-06, "loss": 0.3163, "num_input_tokens_seen": 8399024, "step": 8370 }, { "epoch": 4.4406150583244965, "grad_norm": 6.872629642486572, "learning_rate": 9.553387720149121e-06, "loss": 0.3138, "num_input_tokens_seen": 8404272, "step": 8375 }, { "epoch": 4.443266171792152, "grad_norm": 4.12307596206665, "learning_rate": 9.552431470845947e-06, "loss": 0.2945, "num_input_tokens_seen": 8409264, "step": 8380 }, { "epoch": 4.445917285259809, "grad_norm": 4.977369785308838, "learning_rate": 9.551474246879108e-06, "loss": 0.2717, "num_input_tokens_seen": 8414800, "step": 8385 }, { "epoch": 4.448568398727465, "grad_norm": 3.2751212120056152, "learning_rate": 9.550516048453544e-06, "loss": 0.3133, "num_input_tokens_seen": 8419728, "step": 8390 }, { "epoch": 4.451219512195122, "grad_norm": 5.053792953491211, "learning_rate": 9.5495568757744e-06, "loss": 0.2714, "num_input_tokens_seen": 8423888, "step": 8395 }, { "epoch": 4.453870625662779, "grad_norm": 5.245071887969971, "learning_rate": 9.548596729047034e-06, "loss": 0.3072, "num_input_tokens_seen": 8428784, "step": 8400 }, { "epoch": 4.456521739130435, "grad_norm": 4.184142112731934, "learning_rate": 9.547635608477011e-06, "loss": 0.2748, "num_input_tokens_seen": 8434800, "step": 8405 }, { "epoch": 4.459172852598091, "grad_norm": 3.397503137588501, "learning_rate": 9.546673514270103e-06, "loss": 0.3676, "num_input_tokens_seen": 8439696, "step": 8410 }, { "epoch": 4.461823966065747, "grad_norm": 2.380171537399292, "learning_rate": 9.545710446632294e-06, "loss": 0.2641, "num_input_tokens_seen": 8444304, "step": 8415 }, { "epoch": 4.464475079533404, "grad_norm": 6.8843536376953125, "learning_rate": 9.544746405769774e-06, "loss": 0.4274, "num_input_tokens_seen": 8449648, "step": 8420 }, { "epoch": 4.467126193001061, "grad_norm": 5.903154373168945, "learning_rate": 9.54378139188894e-06, "loss": 0.2812, "num_input_tokens_seen": 8453680, "step": 8425 }, { "epoch": 4.469777306468717, "grad_norm": 5.068695068359375, "learning_rate": 9.5428154051964e-06, "loss": 0.3101, "num_input_tokens_seen": 8458864, "step": 8430 }, { "epoch": 4.4724284199363735, "grad_norm": 5.295475006103516, "learning_rate": 9.54184844589897e-06, "loss": 0.2875, "num_input_tokens_seen": 8463856, "step": 8435 }, { "epoch": 4.475079533404029, "grad_norm": 3.9896838665008545, "learning_rate": 9.540880514203672e-06, "loss": 0.3616, "num_input_tokens_seen": 8469296, "step": 8440 }, { "epoch": 4.477730646871686, "grad_norm": 4.745184421539307, "learning_rate": 9.539911610317738e-06, "loss": 0.2659, "num_input_tokens_seen": 8474768, "step": 8445 }, { "epoch": 4.480381760339343, "grad_norm": 1.840113639831543, "learning_rate": 9.538941734448608e-06, "loss": 0.2684, "num_input_tokens_seen": 8479760, "step": 8450 }, { "epoch": 4.483032873806999, "grad_norm": 4.991092681884766, "learning_rate": 9.53797088680393e-06, "loss": 0.3526, "num_input_tokens_seen": 8484784, "step": 8455 }, { "epoch": 4.485683987274656, "grad_norm": 5.243530750274658, "learning_rate": 9.53699906759156e-06, "loss": 0.2174, "num_input_tokens_seen": 8489648, "step": 8460 }, { "epoch": 4.488335100742312, "grad_norm": 4.388354301452637, "learning_rate": 9.536026277019562e-06, "loss": 0.295, "num_input_tokens_seen": 8493648, "step": 8465 }, { "epoch": 4.490986214209968, "grad_norm": 4.694240570068359, "learning_rate": 9.535052515296205e-06, "loss": 0.2601, "num_input_tokens_seen": 8498288, "step": 8470 }, { "epoch": 4.493637327677624, "grad_norm": 4.497250080108643, "learning_rate": 9.534077782629974e-06, "loss": 0.2243, "num_input_tokens_seen": 8502960, "step": 8475 }, { "epoch": 4.496288441145281, "grad_norm": 4.973400115966797, "learning_rate": 9.533102079229555e-06, "loss": 0.3393, "num_input_tokens_seen": 8507728, "step": 8480 }, { "epoch": 4.498939554612938, "grad_norm": 4.258647441864014, "learning_rate": 9.53212540530384e-06, "loss": 0.3548, "num_input_tokens_seen": 8512912, "step": 8485 }, { "epoch": 4.501590668080594, "grad_norm": 3.818554401397705, "learning_rate": 9.531147761061937e-06, "loss": 0.2542, "num_input_tokens_seen": 8518416, "step": 8490 }, { "epoch": 4.5042417815482505, "grad_norm": 3.7647690773010254, "learning_rate": 9.530169146713155e-06, "loss": 0.2928, "num_input_tokens_seen": 8523024, "step": 8495 }, { "epoch": 4.506892895015906, "grad_norm": 4.147728443145752, "learning_rate": 9.529189562467014e-06, "loss": 0.2919, "num_input_tokens_seen": 8527952, "step": 8500 }, { "epoch": 4.509544008483563, "grad_norm": 4.464080810546875, "learning_rate": 9.528209008533238e-06, "loss": 0.3297, "num_input_tokens_seen": 8533520, "step": 8505 }, { "epoch": 4.512195121951219, "grad_norm": 2.7625811100006104, "learning_rate": 9.527227485121762e-06, "loss": 0.2629, "num_input_tokens_seen": 8538064, "step": 8510 }, { "epoch": 4.514846235418876, "grad_norm": 3.3913252353668213, "learning_rate": 9.52624499244273e-06, "loss": 0.2542, "num_input_tokens_seen": 8542576, "step": 8515 }, { "epoch": 4.517497348886533, "grad_norm": 3.2795753479003906, "learning_rate": 9.525261530706487e-06, "loss": 0.2757, "num_input_tokens_seen": 8547216, "step": 8520 }, { "epoch": 4.5201484623541885, "grad_norm": 5.922802448272705, "learning_rate": 9.524277100123592e-06, "loss": 0.4307, "num_input_tokens_seen": 8552688, "step": 8525 }, { "epoch": 4.522799575821845, "grad_norm": 3.3228557109832764, "learning_rate": 9.523291700904808e-06, "loss": 0.3308, "num_input_tokens_seen": 8557104, "step": 8530 }, { "epoch": 4.525450689289501, "grad_norm": 2.176407814025879, "learning_rate": 9.522305333261109e-06, "loss": 0.2536, "num_input_tokens_seen": 8561776, "step": 8535 }, { "epoch": 4.528101802757158, "grad_norm": 4.705324172973633, "learning_rate": 9.521317997403672e-06, "loss": 0.3427, "num_input_tokens_seen": 8566672, "step": 8540 }, { "epoch": 4.530752916224815, "grad_norm": 4.939235210418701, "learning_rate": 9.520329693543881e-06, "loss": 0.266, "num_input_tokens_seen": 8570736, "step": 8545 }, { "epoch": 4.533404029692471, "grad_norm": 4.152224063873291, "learning_rate": 9.519340421893333e-06, "loss": 0.3357, "num_input_tokens_seen": 8575056, "step": 8550 }, { "epoch": 4.5360551431601275, "grad_norm": 4.356329917907715, "learning_rate": 9.518350182663824e-06, "loss": 0.2633, "num_input_tokens_seen": 8580016, "step": 8555 }, { "epoch": 4.538706256627783, "grad_norm": 5.243633270263672, "learning_rate": 9.517358976067366e-06, "loss": 0.3561, "num_input_tokens_seen": 8585072, "step": 8560 }, { "epoch": 4.54135737009544, "grad_norm": 4.903926849365234, "learning_rate": 9.516366802316173e-06, "loss": 0.3303, "num_input_tokens_seen": 8589296, "step": 8565 }, { "epoch": 4.544008483563097, "grad_norm": 7.639822483062744, "learning_rate": 9.515373661622665e-06, "loss": 0.347, "num_input_tokens_seen": 8597040, "step": 8570 }, { "epoch": 4.546659597030753, "grad_norm": 2.785896062850952, "learning_rate": 9.51437955419947e-06, "loss": 0.2844, "num_input_tokens_seen": 8601520, "step": 8575 }, { "epoch": 4.54931071049841, "grad_norm": 7.070740699768066, "learning_rate": 9.513384480259427e-06, "loss": 0.3494, "num_input_tokens_seen": 8606576, "step": 8580 }, { "epoch": 4.5519618239660655, "grad_norm": 3.299224853515625, "learning_rate": 9.512388440015577e-06, "loss": 0.2564, "num_input_tokens_seen": 8614576, "step": 8585 }, { "epoch": 4.554612937433722, "grad_norm": 3.9574778079986572, "learning_rate": 9.51139143368117e-06, "loss": 0.2297, "num_input_tokens_seen": 8619792, "step": 8590 }, { "epoch": 4.557264050901378, "grad_norm": 4.772799491882324, "learning_rate": 9.51039346146966e-06, "loss": 0.3639, "num_input_tokens_seen": 8624368, "step": 8595 }, { "epoch": 4.559915164369035, "grad_norm": 6.386314392089844, "learning_rate": 9.509394523594715e-06, "loss": 0.3963, "num_input_tokens_seen": 8629104, "step": 8600 }, { "epoch": 4.562566277836692, "grad_norm": 4.750685214996338, "learning_rate": 9.5083946202702e-06, "loss": 0.2593, "num_input_tokens_seen": 8634416, "step": 8605 }, { "epoch": 4.565217391304348, "grad_norm": 3.7661726474761963, "learning_rate": 9.507393751710196e-06, "loss": 0.3375, "num_input_tokens_seen": 8640784, "step": 8610 }, { "epoch": 4.5678685047720045, "grad_norm": 4.8257975578308105, "learning_rate": 9.506391918128984e-06, "loss": 0.2949, "num_input_tokens_seen": 8645232, "step": 8615 }, { "epoch": 4.57051961823966, "grad_norm": 3.6407880783081055, "learning_rate": 9.505389119741054e-06, "loss": 0.2501, "num_input_tokens_seen": 8650288, "step": 8620 }, { "epoch": 4.573170731707317, "grad_norm": 2.3671114444732666, "learning_rate": 9.504385356761103e-06, "loss": 0.2849, "num_input_tokens_seen": 8654128, "step": 8625 }, { "epoch": 4.575821845174973, "grad_norm": 5.018180847167969, "learning_rate": 9.503380629404035e-06, "loss": 0.3766, "num_input_tokens_seen": 8659248, "step": 8630 }, { "epoch": 4.57847295864263, "grad_norm": 6.4628801345825195, "learning_rate": 9.502374937884957e-06, "loss": 0.3482, "num_input_tokens_seen": 8663824, "step": 8635 }, { "epoch": 4.581124072110287, "grad_norm": 4.524603843688965, "learning_rate": 9.501368282419187e-06, "loss": 0.2847, "num_input_tokens_seen": 8669040, "step": 8640 }, { "epoch": 4.5837751855779425, "grad_norm": 6.070146560668945, "learning_rate": 9.500360663222246e-06, "loss": 0.3269, "num_input_tokens_seen": 8674672, "step": 8645 }, { "epoch": 4.586426299045599, "grad_norm": 6.5152587890625, "learning_rate": 9.499352080509865e-06, "loss": 0.2938, "num_input_tokens_seen": 8678480, "step": 8650 }, { "epoch": 4.589077412513255, "grad_norm": 2.435303211212158, "learning_rate": 9.498342534497975e-06, "loss": 0.2315, "num_input_tokens_seen": 8682192, "step": 8655 }, { "epoch": 4.591728525980912, "grad_norm": 3.9376986026763916, "learning_rate": 9.497332025402719e-06, "loss": 0.3134, "num_input_tokens_seen": 8686160, "step": 8660 }, { "epoch": 4.594379639448569, "grad_norm": 4.813634395599365, "learning_rate": 9.496320553440446e-06, "loss": 0.3256, "num_input_tokens_seen": 8690576, "step": 8665 }, { "epoch": 4.597030752916225, "grad_norm": 4.525151252746582, "learning_rate": 9.495308118827708e-06, "loss": 0.3019, "num_input_tokens_seen": 8695600, "step": 8670 }, { "epoch": 4.599681866383881, "grad_norm": 5.205461025238037, "learning_rate": 9.494294721781266e-06, "loss": 0.277, "num_input_tokens_seen": 8700944, "step": 8675 }, { "epoch": 4.602332979851537, "grad_norm": 4.782978057861328, "learning_rate": 9.493280362518082e-06, "loss": 0.2581, "num_input_tokens_seen": 8707504, "step": 8680 }, { "epoch": 4.604984093319194, "grad_norm": 5.241699695587158, "learning_rate": 9.492265041255332e-06, "loss": 0.3145, "num_input_tokens_seen": 8712208, "step": 8685 }, { "epoch": 4.607635206786851, "grad_norm": 4.448662281036377, "learning_rate": 9.49124875821039e-06, "loss": 0.3006, "num_input_tokens_seen": 8717520, "step": 8690 }, { "epoch": 4.610286320254507, "grad_norm": 11.099225044250488, "learning_rate": 9.490231513600842e-06, "loss": 0.4277, "num_input_tokens_seen": 8722480, "step": 8695 }, { "epoch": 4.612937433722164, "grad_norm": 4.351428985595703, "learning_rate": 9.489213307644478e-06, "loss": 0.26, "num_input_tokens_seen": 8726864, "step": 8700 }, { "epoch": 4.6155885471898195, "grad_norm": 3.5669946670532227, "learning_rate": 9.488194140559292e-06, "loss": 0.2645, "num_input_tokens_seen": 8732592, "step": 8705 }, { "epoch": 4.618239660657476, "grad_norm": 8.745896339416504, "learning_rate": 9.487174012563484e-06, "loss": 0.3072, "num_input_tokens_seen": 8736560, "step": 8710 }, { "epoch": 4.620890774125133, "grad_norm": 5.789897918701172, "learning_rate": 9.48615292387546e-06, "loss": 0.3304, "num_input_tokens_seen": 8742384, "step": 8715 }, { "epoch": 4.623541887592789, "grad_norm": 2.3702633380889893, "learning_rate": 9.485130874713837e-06, "loss": 0.3153, "num_input_tokens_seen": 8746448, "step": 8720 }, { "epoch": 4.626193001060446, "grad_norm": 4.427849292755127, "learning_rate": 9.48410786529743e-06, "loss": 0.2751, "num_input_tokens_seen": 8751600, "step": 8725 }, { "epoch": 4.628844114528102, "grad_norm": 2.8108766078948975, "learning_rate": 9.48308389584526e-06, "loss": 0.1878, "num_input_tokens_seen": 8756816, "step": 8730 }, { "epoch": 4.631495227995758, "grad_norm": 2.7518081665039062, "learning_rate": 9.48205896657656e-06, "loss": 0.2186, "num_input_tokens_seen": 8762320, "step": 8735 }, { "epoch": 4.634146341463414, "grad_norm": 7.613460063934326, "learning_rate": 9.481033077710765e-06, "loss": 0.3746, "num_input_tokens_seen": 8767120, "step": 8740 }, { "epoch": 4.636797454931071, "grad_norm": 6.340289115905762, "learning_rate": 9.480006229467513e-06, "loss": 0.3336, "num_input_tokens_seen": 8771728, "step": 8745 }, { "epoch": 4.639448568398727, "grad_norm": 6.582716941833496, "learning_rate": 9.47897842206665e-06, "loss": 0.3177, "num_input_tokens_seen": 8777936, "step": 8750 }, { "epoch": 4.642099681866384, "grad_norm": 5.215304374694824, "learning_rate": 9.477949655728227e-06, "loss": 0.2612, "num_input_tokens_seen": 8783408, "step": 8755 }, { "epoch": 4.644750795334041, "grad_norm": 6.0042219161987305, "learning_rate": 9.476919930672502e-06, "loss": 0.2879, "num_input_tokens_seen": 8788176, "step": 8760 }, { "epoch": 4.6474019088016965, "grad_norm": 2.196472406387329, "learning_rate": 9.475889247119931e-06, "loss": 0.2532, "num_input_tokens_seen": 8792944, "step": 8765 }, { "epoch": 4.650053022269353, "grad_norm": 4.736571788787842, "learning_rate": 9.474857605291188e-06, "loss": 0.3597, "num_input_tokens_seen": 8798128, "step": 8770 }, { "epoch": 4.652704135737009, "grad_norm": 4.858503818511963, "learning_rate": 9.47382500540714e-06, "loss": 0.2617, "num_input_tokens_seen": 8803888, "step": 8775 }, { "epoch": 4.655355249204666, "grad_norm": 8.157379150390625, "learning_rate": 9.472791447688865e-06, "loss": 0.2559, "num_input_tokens_seen": 8808112, "step": 8780 }, { "epoch": 4.658006362672323, "grad_norm": 3.712303400039673, "learning_rate": 9.471756932357645e-06, "loss": 0.5211, "num_input_tokens_seen": 8812176, "step": 8785 }, { "epoch": 4.660657476139979, "grad_norm": 6.069282531738281, "learning_rate": 9.470721459634968e-06, "loss": 0.4074, "num_input_tokens_seen": 8816688, "step": 8790 }, { "epoch": 4.663308589607635, "grad_norm": 7.423392295837402, "learning_rate": 9.469685029742524e-06, "loss": 0.2834, "num_input_tokens_seen": 8821616, "step": 8795 }, { "epoch": 4.665959703075291, "grad_norm": 7.70087194442749, "learning_rate": 9.468647642902213e-06, "loss": 0.4471, "num_input_tokens_seen": 8826352, "step": 8800 }, { "epoch": 4.668610816542948, "grad_norm": 4.833395481109619, "learning_rate": 9.467609299336133e-06, "loss": 0.387, "num_input_tokens_seen": 8831504, "step": 8805 }, { "epoch": 4.671261930010605, "grad_norm": 4.011366844177246, "learning_rate": 9.466569999266595e-06, "loss": 0.3686, "num_input_tokens_seen": 8836048, "step": 8810 }, { "epoch": 4.673913043478261, "grad_norm": 5.9477314949035645, "learning_rate": 9.465529742916105e-06, "loss": 0.3185, "num_input_tokens_seen": 8840848, "step": 8815 }, { "epoch": 4.6765641569459175, "grad_norm": 3.8083913326263428, "learning_rate": 9.464488530507383e-06, "loss": 0.4138, "num_input_tokens_seen": 8847184, "step": 8820 }, { "epoch": 4.679215270413573, "grad_norm": 2.8930397033691406, "learning_rate": 9.46344636226335e-06, "loss": 0.3456, "num_input_tokens_seen": 8853040, "step": 8825 }, { "epoch": 4.68186638388123, "grad_norm": 3.333963394165039, "learning_rate": 9.46240323840713e-06, "loss": 0.3034, "num_input_tokens_seen": 8857136, "step": 8830 }, { "epoch": 4.684517497348887, "grad_norm": 1.8044676780700684, "learning_rate": 9.461359159162053e-06, "loss": 0.2524, "num_input_tokens_seen": 8862960, "step": 8835 }, { "epoch": 4.687168610816543, "grad_norm": 3.273223400115967, "learning_rate": 9.460314124751654e-06, "loss": 0.3028, "num_input_tokens_seen": 8868208, "step": 8840 }, { "epoch": 4.6898197242842, "grad_norm": 3.2097692489624023, "learning_rate": 9.459268135399675e-06, "loss": 0.3138, "num_input_tokens_seen": 8872656, "step": 8845 }, { "epoch": 4.692470837751856, "grad_norm": 3.265883684158325, "learning_rate": 9.458221191330055e-06, "loss": 0.3367, "num_input_tokens_seen": 8878064, "step": 8850 }, { "epoch": 4.695121951219512, "grad_norm": 5.059750080108643, "learning_rate": 9.457173292766944e-06, "loss": 0.3077, "num_input_tokens_seen": 8882448, "step": 8855 }, { "epoch": 4.697773064687168, "grad_norm": 4.2005767822265625, "learning_rate": 9.456124439934694e-06, "loss": 0.3296, "num_input_tokens_seen": 8887440, "step": 8860 }, { "epoch": 4.700424178154825, "grad_norm": 3.0615298748016357, "learning_rate": 9.455074633057862e-06, "loss": 0.2687, "num_input_tokens_seen": 8892208, "step": 8865 }, { "epoch": 4.703075291622481, "grad_norm": 2.927264928817749, "learning_rate": 9.454023872361209e-06, "loss": 0.3835, "num_input_tokens_seen": 8897680, "step": 8870 }, { "epoch": 4.705726405090138, "grad_norm": 6.733297348022461, "learning_rate": 9.4529721580697e-06, "loss": 0.3337, "num_input_tokens_seen": 8902992, "step": 8875 }, { "epoch": 4.7083775185577945, "grad_norm": 2.542300224304199, "learning_rate": 9.451919490408504e-06, "loss": 0.2914, "num_input_tokens_seen": 8908016, "step": 8880 }, { "epoch": 4.71102863202545, "grad_norm": 4.233837604522705, "learning_rate": 9.450865869602996e-06, "loss": 0.2351, "num_input_tokens_seen": 8912080, "step": 8885 }, { "epoch": 4.713679745493107, "grad_norm": 4.073307514190674, "learning_rate": 9.44981129587875e-06, "loss": 0.2029, "num_input_tokens_seen": 8917040, "step": 8890 }, { "epoch": 4.716330858960763, "grad_norm": 3.6788957118988037, "learning_rate": 9.44875576946155e-06, "loss": 0.3372, "num_input_tokens_seen": 8922032, "step": 8895 }, { "epoch": 4.71898197242842, "grad_norm": 5.199697494506836, "learning_rate": 9.447699290577382e-06, "loss": 0.3229, "num_input_tokens_seen": 8927952, "step": 8900 }, { "epoch": 4.721633085896077, "grad_norm": 5.545238494873047, "learning_rate": 9.446641859452433e-06, "loss": 0.2812, "num_input_tokens_seen": 8932656, "step": 8905 }, { "epoch": 4.724284199363733, "grad_norm": 3.9607932567596436, "learning_rate": 9.445583476313098e-06, "loss": 0.3117, "num_input_tokens_seen": 8936912, "step": 8910 }, { "epoch": 4.726935312831389, "grad_norm": 4.24063777923584, "learning_rate": 9.444524141385975e-06, "loss": 0.2347, "num_input_tokens_seen": 8941712, "step": 8915 }, { "epoch": 4.729586426299045, "grad_norm": 5.687101364135742, "learning_rate": 9.443463854897862e-06, "loss": 0.2735, "num_input_tokens_seen": 8946992, "step": 8920 }, { "epoch": 4.732237539766702, "grad_norm": 3.5274171829223633, "learning_rate": 9.442402617075765e-06, "loss": 0.2501, "num_input_tokens_seen": 8952944, "step": 8925 }, { "epoch": 4.734888653234359, "grad_norm": 2.326399087905884, "learning_rate": 9.441340428146893e-06, "loss": 0.256, "num_input_tokens_seen": 8957136, "step": 8930 }, { "epoch": 4.737539766702015, "grad_norm": 4.365431308746338, "learning_rate": 9.440277288338657e-06, "loss": 0.315, "num_input_tokens_seen": 8962480, "step": 8935 }, { "epoch": 4.7401908801696715, "grad_norm": 4.273962497711182, "learning_rate": 9.439213197878674e-06, "loss": 0.2457, "num_input_tokens_seen": 8968016, "step": 8940 }, { "epoch": 4.742841993637327, "grad_norm": 3.9511396884918213, "learning_rate": 9.438148156994758e-06, "loss": 0.4171, "num_input_tokens_seen": 8972400, "step": 8945 }, { "epoch": 4.745493107104984, "grad_norm": 3.814324378967285, "learning_rate": 9.437082165914937e-06, "loss": 0.2359, "num_input_tokens_seen": 8977840, "step": 8950 }, { "epoch": 4.748144220572641, "grad_norm": 4.631649017333984, "learning_rate": 9.436015224867436e-06, "loss": 0.4102, "num_input_tokens_seen": 8983856, "step": 8955 }, { "epoch": 4.750795334040297, "grad_norm": 3.9255847930908203, "learning_rate": 9.434947334080682e-06, "loss": 0.2939, "num_input_tokens_seen": 8988624, "step": 8960 }, { "epoch": 4.753446447507954, "grad_norm": 3.643206834793091, "learning_rate": 9.433878493783308e-06, "loss": 0.2557, "num_input_tokens_seen": 8993200, "step": 8965 }, { "epoch": 4.7560975609756095, "grad_norm": 2.7879488468170166, "learning_rate": 9.432808704204154e-06, "loss": 0.2109, "num_input_tokens_seen": 9000912, "step": 8970 }, { "epoch": 4.758748674443266, "grad_norm": 3.2014575004577637, "learning_rate": 9.431737965572253e-06, "loss": 0.2529, "num_input_tokens_seen": 9004848, "step": 8975 }, { "epoch": 4.761399787910922, "grad_norm": 6.681353569030762, "learning_rate": 9.43066627811685e-06, "loss": 0.2407, "num_input_tokens_seen": 9010416, "step": 8980 }, { "epoch": 4.764050901378579, "grad_norm": 3.6532692909240723, "learning_rate": 9.429593642067392e-06, "loss": 0.3781, "num_input_tokens_seen": 9015984, "step": 8985 }, { "epoch": 4.766702014846236, "grad_norm": 5.429947376251221, "learning_rate": 9.428520057653527e-06, "loss": 0.2793, "num_input_tokens_seen": 9021392, "step": 8990 }, { "epoch": 4.769353128313892, "grad_norm": 4.931720733642578, "learning_rate": 9.427445525105106e-06, "loss": 0.2897, "num_input_tokens_seen": 9026256, "step": 8995 }, { "epoch": 4.7720042417815485, "grad_norm": 4.593873500823975, "learning_rate": 9.426370044652182e-06, "loss": 0.3279, "num_input_tokens_seen": 9031152, "step": 9000 }, { "epoch": 4.774655355249204, "grad_norm": 3.303272008895874, "learning_rate": 9.425293616525015e-06, "loss": 0.2789, "num_input_tokens_seen": 9036336, "step": 9005 }, { "epoch": 4.777306468716861, "grad_norm": 3.834059238433838, "learning_rate": 9.424216240954064e-06, "loss": 0.2681, "num_input_tokens_seen": 9040592, "step": 9010 }, { "epoch": 4.779957582184517, "grad_norm": 3.619215965270996, "learning_rate": 9.423137918169994e-06, "loss": 0.2608, "num_input_tokens_seen": 9045424, "step": 9015 }, { "epoch": 4.782608695652174, "grad_norm": 5.769137382507324, "learning_rate": 9.422058648403666e-06, "loss": 0.3876, "num_input_tokens_seen": 9051792, "step": 9020 }, { "epoch": 4.785259809119831, "grad_norm": 7.224005222320557, "learning_rate": 9.420978431886157e-06, "loss": 0.3728, "num_input_tokens_seen": 9056688, "step": 9025 }, { "epoch": 4.7879109225874865, "grad_norm": 4.626974582672119, "learning_rate": 9.419897268848733e-06, "loss": 0.4214, "num_input_tokens_seen": 9062576, "step": 9030 }, { "epoch": 4.790562036055143, "grad_norm": 5.121217250823975, "learning_rate": 9.41881515952287e-06, "loss": 0.3621, "num_input_tokens_seen": 9066992, "step": 9035 }, { "epoch": 4.793213149522799, "grad_norm": 5.47654914855957, "learning_rate": 9.417732104140243e-06, "loss": 0.3729, "num_input_tokens_seen": 9071568, "step": 9040 }, { "epoch": 4.795864262990456, "grad_norm": 4.909581184387207, "learning_rate": 9.416648102932733e-06, "loss": 0.2798, "num_input_tokens_seen": 9076336, "step": 9045 }, { "epoch": 4.798515376458113, "grad_norm": 4.508213520050049, "learning_rate": 9.415563156132422e-06, "loss": 0.2684, "num_input_tokens_seen": 9080720, "step": 9050 }, { "epoch": 4.801166489925769, "grad_norm": 9.404743194580078, "learning_rate": 9.414477263971594e-06, "loss": 0.3687, "num_input_tokens_seen": 9085488, "step": 9055 }, { "epoch": 4.8038176033934255, "grad_norm": 4.456618309020996, "learning_rate": 9.413390426682733e-06, "loss": 0.2887, "num_input_tokens_seen": 9090288, "step": 9060 }, { "epoch": 4.806468716861081, "grad_norm": 2.9797964096069336, "learning_rate": 9.412302644498532e-06, "loss": 0.2875, "num_input_tokens_seen": 9095696, "step": 9065 }, { "epoch": 4.809119830328738, "grad_norm": 2.2140684127807617, "learning_rate": 9.411213917651881e-06, "loss": 0.2314, "num_input_tokens_seen": 9100912, "step": 9070 }, { "epoch": 4.811770943796395, "grad_norm": 2.0174202919006348, "learning_rate": 9.410124246375873e-06, "loss": 0.2733, "num_input_tokens_seen": 9106896, "step": 9075 }, { "epoch": 4.814422057264051, "grad_norm": 4.450577259063721, "learning_rate": 9.409033630903803e-06, "loss": 0.2481, "num_input_tokens_seen": 9110928, "step": 9080 }, { "epoch": 4.817073170731708, "grad_norm": 8.12596321105957, "learning_rate": 9.40794207146917e-06, "loss": 0.3907, "num_input_tokens_seen": 9116080, "step": 9085 }, { "epoch": 4.8197242841993635, "grad_norm": 2.0891356468200684, "learning_rate": 9.406849568305675e-06, "loss": 0.2118, "num_input_tokens_seen": 9121168, "step": 9090 }, { "epoch": 4.82237539766702, "grad_norm": 4.203258037567139, "learning_rate": 9.405756121647218e-06, "loss": 0.2819, "num_input_tokens_seen": 9125872, "step": 9095 }, { "epoch": 4.825026511134676, "grad_norm": 2.2140800952911377, "learning_rate": 9.404661731727904e-06, "loss": 0.2717, "num_input_tokens_seen": 9130704, "step": 9100 }, { "epoch": 4.827677624602333, "grad_norm": 8.695026397705078, "learning_rate": 9.40356639878204e-06, "loss": 0.3099, "num_input_tokens_seen": 9135216, "step": 9105 }, { "epoch": 4.83032873806999, "grad_norm": 6.731655120849609, "learning_rate": 9.402470123044132e-06, "loss": 0.3715, "num_input_tokens_seen": 9141072, "step": 9110 }, { "epoch": 4.832979851537646, "grad_norm": 6.653932094573975, "learning_rate": 9.401372904748893e-06, "loss": 0.3682, "num_input_tokens_seen": 9145392, "step": 9115 }, { "epoch": 4.835630965005302, "grad_norm": 4.574862003326416, "learning_rate": 9.40027474413123e-06, "loss": 0.2439, "num_input_tokens_seen": 9150224, "step": 9120 }, { "epoch": 4.838282078472958, "grad_norm": 4.949057102203369, "learning_rate": 9.39917564142626e-06, "loss": 0.2981, "num_input_tokens_seen": 9154032, "step": 9125 }, { "epoch": 4.840933191940615, "grad_norm": 5.457191467285156, "learning_rate": 9.398075596869295e-06, "loss": 0.3623, "num_input_tokens_seen": 9159248, "step": 9130 }, { "epoch": 4.843584305408271, "grad_norm": 5.2175116539001465, "learning_rate": 9.396974610695855e-06, "loss": 0.3698, "num_input_tokens_seen": 9163760, "step": 9135 }, { "epoch": 4.846235418875928, "grad_norm": 4.267009258270264, "learning_rate": 9.395872683141656e-06, "loss": 0.3096, "num_input_tokens_seen": 9168592, "step": 9140 }, { "epoch": 4.848886532343585, "grad_norm": 6.520581245422363, "learning_rate": 9.394769814442616e-06, "loss": 0.3155, "num_input_tokens_seen": 9173904, "step": 9145 }, { "epoch": 4.8515376458112405, "grad_norm": 5.980603218078613, "learning_rate": 9.393666004834862e-06, "loss": 0.3163, "num_input_tokens_seen": 9177680, "step": 9150 }, { "epoch": 4.854188759278897, "grad_norm": 6.675129413604736, "learning_rate": 9.392561254554712e-06, "loss": 0.2814, "num_input_tokens_seen": 9183664, "step": 9155 }, { "epoch": 4.856839872746553, "grad_norm": 6.414597511291504, "learning_rate": 9.391455563838691e-06, "loss": 0.357, "num_input_tokens_seen": 9189616, "step": 9160 }, { "epoch": 4.85949098621421, "grad_norm": 5.304319381713867, "learning_rate": 9.390348932923528e-06, "loss": 0.2382, "num_input_tokens_seen": 9194096, "step": 9165 }, { "epoch": 4.862142099681867, "grad_norm": 4.4735798835754395, "learning_rate": 9.389241362046144e-06, "loss": 0.3145, "num_input_tokens_seen": 9199312, "step": 9170 }, { "epoch": 4.864793213149523, "grad_norm": 6.714357376098633, "learning_rate": 9.388132851443671e-06, "loss": 0.4218, "num_input_tokens_seen": 9203952, "step": 9175 }, { "epoch": 4.867444326617179, "grad_norm": 2.7913975715637207, "learning_rate": 9.387023401353436e-06, "loss": 0.273, "num_input_tokens_seen": 9209328, "step": 9180 }, { "epoch": 4.870095440084835, "grad_norm": 4.082531452178955, "learning_rate": 9.385913012012972e-06, "loss": 0.2867, "num_input_tokens_seen": 9213200, "step": 9185 }, { "epoch": 4.872746553552492, "grad_norm": 3.5891997814178467, "learning_rate": 9.384801683660007e-06, "loss": 0.299, "num_input_tokens_seen": 9219120, "step": 9190 }, { "epoch": 4.875397667020149, "grad_norm": 5.082268714904785, "learning_rate": 9.383689416532478e-06, "loss": 0.3184, "num_input_tokens_seen": 9224624, "step": 9195 }, { "epoch": 4.878048780487805, "grad_norm": 4.587832450866699, "learning_rate": 9.382576210868515e-06, "loss": 0.3217, "num_input_tokens_seen": 9230448, "step": 9200 }, { "epoch": 4.8806998939554616, "grad_norm": 3.0587871074676514, "learning_rate": 9.381462066906452e-06, "loss": 0.2788, "num_input_tokens_seen": 9235248, "step": 9205 }, { "epoch": 4.8833510074231175, "grad_norm": 5.717060565948486, "learning_rate": 9.380346984884827e-06, "loss": 0.344, "num_input_tokens_seen": 9239632, "step": 9210 }, { "epoch": 4.886002120890774, "grad_norm": 5.791599750518799, "learning_rate": 9.379230965042372e-06, "loss": 0.3463, "num_input_tokens_seen": 9244304, "step": 9215 }, { "epoch": 4.888653234358431, "grad_norm": 3.1952571868896484, "learning_rate": 9.37811400761803e-06, "loss": 0.2285, "num_input_tokens_seen": 9249232, "step": 9220 }, { "epoch": 4.891304347826087, "grad_norm": 3.759563684463501, "learning_rate": 9.376996112850934e-06, "loss": 0.2589, "num_input_tokens_seen": 9253968, "step": 9225 }, { "epoch": 4.893955461293744, "grad_norm": 5.403943061828613, "learning_rate": 9.375877280980424e-06, "loss": 0.3095, "num_input_tokens_seen": 9258000, "step": 9230 }, { "epoch": 4.8966065747614, "grad_norm": 5.12213659286499, "learning_rate": 9.374757512246037e-06, "loss": 0.2907, "num_input_tokens_seen": 9262320, "step": 9235 }, { "epoch": 4.899257688229056, "grad_norm": 3.9457550048828125, "learning_rate": 9.373636806887515e-06, "loss": 0.3062, "num_input_tokens_seen": 9268464, "step": 9240 }, { "epoch": 4.901908801696712, "grad_norm": 3.29992938041687, "learning_rate": 9.3725151651448e-06, "loss": 0.3013, "num_input_tokens_seen": 9274512, "step": 9245 }, { "epoch": 4.904559915164369, "grad_norm": 9.506550788879395, "learning_rate": 9.371392587258028e-06, "loss": 0.387, "num_input_tokens_seen": 9280720, "step": 9250 }, { "epoch": 4.907211028632025, "grad_norm": 4.299831867218018, "learning_rate": 9.37026907346754e-06, "loss": 0.3321, "num_input_tokens_seen": 9285520, "step": 9255 }, { "epoch": 4.909862142099682, "grad_norm": 5.6325201988220215, "learning_rate": 9.369144624013882e-06, "loss": 0.3272, "num_input_tokens_seen": 9290288, "step": 9260 }, { "epoch": 4.9125132555673385, "grad_norm": 3.5628952980041504, "learning_rate": 9.368019239137792e-06, "loss": 0.2711, "num_input_tokens_seen": 9295888, "step": 9265 }, { "epoch": 4.915164369034994, "grad_norm": 4.572097301483154, "learning_rate": 9.366892919080213e-06, "loss": 0.2733, "num_input_tokens_seen": 9300720, "step": 9270 }, { "epoch": 4.917815482502651, "grad_norm": 5.827660083770752, "learning_rate": 9.365765664082286e-06, "loss": 0.3451, "num_input_tokens_seen": 9305040, "step": 9275 }, { "epoch": 4.920466595970307, "grad_norm": 5.429084300994873, "learning_rate": 9.364637474385354e-06, "loss": 0.2949, "num_input_tokens_seen": 9309424, "step": 9280 }, { "epoch": 4.923117709437964, "grad_norm": 3.0668866634368896, "learning_rate": 9.36350835023096e-06, "loss": 0.3025, "num_input_tokens_seen": 9315600, "step": 9285 }, { "epoch": 4.925768822905621, "grad_norm": 5.0848846435546875, "learning_rate": 9.36237829186085e-06, "loss": 0.3311, "num_input_tokens_seen": 9319760, "step": 9290 }, { "epoch": 4.928419936373277, "grad_norm": 4.688967704772949, "learning_rate": 9.361247299516959e-06, "loss": 0.2423, "num_input_tokens_seen": 9325328, "step": 9295 }, { "epoch": 4.931071049840933, "grad_norm": 8.606282234191895, "learning_rate": 9.360115373441434e-06, "loss": 0.3302, "num_input_tokens_seen": 9330800, "step": 9300 }, { "epoch": 4.933722163308589, "grad_norm": 2.9713001251220703, "learning_rate": 9.358982513876617e-06, "loss": 0.2857, "num_input_tokens_seen": 9336176, "step": 9305 }, { "epoch": 4.936373276776246, "grad_norm": 6.728859901428223, "learning_rate": 9.357848721065052e-06, "loss": 0.2646, "num_input_tokens_seen": 9342800, "step": 9310 }, { "epoch": 4.939024390243903, "grad_norm": 5.356739521026611, "learning_rate": 9.356713995249476e-06, "loss": 0.4345, "num_input_tokens_seen": 9347760, "step": 9315 }, { "epoch": 4.941675503711559, "grad_norm": 3.9370055198669434, "learning_rate": 9.355578336672837e-06, "loss": 0.3201, "num_input_tokens_seen": 9352816, "step": 9320 }, { "epoch": 4.9443266171792155, "grad_norm": 5.286738395690918, "learning_rate": 9.354441745578272e-06, "loss": 0.2476, "num_input_tokens_seen": 9360336, "step": 9325 }, { "epoch": 4.946977730646871, "grad_norm": 4.880620002746582, "learning_rate": 9.353304222209122e-06, "loss": 0.3141, "num_input_tokens_seen": 9365008, "step": 9330 }, { "epoch": 4.949628844114528, "grad_norm": 2.0615479946136475, "learning_rate": 9.352165766808933e-06, "loss": 0.3409, "num_input_tokens_seen": 9370640, "step": 9335 }, { "epoch": 4.952279957582185, "grad_norm": 5.398608684539795, "learning_rate": 9.35102637962144e-06, "loss": 0.3435, "num_input_tokens_seen": 9376624, "step": 9340 }, { "epoch": 4.954931071049841, "grad_norm": 3.338261127471924, "learning_rate": 9.349886060890585e-06, "loss": 0.2339, "num_input_tokens_seen": 9381552, "step": 9345 }, { "epoch": 4.957582184517498, "grad_norm": 7.291370391845703, "learning_rate": 9.348744810860506e-06, "loss": 0.3835, "num_input_tokens_seen": 9386128, "step": 9350 }, { "epoch": 4.9602332979851536, "grad_norm": 5.044161796569824, "learning_rate": 9.347602629775543e-06, "loss": 0.3657, "num_input_tokens_seen": 9391888, "step": 9355 }, { "epoch": 4.96288441145281, "grad_norm": 7.381908893585205, "learning_rate": 9.346459517880234e-06, "loss": 0.427, "num_input_tokens_seen": 9397648, "step": 9360 }, { "epoch": 4.965535524920466, "grad_norm": 3.3903841972351074, "learning_rate": 9.345315475419315e-06, "loss": 0.2391, "num_input_tokens_seen": 9402416, "step": 9365 }, { "epoch": 4.968186638388123, "grad_norm": 2.8164262771606445, "learning_rate": 9.344170502637724e-06, "loss": 0.2953, "num_input_tokens_seen": 9408336, "step": 9370 }, { "epoch": 4.97083775185578, "grad_norm": 3.5081562995910645, "learning_rate": 9.343024599780595e-06, "loss": 0.335, "num_input_tokens_seen": 9414352, "step": 9375 }, { "epoch": 4.973488865323436, "grad_norm": 3.096388578414917, "learning_rate": 9.341877767093264e-06, "loss": 0.2319, "num_input_tokens_seen": 9419344, "step": 9380 }, { "epoch": 4.9761399787910925, "grad_norm": 3.3612234592437744, "learning_rate": 9.340730004821266e-06, "loss": 0.2747, "num_input_tokens_seen": 9423952, "step": 9385 }, { "epoch": 4.978791092258748, "grad_norm": 2.7602431774139404, "learning_rate": 9.339581313210332e-06, "loss": 0.2838, "num_input_tokens_seen": 9429392, "step": 9390 }, { "epoch": 4.981442205726405, "grad_norm": 2.4103450775146484, "learning_rate": 9.338431692506392e-06, "loss": 0.225, "num_input_tokens_seen": 9434928, "step": 9395 }, { "epoch": 4.984093319194061, "grad_norm": 4.970109462738037, "learning_rate": 9.337281142955581e-06, "loss": 0.3317, "num_input_tokens_seen": 9438704, "step": 9400 }, { "epoch": 4.986744432661718, "grad_norm": 2.920301675796509, "learning_rate": 9.336129664804228e-06, "loss": 0.2056, "num_input_tokens_seen": 9443184, "step": 9405 }, { "epoch": 4.989395546129375, "grad_norm": 4.195879936218262, "learning_rate": 9.334977258298858e-06, "loss": 0.3311, "num_input_tokens_seen": 9447184, "step": 9410 }, { "epoch": 4.9920466595970305, "grad_norm": 4.228339672088623, "learning_rate": 9.333823923686202e-06, "loss": 0.3382, "num_input_tokens_seen": 9452016, "step": 9415 }, { "epoch": 4.994697773064687, "grad_norm": 5.378475666046143, "learning_rate": 9.332669661213183e-06, "loss": 0.3038, "num_input_tokens_seen": 9456752, "step": 9420 }, { "epoch": 4.997348886532343, "grad_norm": 5.923582077026367, "learning_rate": 9.331514471126927e-06, "loss": 0.2543, "num_input_tokens_seen": 9461264, "step": 9425 }, { "epoch": 5.0, "grad_norm": 11.432605743408203, "learning_rate": 9.33035835367476e-06, "loss": 0.3138, "num_input_tokens_seen": 9466216, "step": 9430 }, { "epoch": 5.002651113467657, "grad_norm": 2.2417352199554443, "learning_rate": 9.3292013091042e-06, "loss": 0.2154, "num_input_tokens_seen": 9471240, "step": 9435 }, { "epoch": 5.005302226935313, "grad_norm": 3.3894052505493164, "learning_rate": 9.328043337662964e-06, "loss": 0.2139, "num_input_tokens_seen": 9475272, "step": 9440 }, { "epoch": 5.0079533404029695, "grad_norm": 6.057399749755859, "learning_rate": 9.326884439598978e-06, "loss": 0.328, "num_input_tokens_seen": 9480104, "step": 9445 }, { "epoch": 5.010604453870625, "grad_norm": 5.888204574584961, "learning_rate": 9.325724615160356e-06, "loss": 0.3448, "num_input_tokens_seen": 9485608, "step": 9450 }, { "epoch": 5.013255567338282, "grad_norm": 2.65794038772583, "learning_rate": 9.324563864595412e-06, "loss": 0.2458, "num_input_tokens_seen": 9490280, "step": 9455 }, { "epoch": 5.015906680805939, "grad_norm": 3.40692400932312, "learning_rate": 9.323402188152662e-06, "loss": 0.3174, "num_input_tokens_seen": 9494600, "step": 9460 }, { "epoch": 5.018557794273595, "grad_norm": 3.9850094318389893, "learning_rate": 9.322239586080818e-06, "loss": 0.2443, "num_input_tokens_seen": 9499720, "step": 9465 }, { "epoch": 5.021208907741252, "grad_norm": 2.7598657608032227, "learning_rate": 9.321076058628787e-06, "loss": 0.2497, "num_input_tokens_seen": 9504680, "step": 9470 }, { "epoch": 5.0238600212089075, "grad_norm": 6.276782035827637, "learning_rate": 9.319911606045679e-06, "loss": 0.2597, "num_input_tokens_seen": 9508968, "step": 9475 }, { "epoch": 5.026511134676564, "grad_norm": 5.984258651733398, "learning_rate": 9.3187462285808e-06, "loss": 0.3316, "num_input_tokens_seen": 9513384, "step": 9480 }, { "epoch": 5.02916224814422, "grad_norm": 3.712315082550049, "learning_rate": 9.317579926483655e-06, "loss": 0.2132, "num_input_tokens_seen": 9519592, "step": 9485 }, { "epoch": 5.031813361611877, "grad_norm": 5.6908769607543945, "learning_rate": 9.316412700003946e-06, "loss": 0.2823, "num_input_tokens_seen": 9525416, "step": 9490 }, { "epoch": 5.034464475079534, "grad_norm": 4.967284679412842, "learning_rate": 9.31524454939157e-06, "loss": 0.2754, "num_input_tokens_seen": 9531432, "step": 9495 }, { "epoch": 5.03711558854719, "grad_norm": 6.585669040679932, "learning_rate": 9.314075474896631e-06, "loss": 0.3694, "num_input_tokens_seen": 9537800, "step": 9500 }, { "epoch": 5.0397667020148464, "grad_norm": 8.487334251403809, "learning_rate": 9.31290547676942e-06, "loss": 0.3269, "num_input_tokens_seen": 9543080, "step": 9505 }, { "epoch": 5.042417815482502, "grad_norm": 7.730371952056885, "learning_rate": 9.311734555260435e-06, "loss": 0.3867, "num_input_tokens_seen": 9547880, "step": 9510 }, { "epoch": 5.045068928950159, "grad_norm": 5.2885284423828125, "learning_rate": 9.310562710620361e-06, "loss": 0.1924, "num_input_tokens_seen": 9552712, "step": 9515 }, { "epoch": 5.047720042417816, "grad_norm": 2.4303956031799316, "learning_rate": 9.309389943100091e-06, "loss": 0.3102, "num_input_tokens_seen": 9557352, "step": 9520 }, { "epoch": 5.050371155885472, "grad_norm": 4.466437816619873, "learning_rate": 9.308216252950713e-06, "loss": 0.2854, "num_input_tokens_seen": 9562600, "step": 9525 }, { "epoch": 5.053022269353129, "grad_norm": 4.726023197174072, "learning_rate": 9.307041640423507e-06, "loss": 0.2164, "num_input_tokens_seen": 9568072, "step": 9530 }, { "epoch": 5.0556733828207845, "grad_norm": 5.485479831695557, "learning_rate": 9.305866105769955e-06, "loss": 0.2661, "num_input_tokens_seen": 9573608, "step": 9535 }, { "epoch": 5.058324496288441, "grad_norm": 4.063749313354492, "learning_rate": 9.304689649241739e-06, "loss": 0.244, "num_input_tokens_seen": 9577672, "step": 9540 }, { "epoch": 5.060975609756097, "grad_norm": 2.567859411239624, "learning_rate": 9.303512271090733e-06, "loss": 0.2299, "num_input_tokens_seen": 9582728, "step": 9545 }, { "epoch": 5.063626723223754, "grad_norm": 5.301609039306641, "learning_rate": 9.30233397156901e-06, "loss": 0.3563, "num_input_tokens_seen": 9588232, "step": 9550 }, { "epoch": 5.066277836691411, "grad_norm": 7.475751876831055, "learning_rate": 9.301154750928842e-06, "loss": 0.2798, "num_input_tokens_seen": 9594312, "step": 9555 }, { "epoch": 5.068928950159067, "grad_norm": 5.511723518371582, "learning_rate": 9.299974609422697e-06, "loss": 0.2065, "num_input_tokens_seen": 9599784, "step": 9560 }, { "epoch": 5.071580063626723, "grad_norm": 5.435217380523682, "learning_rate": 9.29879354730324e-06, "loss": 0.1711, "num_input_tokens_seen": 9604680, "step": 9565 }, { "epoch": 5.074231177094379, "grad_norm": 1.7582991123199463, "learning_rate": 9.297611564823334e-06, "loss": 0.1637, "num_input_tokens_seen": 9609704, "step": 9570 }, { "epoch": 5.076882290562036, "grad_norm": 12.362778663635254, "learning_rate": 9.296428662236036e-06, "loss": 0.2468, "num_input_tokens_seen": 9613864, "step": 9575 }, { "epoch": 5.079533404029693, "grad_norm": 8.832904815673828, "learning_rate": 9.295244839794606e-06, "loss": 0.7952, "num_input_tokens_seen": 9618312, "step": 9580 }, { "epoch": 5.082184517497349, "grad_norm": 6.338800430297852, "learning_rate": 9.294060097752495e-06, "loss": 0.4608, "num_input_tokens_seen": 9623720, "step": 9585 }, { "epoch": 5.084835630965006, "grad_norm": 4.428074836730957, "learning_rate": 9.292874436363352e-06, "loss": 0.324, "num_input_tokens_seen": 9628648, "step": 9590 }, { "epoch": 5.0874867444326615, "grad_norm": 3.2835581302642822, "learning_rate": 9.291687855881027e-06, "loss": 0.1982, "num_input_tokens_seen": 9633608, "step": 9595 }, { "epoch": 5.090137857900318, "grad_norm": 4.536332607269287, "learning_rate": 9.290500356559561e-06, "loss": 0.283, "num_input_tokens_seen": 9638088, "step": 9600 }, { "epoch": 5.092788971367974, "grad_norm": 5.893910884857178, "learning_rate": 9.289311938653197e-06, "loss": 0.2537, "num_input_tokens_seen": 9642984, "step": 9605 }, { "epoch": 5.095440084835631, "grad_norm": 3.2175910472869873, "learning_rate": 9.28812260241637e-06, "loss": 0.2027, "num_input_tokens_seen": 9647752, "step": 9610 }, { "epoch": 5.098091198303288, "grad_norm": 3.240720748901367, "learning_rate": 9.286932348103716e-06, "loss": 0.2502, "num_input_tokens_seen": 9652680, "step": 9615 }, { "epoch": 5.100742311770944, "grad_norm": 2.3177664279937744, "learning_rate": 9.285741175970062e-06, "loss": 0.2785, "num_input_tokens_seen": 9657512, "step": 9620 }, { "epoch": 5.1033934252386, "grad_norm": 4.438521385192871, "learning_rate": 9.284549086270437e-06, "loss": 0.2126, "num_input_tokens_seen": 9663144, "step": 9625 }, { "epoch": 5.106044538706256, "grad_norm": 2.867091655731201, "learning_rate": 9.283356079260066e-06, "loss": 0.3407, "num_input_tokens_seen": 9668296, "step": 9630 }, { "epoch": 5.108695652173913, "grad_norm": 4.206178188323975, "learning_rate": 9.282162155194368e-06, "loss": 0.2257, "num_input_tokens_seen": 9673736, "step": 9635 }, { "epoch": 5.11134676564157, "grad_norm": 6.766542911529541, "learning_rate": 9.280967314328954e-06, "loss": 0.3415, "num_input_tokens_seen": 9678312, "step": 9640 }, { "epoch": 5.113997879109226, "grad_norm": 6.581257343292236, "learning_rate": 9.279771556919643e-06, "loss": 0.2663, "num_input_tokens_seen": 9684872, "step": 9645 }, { "epoch": 5.1166489925768825, "grad_norm": 5.032341480255127, "learning_rate": 9.27857488322244e-06, "loss": 0.3707, "num_input_tokens_seen": 9690312, "step": 9650 }, { "epoch": 5.1193001060445384, "grad_norm": 3.0336310863494873, "learning_rate": 9.277377293493551e-06, "loss": 0.1983, "num_input_tokens_seen": 9694728, "step": 9655 }, { "epoch": 5.121951219512195, "grad_norm": 4.858823299407959, "learning_rate": 9.276178787989373e-06, "loss": 0.3341, "num_input_tokens_seen": 9699432, "step": 9660 }, { "epoch": 5.124602332979851, "grad_norm": 7.745334148406982, "learning_rate": 9.27497936696651e-06, "loss": 0.3932, "num_input_tokens_seen": 9704680, "step": 9665 }, { "epoch": 5.127253446447508, "grad_norm": 3.3031628131866455, "learning_rate": 9.27377903068175e-06, "loss": 0.2105, "num_input_tokens_seen": 9709320, "step": 9670 }, { "epoch": 5.129904559915165, "grad_norm": 7.024253845214844, "learning_rate": 9.272577779392083e-06, "loss": 0.5548, "num_input_tokens_seen": 9714408, "step": 9675 }, { "epoch": 5.132555673382821, "grad_norm": 8.263984680175781, "learning_rate": 9.271375613354691e-06, "loss": 0.2829, "num_input_tokens_seen": 9720072, "step": 9680 }, { "epoch": 5.135206786850477, "grad_norm": 5.480541706085205, "learning_rate": 9.27017253282696e-06, "loss": 0.3996, "num_input_tokens_seen": 9727016, "step": 9685 }, { "epoch": 5.137857900318133, "grad_norm": 3.129204750061035, "learning_rate": 9.268968538066463e-06, "loss": 0.2384, "num_input_tokens_seen": 9731464, "step": 9690 }, { "epoch": 5.14050901378579, "grad_norm": 3.416867256164551, "learning_rate": 9.267763629330972e-06, "loss": 0.2648, "num_input_tokens_seen": 9735880, "step": 9695 }, { "epoch": 5.143160127253447, "grad_norm": 1.9767630100250244, "learning_rate": 9.266557806878459e-06, "loss": 0.3967, "num_input_tokens_seen": 9740328, "step": 9700 }, { "epoch": 5.145811240721103, "grad_norm": 6.604567527770996, "learning_rate": 9.265351070967079e-06, "loss": 0.2776, "num_input_tokens_seen": 9745544, "step": 9705 }, { "epoch": 5.1484623541887595, "grad_norm": 4.680464744567871, "learning_rate": 9.264143421855199e-06, "loss": 0.2462, "num_input_tokens_seen": 9749832, "step": 9710 }, { "epoch": 5.151113467656415, "grad_norm": 4.858819961547852, "learning_rate": 9.26293485980137e-06, "loss": 0.1974, "num_input_tokens_seen": 9754376, "step": 9715 }, { "epoch": 5.153764581124072, "grad_norm": 8.72851848602295, "learning_rate": 9.261725385064342e-06, "loss": 0.3712, "num_input_tokens_seen": 9758856, "step": 9720 }, { "epoch": 5.156415694591728, "grad_norm": 2.7306582927703857, "learning_rate": 9.260514997903061e-06, "loss": 0.2672, "num_input_tokens_seen": 9763368, "step": 9725 }, { "epoch": 5.159066808059385, "grad_norm": 6.934466361999512, "learning_rate": 9.259303698576669e-06, "loss": 0.2581, "num_input_tokens_seen": 9768168, "step": 9730 }, { "epoch": 5.161717921527042, "grad_norm": 7.223222732543945, "learning_rate": 9.258091487344498e-06, "loss": 0.2636, "num_input_tokens_seen": 9773512, "step": 9735 }, { "epoch": 5.164369034994698, "grad_norm": 6.549062728881836, "learning_rate": 9.256878364466084e-06, "loss": 0.402, "num_input_tokens_seen": 9777704, "step": 9740 }, { "epoch": 5.167020148462354, "grad_norm": 6.568422794342041, "learning_rate": 9.255664330201151e-06, "loss": 0.3101, "num_input_tokens_seen": 9781832, "step": 9745 }, { "epoch": 5.16967126193001, "grad_norm": 5.505851745605469, "learning_rate": 9.254449384809619e-06, "loss": 0.2971, "num_input_tokens_seen": 9787432, "step": 9750 }, { "epoch": 5.172322375397667, "grad_norm": 2.867842197418213, "learning_rate": 9.253233528551608e-06, "loss": 0.2433, "num_input_tokens_seen": 9792040, "step": 9755 }, { "epoch": 5.174973488865324, "grad_norm": 5.1266865730285645, "learning_rate": 9.252016761687428e-06, "loss": 0.2767, "num_input_tokens_seen": 9796424, "step": 9760 }, { "epoch": 5.17762460233298, "grad_norm": 6.7439093589782715, "learning_rate": 9.250799084477586e-06, "loss": 0.271, "num_input_tokens_seen": 9802216, "step": 9765 }, { "epoch": 5.1802757158006365, "grad_norm": 4.2110137939453125, "learning_rate": 9.249580497182784e-06, "loss": 0.4075, "num_input_tokens_seen": 9807336, "step": 9770 }, { "epoch": 5.182926829268292, "grad_norm": 3.15395450592041, "learning_rate": 9.248361000063916e-06, "loss": 0.19, "num_input_tokens_seen": 9812488, "step": 9775 }, { "epoch": 5.185577942735949, "grad_norm": 4.741170406341553, "learning_rate": 9.247140593382076e-06, "loss": 0.3292, "num_input_tokens_seen": 9817288, "step": 9780 }, { "epoch": 5.188229056203605, "grad_norm": 5.887520790100098, "learning_rate": 9.245919277398548e-06, "loss": 0.4144, "num_input_tokens_seen": 9822696, "step": 9785 }, { "epoch": 5.190880169671262, "grad_norm": 7.374669551849365, "learning_rate": 9.244697052374812e-06, "loss": 0.2475, "num_input_tokens_seen": 9827528, "step": 9790 }, { "epoch": 5.193531283138919, "grad_norm": 5.516624927520752, "learning_rate": 9.243473918572545e-06, "loss": 0.2113, "num_input_tokens_seen": 9833096, "step": 9795 }, { "epoch": 5.1961823966065745, "grad_norm": 4.545604228973389, "learning_rate": 9.242249876253617e-06, "loss": 0.3267, "num_input_tokens_seen": 9838088, "step": 9800 }, { "epoch": 5.198833510074231, "grad_norm": 7.250219821929932, "learning_rate": 9.241024925680091e-06, "loss": 0.3523, "num_input_tokens_seen": 9843560, "step": 9805 }, { "epoch": 5.201484623541887, "grad_norm": 5.344252586364746, "learning_rate": 9.239799067114225e-06, "loss": 0.3108, "num_input_tokens_seen": 9848808, "step": 9810 }, { "epoch": 5.204135737009544, "grad_norm": 8.987067222595215, "learning_rate": 9.238572300818474e-06, "loss": 0.3091, "num_input_tokens_seen": 9853480, "step": 9815 }, { "epoch": 5.206786850477201, "grad_norm": 4.593780994415283, "learning_rate": 9.237344627055486e-06, "loss": 0.2176, "num_input_tokens_seen": 9858984, "step": 9820 }, { "epoch": 5.209437963944857, "grad_norm": 3.940653085708618, "learning_rate": 9.2361160460881e-06, "loss": 0.3259, "num_input_tokens_seen": 9864200, "step": 9825 }, { "epoch": 5.2120890774125135, "grad_norm": 2.7724313735961914, "learning_rate": 9.234886558179352e-06, "loss": 0.2805, "num_input_tokens_seen": 9869640, "step": 9830 }, { "epoch": 5.214740190880169, "grad_norm": 3.2595717906951904, "learning_rate": 9.233656163592476e-06, "loss": 0.3467, "num_input_tokens_seen": 9874536, "step": 9835 }, { "epoch": 5.217391304347826, "grad_norm": 4.339427471160889, "learning_rate": 9.232424862590892e-06, "loss": 0.2135, "num_input_tokens_seen": 9879624, "step": 9840 }, { "epoch": 5.220042417815483, "grad_norm": 3.8336212635040283, "learning_rate": 9.231192655438222e-06, "loss": 0.271, "num_input_tokens_seen": 9884008, "step": 9845 }, { "epoch": 5.222693531283139, "grad_norm": 2.003706932067871, "learning_rate": 9.229959542398275e-06, "loss": 0.2794, "num_input_tokens_seen": 9888040, "step": 9850 }, { "epoch": 5.225344644750796, "grad_norm": 5.492460250854492, "learning_rate": 9.228725523735058e-06, "loss": 0.3363, "num_input_tokens_seen": 9892936, "step": 9855 }, { "epoch": 5.2279957582184515, "grad_norm": 6.805123805999756, "learning_rate": 9.227490599712771e-06, "loss": 0.3244, "num_input_tokens_seen": 9897832, "step": 9860 }, { "epoch": 5.230646871686108, "grad_norm": 10.948284149169922, "learning_rate": 9.226254770595812e-06, "loss": 0.3906, "num_input_tokens_seen": 9902920, "step": 9865 }, { "epoch": 5.233297985153764, "grad_norm": 3.680999517440796, "learning_rate": 9.225018036648762e-06, "loss": 0.2605, "num_input_tokens_seen": 9907656, "step": 9870 }, { "epoch": 5.235949098621421, "grad_norm": 5.168023109436035, "learning_rate": 9.223780398136409e-06, "loss": 0.2814, "num_input_tokens_seen": 9911112, "step": 9875 }, { "epoch": 5.238600212089078, "grad_norm": 3.4594109058380127, "learning_rate": 9.222541855323725e-06, "loss": 0.3054, "num_input_tokens_seen": 9916296, "step": 9880 }, { "epoch": 5.241251325556734, "grad_norm": 4.452883720397949, "learning_rate": 9.221302408475879e-06, "loss": 0.2174, "num_input_tokens_seen": 9921096, "step": 9885 }, { "epoch": 5.2439024390243905, "grad_norm": 6.026871681213379, "learning_rate": 9.220062057858232e-06, "loss": 0.3416, "num_input_tokens_seen": 9926792, "step": 9890 }, { "epoch": 5.246553552492046, "grad_norm": 8.165335655212402, "learning_rate": 9.218820803736342e-06, "loss": 0.3255, "num_input_tokens_seen": 9931784, "step": 9895 }, { "epoch": 5.249204665959703, "grad_norm": 5.949963569641113, "learning_rate": 9.217578646375958e-06, "loss": 0.3166, "num_input_tokens_seen": 9935752, "step": 9900 }, { "epoch": 5.251855779427359, "grad_norm": 7.455275535583496, "learning_rate": 9.216335586043022e-06, "loss": 0.2983, "num_input_tokens_seen": 9940616, "step": 9905 }, { "epoch": 5.254506892895016, "grad_norm": 2.7030224800109863, "learning_rate": 9.21509162300367e-06, "loss": 0.2859, "num_input_tokens_seen": 9945576, "step": 9910 }, { "epoch": 5.257158006362673, "grad_norm": 4.483240604400635, "learning_rate": 9.21384675752423e-06, "loss": 0.2329, "num_input_tokens_seen": 9950984, "step": 9915 }, { "epoch": 5.2598091198303285, "grad_norm": 5.7833075523376465, "learning_rate": 9.212600989871227e-06, "loss": 0.2454, "num_input_tokens_seen": 9956040, "step": 9920 }, { "epoch": 5.262460233297985, "grad_norm": 4.668388843536377, "learning_rate": 9.211354320311374e-06, "loss": 0.2736, "num_input_tokens_seen": 9961736, "step": 9925 }, { "epoch": 5.265111346765641, "grad_norm": 4.748852252960205, "learning_rate": 9.210106749111583e-06, "loss": 0.2429, "num_input_tokens_seen": 9966888, "step": 9930 }, { "epoch": 5.267762460233298, "grad_norm": 3.6199092864990234, "learning_rate": 9.208858276538952e-06, "loss": 0.1805, "num_input_tokens_seen": 9971624, "step": 9935 }, { "epoch": 5.270413573700955, "grad_norm": 5.7965593338012695, "learning_rate": 9.207608902860779e-06, "loss": 0.4779, "num_input_tokens_seen": 9976520, "step": 9940 }, { "epoch": 5.273064687168611, "grad_norm": 5.7311787605285645, "learning_rate": 9.20635862834455e-06, "loss": 0.269, "num_input_tokens_seen": 9981096, "step": 9945 }, { "epoch": 5.275715800636267, "grad_norm": 6.3957295417785645, "learning_rate": 9.205107453257944e-06, "loss": 0.3141, "num_input_tokens_seen": 9986216, "step": 9950 }, { "epoch": 5.278366914103923, "grad_norm": 4.906252861022949, "learning_rate": 9.203855377868837e-06, "loss": 0.2578, "num_input_tokens_seen": 9991304, "step": 9955 }, { "epoch": 5.28101802757158, "grad_norm": 6.362718105316162, "learning_rate": 9.202602402445294e-06, "loss": 0.2505, "num_input_tokens_seen": 9999048, "step": 9960 }, { "epoch": 5.283669141039237, "grad_norm": 2.7646994590759277, "learning_rate": 9.201348527255573e-06, "loss": 0.2302, "num_input_tokens_seen": 10003816, "step": 9965 }, { "epoch": 5.286320254506893, "grad_norm": 3.8482909202575684, "learning_rate": 9.200093752568127e-06, "loss": 0.3502, "num_input_tokens_seen": 10010056, "step": 9970 }, { "epoch": 5.28897136797455, "grad_norm": 10.598274230957031, "learning_rate": 9.198838078651598e-06, "loss": 0.4219, "num_input_tokens_seen": 10015144, "step": 9975 }, { "epoch": 5.2916224814422055, "grad_norm": 3.1136314868927, "learning_rate": 9.197581505774824e-06, "loss": 0.1492, "num_input_tokens_seen": 10019688, "step": 9980 }, { "epoch": 5.294273594909862, "grad_norm": 8.860387802124023, "learning_rate": 9.196324034206834e-06, "loss": 0.4294, "num_input_tokens_seen": 10023400, "step": 9985 }, { "epoch": 5.296924708377518, "grad_norm": 4.379262447357178, "learning_rate": 9.195065664216849e-06, "loss": 0.2099, "num_input_tokens_seen": 10027656, "step": 9990 }, { "epoch": 5.299575821845175, "grad_norm": 10.945230484008789, "learning_rate": 9.193806396074283e-06, "loss": 0.3313, "num_input_tokens_seen": 10032488, "step": 9995 }, { "epoch": 5.302226935312832, "grad_norm": 3.071063756942749, "learning_rate": 9.19254623004874e-06, "loss": 0.2204, "num_input_tokens_seen": 10037640, "step": 10000 }, { "epoch": 5.304878048780488, "grad_norm": 3.578153133392334, "learning_rate": 9.191285166410023e-06, "loss": 0.2095, "num_input_tokens_seen": 10043496, "step": 10005 }, { "epoch": 5.307529162248144, "grad_norm": 1.8351343870162964, "learning_rate": 9.190023205428117e-06, "loss": 0.3274, "num_input_tokens_seen": 10047624, "step": 10010 }, { "epoch": 5.3101802757158, "grad_norm": 4.973803520202637, "learning_rate": 9.188760347373207e-06, "loss": 0.2798, "num_input_tokens_seen": 10052200, "step": 10015 }, { "epoch": 5.312831389183457, "grad_norm": 6.504538536071777, "learning_rate": 9.18749659251567e-06, "loss": 0.3322, "num_input_tokens_seen": 10057352, "step": 10020 }, { "epoch": 5.315482502651114, "grad_norm": 10.386838912963867, "learning_rate": 9.186231941126066e-06, "loss": 0.3239, "num_input_tokens_seen": 10061032, "step": 10025 }, { "epoch": 5.31813361611877, "grad_norm": 5.374964714050293, "learning_rate": 9.18496639347516e-06, "loss": 0.2252, "num_input_tokens_seen": 10065512, "step": 10030 }, { "epoch": 5.320784729586427, "grad_norm": 9.256746292114258, "learning_rate": 9.1836999498339e-06, "loss": 0.3528, "num_input_tokens_seen": 10070440, "step": 10035 }, { "epoch": 5.3234358430540825, "grad_norm": 4.572093963623047, "learning_rate": 9.182432610473428e-06, "loss": 0.2385, "num_input_tokens_seen": 10074728, "step": 10040 }, { "epoch": 5.326086956521739, "grad_norm": 9.503070831298828, "learning_rate": 9.18116437566508e-06, "loss": 0.2964, "num_input_tokens_seen": 10079976, "step": 10045 }, { "epoch": 5.328738069989395, "grad_norm": 5.3912811279296875, "learning_rate": 9.17989524568038e-06, "loss": 0.3752, "num_input_tokens_seen": 10084872, "step": 10050 }, { "epoch": 5.331389183457052, "grad_norm": 8.019508361816406, "learning_rate": 9.178625220791042e-06, "loss": 0.2955, "num_input_tokens_seen": 10090792, "step": 10055 }, { "epoch": 5.334040296924709, "grad_norm": 2.338124990463257, "learning_rate": 9.177354301268981e-06, "loss": 0.1968, "num_input_tokens_seen": 10096136, "step": 10060 }, { "epoch": 5.336691410392365, "grad_norm": 7.2916483879089355, "learning_rate": 9.176082487386296e-06, "loss": 0.2838, "num_input_tokens_seen": 10100872, "step": 10065 }, { "epoch": 5.339342523860021, "grad_norm": 11.551862716674805, "learning_rate": 9.174809779415276e-06, "loss": 0.363, "num_input_tokens_seen": 10105224, "step": 10070 }, { "epoch": 5.341993637327677, "grad_norm": 2.4084460735321045, "learning_rate": 9.17353617762841e-06, "loss": 0.3079, "num_input_tokens_seen": 10109800, "step": 10075 }, { "epoch": 5.344644750795334, "grad_norm": 5.837557792663574, "learning_rate": 9.172261682298366e-06, "loss": 0.2602, "num_input_tokens_seen": 10114536, "step": 10080 }, { "epoch": 5.347295864262991, "grad_norm": 9.521281242370605, "learning_rate": 9.170986293698016e-06, "loss": 0.3471, "num_input_tokens_seen": 10119752, "step": 10085 }, { "epoch": 5.349946977730647, "grad_norm": 2.9691758155822754, "learning_rate": 9.169710012100413e-06, "loss": 0.2658, "num_input_tokens_seen": 10124488, "step": 10090 }, { "epoch": 5.3525980911983035, "grad_norm": 4.926957130432129, "learning_rate": 9.168432837778808e-06, "loss": 0.2441, "num_input_tokens_seen": 10129928, "step": 10095 }, { "epoch": 5.355249204665959, "grad_norm": 3.260338306427002, "learning_rate": 9.167154771006641e-06, "loss": 0.2526, "num_input_tokens_seen": 10135016, "step": 10100 }, { "epoch": 5.357900318133616, "grad_norm": 6.1976542472839355, "learning_rate": 9.16587581205754e-06, "loss": 0.324, "num_input_tokens_seen": 10139976, "step": 10105 }, { "epoch": 5.360551431601272, "grad_norm": 7.065187931060791, "learning_rate": 9.164595961205331e-06, "loss": 0.2164, "num_input_tokens_seen": 10145768, "step": 10110 }, { "epoch": 5.363202545068929, "grad_norm": 3.440006971359253, "learning_rate": 9.163315218724023e-06, "loss": 0.2956, "num_input_tokens_seen": 10150056, "step": 10115 }, { "epoch": 5.365853658536586, "grad_norm": 3.658604383468628, "learning_rate": 9.162033584887822e-06, "loss": 0.2464, "num_input_tokens_seen": 10155144, "step": 10120 }, { "epoch": 5.368504772004242, "grad_norm": 5.297077178955078, "learning_rate": 9.16075105997112e-06, "loss": 0.2126, "num_input_tokens_seen": 10159944, "step": 10125 }, { "epoch": 5.371155885471898, "grad_norm": 6.536257743835449, "learning_rate": 9.159467644248503e-06, "loss": 0.3244, "num_input_tokens_seen": 10165896, "step": 10130 }, { "epoch": 5.373806998939554, "grad_norm": 3.048739194869995, "learning_rate": 9.15818333799475e-06, "loss": 0.24, "num_input_tokens_seen": 10170792, "step": 10135 }, { "epoch": 5.376458112407211, "grad_norm": 8.399858474731445, "learning_rate": 9.156898141484824e-06, "loss": 0.2395, "num_input_tokens_seen": 10175688, "step": 10140 }, { "epoch": 5.379109225874868, "grad_norm": 10.251851081848145, "learning_rate": 9.155612054993884e-06, "loss": 0.3498, "num_input_tokens_seen": 10180488, "step": 10145 }, { "epoch": 5.381760339342524, "grad_norm": 5.39436674118042, "learning_rate": 9.154325078797276e-06, "loss": 0.3744, "num_input_tokens_seen": 10184264, "step": 10150 }, { "epoch": 5.3844114528101805, "grad_norm": 6.198753356933594, "learning_rate": 9.15303721317054e-06, "loss": 0.3187, "num_input_tokens_seen": 10189288, "step": 10155 }, { "epoch": 5.387062566277836, "grad_norm": 6.708630561828613, "learning_rate": 9.151748458389405e-06, "loss": 0.2101, "num_input_tokens_seen": 10194824, "step": 10160 }, { "epoch": 5.389713679745493, "grad_norm": 8.03989028930664, "learning_rate": 9.150458814729787e-06, "loss": 0.3019, "num_input_tokens_seen": 10200136, "step": 10165 }, { "epoch": 5.392364793213149, "grad_norm": 4.394494533538818, "learning_rate": 9.149168282467798e-06, "loss": 0.3136, "num_input_tokens_seen": 10205384, "step": 10170 }, { "epoch": 5.395015906680806, "grad_norm": 4.917161464691162, "learning_rate": 9.14787686187974e-06, "loss": 0.3505, "num_input_tokens_seen": 10210792, "step": 10175 }, { "epoch": 5.397667020148463, "grad_norm": 8.09502124786377, "learning_rate": 9.146584553242095e-06, "loss": 0.2857, "num_input_tokens_seen": 10215752, "step": 10180 }, { "epoch": 5.400318133616119, "grad_norm": 6.492441654205322, "learning_rate": 9.14529135683155e-06, "loss": 0.3344, "num_input_tokens_seen": 10219912, "step": 10185 }, { "epoch": 5.402969247083775, "grad_norm": 6.455022811889648, "learning_rate": 9.143997272924974e-06, "loss": 0.2108, "num_input_tokens_seen": 10225000, "step": 10190 }, { "epoch": 5.405620360551431, "grad_norm": 5.122284412384033, "learning_rate": 9.142702301799425e-06, "loss": 0.3469, "num_input_tokens_seen": 10230088, "step": 10195 }, { "epoch": 5.408271474019088, "grad_norm": 4.509887218475342, "learning_rate": 9.141406443732152e-06, "loss": 0.2779, "num_input_tokens_seen": 10235208, "step": 10200 }, { "epoch": 5.410922587486745, "grad_norm": 3.9747889041900635, "learning_rate": 9.140109699000596e-06, "loss": 0.298, "num_input_tokens_seen": 10239272, "step": 10205 }, { "epoch": 5.413573700954401, "grad_norm": 4.883535861968994, "learning_rate": 9.138812067882388e-06, "loss": 0.3803, "num_input_tokens_seen": 10244264, "step": 10210 }, { "epoch": 5.4162248144220575, "grad_norm": 5.483484745025635, "learning_rate": 9.137513550655347e-06, "loss": 0.4069, "num_input_tokens_seen": 10248776, "step": 10215 }, { "epoch": 5.418875927889713, "grad_norm": 3.2393012046813965, "learning_rate": 9.136214147597482e-06, "loss": 0.2175, "num_input_tokens_seen": 10255080, "step": 10220 }, { "epoch": 5.42152704135737, "grad_norm": 5.60604190826416, "learning_rate": 9.134913858986991e-06, "loss": 0.2245, "num_input_tokens_seen": 10259976, "step": 10225 }, { "epoch": 5.424178154825027, "grad_norm": 6.586825847625732, "learning_rate": 9.133612685102262e-06, "loss": 0.3253, "num_input_tokens_seen": 10264744, "step": 10230 }, { "epoch": 5.426829268292683, "grad_norm": 2.3437819480895996, "learning_rate": 9.132310626221874e-06, "loss": 0.2208, "num_input_tokens_seen": 10269992, "step": 10235 }, { "epoch": 5.42948038176034, "grad_norm": 3.684871196746826, "learning_rate": 9.131007682624595e-06, "loss": 0.3071, "num_input_tokens_seen": 10278312, "step": 10240 }, { "epoch": 5.4321314952279955, "grad_norm": 3.6775715351104736, "learning_rate": 9.129703854589379e-06, "loss": 0.2876, "num_input_tokens_seen": 10282856, "step": 10245 }, { "epoch": 5.434782608695652, "grad_norm": 5.537198066711426, "learning_rate": 9.128399142395376e-06, "loss": 0.2608, "num_input_tokens_seen": 10289768, "step": 10250 }, { "epoch": 5.437433722163308, "grad_norm": 4.685754776000977, "learning_rate": 9.127093546321918e-06, "loss": 0.2701, "num_input_tokens_seen": 10294984, "step": 10255 }, { "epoch": 5.440084835630965, "grad_norm": 3.4235615730285645, "learning_rate": 9.125787066648533e-06, "loss": 0.2624, "num_input_tokens_seen": 10302312, "step": 10260 }, { "epoch": 5.442735949098622, "grad_norm": 4.300668239593506, "learning_rate": 9.124479703654931e-06, "loss": 0.2314, "num_input_tokens_seen": 10306536, "step": 10265 }, { "epoch": 5.445387062566278, "grad_norm": 5.60637903213501, "learning_rate": 9.123171457621019e-06, "loss": 0.3503, "num_input_tokens_seen": 10313064, "step": 10270 }, { "epoch": 5.4480381760339345, "grad_norm": 4.5299072265625, "learning_rate": 9.121862328826885e-06, "loss": 0.3005, "num_input_tokens_seen": 10318152, "step": 10275 }, { "epoch": 5.45068928950159, "grad_norm": 3.4981398582458496, "learning_rate": 9.120552317552814e-06, "loss": 0.1844, "num_input_tokens_seen": 10322248, "step": 10280 }, { "epoch": 5.453340402969247, "grad_norm": 11.574931144714355, "learning_rate": 9.119241424079274e-06, "loss": 0.3367, "num_input_tokens_seen": 10326856, "step": 10285 }, { "epoch": 5.455991516436903, "grad_norm": 2.573418140411377, "learning_rate": 9.117929648686924e-06, "loss": 0.1614, "num_input_tokens_seen": 10332584, "step": 10290 }, { "epoch": 5.45864262990456, "grad_norm": 6.809349536895752, "learning_rate": 9.116616991656612e-06, "loss": 0.2763, "num_input_tokens_seen": 10337928, "step": 10295 }, { "epoch": 5.461293743372217, "grad_norm": 8.954740524291992, "learning_rate": 9.115303453269374e-06, "loss": 0.3937, "num_input_tokens_seen": 10342312, "step": 10300 }, { "epoch": 5.4639448568398725, "grad_norm": 5.119919300079346, "learning_rate": 9.113989033806434e-06, "loss": 0.266, "num_input_tokens_seen": 10347784, "step": 10305 }, { "epoch": 5.466595970307529, "grad_norm": 5.364945888519287, "learning_rate": 9.112673733549209e-06, "loss": 0.2525, "num_input_tokens_seen": 10352360, "step": 10310 }, { "epoch": 5.469247083775185, "grad_norm": 9.706220626831055, "learning_rate": 9.111357552779298e-06, "loss": 0.3558, "num_input_tokens_seen": 10356904, "step": 10315 }, { "epoch": 5.471898197242842, "grad_norm": 6.911895751953125, "learning_rate": 9.110040491778495e-06, "loss": 0.2687, "num_input_tokens_seen": 10362056, "step": 10320 }, { "epoch": 5.474549310710499, "grad_norm": 5.907658576965332, "learning_rate": 9.108722550828776e-06, "loss": 0.3519, "num_input_tokens_seen": 10368072, "step": 10325 }, { "epoch": 5.477200424178155, "grad_norm": 6.0319294929504395, "learning_rate": 9.107403730212311e-06, "loss": 0.3006, "num_input_tokens_seen": 10373032, "step": 10330 }, { "epoch": 5.4798515376458115, "grad_norm": 4.615431308746338, "learning_rate": 9.106084030211455e-06, "loss": 0.1881, "num_input_tokens_seen": 10378568, "step": 10335 }, { "epoch": 5.482502651113467, "grad_norm": 7.671238422393799, "learning_rate": 9.104763451108753e-06, "loss": 0.3159, "num_input_tokens_seen": 10383176, "step": 10340 }, { "epoch": 5.485153764581124, "grad_norm": 4.04014778137207, "learning_rate": 9.103441993186936e-06, "loss": 0.3511, "num_input_tokens_seen": 10387496, "step": 10345 }, { "epoch": 5.487804878048781, "grad_norm": 4.158318042755127, "learning_rate": 9.102119656728928e-06, "loss": 0.3161, "num_input_tokens_seen": 10392520, "step": 10350 }, { "epoch": 5.490455991516437, "grad_norm": 4.334410667419434, "learning_rate": 9.100796442017835e-06, "loss": 0.3158, "num_input_tokens_seen": 10398248, "step": 10355 }, { "epoch": 5.493107104984094, "grad_norm": 4.121415138244629, "learning_rate": 9.099472349336953e-06, "loss": 0.3365, "num_input_tokens_seen": 10402312, "step": 10360 }, { "epoch": 5.4957582184517495, "grad_norm": 5.472598075866699, "learning_rate": 9.098147378969771e-06, "loss": 0.3536, "num_input_tokens_seen": 10407656, "step": 10365 }, { "epoch": 5.498409331919406, "grad_norm": 4.20405912399292, "learning_rate": 9.096821531199957e-06, "loss": 0.2334, "num_input_tokens_seen": 10412872, "step": 10370 }, { "epoch": 5.501060445387062, "grad_norm": 5.955725193023682, "learning_rate": 9.095494806311374e-06, "loss": 0.2394, "num_input_tokens_seen": 10417320, "step": 10375 }, { "epoch": 5.503711558854719, "grad_norm": 3.814114809036255, "learning_rate": 9.094167204588069e-06, "loss": 0.2324, "num_input_tokens_seen": 10422792, "step": 10380 }, { "epoch": 5.506362672322376, "grad_norm": 4.150354862213135, "learning_rate": 9.092838726314279e-06, "loss": 0.2566, "num_input_tokens_seen": 10427784, "step": 10385 }, { "epoch": 5.509013785790032, "grad_norm": 4.620378017425537, "learning_rate": 9.091509371774428e-06, "loss": 0.2176, "num_input_tokens_seen": 10432008, "step": 10390 }, { "epoch": 5.511664899257688, "grad_norm": 4.212658405303955, "learning_rate": 9.090179141253128e-06, "loss": 0.4452, "num_input_tokens_seen": 10436296, "step": 10395 }, { "epoch": 5.514316012725344, "grad_norm": 2.494349956512451, "learning_rate": 9.088848035035175e-06, "loss": 0.1439, "num_input_tokens_seen": 10441064, "step": 10400 }, { "epoch": 5.516967126193001, "grad_norm": 6.744565963745117, "learning_rate": 9.087516053405557e-06, "loss": 0.3494, "num_input_tokens_seen": 10446344, "step": 10405 }, { "epoch": 5.519618239660657, "grad_norm": 2.6775617599487305, "learning_rate": 9.086183196649449e-06, "loss": 0.3645, "num_input_tokens_seen": 10453320, "step": 10410 }, { "epoch": 5.522269353128314, "grad_norm": 1.495701551437378, "learning_rate": 9.08484946505221e-06, "loss": 0.2839, "num_input_tokens_seen": 10458184, "step": 10415 }, { "epoch": 5.524920466595971, "grad_norm": 3.2518465518951416, "learning_rate": 9.083514858899391e-06, "loss": 0.3422, "num_input_tokens_seen": 10462440, "step": 10420 }, { "epoch": 5.5275715800636265, "grad_norm": 8.190238952636719, "learning_rate": 9.082179378476725e-06, "loss": 0.329, "num_input_tokens_seen": 10466856, "step": 10425 }, { "epoch": 5.530222693531283, "grad_norm": 4.25175666809082, "learning_rate": 9.080843024070137e-06, "loss": 0.2001, "num_input_tokens_seen": 10471528, "step": 10430 }, { "epoch": 5.532873806998939, "grad_norm": 2.891824960708618, "learning_rate": 9.079505795965734e-06, "loss": 0.2169, "num_input_tokens_seen": 10476808, "step": 10435 }, { "epoch": 5.535524920466596, "grad_norm": 4.029449462890625, "learning_rate": 9.078167694449818e-06, "loss": 0.2011, "num_input_tokens_seen": 10481992, "step": 10440 }, { "epoch": 5.538176033934253, "grad_norm": 3.8493335247039795, "learning_rate": 9.076828719808869e-06, "loss": 0.2716, "num_input_tokens_seen": 10488776, "step": 10445 }, { "epoch": 5.540827147401909, "grad_norm": 7.8847336769104, "learning_rate": 9.07548887232956e-06, "loss": 0.2687, "num_input_tokens_seen": 10494568, "step": 10450 }, { "epoch": 5.543478260869565, "grad_norm": 6.582437038421631, "learning_rate": 9.074148152298745e-06, "loss": 0.3381, "num_input_tokens_seen": 10499912, "step": 10455 }, { "epoch": 5.546129374337221, "grad_norm": 7.8921709060668945, "learning_rate": 9.072806560003474e-06, "loss": 0.2589, "num_input_tokens_seen": 10504712, "step": 10460 }, { "epoch": 5.548780487804878, "grad_norm": 4.529251575469971, "learning_rate": 9.071464095730975e-06, "loss": 0.2284, "num_input_tokens_seen": 10509544, "step": 10465 }, { "epoch": 5.551431601272535, "grad_norm": 5.277381896972656, "learning_rate": 9.070120759768668e-06, "loss": 0.2383, "num_input_tokens_seen": 10514696, "step": 10470 }, { "epoch": 5.554082714740191, "grad_norm": 8.02026653289795, "learning_rate": 9.068776552404154e-06, "loss": 0.398, "num_input_tokens_seen": 10520840, "step": 10475 }, { "epoch": 5.556733828207848, "grad_norm": 8.815077781677246, "learning_rate": 9.06743147392523e-06, "loss": 0.4907, "num_input_tokens_seen": 10525992, "step": 10480 }, { "epoch": 5.5593849416755035, "grad_norm": 4.626523494720459, "learning_rate": 9.06608552461987e-06, "loss": 0.2521, "num_input_tokens_seen": 10530696, "step": 10485 }, { "epoch": 5.56203605514316, "grad_norm": 4.182709217071533, "learning_rate": 9.06473870477624e-06, "loss": 0.3555, "num_input_tokens_seen": 10534632, "step": 10490 }, { "epoch": 5.564687168610817, "grad_norm": 3.793499231338501, "learning_rate": 9.063391014682688e-06, "loss": 0.2599, "num_input_tokens_seen": 10539976, "step": 10495 }, { "epoch": 5.567338282078473, "grad_norm": 4.5400824546813965, "learning_rate": 9.062042454627754e-06, "loss": 0.2545, "num_input_tokens_seen": 10545576, "step": 10500 }, { "epoch": 5.56998939554613, "grad_norm": 10.920870780944824, "learning_rate": 9.060693024900158e-06, "loss": 0.3798, "num_input_tokens_seen": 10550344, "step": 10505 }, { "epoch": 5.572640509013786, "grad_norm": 10.497093200683594, "learning_rate": 9.059342725788814e-06, "loss": 0.3353, "num_input_tokens_seen": 10554920, "step": 10510 }, { "epoch": 5.575291622481442, "grad_norm": 6.673221111297607, "learning_rate": 9.057991557582811e-06, "loss": 0.3994, "num_input_tokens_seen": 10559720, "step": 10515 }, { "epoch": 5.577942735949098, "grad_norm": 4.291275978088379, "learning_rate": 9.056639520571436e-06, "loss": 0.2155, "num_input_tokens_seen": 10564872, "step": 10520 }, { "epoch": 5.580593849416755, "grad_norm": 8.099980354309082, "learning_rate": 9.055286615044155e-06, "loss": 0.5111, "num_input_tokens_seen": 10569832, "step": 10525 }, { "epoch": 5.583244962884411, "grad_norm": 3.6252763271331787, "learning_rate": 9.053932841290622e-06, "loss": 0.2977, "num_input_tokens_seen": 10574216, "step": 10530 }, { "epoch": 5.585896076352068, "grad_norm": 1.9189070463180542, "learning_rate": 9.052578199600675e-06, "loss": 0.2446, "num_input_tokens_seen": 10579048, "step": 10535 }, { "epoch": 5.5885471898197245, "grad_norm": 6.575855255126953, "learning_rate": 9.05122269026434e-06, "loss": 0.2549, "num_input_tokens_seen": 10586408, "step": 10540 }, { "epoch": 5.59119830328738, "grad_norm": 4.896321773529053, "learning_rate": 9.049866313571827e-06, "loss": 0.327, "num_input_tokens_seen": 10591368, "step": 10545 }, { "epoch": 5.593849416755037, "grad_norm": 4.8834075927734375, "learning_rate": 9.048509069813534e-06, "loss": 0.2195, "num_input_tokens_seen": 10596904, "step": 10550 }, { "epoch": 5.596500530222693, "grad_norm": 6.046759128570557, "learning_rate": 9.047150959280044e-06, "loss": 0.3913, "num_input_tokens_seen": 10601224, "step": 10555 }, { "epoch": 5.59915164369035, "grad_norm": 7.5551042556762695, "learning_rate": 9.045791982262123e-06, "loss": 0.341, "num_input_tokens_seen": 10606568, "step": 10560 }, { "epoch": 5.601802757158007, "grad_norm": 6.060254096984863, "learning_rate": 9.044432139050726e-06, "loss": 0.3319, "num_input_tokens_seen": 10611144, "step": 10565 }, { "epoch": 5.604453870625663, "grad_norm": 8.528871536254883, "learning_rate": 9.04307142993699e-06, "loss": 0.3116, "num_input_tokens_seen": 10616232, "step": 10570 }, { "epoch": 5.607104984093319, "grad_norm": 6.629191875457764, "learning_rate": 9.041709855212243e-06, "loss": 0.2998, "num_input_tokens_seen": 10620680, "step": 10575 }, { "epoch": 5.609756097560975, "grad_norm": 6.743140697479248, "learning_rate": 9.040347415167992e-06, "loss": 0.2347, "num_input_tokens_seen": 10625128, "step": 10580 }, { "epoch": 5.612407211028632, "grad_norm": 4.2579450607299805, "learning_rate": 9.03898411009593e-06, "loss": 0.2162, "num_input_tokens_seen": 10629832, "step": 10585 }, { "epoch": 5.615058324496289, "grad_norm": 12.473219871520996, "learning_rate": 9.037619940287939e-06, "loss": 0.3661, "num_input_tokens_seen": 10634056, "step": 10590 }, { "epoch": 5.617709437963945, "grad_norm": 7.017132759094238, "learning_rate": 9.036254906036084e-06, "loss": 0.2936, "num_input_tokens_seen": 10638344, "step": 10595 }, { "epoch": 5.6203605514316015, "grad_norm": 7.947434425354004, "learning_rate": 9.034889007632616e-06, "loss": 0.396, "num_input_tokens_seen": 10643400, "step": 10600 }, { "epoch": 5.623011664899257, "grad_norm": 7.701476097106934, "learning_rate": 9.03352224536997e-06, "loss": 0.3221, "num_input_tokens_seen": 10647752, "step": 10605 }, { "epoch": 5.625662778366914, "grad_norm": 9.163152694702148, "learning_rate": 9.032154619540767e-06, "loss": 0.2258, "num_input_tokens_seen": 10652680, "step": 10610 }, { "epoch": 5.628313891834571, "grad_norm": 8.0149564743042, "learning_rate": 9.03078613043781e-06, "loss": 0.3296, "num_input_tokens_seen": 10658152, "step": 10615 }, { "epoch": 5.630965005302227, "grad_norm": 3.725879192352295, "learning_rate": 9.02941677835409e-06, "loss": 0.3034, "num_input_tokens_seen": 10663784, "step": 10620 }, { "epoch": 5.633616118769884, "grad_norm": 2.6894116401672363, "learning_rate": 9.028046563582782e-06, "loss": 0.278, "num_input_tokens_seen": 10669064, "step": 10625 }, { "epoch": 5.63626723223754, "grad_norm": 8.08132266998291, "learning_rate": 9.026675486417245e-06, "loss": 0.2794, "num_input_tokens_seen": 10673256, "step": 10630 }, { "epoch": 5.638918345705196, "grad_norm": 3.3605167865753174, "learning_rate": 9.025303547151022e-06, "loss": 0.3077, "num_input_tokens_seen": 10678600, "step": 10635 }, { "epoch": 5.641569459172852, "grad_norm": 4.900722026824951, "learning_rate": 9.023930746077844e-06, "loss": 0.2398, "num_input_tokens_seen": 10684040, "step": 10640 }, { "epoch": 5.644220572640509, "grad_norm": 7.269042015075684, "learning_rate": 9.02255708349162e-06, "loss": 0.4622, "num_input_tokens_seen": 10690024, "step": 10645 }, { "epoch": 5.646871686108166, "grad_norm": 8.209342002868652, "learning_rate": 9.021182559686454e-06, "loss": 0.3, "num_input_tokens_seen": 10696648, "step": 10650 }, { "epoch": 5.649522799575822, "grad_norm": 4.425014972686768, "learning_rate": 9.019807174956622e-06, "loss": 0.2757, "num_input_tokens_seen": 10701352, "step": 10655 }, { "epoch": 5.6521739130434785, "grad_norm": 4.937980651855469, "learning_rate": 9.018430929596595e-06, "loss": 0.2631, "num_input_tokens_seen": 10706632, "step": 10660 }, { "epoch": 5.654825026511134, "grad_norm": 7.202224254608154, "learning_rate": 9.01705382390102e-06, "loss": 0.2899, "num_input_tokens_seen": 10712008, "step": 10665 }, { "epoch": 5.657476139978791, "grad_norm": 3.0743954181671143, "learning_rate": 9.015675858164731e-06, "loss": 0.2145, "num_input_tokens_seen": 10716616, "step": 10670 }, { "epoch": 5.660127253446447, "grad_norm": 5.250890731811523, "learning_rate": 9.014297032682752e-06, "loss": 0.2704, "num_input_tokens_seen": 10721128, "step": 10675 }, { "epoch": 5.662778366914104, "grad_norm": 3.6988284587860107, "learning_rate": 9.01291734775028e-06, "loss": 0.2852, "num_input_tokens_seen": 10726120, "step": 10680 }, { "epoch": 5.665429480381761, "grad_norm": 6.845292568206787, "learning_rate": 9.011536803662706e-06, "loss": 0.2971, "num_input_tokens_seen": 10731048, "step": 10685 }, { "epoch": 5.6680805938494165, "grad_norm": 1.741318702697754, "learning_rate": 9.0101554007156e-06, "loss": 0.2535, "num_input_tokens_seen": 10736456, "step": 10690 }, { "epoch": 5.670731707317073, "grad_norm": 3.8482468128204346, "learning_rate": 9.008773139204715e-06, "loss": 0.3972, "num_input_tokens_seen": 10742088, "step": 10695 }, { "epoch": 5.673382820784729, "grad_norm": 5.0657501220703125, "learning_rate": 9.007390019425991e-06, "loss": 0.3159, "num_input_tokens_seen": 10748264, "step": 10700 }, { "epoch": 5.676033934252386, "grad_norm": 6.128678321838379, "learning_rate": 9.00600604167555e-06, "loss": 0.3124, "num_input_tokens_seen": 10753416, "step": 10705 }, { "epoch": 5.678685047720043, "grad_norm": 4.7414870262146, "learning_rate": 9.0046212062497e-06, "loss": 0.231, "num_input_tokens_seen": 10759432, "step": 10710 }, { "epoch": 5.681336161187699, "grad_norm": 5.420533180236816, "learning_rate": 9.003235513444926e-06, "loss": 0.4582, "num_input_tokens_seen": 10763592, "step": 10715 }, { "epoch": 5.6839872746553555, "grad_norm": 5.279153823852539, "learning_rate": 9.001848963557905e-06, "loss": 0.3342, "num_input_tokens_seen": 10769768, "step": 10720 }, { "epoch": 5.686638388123011, "grad_norm": 4.85471773147583, "learning_rate": 9.000461556885494e-06, "loss": 0.3808, "num_input_tokens_seen": 10774312, "step": 10725 }, { "epoch": 5.689289501590668, "grad_norm": 6.811558246612549, "learning_rate": 8.999073293724729e-06, "loss": 0.3265, "num_input_tokens_seen": 10780296, "step": 10730 }, { "epoch": 5.691940615058325, "grad_norm": 7.582120895385742, "learning_rate": 8.99768417437284e-06, "loss": 0.3323, "num_input_tokens_seen": 10784648, "step": 10735 }, { "epoch": 5.694591728525981, "grad_norm": 2.575112819671631, "learning_rate": 8.996294199127226e-06, "loss": 0.2386, "num_input_tokens_seen": 10789384, "step": 10740 }, { "epoch": 5.697242841993638, "grad_norm": 5.190952777862549, "learning_rate": 8.994903368285484e-06, "loss": 0.3685, "num_input_tokens_seen": 10794376, "step": 10745 }, { "epoch": 5.6998939554612935, "grad_norm": 4.448669910430908, "learning_rate": 8.993511682145383e-06, "loss": 0.3471, "num_input_tokens_seen": 10798728, "step": 10750 }, { "epoch": 5.70254506892895, "grad_norm": 4.429726600646973, "learning_rate": 8.992119141004882e-06, "loss": 0.3029, "num_input_tokens_seen": 10803752, "step": 10755 }, { "epoch": 5.705196182396606, "grad_norm": 5.679897785186768, "learning_rate": 8.99072574516212e-06, "loss": 0.2355, "num_input_tokens_seen": 10808040, "step": 10760 }, { "epoch": 5.707847295864263, "grad_norm": 4.524604320526123, "learning_rate": 8.989331494915417e-06, "loss": 0.2663, "num_input_tokens_seen": 10812456, "step": 10765 }, { "epoch": 5.71049840933192, "grad_norm": 7.882339000701904, "learning_rate": 8.987936390563281e-06, "loss": 0.4753, "num_input_tokens_seen": 10817672, "step": 10770 }, { "epoch": 5.713149522799576, "grad_norm": 6.603509902954102, "learning_rate": 8.986540432404398e-06, "loss": 0.2564, "num_input_tokens_seen": 10822376, "step": 10775 }, { "epoch": 5.7158006362672324, "grad_norm": 6.456178665161133, "learning_rate": 8.985143620737642e-06, "loss": 0.3182, "num_input_tokens_seen": 10828552, "step": 10780 }, { "epoch": 5.718451749734888, "grad_norm": 7.493152618408203, "learning_rate": 8.983745955862063e-06, "loss": 0.2431, "num_input_tokens_seen": 10835016, "step": 10785 }, { "epoch": 5.721102863202545, "grad_norm": 4.0078229904174805, "learning_rate": 8.982347438076901e-06, "loss": 0.2723, "num_input_tokens_seen": 10840136, "step": 10790 }, { "epoch": 5.723753976670201, "grad_norm": 4.099117755889893, "learning_rate": 8.980948067681572e-06, "loss": 0.2693, "num_input_tokens_seen": 10844616, "step": 10795 }, { "epoch": 5.726405090137858, "grad_norm": 6.830237865447998, "learning_rate": 8.979547844975678e-06, "loss": 0.3613, "num_input_tokens_seen": 10849928, "step": 10800 }, { "epoch": 5.729056203605515, "grad_norm": 3.5615601539611816, "learning_rate": 8.978146770259005e-06, "loss": 0.2664, "num_input_tokens_seen": 10854120, "step": 10805 }, { "epoch": 5.7317073170731705, "grad_norm": 6.846280574798584, "learning_rate": 8.976744843831517e-06, "loss": 0.3151, "num_input_tokens_seen": 10858408, "step": 10810 }, { "epoch": 5.734358430540827, "grad_norm": 4.2925028800964355, "learning_rate": 8.975342065993367e-06, "loss": 0.2393, "num_input_tokens_seen": 10863464, "step": 10815 }, { "epoch": 5.737009544008483, "grad_norm": 3.2527735233306885, "learning_rate": 8.97393843704488e-06, "loss": 0.1702, "num_input_tokens_seen": 10867944, "step": 10820 }, { "epoch": 5.73966065747614, "grad_norm": 3.7711358070373535, "learning_rate": 8.972533957286574e-06, "loss": 0.1864, "num_input_tokens_seen": 10872936, "step": 10825 }, { "epoch": 5.742311770943797, "grad_norm": 8.606581687927246, "learning_rate": 8.971128627019142e-06, "loss": 0.4006, "num_input_tokens_seen": 10878440, "step": 10830 }, { "epoch": 5.744962884411453, "grad_norm": 2.0565690994262695, "learning_rate": 8.969722446543462e-06, "loss": 0.2548, "num_input_tokens_seen": 10884424, "step": 10835 }, { "epoch": 5.747613997879109, "grad_norm": 4.491918087005615, "learning_rate": 8.968315416160595e-06, "loss": 0.2619, "num_input_tokens_seen": 10889000, "step": 10840 }, { "epoch": 5.750265111346765, "grad_norm": 4.863051414489746, "learning_rate": 8.966907536171782e-06, "loss": 0.2977, "num_input_tokens_seen": 10894472, "step": 10845 }, { "epoch": 5.752916224814422, "grad_norm": 2.8772685527801514, "learning_rate": 8.965498806878446e-06, "loss": 0.2328, "num_input_tokens_seen": 10900136, "step": 10850 }, { "epoch": 5.755567338282079, "grad_norm": 5.492231369018555, "learning_rate": 8.964089228582191e-06, "loss": 0.2409, "num_input_tokens_seen": 10906408, "step": 10855 }, { "epoch": 5.758218451749735, "grad_norm": 9.306965827941895, "learning_rate": 8.962678801584803e-06, "loss": 0.3418, "num_input_tokens_seen": 10910984, "step": 10860 }, { "epoch": 5.760869565217392, "grad_norm": 5.820910930633545, "learning_rate": 8.961267526188256e-06, "loss": 0.2955, "num_input_tokens_seen": 10915368, "step": 10865 }, { "epoch": 5.7635206786850475, "grad_norm": 6.0290446281433105, "learning_rate": 8.959855402694697e-06, "loss": 0.3198, "num_input_tokens_seen": 10919944, "step": 10870 }, { "epoch": 5.766171792152704, "grad_norm": 6.5347819328308105, "learning_rate": 8.958442431406457e-06, "loss": 0.4082, "num_input_tokens_seen": 10925800, "step": 10875 }, { "epoch": 5.768822905620361, "grad_norm": 4.728024005889893, "learning_rate": 8.957028612626051e-06, "loss": 0.3244, "num_input_tokens_seen": 10930376, "step": 10880 }, { "epoch": 5.771474019088017, "grad_norm": 4.091817855834961, "learning_rate": 8.955613946656173e-06, "loss": 0.226, "num_input_tokens_seen": 10935336, "step": 10885 }, { "epoch": 5.774125132555674, "grad_norm": 8.347281455993652, "learning_rate": 8.954198433799702e-06, "loss": 0.2696, "num_input_tokens_seen": 10940552, "step": 10890 }, { "epoch": 5.77677624602333, "grad_norm": 4.963850498199463, "learning_rate": 8.952782074359692e-06, "loss": 0.3049, "num_input_tokens_seen": 10944488, "step": 10895 }, { "epoch": 5.779427359490986, "grad_norm": 4.105186462402344, "learning_rate": 8.951364868639382e-06, "loss": 0.174, "num_input_tokens_seen": 10948776, "step": 10900 }, { "epoch": 5.782078472958642, "grad_norm": 7.89199686050415, "learning_rate": 8.949946816942195e-06, "loss": 0.4827, "num_input_tokens_seen": 10953576, "step": 10905 }, { "epoch": 5.784729586426299, "grad_norm": 4.236886978149414, "learning_rate": 8.948527919571729e-06, "loss": 0.2598, "num_input_tokens_seen": 10958632, "step": 10910 }, { "epoch": 5.787380699893955, "grad_norm": 6.363772392272949, "learning_rate": 8.94710817683177e-06, "loss": 0.343, "num_input_tokens_seen": 10963944, "step": 10915 }, { "epoch": 5.790031813361612, "grad_norm": 4.518927574157715, "learning_rate": 8.945687589026277e-06, "loss": 0.2291, "num_input_tokens_seen": 10968584, "step": 10920 }, { "epoch": 5.7926829268292686, "grad_norm": 4.397737979888916, "learning_rate": 8.944266156459396e-06, "loss": 0.2987, "num_input_tokens_seen": 10973576, "step": 10925 }, { "epoch": 5.7953340402969244, "grad_norm": 3.4388973712921143, "learning_rate": 8.942843879435452e-06, "loss": 0.297, "num_input_tokens_seen": 10978568, "step": 10930 }, { "epoch": 5.797985153764581, "grad_norm": 5.917847156524658, "learning_rate": 8.941420758258951e-06, "loss": 0.2294, "num_input_tokens_seen": 10983496, "step": 10935 }, { "epoch": 5.800636267232237, "grad_norm": 6.2906389236450195, "learning_rate": 8.93999679323458e-06, "loss": 0.4186, "num_input_tokens_seen": 10987528, "step": 10940 }, { "epoch": 5.803287380699894, "grad_norm": 2.7980504035949707, "learning_rate": 8.938571984667206e-06, "loss": 0.3634, "num_input_tokens_seen": 10993000, "step": 10945 }, { "epoch": 5.805938494167551, "grad_norm": 8.778934478759766, "learning_rate": 8.937146332861875e-06, "loss": 0.3661, "num_input_tokens_seen": 10997800, "step": 10950 }, { "epoch": 5.808589607635207, "grad_norm": 4.255260944366455, "learning_rate": 8.935719838123816e-06, "loss": 0.2992, "num_input_tokens_seen": 11002152, "step": 10955 }, { "epoch": 5.811240721102863, "grad_norm": 5.434264659881592, "learning_rate": 8.93429250075844e-06, "loss": 0.2894, "num_input_tokens_seen": 11007400, "step": 10960 }, { "epoch": 5.813891834570519, "grad_norm": 4.307125091552734, "learning_rate": 8.932864321071334e-06, "loss": 0.2511, "num_input_tokens_seen": 11012552, "step": 10965 }, { "epoch": 5.816542948038176, "grad_norm": 4.577129364013672, "learning_rate": 8.931435299368267e-06, "loss": 0.2448, "num_input_tokens_seen": 11016232, "step": 10970 }, { "epoch": 5.819194061505833, "grad_norm": 2.8233981132507324, "learning_rate": 8.930005435955191e-06, "loss": 0.2231, "num_input_tokens_seen": 11020520, "step": 10975 }, { "epoch": 5.821845174973489, "grad_norm": 1.457598328590393, "learning_rate": 8.928574731138235e-06, "loss": 0.3927, "num_input_tokens_seen": 11024872, "step": 10980 }, { "epoch": 5.8244962884411455, "grad_norm": 6.84694766998291, "learning_rate": 8.92714318522371e-06, "loss": 0.2834, "num_input_tokens_seen": 11030600, "step": 10985 }, { "epoch": 5.827147401908801, "grad_norm": 4.864256381988525, "learning_rate": 8.925710798518106e-06, "loss": 0.1824, "num_input_tokens_seen": 11035112, "step": 10990 }, { "epoch": 5.829798515376458, "grad_norm": 8.018640518188477, "learning_rate": 8.924277571328091e-06, "loss": 0.2662, "num_input_tokens_seen": 11039240, "step": 10995 }, { "epoch": 5.832449628844115, "grad_norm": 8.884004592895508, "learning_rate": 8.922843503960517e-06, "loss": 0.3333, "num_input_tokens_seen": 11043496, "step": 11000 }, { "epoch": 5.835100742311771, "grad_norm": 7.822418689727783, "learning_rate": 8.921408596722412e-06, "loss": 0.3029, "num_input_tokens_seen": 11048744, "step": 11005 }, { "epoch": 5.837751855779428, "grad_norm": 3.5758519172668457, "learning_rate": 8.91997284992099e-06, "loss": 0.2685, "num_input_tokens_seen": 11055048, "step": 11010 }, { "epoch": 5.840402969247084, "grad_norm": 5.1703572273254395, "learning_rate": 8.918536263863638e-06, "loss": 0.2203, "num_input_tokens_seen": 11059656, "step": 11015 }, { "epoch": 5.84305408271474, "grad_norm": 8.909262657165527, "learning_rate": 8.917098838857925e-06, "loss": 0.3425, "num_input_tokens_seen": 11064232, "step": 11020 }, { "epoch": 5.845705196182396, "grad_norm": 4.858097076416016, "learning_rate": 8.915660575211601e-06, "loss": 0.1624, "num_input_tokens_seen": 11069192, "step": 11025 }, { "epoch": 5.848356309650053, "grad_norm": 7.679974555969238, "learning_rate": 8.914221473232594e-06, "loss": 0.3699, "num_input_tokens_seen": 11073960, "step": 11030 }, { "epoch": 5.851007423117709, "grad_norm": 6.604353427886963, "learning_rate": 8.912781533229013e-06, "loss": 0.2864, "num_input_tokens_seen": 11078664, "step": 11035 }, { "epoch": 5.853658536585366, "grad_norm": 4.470583438873291, "learning_rate": 8.911340755509143e-06, "loss": 0.2011, "num_input_tokens_seen": 11083176, "step": 11040 }, { "epoch": 5.8563096500530225, "grad_norm": 4.115246295928955, "learning_rate": 8.909899140381454e-06, "loss": 0.2521, "num_input_tokens_seen": 11087624, "step": 11045 }, { "epoch": 5.858960763520678, "grad_norm": 6.699958801269531, "learning_rate": 8.908456688154587e-06, "loss": 0.2156, "num_input_tokens_seen": 11093800, "step": 11050 }, { "epoch": 5.861611876988335, "grad_norm": 4.388214111328125, "learning_rate": 8.907013399137375e-06, "loss": 0.3778, "num_input_tokens_seen": 11099080, "step": 11055 }, { "epoch": 5.864262990455991, "grad_norm": 3.313542366027832, "learning_rate": 8.905569273638815e-06, "loss": 0.2065, "num_input_tokens_seen": 11104200, "step": 11060 }, { "epoch": 5.866914103923648, "grad_norm": 4.740728378295898, "learning_rate": 8.904124311968092e-06, "loss": 0.2309, "num_input_tokens_seen": 11108840, "step": 11065 }, { "epoch": 5.869565217391305, "grad_norm": 4.15968656539917, "learning_rate": 8.902678514434573e-06, "loss": 0.3158, "num_input_tokens_seen": 11113832, "step": 11070 }, { "epoch": 5.8722163308589606, "grad_norm": 2.985301971435547, "learning_rate": 8.901231881347794e-06, "loss": 0.2414, "num_input_tokens_seen": 11118760, "step": 11075 }, { "epoch": 5.874867444326617, "grad_norm": 4.651431083679199, "learning_rate": 8.899784413017479e-06, "loss": 0.3319, "num_input_tokens_seen": 11122824, "step": 11080 }, { "epoch": 5.877518557794273, "grad_norm": 6.250791072845459, "learning_rate": 8.898336109753524e-06, "loss": 0.3835, "num_input_tokens_seen": 11127560, "step": 11085 }, { "epoch": 5.88016967126193, "grad_norm": 8.727822303771973, "learning_rate": 8.896886971866013e-06, "loss": 0.2709, "num_input_tokens_seen": 11132424, "step": 11090 }, { "epoch": 5.882820784729587, "grad_norm": 7.654016494750977, "learning_rate": 8.895436999665194e-06, "loss": 0.2461, "num_input_tokens_seen": 11136680, "step": 11095 }, { "epoch": 5.885471898197243, "grad_norm": 8.246970176696777, "learning_rate": 8.893986193461509e-06, "loss": 0.2617, "num_input_tokens_seen": 11141608, "step": 11100 }, { "epoch": 5.8881230116648995, "grad_norm": 7.104486465454102, "learning_rate": 8.89253455356557e-06, "loss": 0.2774, "num_input_tokens_seen": 11147976, "step": 11105 }, { "epoch": 5.890774125132555, "grad_norm": 4.552871227264404, "learning_rate": 8.891082080288167e-06, "loss": 0.2667, "num_input_tokens_seen": 11154152, "step": 11110 }, { "epoch": 5.893425238600212, "grad_norm": 9.070976257324219, "learning_rate": 8.889628773940272e-06, "loss": 0.448, "num_input_tokens_seen": 11159048, "step": 11115 }, { "epoch": 5.896076352067869, "grad_norm": 6.327695369720459, "learning_rate": 8.888174634833038e-06, "loss": 0.2661, "num_input_tokens_seen": 11163240, "step": 11120 }, { "epoch": 5.898727465535525, "grad_norm": 3.095820665359497, "learning_rate": 8.886719663277787e-06, "loss": 0.2902, "num_input_tokens_seen": 11168168, "step": 11125 }, { "epoch": 5.901378579003182, "grad_norm": 4.0928778648376465, "learning_rate": 8.885263859586026e-06, "loss": 0.2357, "num_input_tokens_seen": 11172744, "step": 11130 }, { "epoch": 5.9040296924708375, "grad_norm": 9.390090942382812, "learning_rate": 8.883807224069441e-06, "loss": 0.3496, "num_input_tokens_seen": 11177352, "step": 11135 }, { "epoch": 5.906680805938494, "grad_norm": 3.002382278442383, "learning_rate": 8.882349757039891e-06, "loss": 0.2334, "num_input_tokens_seen": 11182024, "step": 11140 }, { "epoch": 5.90933191940615, "grad_norm": 2.2167341709136963, "learning_rate": 8.880891458809418e-06, "loss": 0.2183, "num_input_tokens_seen": 11187112, "step": 11145 }, { "epoch": 5.911983032873807, "grad_norm": 5.161385536193848, "learning_rate": 8.879432329690237e-06, "loss": 0.2959, "num_input_tokens_seen": 11191368, "step": 11150 }, { "epoch": 5.914634146341464, "grad_norm": 8.647144317626953, "learning_rate": 8.877972369994747e-06, "loss": 0.3139, "num_input_tokens_seen": 11197704, "step": 11155 }, { "epoch": 5.91728525980912, "grad_norm": 3.631382703781128, "learning_rate": 8.87651158003552e-06, "loss": 0.2295, "num_input_tokens_seen": 11202856, "step": 11160 }, { "epoch": 5.9199363732767765, "grad_norm": 6.647576332092285, "learning_rate": 8.875049960125308e-06, "loss": 0.3094, "num_input_tokens_seen": 11208264, "step": 11165 }, { "epoch": 5.922587486744432, "grad_norm": 5.360782146453857, "learning_rate": 8.87358751057704e-06, "loss": 0.388, "num_input_tokens_seen": 11212712, "step": 11170 }, { "epoch": 5.925238600212089, "grad_norm": 5.413026332855225, "learning_rate": 8.87212423170382e-06, "loss": 0.1886, "num_input_tokens_seen": 11217512, "step": 11175 }, { "epoch": 5.927889713679745, "grad_norm": 4.157095909118652, "learning_rate": 8.870660123818938e-06, "loss": 0.2328, "num_input_tokens_seen": 11224872, "step": 11180 }, { "epoch": 5.930540827147402, "grad_norm": 5.556674957275391, "learning_rate": 8.869195187235848e-06, "loss": 0.2524, "num_input_tokens_seen": 11229544, "step": 11185 }, { "epoch": 5.933191940615059, "grad_norm": 5.11501932144165, "learning_rate": 8.867729422268195e-06, "loss": 0.2852, "num_input_tokens_seen": 11234600, "step": 11190 }, { "epoch": 5.9358430540827145, "grad_norm": 4.4932708740234375, "learning_rate": 8.866262829229793e-06, "loss": 0.2807, "num_input_tokens_seen": 11239720, "step": 11195 }, { "epoch": 5.938494167550371, "grad_norm": 5.110126972198486, "learning_rate": 8.864795408434637e-06, "loss": 0.2094, "num_input_tokens_seen": 11244200, "step": 11200 }, { "epoch": 5.941145281018027, "grad_norm": 7.113142967224121, "learning_rate": 8.863327160196898e-06, "loss": 0.2499, "num_input_tokens_seen": 11249640, "step": 11205 }, { "epoch": 5.943796394485684, "grad_norm": 2.944547653198242, "learning_rate": 8.861858084830921e-06, "loss": 0.1634, "num_input_tokens_seen": 11254664, "step": 11210 }, { "epoch": 5.946447507953341, "grad_norm": 7.930516242980957, "learning_rate": 8.860388182651233e-06, "loss": 0.3131, "num_input_tokens_seen": 11260616, "step": 11215 }, { "epoch": 5.949098621420997, "grad_norm": 7.227898120880127, "learning_rate": 8.85891745397254e-06, "loss": 0.2763, "num_input_tokens_seen": 11266760, "step": 11220 }, { "epoch": 5.951749734888653, "grad_norm": 6.8189191818237305, "learning_rate": 8.857445899109716e-06, "loss": 0.4342, "num_input_tokens_seen": 11271624, "step": 11225 }, { "epoch": 5.954400848356309, "grad_norm": 6.637192726135254, "learning_rate": 8.855973518377818e-06, "loss": 0.3205, "num_input_tokens_seen": 11276904, "step": 11230 }, { "epoch": 5.957051961823966, "grad_norm": 2.395667314529419, "learning_rate": 8.854500312092081e-06, "loss": 0.2393, "num_input_tokens_seen": 11281416, "step": 11235 }, { "epoch": 5.959703075291623, "grad_norm": 6.065169334411621, "learning_rate": 8.853026280567913e-06, "loss": 0.3831, "num_input_tokens_seen": 11286056, "step": 11240 }, { "epoch": 5.962354188759279, "grad_norm": 6.453847885131836, "learning_rate": 8.8515514241209e-06, "loss": 0.2182, "num_input_tokens_seen": 11291816, "step": 11245 }, { "epoch": 5.965005302226936, "grad_norm": 5.190361976623535, "learning_rate": 8.850075743066809e-06, "loss": 0.2038, "num_input_tokens_seen": 11296552, "step": 11250 }, { "epoch": 5.9676564156945915, "grad_norm": 3.2407572269439697, "learning_rate": 8.848599237721573e-06, "loss": 0.213, "num_input_tokens_seen": 11301832, "step": 11255 }, { "epoch": 5.970307529162248, "grad_norm": 6.571632385253906, "learning_rate": 8.84712190840131e-06, "loss": 0.2686, "num_input_tokens_seen": 11306888, "step": 11260 }, { "epoch": 5.972958642629905, "grad_norm": 3.8724515438079834, "learning_rate": 8.845643755422315e-06, "loss": 0.2674, "num_input_tokens_seen": 11312232, "step": 11265 }, { "epoch": 5.975609756097561, "grad_norm": 6.199052810668945, "learning_rate": 8.844164779101055e-06, "loss": 0.3121, "num_input_tokens_seen": 11316680, "step": 11270 }, { "epoch": 5.978260869565218, "grad_norm": 6.158906936645508, "learning_rate": 8.842684979754174e-06, "loss": 0.3674, "num_input_tokens_seen": 11320968, "step": 11275 }, { "epoch": 5.980911983032874, "grad_norm": 5.596436977386475, "learning_rate": 8.841204357698497e-06, "loss": 0.3005, "num_input_tokens_seen": 11325672, "step": 11280 }, { "epoch": 5.98356309650053, "grad_norm": 8.423563957214355, "learning_rate": 8.839722913251016e-06, "loss": 0.3467, "num_input_tokens_seen": 11330856, "step": 11285 }, { "epoch": 5.986214209968186, "grad_norm": 8.546692848205566, "learning_rate": 8.838240646728906e-06, "loss": 0.3723, "num_input_tokens_seen": 11336264, "step": 11290 }, { "epoch": 5.988865323435843, "grad_norm": 6.739644527435303, "learning_rate": 8.836757558449519e-06, "loss": 0.3551, "num_input_tokens_seen": 11341384, "step": 11295 }, { "epoch": 5.991516436903499, "grad_norm": 3.5600411891937256, "learning_rate": 8.835273648730379e-06, "loss": 0.363, "num_input_tokens_seen": 11346344, "step": 11300 }, { "epoch": 5.994167550371156, "grad_norm": 5.763348579406738, "learning_rate": 8.833788917889185e-06, "loss": 0.2633, "num_input_tokens_seen": 11351112, "step": 11305 }, { "epoch": 5.996818663838813, "grad_norm": 4.570155143737793, "learning_rate": 8.832303366243816e-06, "loss": 0.2927, "num_input_tokens_seen": 11355336, "step": 11310 }, { "epoch": 5.9994697773064685, "grad_norm": 4.007883548736572, "learning_rate": 8.830816994112325e-06, "loss": 0.2307, "num_input_tokens_seen": 11360456, "step": 11315 }, { "epoch": 6.0, "eval_loss": 0.3415689170360565, "eval_runtime": 29.1774, "eval_samples_per_second": 64.639, "eval_steps_per_second": 16.177, "num_input_tokens_seen": 11360672, "step": 11316 }, { "epoch": 6.002120890774125, "grad_norm": 4.997966766357422, "learning_rate": 8.82932980181294e-06, "loss": 0.2003, "num_input_tokens_seen": 11364608, "step": 11320 }, { "epoch": 6.004772004241781, "grad_norm": 1.1549510955810547, "learning_rate": 8.827841789664064e-06, "loss": 0.2555, "num_input_tokens_seen": 11370112, "step": 11325 }, { "epoch": 6.007423117709438, "grad_norm": 3.3571603298187256, "learning_rate": 8.826352957984276e-06, "loss": 0.2493, "num_input_tokens_seen": 11374464, "step": 11330 }, { "epoch": 6.010074231177095, "grad_norm": 5.284893035888672, "learning_rate": 8.824863307092335e-06, "loss": 0.2429, "num_input_tokens_seen": 11378560, "step": 11335 }, { "epoch": 6.012725344644751, "grad_norm": 6.838332653045654, "learning_rate": 8.823372837307167e-06, "loss": 0.3163, "num_input_tokens_seen": 11383776, "step": 11340 }, { "epoch": 6.015376458112407, "grad_norm": 8.69005298614502, "learning_rate": 8.82188154894788e-06, "loss": 0.3685, "num_input_tokens_seen": 11389248, "step": 11345 }, { "epoch": 6.018027571580063, "grad_norm": 3.8459126949310303, "learning_rate": 8.820389442333754e-06, "loss": 0.2957, "num_input_tokens_seen": 11395200, "step": 11350 }, { "epoch": 6.02067868504772, "grad_norm": 6.357746124267578, "learning_rate": 8.818896517784246e-06, "loss": 0.2879, "num_input_tokens_seen": 11400672, "step": 11355 }, { "epoch": 6.023329798515377, "grad_norm": 6.851383686065674, "learning_rate": 8.817402775618985e-06, "loss": 0.3657, "num_input_tokens_seen": 11405568, "step": 11360 }, { "epoch": 6.025980911983033, "grad_norm": 2.0769026279449463, "learning_rate": 8.81590821615778e-06, "loss": 0.1958, "num_input_tokens_seen": 11410880, "step": 11365 }, { "epoch": 6.0286320254506895, "grad_norm": 6.710625171661377, "learning_rate": 8.814412839720611e-06, "loss": 0.2465, "num_input_tokens_seen": 11415840, "step": 11370 }, { "epoch": 6.031283138918345, "grad_norm": 4.944804668426514, "learning_rate": 8.812916646627634e-06, "loss": 0.2376, "num_input_tokens_seen": 11420960, "step": 11375 }, { "epoch": 6.033934252386002, "grad_norm": 3.659499406814575, "learning_rate": 8.81141963719918e-06, "loss": 0.2029, "num_input_tokens_seen": 11426880, "step": 11380 }, { "epoch": 6.036585365853658, "grad_norm": 7.524994373321533, "learning_rate": 8.809921811755756e-06, "loss": 0.2313, "num_input_tokens_seen": 11431840, "step": 11385 }, { "epoch": 6.039236479321315, "grad_norm": 4.099300861358643, "learning_rate": 8.808423170618041e-06, "loss": 0.1737, "num_input_tokens_seen": 11436352, "step": 11390 }, { "epoch": 6.041887592788972, "grad_norm": 6.534377574920654, "learning_rate": 8.806923714106893e-06, "loss": 0.2197, "num_input_tokens_seen": 11442400, "step": 11395 }, { "epoch": 6.044538706256628, "grad_norm": 1.8588981628417969, "learning_rate": 8.805423442543336e-06, "loss": 0.2186, "num_input_tokens_seen": 11448224, "step": 11400 }, { "epoch": 6.047189819724284, "grad_norm": 5.5265116691589355, "learning_rate": 8.80392235624858e-06, "loss": 0.2398, "num_input_tokens_seen": 11453376, "step": 11405 }, { "epoch": 6.04984093319194, "grad_norm": 4.108645915985107, "learning_rate": 8.802420455544001e-06, "loss": 0.2139, "num_input_tokens_seen": 11458336, "step": 11410 }, { "epoch": 6.052492046659597, "grad_norm": 4.403648853302002, "learning_rate": 8.800917740751152e-06, "loss": 0.1597, "num_input_tokens_seen": 11462880, "step": 11415 }, { "epoch": 6.055143160127254, "grad_norm": 6.458092212677002, "learning_rate": 8.799414212191761e-06, "loss": 0.1778, "num_input_tokens_seen": 11467776, "step": 11420 }, { "epoch": 6.05779427359491, "grad_norm": 10.071319580078125, "learning_rate": 8.797909870187728e-06, "loss": 0.2576, "num_input_tokens_seen": 11473088, "step": 11425 }, { "epoch": 6.0604453870625665, "grad_norm": 11.966411590576172, "learning_rate": 8.796404715061132e-06, "loss": 0.2905, "num_input_tokens_seen": 11477856, "step": 11430 }, { "epoch": 6.063096500530222, "grad_norm": 7.610185146331787, "learning_rate": 8.79489874713422e-06, "loss": 0.3113, "num_input_tokens_seen": 11482816, "step": 11435 }, { "epoch": 6.065747613997879, "grad_norm": 4.92258882522583, "learning_rate": 8.793391966729416e-06, "loss": 0.2518, "num_input_tokens_seen": 11488480, "step": 11440 }, { "epoch": 6.068398727465535, "grad_norm": 10.959676742553711, "learning_rate": 8.79188437416932e-06, "loss": 0.2184, "num_input_tokens_seen": 11492544, "step": 11445 }, { "epoch": 6.071049840933192, "grad_norm": 5.696274280548096, "learning_rate": 8.790375969776698e-06, "loss": 0.1757, "num_input_tokens_seen": 11497312, "step": 11450 }, { "epoch": 6.073700954400849, "grad_norm": 5.9292826652526855, "learning_rate": 8.788866753874504e-06, "loss": 0.3425, "num_input_tokens_seen": 11502592, "step": 11455 }, { "epoch": 6.076352067868505, "grad_norm": 9.199110984802246, "learning_rate": 8.787356726785849e-06, "loss": 0.2639, "num_input_tokens_seen": 11507232, "step": 11460 }, { "epoch": 6.079003181336161, "grad_norm": 10.068378448486328, "learning_rate": 8.78584588883403e-06, "loss": 0.2188, "num_input_tokens_seen": 11513024, "step": 11465 }, { "epoch": 6.081654294803817, "grad_norm": 11.71622085571289, "learning_rate": 8.784334240342513e-06, "loss": 0.1757, "num_input_tokens_seen": 11518272, "step": 11470 }, { "epoch": 6.084305408271474, "grad_norm": 11.216760635375977, "learning_rate": 8.782821781634939e-06, "loss": 0.2075, "num_input_tokens_seen": 11523168, "step": 11475 }, { "epoch": 6.086956521739131, "grad_norm": 3.4992640018463135, "learning_rate": 8.781308513035119e-06, "loss": 0.3519, "num_input_tokens_seen": 11527872, "step": 11480 }, { "epoch": 6.089607635206787, "grad_norm": 8.052397727966309, "learning_rate": 8.77979443486704e-06, "loss": 0.2259, "num_input_tokens_seen": 11533312, "step": 11485 }, { "epoch": 6.0922587486744435, "grad_norm": 3.9631385803222656, "learning_rate": 8.778279547454866e-06, "loss": 0.3598, "num_input_tokens_seen": 11538624, "step": 11490 }, { "epoch": 6.094909862142099, "grad_norm": 15.017508506774902, "learning_rate": 8.776763851122925e-06, "loss": 0.3781, "num_input_tokens_seen": 11543840, "step": 11495 }, { "epoch": 6.097560975609756, "grad_norm": 9.97733211517334, "learning_rate": 8.775247346195727e-06, "loss": 0.2971, "num_input_tokens_seen": 11549184, "step": 11500 }, { "epoch": 6.100212089077412, "grad_norm": 2.3019168376922607, "learning_rate": 8.773730032997951e-06, "loss": 0.1573, "num_input_tokens_seen": 11554816, "step": 11505 }, { "epoch": 6.102863202545069, "grad_norm": 8.820889472961426, "learning_rate": 8.772211911854451e-06, "loss": 0.3294, "num_input_tokens_seen": 11561952, "step": 11510 }, { "epoch": 6.105514316012726, "grad_norm": 13.13704776763916, "learning_rate": 8.77069298309025e-06, "loss": 0.44, "num_input_tokens_seen": 11566112, "step": 11515 }, { "epoch": 6.1081654294803815, "grad_norm": 9.091946601867676, "learning_rate": 8.769173247030549e-06, "loss": 0.2608, "num_input_tokens_seen": 11570912, "step": 11520 }, { "epoch": 6.110816542948038, "grad_norm": 9.927864074707031, "learning_rate": 8.76765270400072e-06, "loss": 0.2429, "num_input_tokens_seen": 11576320, "step": 11525 }, { "epoch": 6.113467656415694, "grad_norm": 2.36055064201355, "learning_rate": 8.766131354326303e-06, "loss": 0.4068, "num_input_tokens_seen": 11582624, "step": 11530 }, { "epoch": 6.116118769883351, "grad_norm": 7.027505397796631, "learning_rate": 8.76460919833302e-06, "loss": 0.3106, "num_input_tokens_seen": 11587360, "step": 11535 }, { "epoch": 6.118769883351008, "grad_norm": 6.708158493041992, "learning_rate": 8.763086236346759e-06, "loss": 0.2068, "num_input_tokens_seen": 11592352, "step": 11540 }, { "epoch": 6.121420996818664, "grad_norm": 4.007078170776367, "learning_rate": 8.761562468693581e-06, "loss": 0.2768, "num_input_tokens_seen": 11596928, "step": 11545 }, { "epoch": 6.1240721102863205, "grad_norm": 9.35508918762207, "learning_rate": 8.760037895699722e-06, "loss": 0.2849, "num_input_tokens_seen": 11602688, "step": 11550 }, { "epoch": 6.126723223753976, "grad_norm": 10.402884483337402, "learning_rate": 8.758512517691588e-06, "loss": 0.2537, "num_input_tokens_seen": 11607872, "step": 11555 }, { "epoch": 6.129374337221633, "grad_norm": 9.749900817871094, "learning_rate": 8.756986334995758e-06, "loss": 0.2925, "num_input_tokens_seen": 11612352, "step": 11560 }, { "epoch": 6.132025450689289, "grad_norm": 2.685708522796631, "learning_rate": 8.755459347938985e-06, "loss": 0.1507, "num_input_tokens_seen": 11618016, "step": 11565 }, { "epoch": 6.134676564156946, "grad_norm": 15.364954948425293, "learning_rate": 8.753931556848195e-06, "loss": 0.2202, "num_input_tokens_seen": 11622944, "step": 11570 }, { "epoch": 6.137327677624603, "grad_norm": 4.603734970092773, "learning_rate": 8.75240296205048e-06, "loss": 0.1399, "num_input_tokens_seen": 11628288, "step": 11575 }, { "epoch": 6.1399787910922585, "grad_norm": 6.316588878631592, "learning_rate": 8.75087356387311e-06, "loss": 0.2941, "num_input_tokens_seen": 11632864, "step": 11580 }, { "epoch": 6.142629904559915, "grad_norm": 3.993574619293213, "learning_rate": 8.749343362643525e-06, "loss": 0.2437, "num_input_tokens_seen": 11637440, "step": 11585 }, { "epoch": 6.145281018027571, "grad_norm": 9.530635833740234, "learning_rate": 8.747812358689338e-06, "loss": 0.4939, "num_input_tokens_seen": 11642368, "step": 11590 }, { "epoch": 6.147932131495228, "grad_norm": 5.235676288604736, "learning_rate": 8.746280552338331e-06, "loss": 0.2152, "num_input_tokens_seen": 11647936, "step": 11595 }, { "epoch": 6.150583244962885, "grad_norm": 5.555519104003906, "learning_rate": 8.744747943918461e-06, "loss": 0.3322, "num_input_tokens_seen": 11652704, "step": 11600 }, { "epoch": 6.153234358430541, "grad_norm": 8.68060302734375, "learning_rate": 8.743214533757855e-06, "loss": 0.2778, "num_input_tokens_seen": 11657408, "step": 11605 }, { "epoch": 6.1558854718981975, "grad_norm": 5.509481430053711, "learning_rate": 8.741680322184814e-06, "loss": 0.2102, "num_input_tokens_seen": 11662400, "step": 11610 }, { "epoch": 6.158536585365853, "grad_norm": 7.710084915161133, "learning_rate": 8.740145309527805e-06, "loss": 0.3452, "num_input_tokens_seen": 11668448, "step": 11615 }, { "epoch": 6.16118769883351, "grad_norm": 6.737957000732422, "learning_rate": 8.738609496115474e-06, "loss": 0.186, "num_input_tokens_seen": 11673472, "step": 11620 }, { "epoch": 6.163838812301167, "grad_norm": 3.718493938446045, "learning_rate": 8.737072882276631e-06, "loss": 0.2549, "num_input_tokens_seen": 11678784, "step": 11625 }, { "epoch": 6.166489925768823, "grad_norm": 10.50370979309082, "learning_rate": 8.735535468340263e-06, "loss": 0.3383, "num_input_tokens_seen": 11683808, "step": 11630 }, { "epoch": 6.16914103923648, "grad_norm": 4.512977600097656, "learning_rate": 8.733997254635527e-06, "loss": 0.2993, "num_input_tokens_seen": 11688672, "step": 11635 }, { "epoch": 6.1717921527041355, "grad_norm": 9.11897087097168, "learning_rate": 8.732458241491748e-06, "loss": 0.3136, "num_input_tokens_seen": 11693920, "step": 11640 }, { "epoch": 6.174443266171792, "grad_norm": 6.145852088928223, "learning_rate": 8.730918429238429e-06, "loss": 0.2218, "num_input_tokens_seen": 11698944, "step": 11645 }, { "epoch": 6.177094379639448, "grad_norm": 4.528614521026611, "learning_rate": 8.729377818205237e-06, "loss": 0.1337, "num_input_tokens_seen": 11703424, "step": 11650 }, { "epoch": 6.179745493107105, "grad_norm": 4.025517463684082, "learning_rate": 8.72783640872201e-06, "loss": 0.3167, "num_input_tokens_seen": 11707104, "step": 11655 }, { "epoch": 6.182396606574762, "grad_norm": 9.88759994506836, "learning_rate": 8.726294201118763e-06, "loss": 0.2787, "num_input_tokens_seen": 11711136, "step": 11660 }, { "epoch": 6.185047720042418, "grad_norm": 8.696203231811523, "learning_rate": 8.724751195725681e-06, "loss": 0.2809, "num_input_tokens_seen": 11715584, "step": 11665 }, { "epoch": 6.187698833510074, "grad_norm": 6.326916694641113, "learning_rate": 8.723207392873113e-06, "loss": 0.2495, "num_input_tokens_seen": 11720800, "step": 11670 }, { "epoch": 6.19034994697773, "grad_norm": 2.4635729789733887, "learning_rate": 8.721662792891585e-06, "loss": 0.1691, "num_input_tokens_seen": 11725504, "step": 11675 }, { "epoch": 6.193001060445387, "grad_norm": 8.743454933166504, "learning_rate": 8.720117396111793e-06, "loss": 0.2431, "num_input_tokens_seen": 11730496, "step": 11680 }, { "epoch": 6.195652173913044, "grad_norm": 4.228477954864502, "learning_rate": 8.718571202864598e-06, "loss": 0.2376, "num_input_tokens_seen": 11737856, "step": 11685 }, { "epoch": 6.1983032873807, "grad_norm": 9.473057746887207, "learning_rate": 8.717024213481043e-06, "loss": 0.422, "num_input_tokens_seen": 11743136, "step": 11690 }, { "epoch": 6.200954400848357, "grad_norm": 9.257393836975098, "learning_rate": 8.715476428292326e-06, "loss": 0.4586, "num_input_tokens_seen": 11751136, "step": 11695 }, { "epoch": 6.2036055143160125, "grad_norm": 7.840014457702637, "learning_rate": 8.713927847629831e-06, "loss": 0.2371, "num_input_tokens_seen": 11756544, "step": 11700 }, { "epoch": 6.206256627783669, "grad_norm": 5.58125114440918, "learning_rate": 8.712378471825102e-06, "loss": 0.1908, "num_input_tokens_seen": 11762400, "step": 11705 }, { "epoch": 6.208907741251325, "grad_norm": 6.424323558807373, "learning_rate": 8.710828301209856e-06, "loss": 0.2914, "num_input_tokens_seen": 11769088, "step": 11710 }, { "epoch": 6.211558854718982, "grad_norm": 15.714802742004395, "learning_rate": 8.70927733611598e-06, "loss": 0.2594, "num_input_tokens_seen": 11774368, "step": 11715 }, { "epoch": 6.214209968186639, "grad_norm": 3.6790106296539307, "learning_rate": 8.707725576875534e-06, "loss": 0.1571, "num_input_tokens_seen": 11780672, "step": 11720 }, { "epoch": 6.216861081654295, "grad_norm": 5.9190449714660645, "learning_rate": 8.706173023820743e-06, "loss": 0.3529, "num_input_tokens_seen": 11785216, "step": 11725 }, { "epoch": 6.219512195121951, "grad_norm": 5.536769866943359, "learning_rate": 8.704619677284007e-06, "loss": 0.1942, "num_input_tokens_seen": 11790176, "step": 11730 }, { "epoch": 6.222163308589607, "grad_norm": 3.7911245822906494, "learning_rate": 8.703065537597892e-06, "loss": 0.2013, "num_input_tokens_seen": 11796032, "step": 11735 }, { "epoch": 6.224814422057264, "grad_norm": 8.122847557067871, "learning_rate": 8.701510605095135e-06, "loss": 0.271, "num_input_tokens_seen": 11801088, "step": 11740 }, { "epoch": 6.227465535524921, "grad_norm": 9.692156791687012, "learning_rate": 8.699954880108643e-06, "loss": 0.4186, "num_input_tokens_seen": 11806464, "step": 11745 }, { "epoch": 6.230116648992577, "grad_norm": 2.928589344024658, "learning_rate": 8.698398362971493e-06, "loss": 0.2306, "num_input_tokens_seen": 11811040, "step": 11750 }, { "epoch": 6.232767762460234, "grad_norm": 2.206166982650757, "learning_rate": 8.696841054016931e-06, "loss": 0.3057, "num_input_tokens_seen": 11816768, "step": 11755 }, { "epoch": 6.2354188759278895, "grad_norm": 5.652599811553955, "learning_rate": 8.695282953578374e-06, "loss": 0.3154, "num_input_tokens_seen": 11821056, "step": 11760 }, { "epoch": 6.238069989395546, "grad_norm": 8.636248588562012, "learning_rate": 8.693724061989407e-06, "loss": 0.261, "num_input_tokens_seen": 11826592, "step": 11765 }, { "epoch": 6.240721102863202, "grad_norm": 5.691495895385742, "learning_rate": 8.692164379583781e-06, "loss": 0.1875, "num_input_tokens_seen": 11830976, "step": 11770 }, { "epoch": 6.243372216330859, "grad_norm": 7.525954723358154, "learning_rate": 8.690603906695423e-06, "loss": 0.2044, "num_input_tokens_seen": 11835808, "step": 11775 }, { "epoch": 6.246023329798516, "grad_norm": 6.267578125, "learning_rate": 8.689042643658427e-06, "loss": 0.2309, "num_input_tokens_seen": 11841536, "step": 11780 }, { "epoch": 6.248674443266172, "grad_norm": 10.361918449401855, "learning_rate": 8.687480590807054e-06, "loss": 0.3855, "num_input_tokens_seen": 11846656, "step": 11785 }, { "epoch": 6.251325556733828, "grad_norm": 9.202146530151367, "learning_rate": 8.685917748475737e-06, "loss": 0.3292, "num_input_tokens_seen": 11851520, "step": 11790 }, { "epoch": 6.253976670201484, "grad_norm": 4.7485504150390625, "learning_rate": 8.684354116999074e-06, "loss": 0.1562, "num_input_tokens_seen": 11857152, "step": 11795 }, { "epoch": 6.256627783669141, "grad_norm": 4.932651519775391, "learning_rate": 8.682789696711835e-06, "loss": 0.1523, "num_input_tokens_seen": 11862496, "step": 11800 }, { "epoch": 6.259278897136798, "grad_norm": 1.4917500019073486, "learning_rate": 8.68122448794896e-06, "loss": 0.2926, "num_input_tokens_seen": 11866592, "step": 11805 }, { "epoch": 6.261930010604454, "grad_norm": 5.284179210662842, "learning_rate": 8.679658491045555e-06, "loss": 0.3339, "num_input_tokens_seen": 11872544, "step": 11810 }, { "epoch": 6.2645811240721105, "grad_norm": 7.829977035522461, "learning_rate": 8.678091706336895e-06, "loss": 0.3425, "num_input_tokens_seen": 11876480, "step": 11815 }, { "epoch": 6.267232237539766, "grad_norm": 6.261636734008789, "learning_rate": 8.676524134158424e-06, "loss": 0.185, "num_input_tokens_seen": 11881536, "step": 11820 }, { "epoch": 6.269883351007423, "grad_norm": 9.473296165466309, "learning_rate": 8.67495577484576e-06, "loss": 0.429, "num_input_tokens_seen": 11886368, "step": 11825 }, { "epoch": 6.272534464475079, "grad_norm": 9.094475746154785, "learning_rate": 8.673386628734677e-06, "loss": 0.2183, "num_input_tokens_seen": 11890688, "step": 11830 }, { "epoch": 6.275185577942736, "grad_norm": 2.4511330127716064, "learning_rate": 8.67181669616113e-06, "loss": 0.1813, "num_input_tokens_seen": 11896768, "step": 11835 }, { "epoch": 6.277836691410393, "grad_norm": 5.752602577209473, "learning_rate": 8.670245977461237e-06, "loss": 0.2052, "num_input_tokens_seen": 11902208, "step": 11840 }, { "epoch": 6.280487804878049, "grad_norm": 5.364969253540039, "learning_rate": 8.668674472971286e-06, "loss": 0.2192, "num_input_tokens_seen": 11906816, "step": 11845 }, { "epoch": 6.283138918345705, "grad_norm": 5.264271259307861, "learning_rate": 8.667102183027729e-06, "loss": 0.1969, "num_input_tokens_seen": 11913824, "step": 11850 }, { "epoch": 6.285790031813361, "grad_norm": 5.197211265563965, "learning_rate": 8.66552910796719e-06, "loss": 0.3184, "num_input_tokens_seen": 11919072, "step": 11855 }, { "epoch": 6.288441145281018, "grad_norm": 2.246143102645874, "learning_rate": 8.663955248126461e-06, "loss": 0.2312, "num_input_tokens_seen": 11923808, "step": 11860 }, { "epoch": 6.291092258748675, "grad_norm": 9.036803245544434, "learning_rate": 8.662380603842502e-06, "loss": 0.3371, "num_input_tokens_seen": 11929056, "step": 11865 }, { "epoch": 6.293743372216331, "grad_norm": 12.116767883300781, "learning_rate": 8.660805175452438e-06, "loss": 0.34, "num_input_tokens_seen": 11933952, "step": 11870 }, { "epoch": 6.2963944856839875, "grad_norm": 10.468807220458984, "learning_rate": 8.659228963293566e-06, "loss": 0.2943, "num_input_tokens_seen": 11938528, "step": 11875 }, { "epoch": 6.299045599151643, "grad_norm": 6.554555892944336, "learning_rate": 8.657651967703347e-06, "loss": 0.1812, "num_input_tokens_seen": 11943584, "step": 11880 }, { "epoch": 6.3016967126193, "grad_norm": 6.22694206237793, "learning_rate": 8.656074189019414e-06, "loss": 0.2251, "num_input_tokens_seen": 11948352, "step": 11885 }, { "epoch": 6.304347826086957, "grad_norm": 12.15489387512207, "learning_rate": 8.654495627579563e-06, "loss": 0.3061, "num_input_tokens_seen": 11953888, "step": 11890 }, { "epoch": 6.306998939554613, "grad_norm": 5.899701118469238, "learning_rate": 8.652916283721762e-06, "loss": 0.3461, "num_input_tokens_seen": 11958112, "step": 11895 }, { "epoch": 6.30965005302227, "grad_norm": 7.254705905914307, "learning_rate": 8.651336157784142e-06, "loss": 0.2014, "num_input_tokens_seen": 11962560, "step": 11900 }, { "epoch": 6.312301166489926, "grad_norm": 6.9805588722229, "learning_rate": 8.649755250105007e-06, "loss": 0.2233, "num_input_tokens_seen": 11967168, "step": 11905 }, { "epoch": 6.314952279957582, "grad_norm": 5.093031406402588, "learning_rate": 8.648173561022824e-06, "loss": 0.2293, "num_input_tokens_seen": 11973312, "step": 11910 }, { "epoch": 6.317603393425238, "grad_norm": 3.768261432647705, "learning_rate": 8.646591090876225e-06, "loss": 0.235, "num_input_tokens_seen": 11977504, "step": 11915 }, { "epoch": 6.320254506892895, "grad_norm": 10.204911231994629, "learning_rate": 8.645007840004019e-06, "loss": 0.3584, "num_input_tokens_seen": 11983072, "step": 11920 }, { "epoch": 6.322905620360552, "grad_norm": 7.726245880126953, "learning_rate": 8.64342380874517e-06, "loss": 0.3448, "num_input_tokens_seen": 11988320, "step": 11925 }, { "epoch": 6.325556733828208, "grad_norm": 7.541950225830078, "learning_rate": 8.641838997438817e-06, "loss": 0.2758, "num_input_tokens_seen": 11992640, "step": 11930 }, { "epoch": 6.3282078472958645, "grad_norm": 4.090737819671631, "learning_rate": 8.640253406424266e-06, "loss": 0.1862, "num_input_tokens_seen": 11997184, "step": 11935 }, { "epoch": 6.33085896076352, "grad_norm": 7.65837287902832, "learning_rate": 8.638667036040986e-06, "loss": 0.2863, "num_input_tokens_seen": 12002240, "step": 11940 }, { "epoch": 6.333510074231177, "grad_norm": 8.939868927001953, "learning_rate": 8.637079886628614e-06, "loss": 0.2906, "num_input_tokens_seen": 12006496, "step": 11945 }, { "epoch": 6.336161187698833, "grad_norm": 3.952402353286743, "learning_rate": 8.635491958526957e-06, "loss": 0.293, "num_input_tokens_seen": 12012640, "step": 11950 }, { "epoch": 6.33881230116649, "grad_norm": 9.706802368164062, "learning_rate": 8.633903252075984e-06, "loss": 0.3481, "num_input_tokens_seen": 12016512, "step": 11955 }, { "epoch": 6.341463414634147, "grad_norm": 8.925323486328125, "learning_rate": 8.632313767615834e-06, "loss": 0.295, "num_input_tokens_seen": 12021472, "step": 11960 }, { "epoch": 6.3441145281018025, "grad_norm": 8.077963829040527, "learning_rate": 8.630723505486811e-06, "loss": 0.3289, "num_input_tokens_seen": 12026400, "step": 11965 }, { "epoch": 6.346765641569459, "grad_norm": 8.534379005432129, "learning_rate": 8.629132466029387e-06, "loss": 0.2081, "num_input_tokens_seen": 12032064, "step": 11970 }, { "epoch": 6.349416755037115, "grad_norm": 6.307270050048828, "learning_rate": 8.627540649584196e-06, "loss": 0.2595, "num_input_tokens_seen": 12037824, "step": 11975 }, { "epoch": 6.352067868504772, "grad_norm": 5.505382537841797, "learning_rate": 8.625948056492045e-06, "loss": 0.2342, "num_input_tokens_seen": 12042208, "step": 11980 }, { "epoch": 6.354718981972429, "grad_norm": 6.512608051300049, "learning_rate": 8.624354687093905e-06, "loss": 0.1775, "num_input_tokens_seen": 12046912, "step": 11985 }, { "epoch": 6.357370095440085, "grad_norm": 2.8555524349212646, "learning_rate": 8.62276054173091e-06, "loss": 0.2895, "num_input_tokens_seen": 12051264, "step": 11990 }, { "epoch": 6.3600212089077415, "grad_norm": 9.815994262695312, "learning_rate": 8.621165620744362e-06, "loss": 0.2881, "num_input_tokens_seen": 12055360, "step": 11995 }, { "epoch": 6.362672322375397, "grad_norm": 3.9747653007507324, "learning_rate": 8.61956992447573e-06, "loss": 0.1181, "num_input_tokens_seen": 12060544, "step": 12000 }, { "epoch": 6.365323435843054, "grad_norm": 7.311384201049805, "learning_rate": 8.61797345326665e-06, "loss": 0.3109, "num_input_tokens_seen": 12064864, "step": 12005 }, { "epoch": 6.367974549310711, "grad_norm": 4.765517711639404, "learning_rate": 8.61637620745892e-06, "loss": 0.2567, "num_input_tokens_seen": 12070048, "step": 12010 }, { "epoch": 6.370625662778367, "grad_norm": 9.143599510192871, "learning_rate": 8.614778187394507e-06, "loss": 0.3534, "num_input_tokens_seen": 12074624, "step": 12015 }, { "epoch": 6.373276776246024, "grad_norm": 4.325976371765137, "learning_rate": 8.613179393415543e-06, "loss": 0.2685, "num_input_tokens_seen": 12078816, "step": 12020 }, { "epoch": 6.3759278897136795, "grad_norm": 4.309521675109863, "learning_rate": 8.611579825864327e-06, "loss": 0.1192, "num_input_tokens_seen": 12083296, "step": 12025 }, { "epoch": 6.378579003181336, "grad_norm": 3.61370587348938, "learning_rate": 8.609979485083319e-06, "loss": 0.2243, "num_input_tokens_seen": 12088992, "step": 12030 }, { "epoch": 6.381230116648992, "grad_norm": 5.951462268829346, "learning_rate": 8.60837837141515e-06, "loss": 0.3284, "num_input_tokens_seen": 12093920, "step": 12035 }, { "epoch": 6.383881230116649, "grad_norm": 3.6313908100128174, "learning_rate": 8.606776485202615e-06, "loss": 0.227, "num_input_tokens_seen": 12100256, "step": 12040 }, { "epoch": 6.386532343584306, "grad_norm": 6.043890953063965, "learning_rate": 8.605173826788671e-06, "loss": 0.1792, "num_input_tokens_seen": 12105472, "step": 12045 }, { "epoch": 6.389183457051962, "grad_norm": 6.175252437591553, "learning_rate": 8.603570396516446e-06, "loss": 0.2128, "num_input_tokens_seen": 12110144, "step": 12050 }, { "epoch": 6.3918345705196185, "grad_norm": 8.517328262329102, "learning_rate": 8.601966194729228e-06, "loss": 0.3473, "num_input_tokens_seen": 12115296, "step": 12055 }, { "epoch": 6.394485683987274, "grad_norm": 11.063161849975586, "learning_rate": 8.600361221770473e-06, "loss": 0.4964, "num_input_tokens_seen": 12121280, "step": 12060 }, { "epoch": 6.397136797454931, "grad_norm": 13.20518970489502, "learning_rate": 8.5987554779838e-06, "loss": 0.291, "num_input_tokens_seen": 12125536, "step": 12065 }, { "epoch": 6.399787910922587, "grad_norm": 4.340097427368164, "learning_rate": 8.597148963712999e-06, "loss": 0.3154, "num_input_tokens_seen": 12129760, "step": 12070 }, { "epoch": 6.402439024390244, "grad_norm": 9.082359313964844, "learning_rate": 8.595541679302016e-06, "loss": 0.2044, "num_input_tokens_seen": 12135392, "step": 12075 }, { "epoch": 6.405090137857901, "grad_norm": 7.000458717346191, "learning_rate": 8.593933625094968e-06, "loss": 0.2509, "num_input_tokens_seen": 12139872, "step": 12080 }, { "epoch": 6.4077412513255565, "grad_norm": 5.753509044647217, "learning_rate": 8.592324801436137e-06, "loss": 0.1396, "num_input_tokens_seen": 12144032, "step": 12085 }, { "epoch": 6.410392364793213, "grad_norm": 11.573004722595215, "learning_rate": 8.590715208669965e-06, "loss": 0.2879, "num_input_tokens_seen": 12148096, "step": 12090 }, { "epoch": 6.413043478260869, "grad_norm": 8.155421257019043, "learning_rate": 8.58910484714106e-06, "loss": 0.2747, "num_input_tokens_seen": 12152352, "step": 12095 }, { "epoch": 6.415694591728526, "grad_norm": 10.299715995788574, "learning_rate": 8.587493717194204e-06, "loss": 0.2642, "num_input_tokens_seen": 12156576, "step": 12100 }, { "epoch": 6.418345705196183, "grad_norm": 15.709223747253418, "learning_rate": 8.585881819174327e-06, "loss": 0.4698, "num_input_tokens_seen": 12161504, "step": 12105 }, { "epoch": 6.420996818663839, "grad_norm": 2.4308576583862305, "learning_rate": 8.584269153426537e-06, "loss": 0.1432, "num_input_tokens_seen": 12166208, "step": 12110 }, { "epoch": 6.423647932131495, "grad_norm": 13.253597259521484, "learning_rate": 8.5826557202961e-06, "loss": 0.2169, "num_input_tokens_seen": 12171488, "step": 12115 }, { "epoch": 6.426299045599151, "grad_norm": 12.316726684570312, "learning_rate": 8.581041520128449e-06, "loss": 0.3205, "num_input_tokens_seen": 12175296, "step": 12120 }, { "epoch": 6.428950159066808, "grad_norm": 13.536665916442871, "learning_rate": 8.579426553269179e-06, "loss": 0.4658, "num_input_tokens_seen": 12180608, "step": 12125 }, { "epoch": 6.431601272534465, "grad_norm": 3.8319809436798096, "learning_rate": 8.57781082006405e-06, "loss": 0.3752, "num_input_tokens_seen": 12186272, "step": 12130 }, { "epoch": 6.434252386002121, "grad_norm": 8.575493812561035, "learning_rate": 8.576194320858985e-06, "loss": 0.2312, "num_input_tokens_seen": 12191168, "step": 12135 }, { "epoch": 6.436903499469778, "grad_norm": 10.9705228805542, "learning_rate": 8.574577056000077e-06, "loss": 0.2518, "num_input_tokens_seen": 12197216, "step": 12140 }, { "epoch": 6.4395546129374335, "grad_norm": 7.008718013763428, "learning_rate": 8.572959025833573e-06, "loss": 0.4107, "num_input_tokens_seen": 12201792, "step": 12145 }, { "epoch": 6.44220572640509, "grad_norm": 5.024021625518799, "learning_rate": 8.571340230705893e-06, "loss": 0.2551, "num_input_tokens_seen": 12206752, "step": 12150 }, { "epoch": 6.444856839872746, "grad_norm": 8.306498527526855, "learning_rate": 8.569720670963612e-06, "loss": 0.3235, "num_input_tokens_seen": 12212352, "step": 12155 }, { "epoch": 6.447507953340403, "grad_norm": 11.738794326782227, "learning_rate": 8.568100346953479e-06, "loss": 0.3513, "num_input_tokens_seen": 12216960, "step": 12160 }, { "epoch": 6.45015906680806, "grad_norm": 6.473093032836914, "learning_rate": 8.566479259022398e-06, "loss": 0.2661, "num_input_tokens_seen": 12223072, "step": 12165 }, { "epoch": 6.452810180275716, "grad_norm": 9.445636749267578, "learning_rate": 8.564857407517438e-06, "loss": 0.3067, "num_input_tokens_seen": 12227776, "step": 12170 }, { "epoch": 6.455461293743372, "grad_norm": 11.96917724609375, "learning_rate": 8.563234792785838e-06, "loss": 0.3338, "num_input_tokens_seen": 12231904, "step": 12175 }, { "epoch": 6.458112407211028, "grad_norm": 10.159355163574219, "learning_rate": 8.561611415174991e-06, "loss": 0.3107, "num_input_tokens_seen": 12236736, "step": 12180 }, { "epoch": 6.460763520678685, "grad_norm": 5.5395917892456055, "learning_rate": 8.55998727503246e-06, "loss": 0.1818, "num_input_tokens_seen": 12241824, "step": 12185 }, { "epoch": 6.463414634146342, "grad_norm": 9.549460411071777, "learning_rate": 8.558362372705971e-06, "loss": 0.2769, "num_input_tokens_seen": 12246144, "step": 12190 }, { "epoch": 6.466065747613998, "grad_norm": 6.117812156677246, "learning_rate": 8.556736708543407e-06, "loss": 0.195, "num_input_tokens_seen": 12252384, "step": 12195 }, { "epoch": 6.468716861081655, "grad_norm": 8.891854286193848, "learning_rate": 8.555110282892822e-06, "loss": 0.2757, "num_input_tokens_seen": 12257632, "step": 12200 }, { "epoch": 6.4713679745493105, "grad_norm": 9.329900741577148, "learning_rate": 8.55348309610243e-06, "loss": 0.3161, "num_input_tokens_seen": 12262432, "step": 12205 }, { "epoch": 6.474019088016967, "grad_norm": 6.305452346801758, "learning_rate": 8.5518551485206e-06, "loss": 0.2469, "num_input_tokens_seen": 12267008, "step": 12210 }, { "epoch": 6.476670201484623, "grad_norm": 3.643178939819336, "learning_rate": 8.550226440495882e-06, "loss": 0.1475, "num_input_tokens_seen": 12271040, "step": 12215 }, { "epoch": 6.47932131495228, "grad_norm": 5.389724254608154, "learning_rate": 8.548596972376971e-06, "loss": 0.2563, "num_input_tokens_seen": 12275872, "step": 12220 }, { "epoch": 6.481972428419937, "grad_norm": 4.8844075202941895, "learning_rate": 8.546966744512735e-06, "loss": 0.1987, "num_input_tokens_seen": 12280608, "step": 12225 }, { "epoch": 6.484623541887593, "grad_norm": 8.926965713500977, "learning_rate": 8.5453357572522e-06, "loss": 0.2694, "num_input_tokens_seen": 12286624, "step": 12230 }, { "epoch": 6.487274655355249, "grad_norm": 9.936827659606934, "learning_rate": 8.543704010944557e-06, "loss": 0.2264, "num_input_tokens_seen": 12291488, "step": 12235 }, { "epoch": 6.489925768822905, "grad_norm": 6.616876602172852, "learning_rate": 8.542071505939157e-06, "loss": 0.2812, "num_input_tokens_seen": 12296928, "step": 12240 }, { "epoch": 6.492576882290562, "grad_norm": 2.7821972370147705, "learning_rate": 8.540438242585517e-06, "loss": 0.2026, "num_input_tokens_seen": 12301600, "step": 12245 }, { "epoch": 6.495227995758219, "grad_norm": 4.020016670227051, "learning_rate": 8.538804221233315e-06, "loss": 0.1959, "num_input_tokens_seen": 12306368, "step": 12250 }, { "epoch": 6.497879109225875, "grad_norm": 12.994318008422852, "learning_rate": 8.537169442232387e-06, "loss": 0.2988, "num_input_tokens_seen": 12311552, "step": 12255 }, { "epoch": 6.5005302226935315, "grad_norm": 11.597657203674316, "learning_rate": 8.535533905932739e-06, "loss": 0.2598, "num_input_tokens_seen": 12316768, "step": 12260 }, { "epoch": 6.503181336161187, "grad_norm": 5.604786396026611, "learning_rate": 8.533897612684531e-06, "loss": 0.336, "num_input_tokens_seen": 12323296, "step": 12265 }, { "epoch": 6.505832449628844, "grad_norm": 4.944756984710693, "learning_rate": 8.532260562838095e-06, "loss": 0.23, "num_input_tokens_seen": 12328480, "step": 12270 }, { "epoch": 6.508483563096501, "grad_norm": 13.407609939575195, "learning_rate": 8.530622756743912e-06, "loss": 0.3057, "num_input_tokens_seen": 12332608, "step": 12275 }, { "epoch": 6.511134676564157, "grad_norm": 2.9946086406707764, "learning_rate": 8.528984194752638e-06, "loss": 0.1291, "num_input_tokens_seen": 12338944, "step": 12280 }, { "epoch": 6.513785790031814, "grad_norm": 8.561816215515137, "learning_rate": 8.52734487721508e-06, "loss": 0.2193, "num_input_tokens_seen": 12343936, "step": 12285 }, { "epoch": 6.51643690349947, "grad_norm": 11.567367553710938, "learning_rate": 8.525704804482216e-06, "loss": 0.3745, "num_input_tokens_seen": 12349056, "step": 12290 }, { "epoch": 6.519088016967126, "grad_norm": 8.830004692077637, "learning_rate": 8.524063976905178e-06, "loss": 0.2109, "num_input_tokens_seen": 12353984, "step": 12295 }, { "epoch": 6.521739130434782, "grad_norm": 14.799060821533203, "learning_rate": 8.522422394835264e-06, "loss": 0.4285, "num_input_tokens_seen": 12360352, "step": 12300 }, { "epoch": 6.524390243902439, "grad_norm": 6.1318559646606445, "learning_rate": 8.520780058623935e-06, "loss": 0.2996, "num_input_tokens_seen": 12365504, "step": 12305 }, { "epoch": 6.527041357370095, "grad_norm": 2.8554205894470215, "learning_rate": 8.519136968622805e-06, "loss": 0.2762, "num_input_tokens_seen": 12369600, "step": 12310 }, { "epoch": 6.529692470837752, "grad_norm": 7.547086238861084, "learning_rate": 8.51749312518366e-06, "loss": 0.3174, "num_input_tokens_seen": 12373984, "step": 12315 }, { "epoch": 6.5323435843054085, "grad_norm": 10.193721771240234, "learning_rate": 8.515848528658442e-06, "loss": 0.3258, "num_input_tokens_seen": 12378336, "step": 12320 }, { "epoch": 6.534994697773064, "grad_norm": 11.839421272277832, "learning_rate": 8.514203179399253e-06, "loss": 0.3215, "num_input_tokens_seen": 12383552, "step": 12325 }, { "epoch": 6.537645811240721, "grad_norm": 8.473052978515625, "learning_rate": 8.51255707775836e-06, "loss": 0.3135, "num_input_tokens_seen": 12388704, "step": 12330 }, { "epoch": 6.540296924708377, "grad_norm": 8.192155838012695, "learning_rate": 8.510910224088186e-06, "loss": 0.3017, "num_input_tokens_seen": 12394368, "step": 12335 }, { "epoch": 6.542948038176034, "grad_norm": 4.3262128829956055, "learning_rate": 8.509262618741323e-06, "loss": 0.2622, "num_input_tokens_seen": 12399040, "step": 12340 }, { "epoch": 6.545599151643691, "grad_norm": 3.7567954063415527, "learning_rate": 8.507614262070515e-06, "loss": 0.233, "num_input_tokens_seen": 12403872, "step": 12345 }, { "epoch": 6.548250265111347, "grad_norm": 4.143430709838867, "learning_rate": 8.50596515442867e-06, "loss": 0.2068, "num_input_tokens_seen": 12409280, "step": 12350 }, { "epoch": 6.550901378579003, "grad_norm": 8.912625312805176, "learning_rate": 8.504315296168863e-06, "loss": 0.3415, "num_input_tokens_seen": 12414496, "step": 12355 }, { "epoch": 6.553552492046659, "grad_norm": 9.343994140625, "learning_rate": 8.50266468764432e-06, "loss": 0.3037, "num_input_tokens_seen": 12419936, "step": 12360 }, { "epoch": 6.556203605514316, "grad_norm": 6.910220623016357, "learning_rate": 8.501013329208431e-06, "loss": 0.2552, "num_input_tokens_seen": 12425184, "step": 12365 }, { "epoch": 6.558854718981973, "grad_norm": 5.560060977935791, "learning_rate": 8.499361221214752e-06, "loss": 0.2685, "num_input_tokens_seen": 12429664, "step": 12370 }, { "epoch": 6.561505832449629, "grad_norm": 7.219608306884766, "learning_rate": 8.49770836401699e-06, "loss": 0.2242, "num_input_tokens_seen": 12434304, "step": 12375 }, { "epoch": 6.5641569459172855, "grad_norm": 6.0087890625, "learning_rate": 8.496054757969021e-06, "loss": 0.2224, "num_input_tokens_seen": 12438720, "step": 12380 }, { "epoch": 6.566808059384941, "grad_norm": 7.433694362640381, "learning_rate": 8.494400403424877e-06, "loss": 0.2142, "num_input_tokens_seen": 12443584, "step": 12385 }, { "epoch": 6.569459172852598, "grad_norm": 7.069397449493408, "learning_rate": 8.492745300738749e-06, "loss": 0.2561, "num_input_tokens_seen": 12448320, "step": 12390 }, { "epoch": 6.572110286320255, "grad_norm": 1.3931199312210083, "learning_rate": 8.491089450264994e-06, "loss": 0.298, "num_input_tokens_seen": 12452576, "step": 12395 }, { "epoch": 6.574761399787911, "grad_norm": 8.305237770080566, "learning_rate": 8.489432852358121e-06, "loss": 0.2455, "num_input_tokens_seen": 12458752, "step": 12400 }, { "epoch": 6.577412513255568, "grad_norm": 10.422895431518555, "learning_rate": 8.487775507372807e-06, "loss": 0.3899, "num_input_tokens_seen": 12463840, "step": 12405 }, { "epoch": 6.5800636267232235, "grad_norm": 9.383157730102539, "learning_rate": 8.486117415663883e-06, "loss": 0.3927, "num_input_tokens_seen": 12468640, "step": 12410 }, { "epoch": 6.58271474019088, "grad_norm": 3.719179630279541, "learning_rate": 8.484458577586342e-06, "loss": 0.2026, "num_input_tokens_seen": 12474112, "step": 12415 }, { "epoch": 6.585365853658536, "grad_norm": 8.14809513092041, "learning_rate": 8.482798993495338e-06, "loss": 0.1957, "num_input_tokens_seen": 12478592, "step": 12420 }, { "epoch": 6.588016967126193, "grad_norm": 5.938578128814697, "learning_rate": 8.481138663746183e-06, "loss": 0.3282, "num_input_tokens_seen": 12483392, "step": 12425 }, { "epoch": 6.59066808059385, "grad_norm": 6.220433712005615, "learning_rate": 8.479477588694347e-06, "loss": 0.389, "num_input_tokens_seen": 12487648, "step": 12430 }, { "epoch": 6.593319194061506, "grad_norm": 0.860947847366333, "learning_rate": 8.477815768695466e-06, "loss": 0.1428, "num_input_tokens_seen": 12493152, "step": 12435 }, { "epoch": 6.5959703075291625, "grad_norm": 13.872446060180664, "learning_rate": 8.476153204105331e-06, "loss": 0.4248, "num_input_tokens_seen": 12497184, "step": 12440 }, { "epoch": 6.598621420996818, "grad_norm": 9.424049377441406, "learning_rate": 8.47448989527989e-06, "loss": 0.2738, "num_input_tokens_seen": 12502208, "step": 12445 }, { "epoch": 6.601272534464475, "grad_norm": 10.065286636352539, "learning_rate": 8.472825842575251e-06, "loss": 0.3199, "num_input_tokens_seen": 12506752, "step": 12450 }, { "epoch": 6.603923647932131, "grad_norm": 2.278794527053833, "learning_rate": 8.47116104634769e-06, "loss": 0.2233, "num_input_tokens_seen": 12511264, "step": 12455 }, { "epoch": 6.606574761399788, "grad_norm": 7.871973037719727, "learning_rate": 8.469495506953634e-06, "loss": 0.2393, "num_input_tokens_seen": 12516128, "step": 12460 }, { "epoch": 6.609225874867445, "grad_norm": 4.914002895355225, "learning_rate": 8.467829224749665e-06, "loss": 0.177, "num_input_tokens_seen": 12520640, "step": 12465 }, { "epoch": 6.6118769883351005, "grad_norm": 6.4317626953125, "learning_rate": 8.466162200092536e-06, "loss": 0.3585, "num_input_tokens_seen": 12524832, "step": 12470 }, { "epoch": 6.614528101802757, "grad_norm": 8.001032829284668, "learning_rate": 8.464494433339147e-06, "loss": 0.3291, "num_input_tokens_seen": 12529728, "step": 12475 }, { "epoch": 6.617179215270413, "grad_norm": 5.864368915557861, "learning_rate": 8.462825924846569e-06, "loss": 0.2573, "num_input_tokens_seen": 12534464, "step": 12480 }, { "epoch": 6.61983032873807, "grad_norm": 7.126362323760986, "learning_rate": 8.461156674972018e-06, "loss": 0.2327, "num_input_tokens_seen": 12539392, "step": 12485 }, { "epoch": 6.622481442205727, "grad_norm": 7.634777069091797, "learning_rate": 8.459486684072883e-06, "loss": 0.2335, "num_input_tokens_seen": 12543616, "step": 12490 }, { "epoch": 6.625132555673383, "grad_norm": 5.069000720977783, "learning_rate": 8.457815952506701e-06, "loss": 0.3159, "num_input_tokens_seen": 12547808, "step": 12495 }, { "epoch": 6.6277836691410394, "grad_norm": 10.907478332519531, "learning_rate": 8.45614448063117e-06, "loss": 0.2948, "num_input_tokens_seen": 12552704, "step": 12500 }, { "epoch": 6.630434782608695, "grad_norm": 5.654964923858643, "learning_rate": 8.454472268804151e-06, "loss": 0.1692, "num_input_tokens_seen": 12557376, "step": 12505 }, { "epoch": 6.633085896076352, "grad_norm": 3.4314095973968506, "learning_rate": 8.452799317383657e-06, "loss": 0.2819, "num_input_tokens_seen": 12561920, "step": 12510 }, { "epoch": 6.635737009544009, "grad_norm": 7.976579666137695, "learning_rate": 8.451125626727865e-06, "loss": 0.329, "num_input_tokens_seen": 12567872, "step": 12515 }, { "epoch": 6.638388123011665, "grad_norm": 9.321878433227539, "learning_rate": 8.449451197195108e-06, "loss": 0.2619, "num_input_tokens_seen": 12572384, "step": 12520 }, { "epoch": 6.641039236479322, "grad_norm": 9.208358764648438, "learning_rate": 8.447776029143876e-06, "loss": 0.2358, "num_input_tokens_seen": 12576576, "step": 12525 }, { "epoch": 6.6436903499469775, "grad_norm": 4.790472984313965, "learning_rate": 8.446100122932818e-06, "loss": 0.224, "num_input_tokens_seen": 12581248, "step": 12530 }, { "epoch": 6.646341463414634, "grad_norm": 9.975404739379883, "learning_rate": 8.44442347892074e-06, "loss": 0.2289, "num_input_tokens_seen": 12585952, "step": 12535 }, { "epoch": 6.648992576882291, "grad_norm": 6.111028671264648, "learning_rate": 8.44274609746661e-06, "loss": 0.2714, "num_input_tokens_seen": 12592000, "step": 12540 }, { "epoch": 6.651643690349947, "grad_norm": 6.2693023681640625, "learning_rate": 8.441067978929548e-06, "loss": 0.1968, "num_input_tokens_seen": 12596448, "step": 12545 }, { "epoch": 6.654294803817604, "grad_norm": 11.929044723510742, "learning_rate": 8.439389123668836e-06, "loss": 0.3075, "num_input_tokens_seen": 12600512, "step": 12550 }, { "epoch": 6.65694591728526, "grad_norm": 9.477872848510742, "learning_rate": 8.437709532043914e-06, "loss": 0.2032, "num_input_tokens_seen": 12606400, "step": 12555 }, { "epoch": 6.659597030752916, "grad_norm": 3.1576614379882812, "learning_rate": 8.436029204414374e-06, "loss": 0.1822, "num_input_tokens_seen": 12611488, "step": 12560 }, { "epoch": 6.662248144220572, "grad_norm": 5.0093207359313965, "learning_rate": 8.434348141139975e-06, "loss": 0.1566, "num_input_tokens_seen": 12616640, "step": 12565 }, { "epoch": 6.664899257688229, "grad_norm": 8.93660831451416, "learning_rate": 8.432666342580624e-06, "loss": 0.3738, "num_input_tokens_seen": 12620384, "step": 12570 }, { "epoch": 6.667550371155885, "grad_norm": 7.280547618865967, "learning_rate": 8.430983809096394e-06, "loss": 0.337, "num_input_tokens_seen": 12627232, "step": 12575 }, { "epoch": 6.670201484623542, "grad_norm": 1.9841912984848022, "learning_rate": 8.429300541047507e-06, "loss": 0.2963, "num_input_tokens_seen": 12632288, "step": 12580 }, { "epoch": 6.672852598091199, "grad_norm": 9.959922790527344, "learning_rate": 8.427616538794348e-06, "loss": 0.2621, "num_input_tokens_seen": 12638368, "step": 12585 }, { "epoch": 6.6755037115588545, "grad_norm": 7.4336771965026855, "learning_rate": 8.425931802697455e-06, "loss": 0.2284, "num_input_tokens_seen": 12642176, "step": 12590 }, { "epoch": 6.678154825026511, "grad_norm": 2.9322333335876465, "learning_rate": 8.42424633311753e-06, "loss": 0.2468, "num_input_tokens_seen": 12647392, "step": 12595 }, { "epoch": 6.680805938494167, "grad_norm": 8.049104690551758, "learning_rate": 8.422560130415425e-06, "loss": 0.3446, "num_input_tokens_seen": 12652800, "step": 12600 }, { "epoch": 6.683457051961824, "grad_norm": 5.332820892333984, "learning_rate": 8.420873194952153e-06, "loss": 0.2256, "num_input_tokens_seen": 12658464, "step": 12605 }, { "epoch": 6.686108165429481, "grad_norm": 10.887274742126465, "learning_rate": 8.41918552708888e-06, "loss": 0.2355, "num_input_tokens_seen": 12663648, "step": 12610 }, { "epoch": 6.688759278897137, "grad_norm": 10.616301536560059, "learning_rate": 8.417497127186934e-06, "loss": 0.3943, "num_input_tokens_seen": 12668480, "step": 12615 }, { "epoch": 6.691410392364793, "grad_norm": 3.124382257461548, "learning_rate": 8.415807995607796e-06, "loss": 0.2138, "num_input_tokens_seen": 12672992, "step": 12620 }, { "epoch": 6.694061505832449, "grad_norm": 4.393032550811768, "learning_rate": 8.414118132713102e-06, "loss": 0.1627, "num_input_tokens_seen": 12678048, "step": 12625 }, { "epoch": 6.696712619300106, "grad_norm": 6.731339931488037, "learning_rate": 8.412427538864651e-06, "loss": 0.2155, "num_input_tokens_seen": 12682528, "step": 12630 }, { "epoch": 6.699363732767763, "grad_norm": 6.648188591003418, "learning_rate": 8.410736214424394e-06, "loss": 0.2997, "num_input_tokens_seen": 12687200, "step": 12635 }, { "epoch": 6.702014846235419, "grad_norm": 2.974381446838379, "learning_rate": 8.409044159754438e-06, "loss": 0.2073, "num_input_tokens_seen": 12692096, "step": 12640 }, { "epoch": 6.7046659597030756, "grad_norm": 13.256300926208496, "learning_rate": 8.407351375217048e-06, "loss": 0.3427, "num_input_tokens_seen": 12697056, "step": 12645 }, { "epoch": 6.7073170731707314, "grad_norm": 8.988642692565918, "learning_rate": 8.405657861174646e-06, "loss": 0.2353, "num_input_tokens_seen": 12702368, "step": 12650 }, { "epoch": 6.709968186638388, "grad_norm": 5.419163227081299, "learning_rate": 8.403963617989805e-06, "loss": 0.2609, "num_input_tokens_seen": 12707392, "step": 12655 }, { "epoch": 6.712619300106045, "grad_norm": 4.253873348236084, "learning_rate": 8.402268646025263e-06, "loss": 0.1923, "num_input_tokens_seen": 12712768, "step": 12660 }, { "epoch": 6.715270413573701, "grad_norm": 7.773209571838379, "learning_rate": 8.400572945643905e-06, "loss": 0.2347, "num_input_tokens_seen": 12716960, "step": 12665 }, { "epoch": 6.717921527041358, "grad_norm": 7.9715166091918945, "learning_rate": 8.398876517208778e-06, "loss": 0.3062, "num_input_tokens_seen": 12721216, "step": 12670 }, { "epoch": 6.720572640509014, "grad_norm": 6.4285359382629395, "learning_rate": 8.397179361083084e-06, "loss": 0.2664, "num_input_tokens_seen": 12725920, "step": 12675 }, { "epoch": 6.72322375397667, "grad_norm": 6.058890342712402, "learning_rate": 8.395481477630176e-06, "loss": 0.2528, "num_input_tokens_seen": 12730688, "step": 12680 }, { "epoch": 6.725874867444326, "grad_norm": 4.84157133102417, "learning_rate": 8.393782867213572e-06, "loss": 0.194, "num_input_tokens_seen": 12736640, "step": 12685 }, { "epoch": 6.728525980911983, "grad_norm": 9.351757049560547, "learning_rate": 8.392083530196933e-06, "loss": 0.2796, "num_input_tokens_seen": 12740832, "step": 12690 }, { "epoch": 6.731177094379639, "grad_norm": 6.586358547210693, "learning_rate": 8.39038346694409e-06, "loss": 0.2462, "num_input_tokens_seen": 12745408, "step": 12695 }, { "epoch": 6.733828207847296, "grad_norm": 3.823989152908325, "learning_rate": 8.388682677819016e-06, "loss": 0.3148, "num_input_tokens_seen": 12749536, "step": 12700 }, { "epoch": 6.7364793213149525, "grad_norm": 4.269595146179199, "learning_rate": 8.386981163185848e-06, "loss": 0.2119, "num_input_tokens_seen": 12754912, "step": 12705 }, { "epoch": 6.739130434782608, "grad_norm": 16.42567253112793, "learning_rate": 8.385278923408877e-06, "loss": 0.2706, "num_input_tokens_seen": 12759616, "step": 12710 }, { "epoch": 6.741781548250265, "grad_norm": 10.181418418884277, "learning_rate": 8.383575958852545e-06, "loss": 0.3669, "num_input_tokens_seen": 12764576, "step": 12715 }, { "epoch": 6.744432661717921, "grad_norm": 5.184785842895508, "learning_rate": 8.381872269881457e-06, "loss": 0.3409, "num_input_tokens_seen": 12769856, "step": 12720 }, { "epoch": 6.747083775185578, "grad_norm": 4.199455261230469, "learning_rate": 8.380167856860364e-06, "loss": 0.3071, "num_input_tokens_seen": 12776320, "step": 12725 }, { "epoch": 6.749734888653235, "grad_norm": 4.307294845581055, "learning_rate": 8.378462720154178e-06, "loss": 0.2623, "num_input_tokens_seen": 12780704, "step": 12730 }, { "epoch": 6.752386002120891, "grad_norm": 6.947247505187988, "learning_rate": 8.376756860127964e-06, "loss": 0.2997, "num_input_tokens_seen": 12786976, "step": 12735 }, { "epoch": 6.755037115588547, "grad_norm": 7.639339447021484, "learning_rate": 8.375050277146943e-06, "loss": 0.2844, "num_input_tokens_seen": 12791520, "step": 12740 }, { "epoch": 6.757688229056203, "grad_norm": 7.440855503082275, "learning_rate": 8.373342971576487e-06, "loss": 0.3354, "num_input_tokens_seen": 12796736, "step": 12745 }, { "epoch": 6.76033934252386, "grad_norm": 5.457198143005371, "learning_rate": 8.37163494378213e-06, "loss": 0.1432, "num_input_tokens_seen": 12800960, "step": 12750 }, { "epoch": 6.762990455991517, "grad_norm": 9.682514190673828, "learning_rate": 8.369926194129554e-06, "loss": 0.3468, "num_input_tokens_seen": 12806272, "step": 12755 }, { "epoch": 6.765641569459173, "grad_norm": 2.764298915863037, "learning_rate": 8.368216722984597e-06, "loss": 0.2862, "num_input_tokens_seen": 12811552, "step": 12760 }, { "epoch": 6.7682926829268295, "grad_norm": 5.397490978240967, "learning_rate": 8.366506530713255e-06, "loss": 0.2864, "num_input_tokens_seen": 12819552, "step": 12765 }, { "epoch": 6.770943796394485, "grad_norm": 5.063434600830078, "learning_rate": 8.364795617681673e-06, "loss": 0.2714, "num_input_tokens_seen": 12824736, "step": 12770 }, { "epoch": 6.773594909862142, "grad_norm": 7.380745887756348, "learning_rate": 8.363083984256155e-06, "loss": 0.2354, "num_input_tokens_seen": 12828000, "step": 12775 }, { "epoch": 6.776246023329799, "grad_norm": 7.170220375061035, "learning_rate": 8.361371630803154e-06, "loss": 0.2211, "num_input_tokens_seen": 12832320, "step": 12780 }, { "epoch": 6.778897136797455, "grad_norm": 7.507736682891846, "learning_rate": 8.359658557689285e-06, "loss": 0.3022, "num_input_tokens_seen": 12837760, "step": 12785 }, { "epoch": 6.781548250265112, "grad_norm": 3.8545684814453125, "learning_rate": 8.357944765281309e-06, "loss": 0.2219, "num_input_tokens_seen": 12842048, "step": 12790 }, { "epoch": 6.7841993637327676, "grad_norm": 10.006948471069336, "learning_rate": 8.356230253946148e-06, "loss": 0.3466, "num_input_tokens_seen": 12848064, "step": 12795 }, { "epoch": 6.786850477200424, "grad_norm": 6.524844169616699, "learning_rate": 8.354515024050869e-06, "loss": 0.3263, "num_input_tokens_seen": 12853280, "step": 12800 }, { "epoch": 6.78950159066808, "grad_norm": 5.082912445068359, "learning_rate": 8.3527990759627e-06, "loss": 0.2423, "num_input_tokens_seen": 12858592, "step": 12805 }, { "epoch": 6.792152704135737, "grad_norm": 7.014626979827881, "learning_rate": 8.351082410049025e-06, "loss": 0.1858, "num_input_tokens_seen": 12864416, "step": 12810 }, { "epoch": 6.794803817603394, "grad_norm": 5.159501075744629, "learning_rate": 8.349365026677375e-06, "loss": 0.2353, "num_input_tokens_seen": 12869504, "step": 12815 }, { "epoch": 6.79745493107105, "grad_norm": 5.733631134033203, "learning_rate": 8.347646926215436e-06, "loss": 0.4015, "num_input_tokens_seen": 12874176, "step": 12820 }, { "epoch": 6.8001060445387065, "grad_norm": 6.931877613067627, "learning_rate": 8.345928109031052e-06, "loss": 0.2381, "num_input_tokens_seen": 12878720, "step": 12825 }, { "epoch": 6.802757158006362, "grad_norm": 6.34628963470459, "learning_rate": 8.344208575492212e-06, "loss": 0.1607, "num_input_tokens_seen": 12883584, "step": 12830 }, { "epoch": 6.805408271474019, "grad_norm": 1.7929788827896118, "learning_rate": 8.342488325967068e-06, "loss": 0.2119, "num_input_tokens_seen": 12888096, "step": 12835 }, { "epoch": 6.808059384941675, "grad_norm": 4.01327657699585, "learning_rate": 8.34076736082392e-06, "loss": 0.2225, "num_input_tokens_seen": 12893696, "step": 12840 }, { "epoch": 6.810710498409332, "grad_norm": 5.795009613037109, "learning_rate": 8.339045680431223e-06, "loss": 0.2446, "num_input_tokens_seen": 12898848, "step": 12845 }, { "epoch": 6.813361611876989, "grad_norm": 7.438446998596191, "learning_rate": 8.337323285157584e-06, "loss": 0.2769, "num_input_tokens_seen": 12904448, "step": 12850 }, { "epoch": 6.8160127253446445, "grad_norm": 9.497791290283203, "learning_rate": 8.33560017537176e-06, "loss": 0.3718, "num_input_tokens_seen": 12908928, "step": 12855 }, { "epoch": 6.818663838812301, "grad_norm": 7.788669109344482, "learning_rate": 8.33387635144267e-06, "loss": 0.3326, "num_input_tokens_seen": 12915232, "step": 12860 }, { "epoch": 6.821314952279957, "grad_norm": 6.024324893951416, "learning_rate": 8.332151813739373e-06, "loss": 0.146, "num_input_tokens_seen": 12919680, "step": 12865 }, { "epoch": 6.823966065747614, "grad_norm": 5.027156829833984, "learning_rate": 8.330426562631095e-06, "loss": 0.2667, "num_input_tokens_seen": 12924384, "step": 12870 }, { "epoch": 6.826617179215271, "grad_norm": 6.795717716217041, "learning_rate": 8.328700598487203e-06, "loss": 0.2445, "num_input_tokens_seen": 12929568, "step": 12875 }, { "epoch": 6.829268292682927, "grad_norm": 7.298725605010986, "learning_rate": 8.326973921677226e-06, "loss": 0.217, "num_input_tokens_seen": 12935296, "step": 12880 }, { "epoch": 6.8319194061505835, "grad_norm": 5.488876819610596, "learning_rate": 8.325246532570836e-06, "loss": 0.2324, "num_input_tokens_seen": 12940640, "step": 12885 }, { "epoch": 6.834570519618239, "grad_norm": 6.48073673248291, "learning_rate": 8.323518431537866e-06, "loss": 0.3054, "num_input_tokens_seen": 12944960, "step": 12890 }, { "epoch": 6.837221633085896, "grad_norm": 12.881953239440918, "learning_rate": 8.321789618948293e-06, "loss": 0.349, "num_input_tokens_seen": 12949856, "step": 12895 }, { "epoch": 6.839872746553553, "grad_norm": 11.416360855102539, "learning_rate": 8.320060095172259e-06, "loss": 0.3399, "num_input_tokens_seen": 12954272, "step": 12900 }, { "epoch": 6.842523860021209, "grad_norm": 7.277920246124268, "learning_rate": 8.318329860580044e-06, "loss": 0.4338, "num_input_tokens_seen": 12959552, "step": 12905 }, { "epoch": 6.845174973488866, "grad_norm": 4.332813262939453, "learning_rate": 8.316598915542089e-06, "loss": 0.2619, "num_input_tokens_seen": 12963904, "step": 12910 }, { "epoch": 6.8478260869565215, "grad_norm": 9.898289680480957, "learning_rate": 8.314867260428987e-06, "loss": 0.334, "num_input_tokens_seen": 12969312, "step": 12915 }, { "epoch": 6.850477200424178, "grad_norm": 9.674580574035645, "learning_rate": 8.313134895611476e-06, "loss": 0.3905, "num_input_tokens_seen": 12974016, "step": 12920 }, { "epoch": 6.853128313891834, "grad_norm": 6.8445143699646, "learning_rate": 8.311401821460456e-06, "loss": 0.2131, "num_input_tokens_seen": 12978208, "step": 12925 }, { "epoch": 6.855779427359491, "grad_norm": 5.7612223625183105, "learning_rate": 8.309668038346969e-06, "loss": 0.4526, "num_input_tokens_seen": 12985504, "step": 12930 }, { "epoch": 6.858430540827148, "grad_norm": 6.033755302429199, "learning_rate": 8.307933546642217e-06, "loss": 0.2428, "num_input_tokens_seen": 12989760, "step": 12935 }, { "epoch": 6.861081654294804, "grad_norm": 5.903223991394043, "learning_rate": 8.306198346717546e-06, "loss": 0.3336, "num_input_tokens_seen": 12993952, "step": 12940 }, { "epoch": 6.86373276776246, "grad_norm": 8.759936332702637, "learning_rate": 8.304462438944463e-06, "loss": 0.1978, "num_input_tokens_seen": 12998848, "step": 12945 }, { "epoch": 6.866383881230116, "grad_norm": 5.921856880187988, "learning_rate": 8.302725823694619e-06, "loss": 0.305, "num_input_tokens_seen": 13004160, "step": 12950 }, { "epoch": 6.869034994697773, "grad_norm": 6.008667469024658, "learning_rate": 8.300988501339817e-06, "loss": 0.3142, "num_input_tokens_seen": 13010304, "step": 12955 }, { "epoch": 6.871686108165429, "grad_norm": 11.420303344726562, "learning_rate": 8.299250472252015e-06, "loss": 0.2811, "num_input_tokens_seen": 13014656, "step": 12960 }, { "epoch": 6.874337221633086, "grad_norm": 9.747218132019043, "learning_rate": 8.29751173680332e-06, "loss": 0.2736, "num_input_tokens_seen": 13019968, "step": 12965 }, { "epoch": 6.876988335100743, "grad_norm": 6.609870433807373, "learning_rate": 8.295772295365992e-06, "loss": 0.2621, "num_input_tokens_seen": 13025056, "step": 12970 }, { "epoch": 6.8796394485683985, "grad_norm": 7.243781089782715, "learning_rate": 8.294032148312438e-06, "loss": 0.2453, "num_input_tokens_seen": 13030176, "step": 12975 }, { "epoch": 6.882290562036055, "grad_norm": 9.135292053222656, "learning_rate": 8.292291296015222e-06, "loss": 0.2356, "num_input_tokens_seen": 13034656, "step": 12980 }, { "epoch": 6.884941675503711, "grad_norm": 7.064098834991455, "learning_rate": 8.290549738847055e-06, "loss": 0.289, "num_input_tokens_seen": 13039392, "step": 12985 }, { "epoch": 6.887592788971368, "grad_norm": 9.673505783081055, "learning_rate": 8.288807477180798e-06, "loss": 0.365, "num_input_tokens_seen": 13043904, "step": 12990 }, { "epoch": 6.890243902439025, "grad_norm": 5.007436752319336, "learning_rate": 8.287064511389468e-06, "loss": 0.2261, "num_input_tokens_seen": 13048736, "step": 12995 }, { "epoch": 6.892895015906681, "grad_norm": 9.096644401550293, "learning_rate": 8.285320841846226e-06, "loss": 0.5004, "num_input_tokens_seen": 13054048, "step": 13000 }, { "epoch": 6.895546129374337, "grad_norm": 5.054959774017334, "learning_rate": 8.283576468924388e-06, "loss": 0.4144, "num_input_tokens_seen": 13059072, "step": 13005 }, { "epoch": 6.898197242841993, "grad_norm": 5.610023498535156, "learning_rate": 8.281831392997422e-06, "loss": 0.4037, "num_input_tokens_seen": 13063808, "step": 13010 }, { "epoch": 6.90084835630965, "grad_norm": 10.195147514343262, "learning_rate": 8.280085614438941e-06, "loss": 0.2167, "num_input_tokens_seen": 13068768, "step": 13015 }, { "epoch": 6.903499469777307, "grad_norm": 2.4921491146087646, "learning_rate": 8.278339133622714e-06, "loss": 0.1539, "num_input_tokens_seen": 13073824, "step": 13020 }, { "epoch": 6.906150583244963, "grad_norm": 8.519942283630371, "learning_rate": 8.276591950922656e-06, "loss": 0.1713, "num_input_tokens_seen": 13078752, "step": 13025 }, { "epoch": 6.90880169671262, "grad_norm": 7.642221450805664, "learning_rate": 8.274844066712838e-06, "loss": 0.2929, "num_input_tokens_seen": 13083616, "step": 13030 }, { "epoch": 6.9114528101802755, "grad_norm": 9.492920875549316, "learning_rate": 8.27309548136747e-06, "loss": 0.194, "num_input_tokens_seen": 13088224, "step": 13035 }, { "epoch": 6.914103923647932, "grad_norm": 10.786645889282227, "learning_rate": 8.27134619526093e-06, "loss": 0.2856, "num_input_tokens_seen": 13093344, "step": 13040 }, { "epoch": 6.916755037115589, "grad_norm": 9.872527122497559, "learning_rate": 8.269596208767725e-06, "loss": 0.316, "num_input_tokens_seen": 13099904, "step": 13045 }, { "epoch": 6.919406150583245, "grad_norm": 7.615466594696045, "learning_rate": 8.26784552226253e-06, "loss": 0.2324, "num_input_tokens_seen": 13103712, "step": 13050 }, { "epoch": 6.922057264050902, "grad_norm": 8.810908317565918, "learning_rate": 8.266094136120157e-06, "loss": 0.3553, "num_input_tokens_seen": 13108736, "step": 13055 }, { "epoch": 6.924708377518558, "grad_norm": 10.842129707336426, "learning_rate": 8.264342050715579e-06, "loss": 0.2966, "num_input_tokens_seen": 13113824, "step": 13060 }, { "epoch": 6.927359490986214, "grad_norm": 8.526153564453125, "learning_rate": 8.262589266423908e-06, "loss": 0.2903, "num_input_tokens_seen": 13118432, "step": 13065 }, { "epoch": 6.93001060445387, "grad_norm": 6.036864757537842, "learning_rate": 8.260835783620413e-06, "loss": 0.2234, "num_input_tokens_seen": 13122944, "step": 13070 }, { "epoch": 6.932661717921527, "grad_norm": 8.00378704071045, "learning_rate": 8.25908160268051e-06, "loss": 0.2772, "num_input_tokens_seen": 13127616, "step": 13075 }, { "epoch": 6.935312831389183, "grad_norm": 6.958002090454102, "learning_rate": 8.257326723979764e-06, "loss": 0.2334, "num_input_tokens_seen": 13132128, "step": 13080 }, { "epoch": 6.93796394485684, "grad_norm": 12.510337829589844, "learning_rate": 8.25557114789389e-06, "loss": 0.3239, "num_input_tokens_seen": 13136576, "step": 13085 }, { "epoch": 6.9406150583244965, "grad_norm": 4.919431209564209, "learning_rate": 8.25381487479875e-06, "loss": 0.1386, "num_input_tokens_seen": 13141056, "step": 13090 }, { "epoch": 6.943266171792152, "grad_norm": 1.750768780708313, "learning_rate": 8.252057905070363e-06, "loss": 0.2621, "num_input_tokens_seen": 13145760, "step": 13095 }, { "epoch": 6.945917285259809, "grad_norm": 5.559804916381836, "learning_rate": 8.250300239084888e-06, "loss": 0.2099, "num_input_tokens_seen": 13150080, "step": 13100 }, { "epoch": 6.948568398727465, "grad_norm": 11.075575828552246, "learning_rate": 8.248541877218635e-06, "loss": 0.3275, "num_input_tokens_seen": 13154336, "step": 13105 }, { "epoch": 6.951219512195122, "grad_norm": 14.614340782165527, "learning_rate": 8.246782819848066e-06, "loss": 0.2729, "num_input_tokens_seen": 13157952, "step": 13110 }, { "epoch": 6.953870625662779, "grad_norm": 9.185013771057129, "learning_rate": 8.245023067349793e-06, "loss": 0.263, "num_input_tokens_seen": 13162048, "step": 13115 }, { "epoch": 6.956521739130435, "grad_norm": 5.96730375289917, "learning_rate": 8.243262620100573e-06, "loss": 0.2109, "num_input_tokens_seen": 13166816, "step": 13120 }, { "epoch": 6.959172852598091, "grad_norm": 2.3252854347229004, "learning_rate": 8.241501478477311e-06, "loss": 0.2548, "num_input_tokens_seen": 13172064, "step": 13125 }, { "epoch": 6.961823966065747, "grad_norm": 6.811888694763184, "learning_rate": 8.239739642857067e-06, "loss": 0.1754, "num_input_tokens_seen": 13177152, "step": 13130 }, { "epoch": 6.964475079533404, "grad_norm": 8.168339729309082, "learning_rate": 8.237977113617042e-06, "loss": 0.314, "num_input_tokens_seen": 13181472, "step": 13135 }, { "epoch": 6.967126193001061, "grad_norm": 7.766022682189941, "learning_rate": 8.23621389113459e-06, "loss": 0.2314, "num_input_tokens_seen": 13187648, "step": 13140 }, { "epoch": 6.969777306468717, "grad_norm": 7.180837154388428, "learning_rate": 8.234449975787211e-06, "loss": 0.2607, "num_input_tokens_seen": 13192192, "step": 13145 }, { "epoch": 6.9724284199363735, "grad_norm": 7.524232387542725, "learning_rate": 8.232685367952556e-06, "loss": 0.3223, "num_input_tokens_seen": 13196800, "step": 13150 }, { "epoch": 6.975079533404029, "grad_norm": 11.3347806930542, "learning_rate": 8.230920068008425e-06, "loss": 0.2374, "num_input_tokens_seen": 13201792, "step": 13155 }, { "epoch": 6.977730646871686, "grad_norm": 4.697988510131836, "learning_rate": 8.229154076332761e-06, "loss": 0.2741, "num_input_tokens_seen": 13206304, "step": 13160 }, { "epoch": 6.980381760339343, "grad_norm": 2.601287364959717, "learning_rate": 8.227387393303658e-06, "loss": 0.205, "num_input_tokens_seen": 13210848, "step": 13165 }, { "epoch": 6.983032873806999, "grad_norm": 7.892158031463623, "learning_rate": 8.22562001929936e-06, "loss": 0.3682, "num_input_tokens_seen": 13216000, "step": 13170 }, { "epoch": 6.985683987274656, "grad_norm": 8.45634651184082, "learning_rate": 8.223851954698257e-06, "loss": 0.1991, "num_input_tokens_seen": 13220768, "step": 13175 }, { "epoch": 6.988335100742312, "grad_norm": 7.439050197601318, "learning_rate": 8.222083199878885e-06, "loss": 0.3536, "num_input_tokens_seen": 13225696, "step": 13180 }, { "epoch": 6.990986214209968, "grad_norm": 3.2655482292175293, "learning_rate": 8.220313755219931e-06, "loss": 0.2477, "num_input_tokens_seen": 13231040, "step": 13185 }, { "epoch": 6.993637327677624, "grad_norm": 6.093419551849365, "learning_rate": 8.218543621100231e-06, "loss": 0.2031, "num_input_tokens_seen": 13235808, "step": 13190 }, { "epoch": 6.996288441145281, "grad_norm": 1.2966173887252808, "learning_rate": 8.216772797898762e-06, "loss": 0.1891, "num_input_tokens_seen": 13240192, "step": 13195 }, { "epoch": 6.998939554612938, "grad_norm": 8.512990951538086, "learning_rate": 8.215001285994655e-06, "loss": 0.4094, "num_input_tokens_seen": 13245696, "step": 13200 }, { "epoch": 7.001590668080594, "grad_norm": 8.684820175170898, "learning_rate": 8.213229085767186e-06, "loss": 0.2469, "num_input_tokens_seen": 13249360, "step": 13205 }, { "epoch": 7.0042417815482505, "grad_norm": 14.752063751220703, "learning_rate": 8.211456197595777e-06, "loss": 0.2179, "num_input_tokens_seen": 13254128, "step": 13210 }, { "epoch": 7.006892895015906, "grad_norm": 5.647249221801758, "learning_rate": 8.20968262186e-06, "loss": 0.1588, "num_input_tokens_seen": 13258768, "step": 13215 }, { "epoch": 7.009544008483563, "grad_norm": 14.642680168151855, "learning_rate": 8.207908358939573e-06, "loss": 0.2333, "num_input_tokens_seen": 13262928, "step": 13220 }, { "epoch": 7.012195121951219, "grad_norm": 4.403076171875, "learning_rate": 8.20613340921436e-06, "loss": 0.344, "num_input_tokens_seen": 13268400, "step": 13225 }, { "epoch": 7.014846235418876, "grad_norm": 7.198399543762207, "learning_rate": 8.204357773064373e-06, "loss": 0.1723, "num_input_tokens_seen": 13272784, "step": 13230 }, { "epoch": 7.017497348886533, "grad_norm": 11.40893840789795, "learning_rate": 8.202581450869773e-06, "loss": 0.2342, "num_input_tokens_seen": 13277488, "step": 13235 }, { "epoch": 7.0201484623541885, "grad_norm": 9.547435760498047, "learning_rate": 8.200804443010866e-06, "loss": 0.3394, "num_input_tokens_seen": 13283216, "step": 13240 }, { "epoch": 7.022799575821845, "grad_norm": 9.030611038208008, "learning_rate": 8.1990267498681e-06, "loss": 0.2209, "num_input_tokens_seen": 13288112, "step": 13245 }, { "epoch": 7.025450689289501, "grad_norm": 12.952179908752441, "learning_rate": 8.19724837182208e-06, "loss": 0.2367, "num_input_tokens_seen": 13292784, "step": 13250 }, { "epoch": 7.028101802757158, "grad_norm": 1.8933223485946655, "learning_rate": 8.19546930925355e-06, "loss": 0.1628, "num_input_tokens_seen": 13297936, "step": 13255 }, { "epoch": 7.030752916224815, "grad_norm": 12.93525218963623, "learning_rate": 8.193689562543402e-06, "loss": 0.2249, "num_input_tokens_seen": 13302512, "step": 13260 }, { "epoch": 7.033404029692471, "grad_norm": 9.003774642944336, "learning_rate": 8.191909132072675e-06, "loss": 0.172, "num_input_tokens_seen": 13306896, "step": 13265 }, { "epoch": 7.0360551431601275, "grad_norm": 2.4450817108154297, "learning_rate": 8.190128018222557e-06, "loss": 0.1011, "num_input_tokens_seen": 13312144, "step": 13270 }, { "epoch": 7.038706256627783, "grad_norm": 11.095242500305176, "learning_rate": 8.188346221374377e-06, "loss": 0.2925, "num_input_tokens_seen": 13316528, "step": 13275 }, { "epoch": 7.04135737009544, "grad_norm": 12.84444522857666, "learning_rate": 8.186563741909614e-06, "loss": 0.2607, "num_input_tokens_seen": 13321392, "step": 13280 }, { "epoch": 7.044008483563097, "grad_norm": 6.855418682098389, "learning_rate": 8.184780580209892e-06, "loss": 0.2918, "num_input_tokens_seen": 13326640, "step": 13285 }, { "epoch": 7.046659597030753, "grad_norm": 6.027645587921143, "learning_rate": 8.182996736656984e-06, "loss": 0.3018, "num_input_tokens_seen": 13331664, "step": 13290 }, { "epoch": 7.04931071049841, "grad_norm": 4.708706855773926, "learning_rate": 8.1812122116328e-06, "loss": 0.1841, "num_input_tokens_seen": 13335920, "step": 13295 }, { "epoch": 7.0519618239660655, "grad_norm": 4.117886066436768, "learning_rate": 8.179427005519406e-06, "loss": 0.2416, "num_input_tokens_seen": 13340272, "step": 13300 }, { "epoch": 7.054612937433722, "grad_norm": 4.88102388381958, "learning_rate": 8.177641118699013e-06, "loss": 0.2804, "num_input_tokens_seen": 13344848, "step": 13305 }, { "epoch": 7.057264050901378, "grad_norm": 11.745429992675781, "learning_rate": 8.17585455155397e-06, "loss": 0.2718, "num_input_tokens_seen": 13349840, "step": 13310 }, { "epoch": 7.059915164369035, "grad_norm": 14.820808410644531, "learning_rate": 8.174067304466777e-06, "loss": 0.3956, "num_input_tokens_seen": 13354128, "step": 13315 }, { "epoch": 7.062566277836692, "grad_norm": 10.940666198730469, "learning_rate": 8.17227937782008e-06, "loss": 0.3082, "num_input_tokens_seen": 13359568, "step": 13320 }, { "epoch": 7.065217391304348, "grad_norm": 3.7315361499786377, "learning_rate": 8.17049077199667e-06, "loss": 0.2119, "num_input_tokens_seen": 13365168, "step": 13325 }, { "epoch": 7.0678685047720045, "grad_norm": 9.871184349060059, "learning_rate": 8.168701487379483e-06, "loss": 0.199, "num_input_tokens_seen": 13370352, "step": 13330 }, { "epoch": 7.07051961823966, "grad_norm": 7.413448333740234, "learning_rate": 8.166911524351599e-06, "loss": 0.2316, "num_input_tokens_seen": 13374096, "step": 13335 }, { "epoch": 7.073170731707317, "grad_norm": 9.494588851928711, "learning_rate": 8.165120883296243e-06, "loss": 0.1955, "num_input_tokens_seen": 13378992, "step": 13340 }, { "epoch": 7.075821845174974, "grad_norm": 9.116533279418945, "learning_rate": 8.163329564596788e-06, "loss": 0.1963, "num_input_tokens_seen": 13384240, "step": 13345 }, { "epoch": 7.07847295864263, "grad_norm": 7.450631141662598, "learning_rate": 8.161537568636752e-06, "loss": 0.206, "num_input_tokens_seen": 13389680, "step": 13350 }, { "epoch": 7.081124072110287, "grad_norm": 2.2289438247680664, "learning_rate": 8.159744895799795e-06, "loss": 0.2569, "num_input_tokens_seen": 13394672, "step": 13355 }, { "epoch": 7.0837751855779425, "grad_norm": 14.295639991760254, "learning_rate": 8.157951546469725e-06, "loss": 0.3347, "num_input_tokens_seen": 13399696, "step": 13360 }, { "epoch": 7.086426299045599, "grad_norm": 7.017465114593506, "learning_rate": 8.156157521030492e-06, "loss": 0.194, "num_input_tokens_seen": 13405520, "step": 13365 }, { "epoch": 7.089077412513255, "grad_norm": 11.982068061828613, "learning_rate": 8.15436281986619e-06, "loss": 0.2132, "num_input_tokens_seen": 13411312, "step": 13370 }, { "epoch": 7.091728525980912, "grad_norm": 14.297021865844727, "learning_rate": 8.152567443361065e-06, "loss": 0.2948, "num_input_tokens_seen": 13416528, "step": 13375 }, { "epoch": 7.094379639448569, "grad_norm": 9.820178985595703, "learning_rate": 8.1507713918995e-06, "loss": 0.2416, "num_input_tokens_seen": 13422768, "step": 13380 }, { "epoch": 7.097030752916225, "grad_norm": 5.246513843536377, "learning_rate": 8.148974665866024e-06, "loss": 0.3332, "num_input_tokens_seen": 13428816, "step": 13385 }, { "epoch": 7.099681866383881, "grad_norm": 12.698258399963379, "learning_rate": 8.147177265645312e-06, "loss": 0.1601, "num_input_tokens_seen": 13434352, "step": 13390 }, { "epoch": 7.102332979851537, "grad_norm": 2.9504318237304688, "learning_rate": 8.145379191622184e-06, "loss": 0.21, "num_input_tokens_seen": 13439504, "step": 13395 }, { "epoch": 7.104984093319194, "grad_norm": 7.120417594909668, "learning_rate": 8.1435804441816e-06, "loss": 0.1426, "num_input_tokens_seen": 13444304, "step": 13400 }, { "epoch": 7.107635206786851, "grad_norm": 11.726065635681152, "learning_rate": 8.141781023708671e-06, "loss": 0.2642, "num_input_tokens_seen": 13449680, "step": 13405 }, { "epoch": 7.110286320254507, "grad_norm": 7.904376983642578, "learning_rate": 8.139980930588643e-06, "loss": 0.1872, "num_input_tokens_seen": 13454832, "step": 13410 }, { "epoch": 7.112937433722164, "grad_norm": 12.435731887817383, "learning_rate": 8.138180165206915e-06, "loss": 0.1924, "num_input_tokens_seen": 13458672, "step": 13415 }, { "epoch": 7.1155885471898195, "grad_norm": 5.360986232757568, "learning_rate": 8.136378727949028e-06, "loss": 0.1571, "num_input_tokens_seen": 13463728, "step": 13420 }, { "epoch": 7.118239660657476, "grad_norm": 14.153614044189453, "learning_rate": 8.134576619200659e-06, "loss": 0.3861, "num_input_tokens_seen": 13468400, "step": 13425 }, { "epoch": 7.120890774125132, "grad_norm": 8.617693901062012, "learning_rate": 8.13277383934764e-06, "loss": 0.2408, "num_input_tokens_seen": 13472432, "step": 13430 }, { "epoch": 7.123541887592789, "grad_norm": 1.2544751167297363, "learning_rate": 8.130970388775941e-06, "loss": 0.1131, "num_input_tokens_seen": 13477392, "step": 13435 }, { "epoch": 7.126193001060446, "grad_norm": 10.808786392211914, "learning_rate": 8.129166267871673e-06, "loss": 0.2926, "num_input_tokens_seen": 13482640, "step": 13440 }, { "epoch": 7.128844114528102, "grad_norm": 10.045589447021484, "learning_rate": 8.127361477021095e-06, "loss": 0.2233, "num_input_tokens_seen": 13487024, "step": 13445 }, { "epoch": 7.131495227995758, "grad_norm": 4.827729225158691, "learning_rate": 8.125556016610608e-06, "loss": 0.1849, "num_input_tokens_seen": 13492304, "step": 13450 }, { "epoch": 7.134146341463414, "grad_norm": 13.588348388671875, "learning_rate": 8.123749887026757e-06, "loss": 0.2105, "num_input_tokens_seen": 13496912, "step": 13455 }, { "epoch": 7.136797454931071, "grad_norm": 3.977264881134033, "learning_rate": 8.12194308865623e-06, "loss": 0.1431, "num_input_tokens_seen": 13501232, "step": 13460 }, { "epoch": 7.139448568398728, "grad_norm": 12.780898094177246, "learning_rate": 8.120135621885857e-06, "loss": 0.3741, "num_input_tokens_seen": 13506384, "step": 13465 }, { "epoch": 7.142099681866384, "grad_norm": 11.072029113769531, "learning_rate": 8.11832748710261e-06, "loss": 0.2654, "num_input_tokens_seen": 13512016, "step": 13470 }, { "epoch": 7.144750795334041, "grad_norm": 4.9308085441589355, "learning_rate": 8.116518684693611e-06, "loss": 0.2507, "num_input_tokens_seen": 13517584, "step": 13475 }, { "epoch": 7.1474019088016965, "grad_norm": 7.351563453674316, "learning_rate": 8.114709215046114e-06, "loss": 0.2374, "num_input_tokens_seen": 13522736, "step": 13480 }, { "epoch": 7.150053022269353, "grad_norm": 9.574237823486328, "learning_rate": 8.112899078547526e-06, "loss": 0.1957, "num_input_tokens_seen": 13527440, "step": 13485 }, { "epoch": 7.152704135737009, "grad_norm": 2.5298988819122314, "learning_rate": 8.11108827558539e-06, "loss": 0.2876, "num_input_tokens_seen": 13533008, "step": 13490 }, { "epoch": 7.155355249204666, "grad_norm": 5.774722576141357, "learning_rate": 8.109276806547395e-06, "loss": 0.1758, "num_input_tokens_seen": 13538000, "step": 13495 }, { "epoch": 7.158006362672323, "grad_norm": 8.873866081237793, "learning_rate": 8.107464671821373e-06, "loss": 0.2781, "num_input_tokens_seen": 13543184, "step": 13500 }, { "epoch": 7.160657476139979, "grad_norm": 3.8930532932281494, "learning_rate": 8.105651871795295e-06, "loss": 0.1494, "num_input_tokens_seen": 13548816, "step": 13505 }, { "epoch": 7.163308589607635, "grad_norm": 9.23452091217041, "learning_rate": 8.103838406857278e-06, "loss": 0.1823, "num_input_tokens_seen": 13553808, "step": 13510 }, { "epoch": 7.165959703075291, "grad_norm": 16.095993041992188, "learning_rate": 8.102024277395581e-06, "loss": 0.3806, "num_input_tokens_seen": 13558448, "step": 13515 }, { "epoch": 7.168610816542948, "grad_norm": 4.6601691246032715, "learning_rate": 8.100209483798603e-06, "loss": 0.2012, "num_input_tokens_seen": 13564304, "step": 13520 }, { "epoch": 7.171261930010605, "grad_norm": 7.283492565155029, "learning_rate": 8.098394026454886e-06, "loss": 0.2299, "num_input_tokens_seen": 13569072, "step": 13525 }, { "epoch": 7.173913043478261, "grad_norm": 10.472453117370605, "learning_rate": 8.096577905753115e-06, "loss": 0.2022, "num_input_tokens_seen": 13574160, "step": 13530 }, { "epoch": 7.1765641569459175, "grad_norm": 9.85716438293457, "learning_rate": 8.09476112208212e-06, "loss": 0.2174, "num_input_tokens_seen": 13579312, "step": 13535 }, { "epoch": 7.179215270413573, "grad_norm": 5.762080669403076, "learning_rate": 8.092943675830864e-06, "loss": 0.2371, "num_input_tokens_seen": 13585008, "step": 13540 }, { "epoch": 7.18186638388123, "grad_norm": 8.44367504119873, "learning_rate": 8.091125567388463e-06, "loss": 0.2611, "num_input_tokens_seen": 13589808, "step": 13545 }, { "epoch": 7.184517497348886, "grad_norm": 6.6154985427856445, "learning_rate": 8.089306797144166e-06, "loss": 0.3997, "num_input_tokens_seen": 13594288, "step": 13550 }, { "epoch": 7.187168610816543, "grad_norm": 9.791804313659668, "learning_rate": 8.087487365487367e-06, "loss": 0.0986, "num_input_tokens_seen": 13598736, "step": 13555 }, { "epoch": 7.1898197242842, "grad_norm": 7.619038105010986, "learning_rate": 8.085667272807602e-06, "loss": 0.2938, "num_input_tokens_seen": 13604752, "step": 13560 }, { "epoch": 7.192470837751856, "grad_norm": 9.484749794006348, "learning_rate": 8.083846519494549e-06, "loss": 0.2077, "num_input_tokens_seen": 13611120, "step": 13565 }, { "epoch": 7.195121951219512, "grad_norm": 11.3390531539917, "learning_rate": 8.082025105938026e-06, "loss": 0.2154, "num_input_tokens_seen": 13615536, "step": 13570 }, { "epoch": 7.197773064687168, "grad_norm": 9.832498550415039, "learning_rate": 8.080203032527994e-06, "loss": 0.154, "num_input_tokens_seen": 13620496, "step": 13575 }, { "epoch": 7.200424178154825, "grad_norm": 9.193833351135254, "learning_rate": 8.07838029965455e-06, "loss": 0.2525, "num_input_tokens_seen": 13625264, "step": 13580 }, { "epoch": 7.203075291622482, "grad_norm": 8.810144424438477, "learning_rate": 8.07655690770794e-06, "loss": 0.3875, "num_input_tokens_seen": 13629840, "step": 13585 }, { "epoch": 7.205726405090138, "grad_norm": 4.845849514007568, "learning_rate": 8.074732857078546e-06, "loss": 0.1307, "num_input_tokens_seen": 13634736, "step": 13590 }, { "epoch": 7.2083775185577945, "grad_norm": 12.114426612854004, "learning_rate": 8.072908148156895e-06, "loss": 0.247, "num_input_tokens_seen": 13640144, "step": 13595 }, { "epoch": 7.21102863202545, "grad_norm": 8.519432067871094, "learning_rate": 8.07108278133365e-06, "loss": 0.1357, "num_input_tokens_seen": 13645584, "step": 13600 }, { "epoch": 7.213679745493107, "grad_norm": 6.936097621917725, "learning_rate": 8.069256756999617e-06, "loss": 0.2033, "num_input_tokens_seen": 13651024, "step": 13605 }, { "epoch": 7.216330858960763, "grad_norm": 14.793664932250977, "learning_rate": 8.067430075545744e-06, "loss": 0.2343, "num_input_tokens_seen": 13656304, "step": 13610 }, { "epoch": 7.21898197242842, "grad_norm": 18.28289031982422, "learning_rate": 8.065602737363118e-06, "loss": 0.5104, "num_input_tokens_seen": 13660784, "step": 13615 }, { "epoch": 7.221633085896077, "grad_norm": 6.98325252532959, "learning_rate": 8.063774742842966e-06, "loss": 0.1283, "num_input_tokens_seen": 13665776, "step": 13620 }, { "epoch": 7.224284199363733, "grad_norm": 9.574357986450195, "learning_rate": 8.061946092376662e-06, "loss": 0.2012, "num_input_tokens_seen": 13670608, "step": 13625 }, { "epoch": 7.226935312831389, "grad_norm": 11.65880012512207, "learning_rate": 8.060116786355711e-06, "loss": 0.2175, "num_input_tokens_seen": 13675344, "step": 13630 }, { "epoch": 7.229586426299045, "grad_norm": 9.897520065307617, "learning_rate": 8.058286825171763e-06, "loss": 0.1778, "num_input_tokens_seen": 13680496, "step": 13635 }, { "epoch": 7.232237539766702, "grad_norm": 13.708151817321777, "learning_rate": 8.056456209216609e-06, "loss": 0.288, "num_input_tokens_seen": 13685520, "step": 13640 }, { "epoch": 7.234888653234359, "grad_norm": 2.784550428390503, "learning_rate": 8.05462493888218e-06, "loss": 0.0785, "num_input_tokens_seen": 13690768, "step": 13645 }, { "epoch": 7.237539766702015, "grad_norm": 6.821785926818848, "learning_rate": 8.05279301456054e-06, "loss": 0.4107, "num_input_tokens_seen": 13697040, "step": 13650 }, { "epoch": 7.2401908801696715, "grad_norm": 10.240964889526367, "learning_rate": 8.050960436643907e-06, "loss": 0.3607, "num_input_tokens_seen": 13701584, "step": 13655 }, { "epoch": 7.242841993637327, "grad_norm": 10.193622589111328, "learning_rate": 8.04912720552463e-06, "loss": 0.2446, "num_input_tokens_seen": 13707376, "step": 13660 }, { "epoch": 7.245493107104984, "grad_norm": 6.624356746673584, "learning_rate": 8.047293321595194e-06, "loss": 0.2333, "num_input_tokens_seen": 13712272, "step": 13665 }, { "epoch": 7.248144220572641, "grad_norm": 16.284503936767578, "learning_rate": 8.045458785248233e-06, "loss": 0.258, "num_input_tokens_seen": 13717008, "step": 13670 }, { "epoch": 7.250795334040297, "grad_norm": 7.929874420166016, "learning_rate": 8.043623596876515e-06, "loss": 0.0952, "num_input_tokens_seen": 13721488, "step": 13675 }, { "epoch": 7.253446447507954, "grad_norm": 4.023780345916748, "learning_rate": 8.04178775687295e-06, "loss": 0.1223, "num_input_tokens_seen": 13727024, "step": 13680 }, { "epoch": 7.2560975609756095, "grad_norm": 7.5262627601623535, "learning_rate": 8.039951265630585e-06, "loss": 0.1464, "num_input_tokens_seen": 13733072, "step": 13685 }, { "epoch": 7.258748674443266, "grad_norm": 10.16340160369873, "learning_rate": 8.038114123542608e-06, "loss": 0.2593, "num_input_tokens_seen": 13738512, "step": 13690 }, { "epoch": 7.261399787910922, "grad_norm": 15.74844741821289, "learning_rate": 8.036276331002348e-06, "loss": 0.3872, "num_input_tokens_seen": 13742800, "step": 13695 }, { "epoch": 7.264050901378579, "grad_norm": 4.107804775238037, "learning_rate": 8.03443788840327e-06, "loss": 0.1728, "num_input_tokens_seen": 13747376, "step": 13700 }, { "epoch": 7.266702014846236, "grad_norm": 19.4306583404541, "learning_rate": 8.032598796138982e-06, "loss": 0.4456, "num_input_tokens_seen": 13753168, "step": 13705 }, { "epoch": 7.269353128313892, "grad_norm": 4.150245666503906, "learning_rate": 8.030759054603227e-06, "loss": 0.2157, "num_input_tokens_seen": 13757872, "step": 13710 }, { "epoch": 7.2720042417815485, "grad_norm": 2.0322980880737305, "learning_rate": 8.028918664189889e-06, "loss": 0.1884, "num_input_tokens_seen": 13762224, "step": 13715 }, { "epoch": 7.274655355249204, "grad_norm": 10.877799987792969, "learning_rate": 8.02707762529299e-06, "loss": 0.157, "num_input_tokens_seen": 13767024, "step": 13720 }, { "epoch": 7.277306468716861, "grad_norm": 14.741116523742676, "learning_rate": 8.025235938306693e-06, "loss": 0.2863, "num_input_tokens_seen": 13772976, "step": 13725 }, { "epoch": 7.279957582184517, "grad_norm": 7.509084701538086, "learning_rate": 8.023393603625298e-06, "loss": 0.2218, "num_input_tokens_seen": 13777488, "step": 13730 }, { "epoch": 7.282608695652174, "grad_norm": 17.194196701049805, "learning_rate": 8.021550621643244e-06, "loss": 0.3031, "num_input_tokens_seen": 13782992, "step": 13735 }, { "epoch": 7.285259809119831, "grad_norm": 3.9118003845214844, "learning_rate": 8.019706992755108e-06, "loss": 0.1636, "num_input_tokens_seen": 13789424, "step": 13740 }, { "epoch": 7.2879109225874865, "grad_norm": 4.343299388885498, "learning_rate": 8.017862717355606e-06, "loss": 0.2062, "num_input_tokens_seen": 13794576, "step": 13745 }, { "epoch": 7.290562036055143, "grad_norm": 9.15234375, "learning_rate": 8.016017795839595e-06, "loss": 0.1787, "num_input_tokens_seen": 13799152, "step": 13750 }, { "epoch": 7.293213149522799, "grad_norm": 10.432428359985352, "learning_rate": 8.014172228602063e-06, "loss": 0.2209, "num_input_tokens_seen": 13804304, "step": 13755 }, { "epoch": 7.295864262990456, "grad_norm": 9.903457641601562, "learning_rate": 8.012326016038148e-06, "loss": 0.1872, "num_input_tokens_seen": 13810992, "step": 13760 }, { "epoch": 7.298515376458113, "grad_norm": 11.24250602722168, "learning_rate": 8.010479158543112e-06, "loss": 0.247, "num_input_tokens_seen": 13817264, "step": 13765 }, { "epoch": 7.301166489925769, "grad_norm": 3.920891523361206, "learning_rate": 8.008631656512367e-06, "loss": 0.2343, "num_input_tokens_seen": 13822320, "step": 13770 }, { "epoch": 7.3038176033934255, "grad_norm": 8.252533912658691, "learning_rate": 8.006783510341458e-06, "loss": 0.255, "num_input_tokens_seen": 13828496, "step": 13775 }, { "epoch": 7.306468716861081, "grad_norm": 11.78640365600586, "learning_rate": 8.004934720426067e-06, "loss": 0.2448, "num_input_tokens_seen": 13833392, "step": 13780 }, { "epoch": 7.309119830328738, "grad_norm": 5.26625919342041, "learning_rate": 8.003085287162016e-06, "loss": 0.1802, "num_input_tokens_seen": 13838416, "step": 13785 }, { "epoch": 7.311770943796395, "grad_norm": 12.035536766052246, "learning_rate": 8.001235210945261e-06, "loss": 0.4807, "num_input_tokens_seen": 13842736, "step": 13790 }, { "epoch": 7.314422057264051, "grad_norm": 7.550615310668945, "learning_rate": 7.999384492171904e-06, "loss": 0.1732, "num_input_tokens_seen": 13848304, "step": 13795 }, { "epoch": 7.317073170731708, "grad_norm": 13.172983169555664, "learning_rate": 7.997533131238173e-06, "loss": 0.306, "num_input_tokens_seen": 13853424, "step": 13800 }, { "epoch": 7.3197242841993635, "grad_norm": 11.286768913269043, "learning_rate": 7.995681128540445e-06, "loss": 0.2746, "num_input_tokens_seen": 13857616, "step": 13805 }, { "epoch": 7.32237539766702, "grad_norm": 11.146530151367188, "learning_rate": 7.993828484475224e-06, "loss": 0.1986, "num_input_tokens_seen": 13861840, "step": 13810 }, { "epoch": 7.325026511134676, "grad_norm": 2.1385674476623535, "learning_rate": 7.99197519943916e-06, "loss": 0.15, "num_input_tokens_seen": 13867216, "step": 13815 }, { "epoch": 7.327677624602333, "grad_norm": 2.569535732269287, "learning_rate": 7.990121273829034e-06, "loss": 0.1934, "num_input_tokens_seen": 13871888, "step": 13820 }, { "epoch": 7.33032873806999, "grad_norm": 12.223865509033203, "learning_rate": 7.988266708041767e-06, "loss": 0.2291, "num_input_tokens_seen": 13877840, "step": 13825 }, { "epoch": 7.332979851537646, "grad_norm": 10.344450950622559, "learning_rate": 7.986411502474419e-06, "loss": 0.2452, "num_input_tokens_seen": 13884016, "step": 13830 }, { "epoch": 7.335630965005302, "grad_norm": 10.555510520935059, "learning_rate": 7.984555657524182e-06, "loss": 0.4584, "num_input_tokens_seen": 13888656, "step": 13835 }, { "epoch": 7.338282078472958, "grad_norm": 10.872383117675781, "learning_rate": 7.982699173588389e-06, "loss": 0.3448, "num_input_tokens_seen": 13893904, "step": 13840 }, { "epoch": 7.340933191940615, "grad_norm": 11.781347274780273, "learning_rate": 7.980842051064508e-06, "loss": 0.3469, "num_input_tokens_seen": 13898608, "step": 13845 }, { "epoch": 7.343584305408271, "grad_norm": 10.798192977905273, "learning_rate": 7.978984290350145e-06, "loss": 0.2584, "num_input_tokens_seen": 13903664, "step": 13850 }, { "epoch": 7.346235418875928, "grad_norm": 8.072697639465332, "learning_rate": 7.977125891843039e-06, "loss": 0.3562, "num_input_tokens_seen": 13909680, "step": 13855 }, { "epoch": 7.348886532343585, "grad_norm": 10.223252296447754, "learning_rate": 7.975266855941069e-06, "loss": 0.2368, "num_input_tokens_seen": 13914256, "step": 13860 }, { "epoch": 7.3515376458112405, "grad_norm": 10.517594337463379, "learning_rate": 7.973407183042253e-06, "loss": 0.19, "num_input_tokens_seen": 13920176, "step": 13865 }, { "epoch": 7.354188759278897, "grad_norm": 6.159595012664795, "learning_rate": 7.971546873544737e-06, "loss": 0.1166, "num_input_tokens_seen": 13925552, "step": 13870 }, { "epoch": 7.356839872746553, "grad_norm": 5.120851993560791, "learning_rate": 7.969685927846812e-06, "loss": 0.2208, "num_input_tokens_seen": 13930160, "step": 13875 }, { "epoch": 7.35949098621421, "grad_norm": 14.060461044311523, "learning_rate": 7.967824346346903e-06, "loss": 0.277, "num_input_tokens_seen": 13935440, "step": 13880 }, { "epoch": 7.362142099681867, "grad_norm": 4.518609523773193, "learning_rate": 7.965962129443566e-06, "loss": 0.2234, "num_input_tokens_seen": 13940240, "step": 13885 }, { "epoch": 7.364793213149523, "grad_norm": 5.378170490264893, "learning_rate": 7.964099277535498e-06, "loss": 0.3115, "num_input_tokens_seen": 13944976, "step": 13890 }, { "epoch": 7.367444326617179, "grad_norm": 3.20119309425354, "learning_rate": 7.96223579102153e-06, "loss": 0.2933, "num_input_tokens_seen": 13949968, "step": 13895 }, { "epoch": 7.370095440084835, "grad_norm": 16.035642623901367, "learning_rate": 7.960371670300632e-06, "loss": 0.2906, "num_input_tokens_seen": 13956048, "step": 13900 }, { "epoch": 7.372746553552492, "grad_norm": 17.205135345458984, "learning_rate": 7.958506915771907e-06, "loss": 0.2225, "num_input_tokens_seen": 13961136, "step": 13905 }, { "epoch": 7.375397667020149, "grad_norm": 4.545478343963623, "learning_rate": 7.95664152783459e-06, "loss": 0.2656, "num_input_tokens_seen": 13966320, "step": 13910 }, { "epoch": 7.378048780487805, "grad_norm": 7.864436626434326, "learning_rate": 7.954775506888062e-06, "loss": 0.2588, "num_input_tokens_seen": 13972048, "step": 13915 }, { "epoch": 7.3806998939554616, "grad_norm": 9.606200218200684, "learning_rate": 7.952908853331826e-06, "loss": 0.2092, "num_input_tokens_seen": 13977872, "step": 13920 }, { "epoch": 7.3833510074231175, "grad_norm": 9.606341361999512, "learning_rate": 7.951041567565534e-06, "loss": 0.1902, "num_input_tokens_seen": 13982224, "step": 13925 }, { "epoch": 7.386002120890774, "grad_norm": 11.019192695617676, "learning_rate": 7.949173649988963e-06, "loss": 0.1623, "num_input_tokens_seen": 13986608, "step": 13930 }, { "epoch": 7.38865323435843, "grad_norm": 13.197929382324219, "learning_rate": 7.947305101002033e-06, "loss": 0.2113, "num_input_tokens_seen": 13991792, "step": 13935 }, { "epoch": 7.391304347826087, "grad_norm": 6.464648246765137, "learning_rate": 7.945435921004791e-06, "loss": 0.2091, "num_input_tokens_seen": 13996784, "step": 13940 }, { "epoch": 7.393955461293744, "grad_norm": 4.491862773895264, "learning_rate": 7.943566110397428e-06, "loss": 0.3251, "num_input_tokens_seen": 14001872, "step": 13945 }, { "epoch": 7.3966065747614, "grad_norm": 11.519577980041504, "learning_rate": 7.94169566958026e-06, "loss": 0.377, "num_input_tokens_seen": 14006704, "step": 13950 }, { "epoch": 7.399257688229056, "grad_norm": 8.006918907165527, "learning_rate": 7.93982459895375e-06, "loss": 0.2397, "num_input_tokens_seen": 14012240, "step": 13955 }, { "epoch": 7.401908801696712, "grad_norm": 11.50145149230957, "learning_rate": 7.937952898918484e-06, "loss": 0.2771, "num_input_tokens_seen": 14017360, "step": 13960 }, { "epoch": 7.404559915164369, "grad_norm": 6.806387424468994, "learning_rate": 7.936080569875192e-06, "loss": 0.2048, "num_input_tokens_seen": 14021232, "step": 13965 }, { "epoch": 7.407211028632026, "grad_norm": 3.8249623775482178, "learning_rate": 7.934207612224732e-06, "loss": 0.1496, "num_input_tokens_seen": 14026704, "step": 13970 }, { "epoch": 7.409862142099682, "grad_norm": 5.255777359008789, "learning_rate": 7.9323340263681e-06, "loss": 0.2423, "num_input_tokens_seen": 14032080, "step": 13975 }, { "epoch": 7.4125132555673385, "grad_norm": 13.447945594787598, "learning_rate": 7.930459812706426e-06, "loss": 0.2668, "num_input_tokens_seen": 14036816, "step": 13980 }, { "epoch": 7.415164369034994, "grad_norm": 13.60453987121582, "learning_rate": 7.928584971640974e-06, "loss": 0.172, "num_input_tokens_seen": 14041872, "step": 13985 }, { "epoch": 7.417815482502651, "grad_norm": 13.102509498596191, "learning_rate": 7.926709503573145e-06, "loss": 0.2345, "num_input_tokens_seen": 14047504, "step": 13990 }, { "epoch": 7.420466595970307, "grad_norm": 17.455842971801758, "learning_rate": 7.924833408904469e-06, "loss": 0.2879, "num_input_tokens_seen": 14053264, "step": 13995 }, { "epoch": 7.423117709437964, "grad_norm": 3.9859275817871094, "learning_rate": 7.922956688036611e-06, "loss": 0.2659, "num_input_tokens_seen": 14058480, "step": 14000 }, { "epoch": 7.425768822905621, "grad_norm": 2.4739675521850586, "learning_rate": 7.921079341371378e-06, "loss": 0.1695, "num_input_tokens_seen": 14063568, "step": 14005 }, { "epoch": 7.428419936373277, "grad_norm": 22.38566780090332, "learning_rate": 7.9192013693107e-06, "loss": 0.3706, "num_input_tokens_seen": 14068368, "step": 14010 }, { "epoch": 7.431071049840933, "grad_norm": 11.228109359741211, "learning_rate": 7.917322772256648e-06, "loss": 0.2171, "num_input_tokens_seen": 14073072, "step": 14015 }, { "epoch": 7.433722163308589, "grad_norm": 15.088156700134277, "learning_rate": 7.915443550611424e-06, "loss": 0.3234, "num_input_tokens_seen": 14078096, "step": 14020 }, { "epoch": 7.436373276776246, "grad_norm": 10.417135238647461, "learning_rate": 7.913563704777366e-06, "loss": 0.2459, "num_input_tokens_seen": 14083472, "step": 14025 }, { "epoch": 7.439024390243903, "grad_norm": 8.421095848083496, "learning_rate": 7.91168323515694e-06, "loss": 0.3866, "num_input_tokens_seen": 14088848, "step": 14030 }, { "epoch": 7.441675503711559, "grad_norm": 11.556909561157227, "learning_rate": 7.909802142152756e-06, "loss": 0.223, "num_input_tokens_seen": 14093680, "step": 14035 }, { "epoch": 7.4443266171792155, "grad_norm": 1.6839277744293213, "learning_rate": 7.907920426167545e-06, "loss": 0.2529, "num_input_tokens_seen": 14098288, "step": 14040 }, { "epoch": 7.446977730646871, "grad_norm": 7.728382587432861, "learning_rate": 7.90603808760418e-06, "loss": 0.2092, "num_input_tokens_seen": 14104464, "step": 14045 }, { "epoch": 7.449628844114528, "grad_norm": 10.474410057067871, "learning_rate": 7.904155126865667e-06, "loss": 0.2107, "num_input_tokens_seen": 14110064, "step": 14050 }, { "epoch": 7.452279957582185, "grad_norm": 15.528153419494629, "learning_rate": 7.902271544355139e-06, "loss": 0.2512, "num_input_tokens_seen": 14114032, "step": 14055 }, { "epoch": 7.454931071049841, "grad_norm": 3.8452367782592773, "learning_rate": 7.900387340475868e-06, "loss": 0.1391, "num_input_tokens_seen": 14118928, "step": 14060 }, { "epoch": 7.457582184517498, "grad_norm": 4.72935676574707, "learning_rate": 7.898502515631257e-06, "loss": 0.1547, "num_input_tokens_seen": 14123632, "step": 14065 }, { "epoch": 7.4602332979851536, "grad_norm": 6.038636207580566, "learning_rate": 7.896617070224842e-06, "loss": 0.3037, "num_input_tokens_seen": 14128752, "step": 14070 }, { "epoch": 7.46288441145281, "grad_norm": 19.555252075195312, "learning_rate": 7.894731004660293e-06, "loss": 0.1298, "num_input_tokens_seen": 14133232, "step": 14075 }, { "epoch": 7.465535524920466, "grad_norm": 19.26848602294922, "learning_rate": 7.89284431934141e-06, "loss": 0.3336, "num_input_tokens_seen": 14138416, "step": 14080 }, { "epoch": 7.468186638388123, "grad_norm": 10.941492080688477, "learning_rate": 7.890957014672127e-06, "loss": 0.3809, "num_input_tokens_seen": 14143184, "step": 14085 }, { "epoch": 7.47083775185578, "grad_norm": 14.170256614685059, "learning_rate": 7.889069091056513e-06, "loss": 0.2159, "num_input_tokens_seen": 14147120, "step": 14090 }, { "epoch": 7.473488865323436, "grad_norm": 10.777109146118164, "learning_rate": 7.887180548898768e-06, "loss": 0.2443, "num_input_tokens_seen": 14151216, "step": 14095 }, { "epoch": 7.4761399787910925, "grad_norm": 9.362993240356445, "learning_rate": 7.88529138860322e-06, "loss": 0.0899, "num_input_tokens_seen": 14156048, "step": 14100 }, { "epoch": 7.478791092258748, "grad_norm": 8.167807579040527, "learning_rate": 7.883401610574338e-06, "loss": 0.1619, "num_input_tokens_seen": 14160624, "step": 14105 }, { "epoch": 7.481442205726405, "grad_norm": 26.927982330322266, "learning_rate": 7.881511215216712e-06, "loss": 0.3023, "num_input_tokens_seen": 14166576, "step": 14110 }, { "epoch": 7.484093319194061, "grad_norm": 12.5407075881958, "learning_rate": 7.879620202935079e-06, "loss": 0.2574, "num_input_tokens_seen": 14170992, "step": 14115 }, { "epoch": 7.486744432661718, "grad_norm": 12.182599067687988, "learning_rate": 7.877728574134294e-06, "loss": 0.2597, "num_input_tokens_seen": 14175344, "step": 14120 }, { "epoch": 7.489395546129375, "grad_norm": 6.185372829437256, "learning_rate": 7.875836329219354e-06, "loss": 0.2331, "num_input_tokens_seen": 14179952, "step": 14125 }, { "epoch": 7.4920466595970305, "grad_norm": 2.42684006690979, "learning_rate": 7.87394346859538e-06, "loss": 0.266, "num_input_tokens_seen": 14185520, "step": 14130 }, { "epoch": 7.494697773064687, "grad_norm": 7.191001892089844, "learning_rate": 7.872049992667627e-06, "loss": 0.1889, "num_input_tokens_seen": 14190608, "step": 14135 }, { "epoch": 7.497348886532343, "grad_norm": 16.74580192565918, "learning_rate": 7.87015590184149e-06, "loss": 0.2756, "num_input_tokens_seen": 14195280, "step": 14140 }, { "epoch": 7.5, "grad_norm": 3.449141502380371, "learning_rate": 7.868261196522481e-06, "loss": 0.1862, "num_input_tokens_seen": 14200112, "step": 14145 }, { "epoch": 7.502651113467657, "grad_norm": 2.771008014678955, "learning_rate": 7.866365877116258e-06, "loss": 0.1421, "num_input_tokens_seen": 14205264, "step": 14150 }, { "epoch": 7.505302226935313, "grad_norm": 5.215397834777832, "learning_rate": 7.8644699440286e-06, "loss": 0.3514, "num_input_tokens_seen": 14209680, "step": 14155 }, { "epoch": 7.5079533404029695, "grad_norm": 10.0499849319458, "learning_rate": 7.862573397665424e-06, "loss": 0.1488, "num_input_tokens_seen": 14214992, "step": 14160 }, { "epoch": 7.510604453870625, "grad_norm": 7.058725833892822, "learning_rate": 7.860676238432773e-06, "loss": 0.2127, "num_input_tokens_seen": 14220720, "step": 14165 }, { "epoch": 7.513255567338282, "grad_norm": 9.589844703674316, "learning_rate": 7.858778466736824e-06, "loss": 0.1615, "num_input_tokens_seen": 14224880, "step": 14170 }, { "epoch": 7.515906680805939, "grad_norm": 14.393291473388672, "learning_rate": 7.856880082983889e-06, "loss": 0.3643, "num_input_tokens_seen": 14229744, "step": 14175 }, { "epoch": 7.518557794273595, "grad_norm": 6.028214454650879, "learning_rate": 7.854981087580403e-06, "loss": 0.171, "num_input_tokens_seen": 14236144, "step": 14180 }, { "epoch": 7.521208907741252, "grad_norm": 19.20905303955078, "learning_rate": 7.853081480932933e-06, "loss": 0.3282, "num_input_tokens_seen": 14241168, "step": 14185 }, { "epoch": 7.5238600212089075, "grad_norm": 4.166212558746338, "learning_rate": 7.851181263448189e-06, "loss": 0.1623, "num_input_tokens_seen": 14245488, "step": 14190 }, { "epoch": 7.526511134676564, "grad_norm": 10.89203929901123, "learning_rate": 7.849280435532993e-06, "loss": 0.1916, "num_input_tokens_seen": 14251664, "step": 14195 }, { "epoch": 7.52916224814422, "grad_norm": 2.8260996341705322, "learning_rate": 7.847378997594314e-06, "loss": 0.2728, "num_input_tokens_seen": 14256208, "step": 14200 }, { "epoch": 7.531813361611877, "grad_norm": 11.150747299194336, "learning_rate": 7.845476950039242e-06, "loss": 0.2283, "num_input_tokens_seen": 14264208, "step": 14205 }, { "epoch": 7.534464475079534, "grad_norm": 13.364361763000488, "learning_rate": 7.843574293275002e-06, "loss": 0.2355, "num_input_tokens_seen": 14268656, "step": 14210 }, { "epoch": 7.53711558854719, "grad_norm": 1.89669930934906, "learning_rate": 7.841671027708945e-06, "loss": 0.1562, "num_input_tokens_seen": 14272752, "step": 14215 }, { "epoch": 7.5397667020148464, "grad_norm": 5.648697376251221, "learning_rate": 7.839767153748556e-06, "loss": 0.1438, "num_input_tokens_seen": 14277712, "step": 14220 }, { "epoch": 7.542417815482502, "grad_norm": 14.927568435668945, "learning_rate": 7.837862671801452e-06, "loss": 0.2896, "num_input_tokens_seen": 14282128, "step": 14225 }, { "epoch": 7.545068928950159, "grad_norm": 14.681204795837402, "learning_rate": 7.835957582275376e-06, "loss": 0.3474, "num_input_tokens_seen": 14287472, "step": 14230 }, { "epoch": 7.547720042417815, "grad_norm": 6.266261100769043, "learning_rate": 7.834051885578202e-06, "loss": 0.2382, "num_input_tokens_seen": 14293040, "step": 14235 }, { "epoch": 7.550371155885472, "grad_norm": 21.338430404663086, "learning_rate": 7.832145582117935e-06, "loss": 0.3265, "num_input_tokens_seen": 14298000, "step": 14240 }, { "epoch": 7.553022269353129, "grad_norm": 8.876811981201172, "learning_rate": 7.83023867230271e-06, "loss": 0.3989, "num_input_tokens_seen": 14302576, "step": 14245 }, { "epoch": 7.5556733828207845, "grad_norm": 9.007966041564941, "learning_rate": 7.828331156540793e-06, "loss": 0.1466, "num_input_tokens_seen": 14307184, "step": 14250 }, { "epoch": 7.558324496288441, "grad_norm": 4.905295372009277, "learning_rate": 7.826423035240575e-06, "loss": 0.2094, "num_input_tokens_seen": 14312368, "step": 14255 }, { "epoch": 7.560975609756097, "grad_norm": 7.141935348510742, "learning_rate": 7.82451430881058e-06, "loss": 0.3205, "num_input_tokens_seen": 14316912, "step": 14260 }, { "epoch": 7.563626723223754, "grad_norm": 14.0625638961792, "learning_rate": 7.822604977659464e-06, "loss": 0.3364, "num_input_tokens_seen": 14321136, "step": 14265 }, { "epoch": 7.566277836691411, "grad_norm": 9.034582138061523, "learning_rate": 7.82069504219601e-06, "loss": 0.2518, "num_input_tokens_seen": 14326256, "step": 14270 }, { "epoch": 7.568928950159067, "grad_norm": 2.9659948348999023, "learning_rate": 7.818784502829124e-06, "loss": 0.1828, "num_input_tokens_seen": 14332176, "step": 14275 }, { "epoch": 7.571580063626723, "grad_norm": 10.501785278320312, "learning_rate": 7.816873359967856e-06, "loss": 0.3381, "num_input_tokens_seen": 14338320, "step": 14280 }, { "epoch": 7.574231177094379, "grad_norm": 7.678043842315674, "learning_rate": 7.81496161402137e-06, "loss": 0.17, "num_input_tokens_seen": 14344208, "step": 14285 }, { "epoch": 7.576882290562036, "grad_norm": 16.936033248901367, "learning_rate": 7.813049265398968e-06, "loss": 0.2365, "num_input_tokens_seen": 14349584, "step": 14290 }, { "epoch": 7.579533404029693, "grad_norm": 9.282112121582031, "learning_rate": 7.811136314510081e-06, "loss": 0.1669, "num_input_tokens_seen": 14353552, "step": 14295 }, { "epoch": 7.582184517497349, "grad_norm": 5.422773361206055, "learning_rate": 7.809222761764266e-06, "loss": 0.3091, "num_input_tokens_seen": 14358320, "step": 14300 }, { "epoch": 7.584835630965006, "grad_norm": 13.756950378417969, "learning_rate": 7.807308607571206e-06, "loss": 0.1811, "num_input_tokens_seen": 14362960, "step": 14305 }, { "epoch": 7.5874867444326615, "grad_norm": 15.089065551757812, "learning_rate": 7.80539385234072e-06, "loss": 0.278, "num_input_tokens_seen": 14367152, "step": 14310 }, { "epoch": 7.590137857900318, "grad_norm": 21.555030822753906, "learning_rate": 7.803478496482752e-06, "loss": 0.3635, "num_input_tokens_seen": 14371760, "step": 14315 }, { "epoch": 7.592788971367975, "grad_norm": 14.921504974365234, "learning_rate": 7.801562540407375e-06, "loss": 0.3258, "num_input_tokens_seen": 14376336, "step": 14320 }, { "epoch": 7.595440084835631, "grad_norm": 9.374452590942383, "learning_rate": 7.799645984524788e-06, "loss": 0.3156, "num_input_tokens_seen": 14381008, "step": 14325 }, { "epoch": 7.598091198303288, "grad_norm": 1.906618356704712, "learning_rate": 7.797728829245321e-06, "loss": 0.2674, "num_input_tokens_seen": 14385200, "step": 14330 }, { "epoch": 7.600742311770944, "grad_norm": 1.4732677936553955, "learning_rate": 7.795811074979432e-06, "loss": 0.1854, "num_input_tokens_seen": 14390896, "step": 14335 }, { "epoch": 7.6033934252386, "grad_norm": 7.040542125701904, "learning_rate": 7.79389272213771e-06, "loss": 0.2779, "num_input_tokens_seen": 14395664, "step": 14340 }, { "epoch": 7.606044538706256, "grad_norm": 14.461620330810547, "learning_rate": 7.791973771130866e-06, "loss": 0.3527, "num_input_tokens_seen": 14400144, "step": 14345 }, { "epoch": 7.608695652173913, "grad_norm": 10.902091026306152, "learning_rate": 7.790054222369743e-06, "loss": 0.2007, "num_input_tokens_seen": 14405008, "step": 14350 }, { "epoch": 7.611346765641569, "grad_norm": 10.444306373596191, "learning_rate": 7.78813407626531e-06, "loss": 0.1914, "num_input_tokens_seen": 14409488, "step": 14355 }, { "epoch": 7.613997879109226, "grad_norm": 3.215210437774658, "learning_rate": 7.786213333228671e-06, "loss": 0.1692, "num_input_tokens_seen": 14414480, "step": 14360 }, { "epoch": 7.6166489925768825, "grad_norm": 6.0938401222229, "learning_rate": 7.784291993671045e-06, "loss": 0.3035, "num_input_tokens_seen": 14419952, "step": 14365 }, { "epoch": 7.6193001060445384, "grad_norm": 9.175562858581543, "learning_rate": 7.782370058003791e-06, "loss": 0.2601, "num_input_tokens_seen": 14424176, "step": 14370 }, { "epoch": 7.621951219512195, "grad_norm": 16.126619338989258, "learning_rate": 7.780447526638386e-06, "loss": 0.3554, "num_input_tokens_seen": 14429488, "step": 14375 }, { "epoch": 7.624602332979851, "grad_norm": 8.602303504943848, "learning_rate": 7.778524399986445e-06, "loss": 0.1923, "num_input_tokens_seen": 14433584, "step": 14380 }, { "epoch": 7.627253446447508, "grad_norm": 12.28893756866455, "learning_rate": 7.776600678459697e-06, "loss": 0.2216, "num_input_tokens_seen": 14438416, "step": 14385 }, { "epoch": 7.629904559915165, "grad_norm": 3.8835954666137695, "learning_rate": 7.77467636247001e-06, "loss": 0.1753, "num_input_tokens_seen": 14442896, "step": 14390 }, { "epoch": 7.632555673382821, "grad_norm": 13.524107933044434, "learning_rate": 7.772751452429376e-06, "loss": 0.2168, "num_input_tokens_seen": 14450256, "step": 14395 }, { "epoch": 7.635206786850477, "grad_norm": 5.373943328857422, "learning_rate": 7.77082594874991e-06, "loss": 0.1512, "num_input_tokens_seen": 14455920, "step": 14400 }, { "epoch": 7.637857900318133, "grad_norm": 6.445881366729736, "learning_rate": 7.76889985184386e-06, "loss": 0.3158, "num_input_tokens_seen": 14460368, "step": 14405 }, { "epoch": 7.64050901378579, "grad_norm": 8.606791496276855, "learning_rate": 7.766973162123597e-06, "loss": 0.2619, "num_input_tokens_seen": 14465520, "step": 14410 }, { "epoch": 7.643160127253447, "grad_norm": 3.2384819984436035, "learning_rate": 7.76504588000162e-06, "loss": 0.1969, "num_input_tokens_seen": 14469968, "step": 14415 }, { "epoch": 7.645811240721103, "grad_norm": 4.281896591186523, "learning_rate": 7.763118005890555e-06, "loss": 0.151, "num_input_tokens_seen": 14474512, "step": 14420 }, { "epoch": 7.6484623541887595, "grad_norm": 7.9893293380737305, "learning_rate": 7.761189540203158e-06, "loss": 0.1584, "num_input_tokens_seen": 14479280, "step": 14425 }, { "epoch": 7.651113467656415, "grad_norm": 6.680288791656494, "learning_rate": 7.759260483352302e-06, "loss": 0.1813, "num_input_tokens_seen": 14483600, "step": 14430 }, { "epoch": 7.653764581124072, "grad_norm": 14.417469024658203, "learning_rate": 7.757330835751e-06, "loss": 0.3046, "num_input_tokens_seen": 14490608, "step": 14435 }, { "epoch": 7.656415694591729, "grad_norm": 17.415996551513672, "learning_rate": 7.75540059781238e-06, "loss": 0.2209, "num_input_tokens_seen": 14494512, "step": 14440 }, { "epoch": 7.659066808059385, "grad_norm": 7.033243656158447, "learning_rate": 7.753469769949701e-06, "loss": 0.2313, "num_input_tokens_seen": 14498800, "step": 14445 }, { "epoch": 7.661717921527042, "grad_norm": 11.181540489196777, "learning_rate": 7.75153835257635e-06, "loss": 0.232, "num_input_tokens_seen": 14502704, "step": 14450 }, { "epoch": 7.664369034994698, "grad_norm": 1.8412429094314575, "learning_rate": 7.749606346105839e-06, "loss": 0.1932, "num_input_tokens_seen": 14506864, "step": 14455 }, { "epoch": 7.667020148462354, "grad_norm": 8.358061790466309, "learning_rate": 7.747673750951803e-06, "loss": 0.2341, "num_input_tokens_seen": 14511152, "step": 14460 }, { "epoch": 7.66967126193001, "grad_norm": 17.470300674438477, "learning_rate": 7.745740567528006e-06, "loss": 0.2578, "num_input_tokens_seen": 14516464, "step": 14465 }, { "epoch": 7.672322375397667, "grad_norm": 15.7529296875, "learning_rate": 7.743806796248342e-06, "loss": 0.2379, "num_input_tokens_seen": 14521104, "step": 14470 }, { "epoch": 7.674973488865324, "grad_norm": 5.2612996101379395, "learning_rate": 7.741872437526818e-06, "loss": 0.3889, "num_input_tokens_seen": 14528080, "step": 14475 }, { "epoch": 7.67762460233298, "grad_norm": 7.781287670135498, "learning_rate": 7.739937491777583e-06, "loss": 0.3082, "num_input_tokens_seen": 14532752, "step": 14480 }, { "epoch": 7.6802757158006365, "grad_norm": 16.107545852661133, "learning_rate": 7.7380019594149e-06, "loss": 0.3354, "num_input_tokens_seen": 14537744, "step": 14485 }, { "epoch": 7.682926829268292, "grad_norm": 7.255747318267822, "learning_rate": 7.736065840853162e-06, "loss": 0.2653, "num_input_tokens_seen": 14542480, "step": 14490 }, { "epoch": 7.685577942735949, "grad_norm": 13.631837844848633, "learning_rate": 7.734129136506887e-06, "loss": 0.2189, "num_input_tokens_seen": 14547984, "step": 14495 }, { "epoch": 7.688229056203605, "grad_norm": 10.915251731872559, "learning_rate": 7.732191846790717e-06, "loss": 0.1593, "num_input_tokens_seen": 14553904, "step": 14500 }, { "epoch": 7.690880169671262, "grad_norm": 14.293415069580078, "learning_rate": 7.730253972119425e-06, "loss": 0.3111, "num_input_tokens_seen": 14559696, "step": 14505 }, { "epoch": 7.693531283138919, "grad_norm": 7.539002418518066, "learning_rate": 7.7283155129079e-06, "loss": 0.21, "num_input_tokens_seen": 14566224, "step": 14510 }, { "epoch": 7.6961823966065745, "grad_norm": 6.891801357269287, "learning_rate": 7.726376469571165e-06, "loss": 0.1989, "num_input_tokens_seen": 14571152, "step": 14515 }, { "epoch": 7.698833510074231, "grad_norm": 8.116697311401367, "learning_rate": 7.724436842524359e-06, "loss": 0.1759, "num_input_tokens_seen": 14577040, "step": 14520 }, { "epoch": 7.701484623541887, "grad_norm": 5.379748344421387, "learning_rate": 7.722496632182756e-06, "loss": 0.206, "num_input_tokens_seen": 14582096, "step": 14525 }, { "epoch": 7.704135737009544, "grad_norm": 10.253973007202148, "learning_rate": 7.720555838961747e-06, "loss": 0.1693, "num_input_tokens_seen": 14587440, "step": 14530 }, { "epoch": 7.706786850477201, "grad_norm": 13.040416717529297, "learning_rate": 7.718614463276852e-06, "loss": 0.3068, "num_input_tokens_seen": 14592304, "step": 14535 }, { "epoch": 7.709437963944857, "grad_norm": 13.03803825378418, "learning_rate": 7.716672505543714e-06, "loss": 0.2919, "num_input_tokens_seen": 14596688, "step": 14540 }, { "epoch": 7.7120890774125135, "grad_norm": 7.514241695404053, "learning_rate": 7.714729966178104e-06, "loss": 0.1674, "num_input_tokens_seen": 14602256, "step": 14545 }, { "epoch": 7.714740190880169, "grad_norm": 7.978521347045898, "learning_rate": 7.71278684559591e-06, "loss": 0.204, "num_input_tokens_seen": 14608688, "step": 14550 }, { "epoch": 7.717391304347826, "grad_norm": 1.4202221632003784, "learning_rate": 7.710843144213151e-06, "loss": 0.1436, "num_input_tokens_seen": 14614160, "step": 14555 }, { "epoch": 7.720042417815483, "grad_norm": 14.593958854675293, "learning_rate": 7.708898862445968e-06, "loss": 0.1606, "num_input_tokens_seen": 14618288, "step": 14560 }, { "epoch": 7.722693531283139, "grad_norm": 11.721251487731934, "learning_rate": 7.706954000710629e-06, "loss": 0.1746, "num_input_tokens_seen": 14623824, "step": 14565 }, { "epoch": 7.725344644750796, "grad_norm": 13.194318771362305, "learning_rate": 7.70500855942352e-06, "loss": 0.3293, "num_input_tokens_seen": 14628912, "step": 14570 }, { "epoch": 7.7279957582184515, "grad_norm": 6.213490009307861, "learning_rate": 7.703062539001158e-06, "loss": 0.1447, "num_input_tokens_seen": 14634928, "step": 14575 }, { "epoch": 7.730646871686108, "grad_norm": 15.125750541687012, "learning_rate": 7.701115939860178e-06, "loss": 0.3562, "num_input_tokens_seen": 14640944, "step": 14580 }, { "epoch": 7.733297985153764, "grad_norm": 12.396320343017578, "learning_rate": 7.699168762417344e-06, "loss": 0.1633, "num_input_tokens_seen": 14646256, "step": 14585 }, { "epoch": 7.735949098621421, "grad_norm": 16.6790714263916, "learning_rate": 7.697221007089541e-06, "loss": 0.277, "num_input_tokens_seen": 14650896, "step": 14590 }, { "epoch": 7.738600212089078, "grad_norm": 7.588809490203857, "learning_rate": 7.695272674293779e-06, "loss": 0.1856, "num_input_tokens_seen": 14655888, "step": 14595 }, { "epoch": 7.741251325556734, "grad_norm": 12.478729248046875, "learning_rate": 7.693323764447188e-06, "loss": 0.3054, "num_input_tokens_seen": 14660048, "step": 14600 }, { "epoch": 7.7439024390243905, "grad_norm": 6.882058143615723, "learning_rate": 7.691374277967029e-06, "loss": 0.1434, "num_input_tokens_seen": 14665008, "step": 14605 }, { "epoch": 7.746553552492046, "grad_norm": 4.654575824737549, "learning_rate": 7.68942421527068e-06, "loss": 0.2747, "num_input_tokens_seen": 14671440, "step": 14610 }, { "epoch": 7.749204665959703, "grad_norm": 7.275289058685303, "learning_rate": 7.687473576775642e-06, "loss": 0.2193, "num_input_tokens_seen": 14676752, "step": 14615 }, { "epoch": 7.751855779427359, "grad_norm": 10.494091033935547, "learning_rate": 7.685522362899546e-06, "loss": 0.2124, "num_input_tokens_seen": 14681296, "step": 14620 }, { "epoch": 7.754506892895016, "grad_norm": 12.983842849731445, "learning_rate": 7.683570574060137e-06, "loss": 0.1343, "num_input_tokens_seen": 14685232, "step": 14625 }, { "epoch": 7.757158006362673, "grad_norm": 5.359222888946533, "learning_rate": 7.681618210675292e-06, "loss": 0.2713, "num_input_tokens_seen": 14690032, "step": 14630 }, { "epoch": 7.7598091198303285, "grad_norm": 10.873294830322266, "learning_rate": 7.679665273163004e-06, "loss": 0.2374, "num_input_tokens_seen": 14694768, "step": 14635 }, { "epoch": 7.762460233297985, "grad_norm": 14.763404846191406, "learning_rate": 7.677711761941394e-06, "loss": 0.4127, "num_input_tokens_seen": 14699408, "step": 14640 }, { "epoch": 7.765111346765641, "grad_norm": 6.376373291015625, "learning_rate": 7.675757677428702e-06, "loss": 0.1097, "num_input_tokens_seen": 14704784, "step": 14645 }, { "epoch": 7.767762460233298, "grad_norm": 11.730305671691895, "learning_rate": 7.673803020043294e-06, "loss": 0.3282, "num_input_tokens_seen": 14708752, "step": 14650 }, { "epoch": 7.770413573700955, "grad_norm": 12.906397819519043, "learning_rate": 7.671847790203655e-06, "loss": 0.2709, "num_input_tokens_seen": 14714032, "step": 14655 }, { "epoch": 7.773064687168611, "grad_norm": 8.024007797241211, "learning_rate": 7.669891988328397e-06, "loss": 0.1385, "num_input_tokens_seen": 14719312, "step": 14660 }, { "epoch": 7.775715800636267, "grad_norm": 4.785192489624023, "learning_rate": 7.667935614836248e-06, "loss": 0.2756, "num_input_tokens_seen": 14724048, "step": 14665 }, { "epoch": 7.778366914103923, "grad_norm": 11.695919036865234, "learning_rate": 7.665978670146066e-06, "loss": 0.382, "num_input_tokens_seen": 14729360, "step": 14670 }, { "epoch": 7.78101802757158, "grad_norm": 4.760873317718506, "learning_rate": 7.664021154676828e-06, "loss": 0.1624, "num_input_tokens_seen": 14733584, "step": 14675 }, { "epoch": 7.783669141039237, "grad_norm": 7.437098026275635, "learning_rate": 7.662063068847632e-06, "loss": 0.3057, "num_input_tokens_seen": 14737904, "step": 14680 }, { "epoch": 7.786320254506893, "grad_norm": 9.757353782653809, "learning_rate": 7.6601044130777e-06, "loss": 0.3087, "num_input_tokens_seen": 14743024, "step": 14685 }, { "epoch": 7.78897136797455, "grad_norm": 11.724640846252441, "learning_rate": 7.658145187786373e-06, "loss": 0.3142, "num_input_tokens_seen": 14747888, "step": 14690 }, { "epoch": 7.7916224814422055, "grad_norm": 2.618208408355713, "learning_rate": 7.656185393393116e-06, "loss": 0.2232, "num_input_tokens_seen": 14752816, "step": 14695 }, { "epoch": 7.794273594909862, "grad_norm": 11.362000465393066, "learning_rate": 7.654225030317515e-06, "loss": 0.2803, "num_input_tokens_seen": 14758096, "step": 14700 }, { "epoch": 7.796924708377519, "grad_norm": 7.920212268829346, "learning_rate": 7.652264098979282e-06, "loss": 0.1299, "num_input_tokens_seen": 14762960, "step": 14705 }, { "epoch": 7.799575821845175, "grad_norm": 11.826669692993164, "learning_rate": 7.650302599798249e-06, "loss": 0.3029, "num_input_tokens_seen": 14767504, "step": 14710 }, { "epoch": 7.802226935312832, "grad_norm": 3.8927812576293945, "learning_rate": 7.64834053319436e-06, "loss": 0.2666, "num_input_tokens_seen": 14772496, "step": 14715 }, { "epoch": 7.804878048780488, "grad_norm": 10.172598838806152, "learning_rate": 7.646377899587695e-06, "loss": 0.2498, "num_input_tokens_seen": 14777712, "step": 14720 }, { "epoch": 7.807529162248144, "grad_norm": 8.96540355682373, "learning_rate": 7.644414699398446e-06, "loss": 0.3501, "num_input_tokens_seen": 14782800, "step": 14725 }, { "epoch": 7.8101802757158, "grad_norm": 7.532724857330322, "learning_rate": 7.64245093304693e-06, "loss": 0.1659, "num_input_tokens_seen": 14787568, "step": 14730 }, { "epoch": 7.812831389183457, "grad_norm": 3.7872164249420166, "learning_rate": 7.640486600953585e-06, "loss": 0.1671, "num_input_tokens_seen": 14792464, "step": 14735 }, { "epoch": 7.815482502651113, "grad_norm": 2.258837938308716, "learning_rate": 7.638521703538966e-06, "loss": 0.2419, "num_input_tokens_seen": 14797520, "step": 14740 }, { "epoch": 7.81813361611877, "grad_norm": 6.581565856933594, "learning_rate": 7.636556241223755e-06, "loss": 0.387, "num_input_tokens_seen": 14802192, "step": 14745 }, { "epoch": 7.820784729586427, "grad_norm": 11.367372512817383, "learning_rate": 7.634590214428753e-06, "loss": 0.224, "num_input_tokens_seen": 14806896, "step": 14750 }, { "epoch": 7.8234358430540825, "grad_norm": 5.488363265991211, "learning_rate": 7.63262362357488e-06, "loss": 0.2843, "num_input_tokens_seen": 14811760, "step": 14755 }, { "epoch": 7.826086956521739, "grad_norm": 2.8908486366271973, "learning_rate": 7.630656469083174e-06, "loss": 0.1677, "num_input_tokens_seen": 14816336, "step": 14760 }, { "epoch": 7.828738069989395, "grad_norm": 11.27586555480957, "learning_rate": 7.628688751374807e-06, "loss": 0.2593, "num_input_tokens_seen": 14820752, "step": 14765 }, { "epoch": 7.831389183457052, "grad_norm": 10.237471580505371, "learning_rate": 7.626720470871057e-06, "loss": 0.2816, "num_input_tokens_seen": 14825360, "step": 14770 }, { "epoch": 7.834040296924709, "grad_norm": 13.711464881896973, "learning_rate": 7.624751627993324e-06, "loss": 0.1765, "num_input_tokens_seen": 14830736, "step": 14775 }, { "epoch": 7.836691410392365, "grad_norm": 6.731103420257568, "learning_rate": 7.6227822231631365e-06, "loss": 0.1963, "num_input_tokens_seen": 14837328, "step": 14780 }, { "epoch": 7.839342523860021, "grad_norm": 7.456413269042969, "learning_rate": 7.6208122568021406e-06, "loss": 0.215, "num_input_tokens_seen": 14843056, "step": 14785 }, { "epoch": 7.841993637327677, "grad_norm": 20.093448638916016, "learning_rate": 7.618841729332096e-06, "loss": 0.2955, "num_input_tokens_seen": 14847536, "step": 14790 }, { "epoch": 7.844644750795334, "grad_norm": 12.92017936706543, "learning_rate": 7.6168706411748915e-06, "loss": 0.284, "num_input_tokens_seen": 14853552, "step": 14795 }, { "epoch": 7.847295864262991, "grad_norm": 8.883831977844238, "learning_rate": 7.61489899275253e-06, "loss": 0.2307, "num_input_tokens_seen": 14859376, "step": 14800 }, { "epoch": 7.849946977730647, "grad_norm": 7.587650775909424, "learning_rate": 7.612926784487136e-06, "loss": 0.204, "num_input_tokens_seen": 14863504, "step": 14805 }, { "epoch": 7.8525980911983035, "grad_norm": 13.415180206298828, "learning_rate": 7.610954016800956e-06, "loss": 0.2808, "num_input_tokens_seen": 14867760, "step": 14810 }, { "epoch": 7.855249204665959, "grad_norm": 2.6827282905578613, "learning_rate": 7.608980690116352e-06, "loss": 0.1962, "num_input_tokens_seen": 14873968, "step": 14815 }, { "epoch": 7.857900318133616, "grad_norm": 7.5223493576049805, "learning_rate": 7.60700680485581e-06, "loss": 0.2951, "num_input_tokens_seen": 14879856, "step": 14820 }, { "epoch": 7.860551431601273, "grad_norm": 9.430492401123047, "learning_rate": 7.605032361441933e-06, "loss": 0.216, "num_input_tokens_seen": 14885008, "step": 14825 }, { "epoch": 7.863202545068929, "grad_norm": 9.0667085647583, "learning_rate": 7.6030573602974445e-06, "loss": 0.28, "num_input_tokens_seen": 14890544, "step": 14830 }, { "epoch": 7.865853658536586, "grad_norm": 7.722686767578125, "learning_rate": 7.601081801845185e-06, "loss": 0.2471, "num_input_tokens_seen": 14897968, "step": 14835 }, { "epoch": 7.868504772004242, "grad_norm": 5.063793659210205, "learning_rate": 7.599105686508119e-06, "loss": 0.207, "num_input_tokens_seen": 14902928, "step": 14840 }, { "epoch": 7.871155885471898, "grad_norm": 5.892356872558594, "learning_rate": 7.597129014709328e-06, "loss": 0.1837, "num_input_tokens_seen": 14909168, "step": 14845 }, { "epoch": 7.873806998939554, "grad_norm": 8.2039213180542, "learning_rate": 7.595151786872009e-06, "loss": 0.2904, "num_input_tokens_seen": 14913776, "step": 14850 }, { "epoch": 7.876458112407211, "grad_norm": 6.687108039855957, "learning_rate": 7.593174003419483e-06, "loss": 0.2087, "num_input_tokens_seen": 14919344, "step": 14855 }, { "epoch": 7.879109225874867, "grad_norm": 6.30843448638916, "learning_rate": 7.591195664775191e-06, "loss": 0.2013, "num_input_tokens_seen": 14924592, "step": 14860 }, { "epoch": 7.881760339342524, "grad_norm": 12.427035331726074, "learning_rate": 7.589216771362685e-06, "loss": 0.2017, "num_input_tokens_seen": 14929424, "step": 14865 }, { "epoch": 7.8844114528101805, "grad_norm": 11.408576965332031, "learning_rate": 7.5872373236056454e-06, "loss": 0.2736, "num_input_tokens_seen": 14935312, "step": 14870 }, { "epoch": 7.887062566277836, "grad_norm": 9.743927001953125, "learning_rate": 7.585257321927863e-06, "loss": 0.29, "num_input_tokens_seen": 14939856, "step": 14875 }, { "epoch": 7.889713679745493, "grad_norm": 16.998695373535156, "learning_rate": 7.583276766753253e-06, "loss": 0.3794, "num_input_tokens_seen": 14946736, "step": 14880 }, { "epoch": 7.892364793213149, "grad_norm": 13.417512893676758, "learning_rate": 7.58129565850585e-06, "loss": 0.2354, "num_input_tokens_seen": 14950928, "step": 14885 }, { "epoch": 7.895015906680806, "grad_norm": 12.413893699645996, "learning_rate": 7.579313997609799e-06, "loss": 0.1861, "num_input_tokens_seen": 14955088, "step": 14890 }, { "epoch": 7.897667020148463, "grad_norm": 7.7516961097717285, "learning_rate": 7.57733178448937e-06, "loss": 0.2807, "num_input_tokens_seen": 14960368, "step": 14895 }, { "epoch": 7.900318133616119, "grad_norm": 6.027243137359619, "learning_rate": 7.575349019568951e-06, "loss": 0.225, "num_input_tokens_seen": 14965232, "step": 14900 }, { "epoch": 7.902969247083775, "grad_norm": 8.161507606506348, "learning_rate": 7.573365703273045e-06, "loss": 0.2118, "num_input_tokens_seen": 14969648, "step": 14905 }, { "epoch": 7.905620360551431, "grad_norm": 7.9280500411987305, "learning_rate": 7.571381836026277e-06, "loss": 0.1765, "num_input_tokens_seen": 14974096, "step": 14910 }, { "epoch": 7.908271474019088, "grad_norm": 21.24817657470703, "learning_rate": 7.569397418253386e-06, "loss": 0.3265, "num_input_tokens_seen": 14977904, "step": 14915 }, { "epoch": 7.910922587486745, "grad_norm": 3.876397132873535, "learning_rate": 7.567412450379231e-06, "loss": 0.1665, "num_input_tokens_seen": 14983824, "step": 14920 }, { "epoch": 7.913573700954401, "grad_norm": 6.844945907592773, "learning_rate": 7.56542693282879e-06, "loss": 0.2126, "num_input_tokens_seen": 14989200, "step": 14925 }, { "epoch": 7.9162248144220575, "grad_norm": 11.16896915435791, "learning_rate": 7.563440866027153e-06, "loss": 0.2412, "num_input_tokens_seen": 14994096, "step": 14930 }, { "epoch": 7.918875927889713, "grad_norm": 12.425985336303711, "learning_rate": 7.5614542503995355e-06, "loss": 0.3642, "num_input_tokens_seen": 14999248, "step": 14935 }, { "epoch": 7.92152704135737, "grad_norm": 15.246292114257812, "learning_rate": 7.559467086371267e-06, "loss": 0.2078, "num_input_tokens_seen": 15003472, "step": 14940 }, { "epoch": 7.924178154825027, "grad_norm": 10.973480224609375, "learning_rate": 7.557479374367792e-06, "loss": 0.3045, "num_input_tokens_seen": 15007856, "step": 14945 }, { "epoch": 7.926829268292683, "grad_norm": 3.5589773654937744, "learning_rate": 7.555491114814675e-06, "loss": 0.1882, "num_input_tokens_seen": 15013392, "step": 14950 }, { "epoch": 7.92948038176034, "grad_norm": 9.811996459960938, "learning_rate": 7.553502308137597e-06, "loss": 0.2086, "num_input_tokens_seen": 15017584, "step": 14955 }, { "epoch": 7.9321314952279955, "grad_norm": 11.124063491821289, "learning_rate": 7.551512954762359e-06, "loss": 0.2079, "num_input_tokens_seen": 15022256, "step": 14960 }, { "epoch": 7.934782608695652, "grad_norm": 9.960487365722656, "learning_rate": 7.549523055114871e-06, "loss": 0.2525, "num_input_tokens_seen": 15026960, "step": 14965 }, { "epoch": 7.937433722163308, "grad_norm": 14.38268756866455, "learning_rate": 7.547532609621168e-06, "loss": 0.2792, "num_input_tokens_seen": 15031952, "step": 14970 }, { "epoch": 7.940084835630965, "grad_norm": 7.6719560623168945, "learning_rate": 7.545541618707403e-06, "loss": 0.2036, "num_input_tokens_seen": 15037040, "step": 14975 }, { "epoch": 7.942735949098622, "grad_norm": 3.000035047531128, "learning_rate": 7.5435500827998355e-06, "loss": 0.1051, "num_input_tokens_seen": 15041104, "step": 14980 }, { "epoch": 7.945387062566278, "grad_norm": 14.435423851013184, "learning_rate": 7.541558002324851e-06, "loss": 0.2004, "num_input_tokens_seen": 15045584, "step": 14985 }, { "epoch": 7.9480381760339345, "grad_norm": 7.024955749511719, "learning_rate": 7.53956537770895e-06, "loss": 0.2235, "num_input_tokens_seen": 15050576, "step": 14990 }, { "epoch": 7.95068928950159, "grad_norm": 19.559932708740234, "learning_rate": 7.537572209378747e-06, "loss": 0.4679, "num_input_tokens_seen": 15055184, "step": 14995 }, { "epoch": 7.953340402969247, "grad_norm": 12.233707427978516, "learning_rate": 7.535578497760975e-06, "loss": 0.2272, "num_input_tokens_seen": 15060592, "step": 15000 }, { "epoch": 7.955991516436903, "grad_norm": 16.96025276184082, "learning_rate": 7.5335842432824794e-06, "loss": 0.2882, "num_input_tokens_seen": 15066768, "step": 15005 }, { "epoch": 7.95864262990456, "grad_norm": 5.380483150482178, "learning_rate": 7.531589446370229e-06, "loss": 0.1827, "num_input_tokens_seen": 15072016, "step": 15010 }, { "epoch": 7.961293743372217, "grad_norm": 4.049888610839844, "learning_rate": 7.5295941074513015e-06, "loss": 0.2169, "num_input_tokens_seen": 15077552, "step": 15015 }, { "epoch": 7.9639448568398725, "grad_norm": 11.519939422607422, "learning_rate": 7.527598226952895e-06, "loss": 0.3743, "num_input_tokens_seen": 15082896, "step": 15020 }, { "epoch": 7.966595970307529, "grad_norm": 8.859800338745117, "learning_rate": 7.525601805302321e-06, "loss": 0.3458, "num_input_tokens_seen": 15087472, "step": 15025 }, { "epoch": 7.969247083775185, "grad_norm": 6.680647850036621, "learning_rate": 7.523604842927011e-06, "loss": 0.2001, "num_input_tokens_seen": 15092784, "step": 15030 }, { "epoch": 7.971898197242842, "grad_norm": 5.8622870445251465, "learning_rate": 7.521607340254509e-06, "loss": 0.2196, "num_input_tokens_seen": 15097520, "step": 15035 }, { "epoch": 7.974549310710499, "grad_norm": 8.788570404052734, "learning_rate": 7.519609297712471e-06, "loss": 0.3431, "num_input_tokens_seen": 15102032, "step": 15040 }, { "epoch": 7.977200424178155, "grad_norm": 6.399003505706787, "learning_rate": 7.517610715728676e-06, "loss": 0.2095, "num_input_tokens_seen": 15107472, "step": 15045 }, { "epoch": 7.9798515376458115, "grad_norm": 8.517690658569336, "learning_rate": 7.515611594731016e-06, "loss": 0.2205, "num_input_tokens_seen": 15112624, "step": 15050 }, { "epoch": 7.982502651113467, "grad_norm": 5.323696613311768, "learning_rate": 7.513611935147496e-06, "loss": 0.1697, "num_input_tokens_seen": 15117840, "step": 15055 }, { "epoch": 7.985153764581124, "grad_norm": 12.254754066467285, "learning_rate": 7.511611737406237e-06, "loss": 0.2865, "num_input_tokens_seen": 15122320, "step": 15060 }, { "epoch": 7.987804878048781, "grad_norm": 8.165031433105469, "learning_rate": 7.509611001935477e-06, "loss": 0.1921, "num_input_tokens_seen": 15127664, "step": 15065 }, { "epoch": 7.990455991516437, "grad_norm": 12.120317459106445, "learning_rate": 7.5076097291635705e-06, "loss": 0.2023, "num_input_tokens_seen": 15132464, "step": 15070 }, { "epoch": 7.993107104984094, "grad_norm": 8.06482219696045, "learning_rate": 7.505607919518982e-06, "loss": 0.1988, "num_input_tokens_seen": 15138384, "step": 15075 }, { "epoch": 7.9957582184517495, "grad_norm": 8.511544227600098, "learning_rate": 7.503605573430295e-06, "loss": 0.3045, "num_input_tokens_seen": 15143760, "step": 15080 }, { "epoch": 7.998409331919406, "grad_norm": 6.550633907318115, "learning_rate": 7.501602691326204e-06, "loss": 0.2267, "num_input_tokens_seen": 15149008, "step": 15085 }, { "epoch": 8.0, "eval_loss": 0.3523215353488922, "eval_runtime": 29.3272, "eval_samples_per_second": 64.309, "eval_steps_per_second": 16.094, "num_input_tokens_seen": 15150888, "step": 15088 }, { "epoch": 8.001060445387063, "grad_norm": 13.894933700561523, "learning_rate": 7.499599273635524e-06, "loss": 0.4056, "num_input_tokens_seen": 15152584, "step": 15090 }, { "epoch": 8.00371155885472, "grad_norm": 6.996933460235596, "learning_rate": 7.497595320787181e-06, "loss": 0.1485, "num_input_tokens_seen": 15157384, "step": 15095 }, { "epoch": 8.006362672322375, "grad_norm": 9.032930374145508, "learning_rate": 7.495590833210215e-06, "loss": 0.2239, "num_input_tokens_seen": 15161896, "step": 15100 }, { "epoch": 8.009013785790032, "grad_norm": 3.8180079460144043, "learning_rate": 7.493585811333781e-06, "loss": 0.1605, "num_input_tokens_seen": 15166856, "step": 15105 }, { "epoch": 8.011664899257688, "grad_norm": 8.019796371459961, "learning_rate": 7.491580255587151e-06, "loss": 0.1662, "num_input_tokens_seen": 15172360, "step": 15110 }, { "epoch": 8.014316012725345, "grad_norm": 7.891743183135986, "learning_rate": 7.489574166399708e-06, "loss": 0.1975, "num_input_tokens_seen": 15177032, "step": 15115 }, { "epoch": 8.016967126193, "grad_norm": 2.888944149017334, "learning_rate": 7.4875675442009485e-06, "loss": 0.1114, "num_input_tokens_seen": 15181256, "step": 15120 }, { "epoch": 8.019618239660657, "grad_norm": 11.466632843017578, "learning_rate": 7.48556038942049e-06, "loss": 0.231, "num_input_tokens_seen": 15187304, "step": 15125 }, { "epoch": 8.022269353128314, "grad_norm": 9.787368774414062, "learning_rate": 7.483552702488054e-06, "loss": 0.0754, "num_input_tokens_seen": 15191464, "step": 15130 }, { "epoch": 8.02492046659597, "grad_norm": 9.404413223266602, "learning_rate": 7.481544483833485e-06, "loss": 0.1259, "num_input_tokens_seen": 15196008, "step": 15135 }, { "epoch": 8.027571580063627, "grad_norm": 12.238271713256836, "learning_rate": 7.4795357338867334e-06, "loss": 0.1793, "num_input_tokens_seen": 15200520, "step": 15140 }, { "epoch": 8.030222693531282, "grad_norm": 12.716482162475586, "learning_rate": 7.47752645307787e-06, "loss": 0.2075, "num_input_tokens_seen": 15204264, "step": 15145 }, { "epoch": 8.03287380699894, "grad_norm": 4.202263355255127, "learning_rate": 7.475516641837077e-06, "loss": 0.1563, "num_input_tokens_seen": 15208168, "step": 15150 }, { "epoch": 8.035524920466596, "grad_norm": 11.964009284973145, "learning_rate": 7.473506300594649e-06, "loss": 0.1944, "num_input_tokens_seen": 15212968, "step": 15155 }, { "epoch": 8.038176033934253, "grad_norm": 7.331450462341309, "learning_rate": 7.471495429780994e-06, "loss": 0.1404, "num_input_tokens_seen": 15218504, "step": 15160 }, { "epoch": 8.04082714740191, "grad_norm": 9.436690330505371, "learning_rate": 7.4694840298266345e-06, "loss": 0.1517, "num_input_tokens_seen": 15222760, "step": 15165 }, { "epoch": 8.043478260869565, "grad_norm": 18.949750900268555, "learning_rate": 7.467472101162206e-06, "loss": 0.2242, "num_input_tokens_seen": 15228616, "step": 15170 }, { "epoch": 8.046129374337221, "grad_norm": 9.619037628173828, "learning_rate": 7.465459644218458e-06, "loss": 0.1768, "num_input_tokens_seen": 15233512, "step": 15175 }, { "epoch": 8.048780487804878, "grad_norm": 15.780121803283691, "learning_rate": 7.463446659426251e-06, "loss": 0.1961, "num_input_tokens_seen": 15238760, "step": 15180 }, { "epoch": 8.051431601272535, "grad_norm": 9.255861282348633, "learning_rate": 7.461433147216561e-06, "loss": 0.184, "num_input_tokens_seen": 15244136, "step": 15185 }, { "epoch": 8.054082714740192, "grad_norm": 18.087940216064453, "learning_rate": 7.459419108020476e-06, "loss": 0.2968, "num_input_tokens_seen": 15249128, "step": 15190 }, { "epoch": 8.056733828207847, "grad_norm": 8.681056022644043, "learning_rate": 7.4574045422691935e-06, "loss": 0.1098, "num_input_tokens_seen": 15253512, "step": 15195 }, { "epoch": 8.059384941675503, "grad_norm": 16.298093795776367, "learning_rate": 7.455389450394031e-06, "loss": 0.2541, "num_input_tokens_seen": 15257896, "step": 15200 }, { "epoch": 8.06203605514316, "grad_norm": 10.658269882202148, "learning_rate": 7.453373832826411e-06, "loss": 0.2668, "num_input_tokens_seen": 15262952, "step": 15205 }, { "epoch": 8.064687168610817, "grad_norm": 7.622495174407959, "learning_rate": 7.451357689997873e-06, "loss": 0.0641, "num_input_tokens_seen": 15267528, "step": 15210 }, { "epoch": 8.067338282078474, "grad_norm": 9.874733924865723, "learning_rate": 7.4493410223400676e-06, "loss": 0.1622, "num_input_tokens_seen": 15273320, "step": 15215 }, { "epoch": 8.069989395546129, "grad_norm": 13.92279052734375, "learning_rate": 7.447323830284761e-06, "loss": 0.2644, "num_input_tokens_seen": 15278344, "step": 15220 }, { "epoch": 8.072640509013786, "grad_norm": 3.1815216541290283, "learning_rate": 7.445306114263824e-06, "loss": 0.1799, "num_input_tokens_seen": 15284488, "step": 15225 }, { "epoch": 8.075291622481442, "grad_norm": 1.4054831266403198, "learning_rate": 7.443287874709247e-06, "loss": 0.1403, "num_input_tokens_seen": 15289032, "step": 15230 }, { "epoch": 8.0779427359491, "grad_norm": 17.127140045166016, "learning_rate": 7.441269112053131e-06, "loss": 0.3517, "num_input_tokens_seen": 15293832, "step": 15235 }, { "epoch": 8.080593849416754, "grad_norm": 4.858790874481201, "learning_rate": 7.439249826727686e-06, "loss": 0.1279, "num_input_tokens_seen": 15299304, "step": 15240 }, { "epoch": 8.083244962884411, "grad_norm": 16.06317901611328, "learning_rate": 7.437230019165234e-06, "loss": 0.2365, "num_input_tokens_seen": 15304360, "step": 15245 }, { "epoch": 8.085896076352068, "grad_norm": 10.19212532043457, "learning_rate": 7.435209689798214e-06, "loss": 0.1551, "num_input_tokens_seen": 15309640, "step": 15250 }, { "epoch": 8.088547189819725, "grad_norm": 5.630555152893066, "learning_rate": 7.433188839059171e-06, "loss": 0.163, "num_input_tokens_seen": 15314824, "step": 15255 }, { "epoch": 8.091198303287381, "grad_norm": 11.42951774597168, "learning_rate": 7.431167467380767e-06, "loss": 0.2727, "num_input_tokens_seen": 15322536, "step": 15260 }, { "epoch": 8.093849416755036, "grad_norm": 12.291396141052246, "learning_rate": 7.429145575195767e-06, "loss": 0.2191, "num_input_tokens_seen": 15326792, "step": 15265 }, { "epoch": 8.096500530222693, "grad_norm": 18.114404678344727, "learning_rate": 7.427123162937055e-06, "loss": 0.2264, "num_input_tokens_seen": 15331304, "step": 15270 }, { "epoch": 8.09915164369035, "grad_norm": 6.972524166107178, "learning_rate": 7.425100231037628e-06, "loss": 0.3213, "num_input_tokens_seen": 15336872, "step": 15275 }, { "epoch": 8.101802757158007, "grad_norm": 17.49612808227539, "learning_rate": 7.423076779930587e-06, "loss": 0.2328, "num_input_tokens_seen": 15341288, "step": 15280 }, { "epoch": 8.104453870625663, "grad_norm": 12.982752799987793, "learning_rate": 7.421052810049146e-06, "loss": 0.1499, "num_input_tokens_seen": 15346024, "step": 15285 }, { "epoch": 8.107104984093318, "grad_norm": 3.6196155548095703, "learning_rate": 7.419028321826635e-06, "loss": 0.1646, "num_input_tokens_seen": 15351496, "step": 15290 }, { "epoch": 8.109756097560975, "grad_norm": 9.594738960266113, "learning_rate": 7.4170033156964895e-06, "loss": 0.134, "num_input_tokens_seen": 15357480, "step": 15295 }, { "epoch": 8.112407211028632, "grad_norm": 5.5157318115234375, "learning_rate": 7.41497779209226e-06, "loss": 0.1162, "num_input_tokens_seen": 15361800, "step": 15300 }, { "epoch": 8.115058324496289, "grad_norm": 9.486688613891602, "learning_rate": 7.4129517514476036e-06, "loss": 0.1313, "num_input_tokens_seen": 15366440, "step": 15305 }, { "epoch": 8.117709437963946, "grad_norm": 4.754476547241211, "learning_rate": 7.410925194196292e-06, "loss": 0.1831, "num_input_tokens_seen": 15372104, "step": 15310 }, { "epoch": 8.1203605514316, "grad_norm": 9.012309074401855, "learning_rate": 7.408898120772203e-06, "loss": 0.3217, "num_input_tokens_seen": 15376168, "step": 15315 }, { "epoch": 8.123011664899257, "grad_norm": 11.676976203918457, "learning_rate": 7.406870531609331e-06, "loss": 0.1559, "num_input_tokens_seen": 15380488, "step": 15320 }, { "epoch": 8.125662778366914, "grad_norm": 6.724965572357178, "learning_rate": 7.404842427141779e-06, "loss": 0.1553, "num_input_tokens_seen": 15384552, "step": 15325 }, { "epoch": 8.128313891834571, "grad_norm": 6.778855323791504, "learning_rate": 7.402813807803752e-06, "loss": 0.2576, "num_input_tokens_seen": 15388488, "step": 15330 }, { "epoch": 8.130965005302228, "grad_norm": 7.829508304595947, "learning_rate": 7.400784674029579e-06, "loss": 0.104, "num_input_tokens_seen": 15393128, "step": 15335 }, { "epoch": 8.133616118769883, "grad_norm": 20.86159324645996, "learning_rate": 7.3987550262536875e-06, "loss": 0.2769, "num_input_tokens_seen": 15397736, "step": 15340 }, { "epoch": 8.13626723223754, "grad_norm": 7.748720645904541, "learning_rate": 7.396724864910624e-06, "loss": 0.2496, "num_input_tokens_seen": 15402664, "step": 15345 }, { "epoch": 8.138918345705196, "grad_norm": 0.7800996899604797, "learning_rate": 7.394694190435037e-06, "loss": 0.2117, "num_input_tokens_seen": 15407624, "step": 15350 }, { "epoch": 8.141569459172853, "grad_norm": 17.806421279907227, "learning_rate": 7.392663003261691e-06, "loss": 0.222, "num_input_tokens_seen": 15413224, "step": 15355 }, { "epoch": 8.14422057264051, "grad_norm": 9.187397003173828, "learning_rate": 7.390631303825456e-06, "loss": 0.1409, "num_input_tokens_seen": 15419080, "step": 15360 }, { "epoch": 8.146871686108165, "grad_norm": 3.190589427947998, "learning_rate": 7.388599092561315e-06, "loss": 0.1517, "num_input_tokens_seen": 15424136, "step": 15365 }, { "epoch": 8.149522799575822, "grad_norm": 11.28255844116211, "learning_rate": 7.386566369904358e-06, "loss": 0.1802, "num_input_tokens_seen": 15428520, "step": 15370 }, { "epoch": 8.152173913043478, "grad_norm": 15.849255561828613, "learning_rate": 7.384533136289789e-06, "loss": 0.1307, "num_input_tokens_seen": 15433320, "step": 15375 }, { "epoch": 8.154825026511135, "grad_norm": 5.351956844329834, "learning_rate": 7.382499392152913e-06, "loss": 0.145, "num_input_tokens_seen": 15438120, "step": 15380 }, { "epoch": 8.15747613997879, "grad_norm": 11.882217407226562, "learning_rate": 7.38046513792915e-06, "loss": 0.1736, "num_input_tokens_seen": 15444872, "step": 15385 }, { "epoch": 8.160127253446447, "grad_norm": 8.674811363220215, "learning_rate": 7.378430374054033e-06, "loss": 0.4242, "num_input_tokens_seen": 15450376, "step": 15390 }, { "epoch": 8.162778366914104, "grad_norm": 11.327913284301758, "learning_rate": 7.376395100963195e-06, "loss": 0.1928, "num_input_tokens_seen": 15454856, "step": 15395 }, { "epoch": 8.16542948038176, "grad_norm": 16.26247215270996, "learning_rate": 7.374359319092385e-06, "loss": 0.2895, "num_input_tokens_seen": 15458984, "step": 15400 }, { "epoch": 8.168080593849417, "grad_norm": 5.207380771636963, "learning_rate": 7.3723230288774565e-06, "loss": 0.2043, "num_input_tokens_seen": 15464040, "step": 15405 }, { "epoch": 8.170731707317072, "grad_norm": 14.487906455993652, "learning_rate": 7.370286230754377e-06, "loss": 0.2628, "num_input_tokens_seen": 15468136, "step": 15410 }, { "epoch": 8.17338282078473, "grad_norm": 11.471050262451172, "learning_rate": 7.368248925159216e-06, "loss": 0.2406, "num_input_tokens_seen": 15472616, "step": 15415 }, { "epoch": 8.176033934252386, "grad_norm": 13.519198417663574, "learning_rate": 7.3662111125281585e-06, "loss": 0.1641, "num_input_tokens_seen": 15478312, "step": 15420 }, { "epoch": 8.178685047720043, "grad_norm": 22.28777503967285, "learning_rate": 7.364172793297493e-06, "loss": 0.2223, "num_input_tokens_seen": 15482792, "step": 15425 }, { "epoch": 8.1813361611877, "grad_norm": 18.568742752075195, "learning_rate": 7.362133967903619e-06, "loss": 0.2197, "num_input_tokens_seen": 15487400, "step": 15430 }, { "epoch": 8.183987274655355, "grad_norm": 6.181830883026123, "learning_rate": 7.3600946367830435e-06, "loss": 0.1502, "num_input_tokens_seen": 15493544, "step": 15435 }, { "epoch": 8.186638388123011, "grad_norm": 7.932883262634277, "learning_rate": 7.358054800372383e-06, "loss": 0.2738, "num_input_tokens_seen": 15499016, "step": 15440 }, { "epoch": 8.189289501590668, "grad_norm": 4.70739221572876, "learning_rate": 7.35601445910836e-06, "loss": 0.1083, "num_input_tokens_seen": 15502824, "step": 15445 }, { "epoch": 8.191940615058325, "grad_norm": 9.744409561157227, "learning_rate": 7.353973613427805e-06, "loss": 0.2379, "num_input_tokens_seen": 15507880, "step": 15450 }, { "epoch": 8.194591728525982, "grad_norm": 25.889678955078125, "learning_rate": 7.3519322637676606e-06, "loss": 0.3291, "num_input_tokens_seen": 15512904, "step": 15455 }, { "epoch": 8.197242841993637, "grad_norm": 16.665081024169922, "learning_rate": 7.34989041056497e-06, "loss": 0.278, "num_input_tokens_seen": 15518184, "step": 15460 }, { "epoch": 8.199893955461294, "grad_norm": 11.262672424316406, "learning_rate": 7.347848054256895e-06, "loss": 0.3, "num_input_tokens_seen": 15523976, "step": 15465 }, { "epoch": 8.20254506892895, "grad_norm": 7.534533500671387, "learning_rate": 7.345805195280694e-06, "loss": 0.0976, "num_input_tokens_seen": 15528136, "step": 15470 }, { "epoch": 8.205196182396607, "grad_norm": 10.733166694641113, "learning_rate": 7.3437618340737394e-06, "loss": 0.2353, "num_input_tokens_seen": 15533608, "step": 15475 }, { "epoch": 8.207847295864262, "grad_norm": 21.14507293701172, "learning_rate": 7.341717971073508e-06, "loss": 0.2689, "num_input_tokens_seen": 15538568, "step": 15480 }, { "epoch": 8.210498409331919, "grad_norm": 15.433460235595703, "learning_rate": 7.339673606717588e-06, "loss": 0.2623, "num_input_tokens_seen": 15543432, "step": 15485 }, { "epoch": 8.213149522799576, "grad_norm": 5.669189453125, "learning_rate": 7.337628741443671e-06, "loss": 0.1775, "num_input_tokens_seen": 15548904, "step": 15490 }, { "epoch": 8.215800636267232, "grad_norm": 20.79669761657715, "learning_rate": 7.335583375689557e-06, "loss": 0.2703, "num_input_tokens_seen": 15553320, "step": 15495 }, { "epoch": 8.21845174973489, "grad_norm": 15.016453742980957, "learning_rate": 7.333537509893153e-06, "loss": 0.245, "num_input_tokens_seen": 15557192, "step": 15500 }, { "epoch": 8.221102863202544, "grad_norm": 19.398393630981445, "learning_rate": 7.331491144492475e-06, "loss": 0.2042, "num_input_tokens_seen": 15561960, "step": 15505 }, { "epoch": 8.223753976670201, "grad_norm": 6.240278244018555, "learning_rate": 7.329444279925646e-06, "loss": 0.1377, "num_input_tokens_seen": 15567592, "step": 15510 }, { "epoch": 8.226405090137858, "grad_norm": 2.9809153079986572, "learning_rate": 7.3273969166308885e-06, "loss": 0.0856, "num_input_tokens_seen": 15571784, "step": 15515 }, { "epoch": 8.229056203605515, "grad_norm": 11.95522689819336, "learning_rate": 7.325349055046544e-06, "loss": 0.3651, "num_input_tokens_seen": 15576616, "step": 15520 }, { "epoch": 8.231707317073171, "grad_norm": 19.901329040527344, "learning_rate": 7.323300695611051e-06, "loss": 0.1838, "num_input_tokens_seen": 15583080, "step": 15525 }, { "epoch": 8.234358430540826, "grad_norm": 18.978097915649414, "learning_rate": 7.321251838762959e-06, "loss": 0.1804, "num_input_tokens_seen": 15589992, "step": 15530 }, { "epoch": 8.237009544008483, "grad_norm": 10.472136497497559, "learning_rate": 7.31920248494092e-06, "loss": 0.1106, "num_input_tokens_seen": 15594888, "step": 15535 }, { "epoch": 8.23966065747614, "grad_norm": 15.187904357910156, "learning_rate": 7.3171526345837e-06, "loss": 0.2978, "num_input_tokens_seen": 15600552, "step": 15540 }, { "epoch": 8.242311770943797, "grad_norm": 11.217083930969238, "learning_rate": 7.315102288130164e-06, "loss": 0.0893, "num_input_tokens_seen": 15604872, "step": 15545 }, { "epoch": 8.244962884411454, "grad_norm": 12.845727920532227, "learning_rate": 7.313051446019286e-06, "loss": 0.1948, "num_input_tokens_seen": 15609608, "step": 15550 }, { "epoch": 8.247613997879109, "grad_norm": 4.39459228515625, "learning_rate": 7.311000108690145e-06, "loss": 0.2063, "num_input_tokens_seen": 15615560, "step": 15555 }, { "epoch": 8.250265111346765, "grad_norm": 8.204325675964355, "learning_rate": 7.30894827658193e-06, "loss": 0.2738, "num_input_tokens_seen": 15621352, "step": 15560 }, { "epoch": 8.252916224814422, "grad_norm": 16.055389404296875, "learning_rate": 7.30689595013393e-06, "loss": 0.2509, "num_input_tokens_seen": 15625544, "step": 15565 }, { "epoch": 8.255567338282079, "grad_norm": 16.223344802856445, "learning_rate": 7.304843129785543e-06, "loss": 0.1986, "num_input_tokens_seen": 15630952, "step": 15570 }, { "epoch": 8.258218451749736, "grad_norm": 11.42915153503418, "learning_rate": 7.302789815976272e-06, "loss": 0.215, "num_input_tokens_seen": 15635336, "step": 15575 }, { "epoch": 8.26086956521739, "grad_norm": 16.17356300354004, "learning_rate": 7.30073600914573e-06, "loss": 0.2288, "num_input_tokens_seen": 15640392, "step": 15580 }, { "epoch": 8.263520678685047, "grad_norm": 10.97680377960205, "learning_rate": 7.2986817097336285e-06, "loss": 0.2008, "num_input_tokens_seen": 15646376, "step": 15585 }, { "epoch": 8.266171792152704, "grad_norm": 5.332220077514648, "learning_rate": 7.296626918179786e-06, "loss": 0.1956, "num_input_tokens_seen": 15651816, "step": 15590 }, { "epoch": 8.268822905620361, "grad_norm": 4.399108409881592, "learning_rate": 7.2945716349241305e-06, "loss": 0.2124, "num_input_tokens_seen": 15656552, "step": 15595 }, { "epoch": 8.271474019088018, "grad_norm": 11.887110710144043, "learning_rate": 7.292515860406692e-06, "loss": 0.1911, "num_input_tokens_seen": 15661608, "step": 15600 }, { "epoch": 8.274125132555673, "grad_norm": 14.071810722351074, "learning_rate": 7.290459595067609e-06, "loss": 0.2803, "num_input_tokens_seen": 15667208, "step": 15605 }, { "epoch": 8.27677624602333, "grad_norm": 11.622577667236328, "learning_rate": 7.2884028393471185e-06, "loss": 0.2405, "num_input_tokens_seen": 15672232, "step": 15610 }, { "epoch": 8.279427359490986, "grad_norm": 8.59852409362793, "learning_rate": 7.286345593685568e-06, "loss": 0.1934, "num_input_tokens_seen": 15677576, "step": 15615 }, { "epoch": 8.282078472958643, "grad_norm": 4.596238613128662, "learning_rate": 7.284287858523409e-06, "loss": 0.1222, "num_input_tokens_seen": 15684200, "step": 15620 }, { "epoch": 8.284729586426298, "grad_norm": 13.711301803588867, "learning_rate": 7.282229634301198e-06, "loss": 0.3335, "num_input_tokens_seen": 15689000, "step": 15625 }, { "epoch": 8.287380699893955, "grad_norm": 16.15086555480957, "learning_rate": 7.2801709214595925e-06, "loss": 0.268, "num_input_tokens_seen": 15693736, "step": 15630 }, { "epoch": 8.290031813361612, "grad_norm": 12.504789352416992, "learning_rate": 7.27811172043936e-06, "loss": 0.1824, "num_input_tokens_seen": 15698408, "step": 15635 }, { "epoch": 8.292682926829269, "grad_norm": 6.642853260040283, "learning_rate": 7.27605203168137e-06, "loss": 0.1253, "num_input_tokens_seen": 15702696, "step": 15640 }, { "epoch": 8.295334040296925, "grad_norm": 3.661724805831909, "learning_rate": 7.273991855626595e-06, "loss": 0.3171, "num_input_tokens_seen": 15707496, "step": 15645 }, { "epoch": 8.29798515376458, "grad_norm": 18.89192008972168, "learning_rate": 7.271931192716115e-06, "loss": 0.2159, "num_input_tokens_seen": 15712616, "step": 15650 }, { "epoch": 8.300636267232237, "grad_norm": 17.738046646118164, "learning_rate": 7.269870043391112e-06, "loss": 0.2482, "num_input_tokens_seen": 15716968, "step": 15655 }, { "epoch": 8.303287380699894, "grad_norm": 15.496298789978027, "learning_rate": 7.267808408092871e-06, "loss": 0.2346, "num_input_tokens_seen": 15720968, "step": 15660 }, { "epoch": 8.30593849416755, "grad_norm": 12.660447120666504, "learning_rate": 7.265746287262785e-06, "loss": 0.1604, "num_input_tokens_seen": 15726536, "step": 15665 }, { "epoch": 8.308589607635207, "grad_norm": 18.510656356811523, "learning_rate": 7.263683681342346e-06, "loss": 0.1521, "num_input_tokens_seen": 15730760, "step": 15670 }, { "epoch": 8.311240721102862, "grad_norm": 14.079751968383789, "learning_rate": 7.261620590773156e-06, "loss": 0.1694, "num_input_tokens_seen": 15735304, "step": 15675 }, { "epoch": 8.31389183457052, "grad_norm": 16.69282341003418, "learning_rate": 7.259557015996915e-06, "loss": 0.1958, "num_input_tokens_seen": 15739592, "step": 15680 }, { "epoch": 8.316542948038176, "grad_norm": 0.28560686111450195, "learning_rate": 7.257492957455431e-06, "loss": 0.1067, "num_input_tokens_seen": 15744200, "step": 15685 }, { "epoch": 8.319194061505833, "grad_norm": 6.207469940185547, "learning_rate": 7.255428415590609e-06, "loss": 0.3895, "num_input_tokens_seen": 15749384, "step": 15690 }, { "epoch": 8.32184517497349, "grad_norm": 1.914132833480835, "learning_rate": 7.2533633908444676e-06, "loss": 0.2293, "num_input_tokens_seen": 15754120, "step": 15695 }, { "epoch": 8.324496288441145, "grad_norm": 13.11181354522705, "learning_rate": 7.251297883659121e-06, "loss": 0.2115, "num_input_tokens_seen": 15760168, "step": 15700 }, { "epoch": 8.327147401908801, "grad_norm": 21.150362014770508, "learning_rate": 7.249231894476787e-06, "loss": 0.2288, "num_input_tokens_seen": 15765512, "step": 15705 }, { "epoch": 8.329798515376458, "grad_norm": 16.51285171508789, "learning_rate": 7.24716542373979e-06, "loss": 0.231, "num_input_tokens_seen": 15770536, "step": 15710 }, { "epoch": 8.332449628844115, "grad_norm": 13.750916481018066, "learning_rate": 7.245098471890557e-06, "loss": 0.1893, "num_input_tokens_seen": 15775560, "step": 15715 }, { "epoch": 8.335100742311772, "grad_norm": 5.538909435272217, "learning_rate": 7.243031039371615e-06, "loss": 0.2373, "num_input_tokens_seen": 15780776, "step": 15720 }, { "epoch": 8.337751855779427, "grad_norm": 10.509048461914062, "learning_rate": 7.240963126625598e-06, "loss": 0.3068, "num_input_tokens_seen": 15786888, "step": 15725 }, { "epoch": 8.340402969247084, "grad_norm": 8.872994422912598, "learning_rate": 7.238894734095239e-06, "loss": 0.2036, "num_input_tokens_seen": 15791720, "step": 15730 }, { "epoch": 8.34305408271474, "grad_norm": 8.622492790222168, "learning_rate": 7.236825862223375e-06, "loss": 0.137, "num_input_tokens_seen": 15796200, "step": 15735 }, { "epoch": 8.345705196182397, "grad_norm": 11.444801330566406, "learning_rate": 7.234756511452949e-06, "loss": 0.2452, "num_input_tokens_seen": 15800392, "step": 15740 }, { "epoch": 8.348356309650054, "grad_norm": 17.480426788330078, "learning_rate": 7.232686682227001e-06, "loss": 0.2604, "num_input_tokens_seen": 15805800, "step": 15745 }, { "epoch": 8.351007423117709, "grad_norm": 5.036227703094482, "learning_rate": 7.230616374988676e-06, "loss": 0.2118, "num_input_tokens_seen": 15811080, "step": 15750 }, { "epoch": 8.353658536585366, "grad_norm": 16.559776306152344, "learning_rate": 7.228545590181223e-06, "loss": 0.3253, "num_input_tokens_seen": 15817768, "step": 15755 }, { "epoch": 8.356309650053023, "grad_norm": 8.84748649597168, "learning_rate": 7.226474328247991e-06, "loss": 0.2352, "num_input_tokens_seen": 15822824, "step": 15760 }, { "epoch": 8.35896076352068, "grad_norm": 16.790321350097656, "learning_rate": 7.22440258963243e-06, "loss": 0.2391, "num_input_tokens_seen": 15828648, "step": 15765 }, { "epoch": 8.361611876988334, "grad_norm": 9.579214096069336, "learning_rate": 7.222330374778096e-06, "loss": 0.1727, "num_input_tokens_seen": 15832968, "step": 15770 }, { "epoch": 8.364262990455991, "grad_norm": 7.516337871551514, "learning_rate": 7.220257684128644e-06, "loss": 0.2102, "num_input_tokens_seen": 15837672, "step": 15775 }, { "epoch": 8.366914103923648, "grad_norm": 3.039585828781128, "learning_rate": 7.2181845181278306e-06, "loss": 0.1644, "num_input_tokens_seen": 15843176, "step": 15780 }, { "epoch": 8.369565217391305, "grad_norm": 15.1897611618042, "learning_rate": 7.2161108772195185e-06, "loss": 0.1532, "num_input_tokens_seen": 15847752, "step": 15785 }, { "epoch": 8.372216330858961, "grad_norm": 4.142912864685059, "learning_rate": 7.214036761847666e-06, "loss": 0.248, "num_input_tokens_seen": 15852520, "step": 15790 }, { "epoch": 8.374867444326616, "grad_norm": 1.3442010879516602, "learning_rate": 7.211962172456335e-06, "loss": 0.0608, "num_input_tokens_seen": 15858120, "step": 15795 }, { "epoch": 8.377518557794273, "grad_norm": 6.753003120422363, "learning_rate": 7.209887109489692e-06, "loss": 0.1727, "num_input_tokens_seen": 15863560, "step": 15800 }, { "epoch": 8.38016967126193, "grad_norm": 9.053764343261719, "learning_rate": 7.207811573392e-06, "loss": 0.2692, "num_input_tokens_seen": 15868552, "step": 15805 }, { "epoch": 8.382820784729587, "grad_norm": 5.520711898803711, "learning_rate": 7.20573556460763e-06, "loss": 0.102, "num_input_tokens_seen": 15874184, "step": 15810 }, { "epoch": 8.385471898197244, "grad_norm": 7.589677333831787, "learning_rate": 7.203659083581046e-06, "loss": 0.1818, "num_input_tokens_seen": 15880520, "step": 15815 }, { "epoch": 8.388123011664899, "grad_norm": 10.67928695678711, "learning_rate": 7.201582130756818e-06, "loss": 0.2111, "num_input_tokens_seen": 15885160, "step": 15820 }, { "epoch": 8.390774125132555, "grad_norm": 2.256465435028076, "learning_rate": 7.199504706579617e-06, "loss": 0.202, "num_input_tokens_seen": 15890216, "step": 15825 }, { "epoch": 8.393425238600212, "grad_norm": 19.059553146362305, "learning_rate": 7.197426811494215e-06, "loss": 0.2053, "num_input_tokens_seen": 15894472, "step": 15830 }, { "epoch": 8.396076352067869, "grad_norm": 5.1282572746276855, "learning_rate": 7.1953484459454826e-06, "loss": 0.1163, "num_input_tokens_seen": 15898696, "step": 15835 }, { "epoch": 8.398727465535526, "grad_norm": 10.468833923339844, "learning_rate": 7.193269610378391e-06, "loss": 0.246, "num_input_tokens_seen": 15903848, "step": 15840 }, { "epoch": 8.40137857900318, "grad_norm": 9.439517974853516, "learning_rate": 7.191190305238016e-06, "loss": 0.1936, "num_input_tokens_seen": 15908712, "step": 15845 }, { "epoch": 8.404029692470838, "grad_norm": 2.9340248107910156, "learning_rate": 7.189110530969531e-06, "loss": 0.1741, "num_input_tokens_seen": 15914472, "step": 15850 }, { "epoch": 8.406680805938494, "grad_norm": 15.475702285766602, "learning_rate": 7.187030288018208e-06, "loss": 0.3648, "num_input_tokens_seen": 15918984, "step": 15855 }, { "epoch": 8.409331919406151, "grad_norm": 5.931075096130371, "learning_rate": 7.184949576829424e-06, "loss": 0.1649, "num_input_tokens_seen": 15924008, "step": 15860 }, { "epoch": 8.411983032873806, "grad_norm": 14.261041641235352, "learning_rate": 7.182868397848653e-06, "loss": 0.3524, "num_input_tokens_seen": 15928776, "step": 15865 }, { "epoch": 8.414634146341463, "grad_norm": 10.474635124206543, "learning_rate": 7.1807867515214714e-06, "loss": 0.2277, "num_input_tokens_seen": 15934120, "step": 15870 }, { "epoch": 8.41728525980912, "grad_norm": 17.631628036499023, "learning_rate": 7.1787046382935525e-06, "loss": 0.2182, "num_input_tokens_seen": 15939208, "step": 15875 }, { "epoch": 8.419936373276776, "grad_norm": 19.550508499145508, "learning_rate": 7.1766220586106695e-06, "loss": 0.2628, "num_input_tokens_seen": 15944392, "step": 15880 }, { "epoch": 8.422587486744433, "grad_norm": 18.41204833984375, "learning_rate": 7.1745390129187006e-06, "loss": 0.2594, "num_input_tokens_seen": 15950440, "step": 15885 }, { "epoch": 8.425238600212088, "grad_norm": 5.972517013549805, "learning_rate": 7.17245550166362e-06, "loss": 0.3629, "num_input_tokens_seen": 15956680, "step": 15890 }, { "epoch": 8.427889713679745, "grad_norm": 16.18816375732422, "learning_rate": 7.170371525291502e-06, "loss": 0.2677, "num_input_tokens_seen": 15961576, "step": 15895 }, { "epoch": 8.430540827147402, "grad_norm": 6.221053123474121, "learning_rate": 7.1682870842485176e-06, "loss": 0.1659, "num_input_tokens_seen": 15966536, "step": 15900 }, { "epoch": 8.433191940615059, "grad_norm": 7.7844719886779785, "learning_rate": 7.166202178980942e-06, "loss": 0.2038, "num_input_tokens_seen": 15971272, "step": 15905 }, { "epoch": 8.435843054082715, "grad_norm": 15.215343475341797, "learning_rate": 7.16411680993515e-06, "loss": 0.2353, "num_input_tokens_seen": 15976680, "step": 15910 }, { "epoch": 8.43849416755037, "grad_norm": 19.558908462524414, "learning_rate": 7.162030977557611e-06, "loss": 0.2166, "num_input_tokens_seen": 15981640, "step": 15915 }, { "epoch": 8.441145281018027, "grad_norm": 10.244382858276367, "learning_rate": 7.159944682294897e-06, "loss": 0.1619, "num_input_tokens_seen": 15985608, "step": 15920 }, { "epoch": 8.443796394485684, "grad_norm": 8.467784881591797, "learning_rate": 7.157857924593681e-06, "loss": 0.2363, "num_input_tokens_seen": 15991048, "step": 15925 }, { "epoch": 8.44644750795334, "grad_norm": 8.103511810302734, "learning_rate": 7.155770704900728e-06, "loss": 0.1326, "num_input_tokens_seen": 15996104, "step": 15930 }, { "epoch": 8.449098621420998, "grad_norm": 11.625608444213867, "learning_rate": 7.153683023662907e-06, "loss": 0.2088, "num_input_tokens_seen": 16001128, "step": 15935 }, { "epoch": 8.451749734888653, "grad_norm": 9.698564529418945, "learning_rate": 7.1515948813271875e-06, "loss": 0.1008, "num_input_tokens_seen": 16005832, "step": 15940 }, { "epoch": 8.45440084835631, "grad_norm": 16.882131576538086, "learning_rate": 7.149506278340634e-06, "loss": 0.2194, "num_input_tokens_seen": 16010920, "step": 15945 }, { "epoch": 8.457051961823966, "grad_norm": 2.4776015281677246, "learning_rate": 7.147417215150411e-06, "loss": 0.1517, "num_input_tokens_seen": 16015208, "step": 15950 }, { "epoch": 8.459703075291623, "grad_norm": 6.787606239318848, "learning_rate": 7.145327692203781e-06, "loss": 0.1899, "num_input_tokens_seen": 16019848, "step": 15955 }, { "epoch": 8.46235418875928, "grad_norm": 11.879033088684082, "learning_rate": 7.143237709948105e-06, "loss": 0.1376, "num_input_tokens_seen": 16024840, "step": 15960 }, { "epoch": 8.465005302226935, "grad_norm": 12.310953140258789, "learning_rate": 7.141147268830846e-06, "loss": 0.2552, "num_input_tokens_seen": 16029608, "step": 15965 }, { "epoch": 8.467656415694591, "grad_norm": 16.38657569885254, "learning_rate": 7.139056369299559e-06, "loss": 0.2044, "num_input_tokens_seen": 16035144, "step": 15970 }, { "epoch": 8.470307529162248, "grad_norm": 8.060364723205566, "learning_rate": 7.136965011801899e-06, "loss": 0.1052, "num_input_tokens_seen": 16039784, "step": 15975 }, { "epoch": 8.472958642629905, "grad_norm": 11.667742729187012, "learning_rate": 7.134873196785622e-06, "loss": 0.133, "num_input_tokens_seen": 16044456, "step": 15980 }, { "epoch": 8.475609756097562, "grad_norm": 11.551279067993164, "learning_rate": 7.1327809246985815e-06, "loss": 0.2211, "num_input_tokens_seen": 16048552, "step": 15985 }, { "epoch": 8.478260869565217, "grad_norm": 21.437082290649414, "learning_rate": 7.130688195988725e-06, "loss": 0.1883, "num_input_tokens_seen": 16053512, "step": 15990 }, { "epoch": 8.480911983032874, "grad_norm": 5.227959156036377, "learning_rate": 7.128595011104102e-06, "loss": 0.1318, "num_input_tokens_seen": 16058536, "step": 15995 }, { "epoch": 8.48356309650053, "grad_norm": 15.400530815124512, "learning_rate": 7.126501370492855e-06, "loss": 0.2699, "num_input_tokens_seen": 16063496, "step": 16000 }, { "epoch": 8.486214209968187, "grad_norm": 3.9432613849639893, "learning_rate": 7.124407274603232e-06, "loss": 0.1954, "num_input_tokens_seen": 16067912, "step": 16005 }, { "epoch": 8.488865323435842, "grad_norm": 16.629478454589844, "learning_rate": 7.122312723883569e-06, "loss": 0.1718, "num_input_tokens_seen": 16073704, "step": 16010 }, { "epoch": 8.491516436903499, "grad_norm": 7.182583808898926, "learning_rate": 7.120217718782305e-06, "loss": 0.1977, "num_input_tokens_seen": 16077800, "step": 16015 }, { "epoch": 8.494167550371156, "grad_norm": 14.11965274810791, "learning_rate": 7.118122259747976e-06, "loss": 0.2437, "num_input_tokens_seen": 16081832, "step": 16020 }, { "epoch": 8.496818663838813, "grad_norm": 9.395030975341797, "learning_rate": 7.116026347229215e-06, "loss": 0.2808, "num_input_tokens_seen": 16086792, "step": 16025 }, { "epoch": 8.49946977730647, "grad_norm": 16.3671875, "learning_rate": 7.113929981674747e-06, "loss": 0.2178, "num_input_tokens_seen": 16091848, "step": 16030 }, { "epoch": 8.502120890774124, "grad_norm": 20.18716812133789, "learning_rate": 7.111833163533403e-06, "loss": 0.2438, "num_input_tokens_seen": 16097064, "step": 16035 }, { "epoch": 8.504772004241781, "grad_norm": 4.552320957183838, "learning_rate": 7.109735893254106e-06, "loss": 0.3364, "num_input_tokens_seen": 16103560, "step": 16040 }, { "epoch": 8.507423117709438, "grad_norm": 15.221504211425781, "learning_rate": 7.1076381712858745e-06, "loss": 0.1861, "num_input_tokens_seen": 16107624, "step": 16045 }, { "epoch": 8.510074231177095, "grad_norm": 12.12710952758789, "learning_rate": 7.105539998077824e-06, "loss": 0.2399, "num_input_tokens_seen": 16112808, "step": 16050 }, { "epoch": 8.512725344644752, "grad_norm": 2.1812877655029297, "learning_rate": 7.1034413740791705e-06, "loss": 0.2147, "num_input_tokens_seen": 16119304, "step": 16055 }, { "epoch": 8.515376458112407, "grad_norm": 7.678706169128418, "learning_rate": 7.101342299739225e-06, "loss": 0.2393, "num_input_tokens_seen": 16124232, "step": 16060 }, { "epoch": 8.518027571580063, "grad_norm": 4.252578258514404, "learning_rate": 7.099242775507389e-06, "loss": 0.0787, "num_input_tokens_seen": 16129864, "step": 16065 }, { "epoch": 8.52067868504772, "grad_norm": 3.218536376953125, "learning_rate": 7.097142801833169e-06, "loss": 0.1152, "num_input_tokens_seen": 16135304, "step": 16070 }, { "epoch": 8.523329798515377, "grad_norm": 7.116354465484619, "learning_rate": 7.095042379166164e-06, "loss": 0.1331, "num_input_tokens_seen": 16140904, "step": 16075 }, { "epoch": 8.525980911983034, "grad_norm": 7.857928276062012, "learning_rate": 7.092941507956066e-06, "loss": 0.1828, "num_input_tokens_seen": 16146088, "step": 16080 }, { "epoch": 8.528632025450689, "grad_norm": 10.393987655639648, "learning_rate": 7.090840188652668e-06, "loss": 0.3109, "num_input_tokens_seen": 16151432, "step": 16085 }, { "epoch": 8.531283138918345, "grad_norm": 16.143306732177734, "learning_rate": 7.088738421705856e-06, "loss": 0.2598, "num_input_tokens_seen": 16155720, "step": 16090 }, { "epoch": 8.533934252386002, "grad_norm": 16.059303283691406, "learning_rate": 7.086636207565616e-06, "loss": 0.2483, "num_input_tokens_seen": 16160200, "step": 16095 }, { "epoch": 8.536585365853659, "grad_norm": 2.227260112762451, "learning_rate": 7.084533546682023e-06, "loss": 0.2111, "num_input_tokens_seen": 16166184, "step": 16100 }, { "epoch": 8.539236479321314, "grad_norm": 0.4789220690727234, "learning_rate": 7.082430439505251e-06, "loss": 0.1913, "num_input_tokens_seen": 16170792, "step": 16105 }, { "epoch": 8.54188759278897, "grad_norm": 6.901888847351074, "learning_rate": 7.080326886485572e-06, "loss": 0.3895, "num_input_tokens_seen": 16175880, "step": 16110 }, { "epoch": 8.544538706256628, "grad_norm": 2.8948585987091064, "learning_rate": 7.078222888073352e-06, "loss": 0.1828, "num_input_tokens_seen": 16184520, "step": 16115 }, { "epoch": 8.547189819724284, "grad_norm": 9.932499885559082, "learning_rate": 7.076118444719048e-06, "loss": 0.3071, "num_input_tokens_seen": 16190856, "step": 16120 }, { "epoch": 8.549840933191941, "grad_norm": 19.769332885742188, "learning_rate": 7.074013556873217e-06, "loss": 0.2218, "num_input_tokens_seen": 16195432, "step": 16125 }, { "epoch": 8.552492046659598, "grad_norm": 20.249225616455078, "learning_rate": 7.071908224986512e-06, "loss": 0.2949, "num_input_tokens_seen": 16199816, "step": 16130 }, { "epoch": 8.555143160127253, "grad_norm": 15.064894676208496, "learning_rate": 7.069802449509677e-06, "loss": 0.1654, "num_input_tokens_seen": 16205640, "step": 16135 }, { "epoch": 8.55779427359491, "grad_norm": 3.6290769577026367, "learning_rate": 7.067696230893555e-06, "loss": 0.172, "num_input_tokens_seen": 16210344, "step": 16140 }, { "epoch": 8.560445387062567, "grad_norm": 7.303335666656494, "learning_rate": 7.065589569589079e-06, "loss": 0.1241, "num_input_tokens_seen": 16215464, "step": 16145 }, { "epoch": 8.563096500530223, "grad_norm": 9.428692817687988, "learning_rate": 7.063482466047281e-06, "loss": 0.1251, "num_input_tokens_seen": 16220296, "step": 16150 }, { "epoch": 8.565747613997878, "grad_norm": 16.048494338989258, "learning_rate": 7.061374920719288e-06, "loss": 0.2973, "num_input_tokens_seen": 16225608, "step": 16155 }, { "epoch": 8.568398727465535, "grad_norm": 1.7627975940704346, "learning_rate": 7.059266934056318e-06, "loss": 0.1478, "num_input_tokens_seen": 16231432, "step": 16160 }, { "epoch": 8.571049840933192, "grad_norm": 4.7320051193237305, "learning_rate": 7.057158506509685e-06, "loss": 0.2094, "num_input_tokens_seen": 16236072, "step": 16165 }, { "epoch": 8.573700954400849, "grad_norm": 16.902368545532227, "learning_rate": 7.0550496385308e-06, "loss": 0.2429, "num_input_tokens_seen": 16240840, "step": 16170 }, { "epoch": 8.576352067868505, "grad_norm": 12.973830223083496, "learning_rate": 7.0529403305711656e-06, "loss": 0.2009, "num_input_tokens_seen": 16246792, "step": 16175 }, { "epoch": 8.57900318133616, "grad_norm": 9.53050708770752, "learning_rate": 7.050830583082377e-06, "loss": 0.1524, "num_input_tokens_seen": 16251496, "step": 16180 }, { "epoch": 8.581654294803817, "grad_norm": 2.739938497543335, "learning_rate": 7.048720396516127e-06, "loss": 0.0888, "num_input_tokens_seen": 16257064, "step": 16185 }, { "epoch": 8.584305408271474, "grad_norm": 13.137852668762207, "learning_rate": 7.046609771324202e-06, "loss": 0.2589, "num_input_tokens_seen": 16261832, "step": 16190 }, { "epoch": 8.58695652173913, "grad_norm": 10.730831146240234, "learning_rate": 7.04449870795848e-06, "loss": 0.2252, "num_input_tokens_seen": 16266664, "step": 16195 }, { "epoch": 8.589607635206788, "grad_norm": 9.314947128295898, "learning_rate": 7.042387206870935e-06, "loss": 0.2239, "num_input_tokens_seen": 16271784, "step": 16200 }, { "epoch": 8.592258748674443, "grad_norm": 9.384381294250488, "learning_rate": 7.040275268513632e-06, "loss": 0.1313, "num_input_tokens_seen": 16277448, "step": 16205 }, { "epoch": 8.5949098621421, "grad_norm": 16.33513069152832, "learning_rate": 7.038162893338735e-06, "loss": 0.2538, "num_input_tokens_seen": 16281832, "step": 16210 }, { "epoch": 8.597560975609756, "grad_norm": 18.986671447753906, "learning_rate": 7.0360500817984945e-06, "loss": 0.3004, "num_input_tokens_seen": 16287752, "step": 16215 }, { "epoch": 8.600212089077413, "grad_norm": 14.620759010314941, "learning_rate": 7.033936834345258e-06, "loss": 0.2255, "num_input_tokens_seen": 16292392, "step": 16220 }, { "epoch": 8.60286320254507, "grad_norm": 6.017468452453613, "learning_rate": 7.031823151431469e-06, "loss": 0.2325, "num_input_tokens_seen": 16298536, "step": 16225 }, { "epoch": 8.605514316012725, "grad_norm": 14.778165817260742, "learning_rate": 7.02970903350966e-06, "loss": 0.2354, "num_input_tokens_seen": 16303336, "step": 16230 }, { "epoch": 8.608165429480382, "grad_norm": 15.62004566192627, "learning_rate": 7.027594481032459e-06, "loss": 0.2247, "num_input_tokens_seen": 16307688, "step": 16235 }, { "epoch": 8.610816542948038, "grad_norm": 8.513693809509277, "learning_rate": 7.0254794944525835e-06, "loss": 0.2596, "num_input_tokens_seen": 16313288, "step": 16240 }, { "epoch": 8.613467656415695, "grad_norm": 10.356107711791992, "learning_rate": 7.0233640742228504e-06, "loss": 0.207, "num_input_tokens_seen": 16317448, "step": 16245 }, { "epoch": 8.61611876988335, "grad_norm": 7.790833950042725, "learning_rate": 7.021248220796162e-06, "loss": 0.2123, "num_input_tokens_seen": 16322792, "step": 16250 }, { "epoch": 8.618769883351007, "grad_norm": 13.633440017700195, "learning_rate": 7.0191319346255206e-06, "loss": 0.1586, "num_input_tokens_seen": 16327464, "step": 16255 }, { "epoch": 8.621420996818664, "grad_norm": 21.058584213256836, "learning_rate": 7.017015216164013e-06, "loss": 0.2587, "num_input_tokens_seen": 16332040, "step": 16260 }, { "epoch": 8.62407211028632, "grad_norm": 9.566964149475098, "learning_rate": 7.014898065864826e-06, "loss": 0.2125, "num_input_tokens_seen": 16337448, "step": 16265 }, { "epoch": 8.626723223753977, "grad_norm": 2.201434373855591, "learning_rate": 7.012780484181236e-06, "loss": 0.26, "num_input_tokens_seen": 16342280, "step": 16270 }, { "epoch": 8.629374337221632, "grad_norm": 5.495865821838379, "learning_rate": 7.010662471566613e-06, "loss": 0.2179, "num_input_tokens_seen": 16347688, "step": 16275 }, { "epoch": 8.632025450689289, "grad_norm": 15.066656112670898, "learning_rate": 7.008544028474413e-06, "loss": 0.1832, "num_input_tokens_seen": 16352808, "step": 16280 }, { "epoch": 8.634676564156946, "grad_norm": 10.629133224487305, "learning_rate": 7.006425155358195e-06, "loss": 0.1753, "num_input_tokens_seen": 16356872, "step": 16285 }, { "epoch": 8.637327677624603, "grad_norm": 13.335100173950195, "learning_rate": 7.0043058526716e-06, "loss": 0.2881, "num_input_tokens_seen": 16362152, "step": 16290 }, { "epoch": 8.63997879109226, "grad_norm": 7.847588539123535, "learning_rate": 7.002186120868368e-06, "loss": 0.1497, "num_input_tokens_seen": 16367560, "step": 16295 }, { "epoch": 8.642629904559914, "grad_norm": 9.151130676269531, "learning_rate": 7.000065960402325e-06, "loss": 0.1973, "num_input_tokens_seen": 16372072, "step": 16300 }, { "epoch": 8.645281018027571, "grad_norm": 13.724231719970703, "learning_rate": 6.9979453717273945e-06, "loss": 0.2063, "num_input_tokens_seen": 16376840, "step": 16305 }, { "epoch": 8.647932131495228, "grad_norm": 5.9473981857299805, "learning_rate": 6.995824355297589e-06, "loss": 0.2125, "num_input_tokens_seen": 16382600, "step": 16310 }, { "epoch": 8.650583244962885, "grad_norm": 3.729773759841919, "learning_rate": 6.99370291156701e-06, "loss": 0.115, "num_input_tokens_seen": 16387432, "step": 16315 }, { "epoch": 8.653234358430542, "grad_norm": 15.47608470916748, "learning_rate": 6.9915810409898545e-06, "loss": 0.35, "num_input_tokens_seen": 16392904, "step": 16320 }, { "epoch": 8.655885471898197, "grad_norm": 20.478635787963867, "learning_rate": 6.989458744020411e-06, "loss": 0.3221, "num_input_tokens_seen": 16397992, "step": 16325 }, { "epoch": 8.658536585365853, "grad_norm": 11.925193786621094, "learning_rate": 6.987336021113056e-06, "loss": 0.1969, "num_input_tokens_seen": 16402920, "step": 16330 }, { "epoch": 8.66118769883351, "grad_norm": 15.622661590576172, "learning_rate": 6.985212872722258e-06, "loss": 0.2103, "num_input_tokens_seen": 16408328, "step": 16335 }, { "epoch": 8.663838812301167, "grad_norm": 10.107172012329102, "learning_rate": 6.98308929930258e-06, "loss": 0.1804, "num_input_tokens_seen": 16416392, "step": 16340 }, { "epoch": 8.666489925768824, "grad_norm": 16.436460494995117, "learning_rate": 6.980965301308672e-06, "loss": 0.3075, "num_input_tokens_seen": 16421064, "step": 16345 }, { "epoch": 8.669141039236479, "grad_norm": 16.35237693786621, "learning_rate": 6.978840879195276e-06, "loss": 0.252, "num_input_tokens_seen": 16426024, "step": 16350 }, { "epoch": 8.671792152704136, "grad_norm": 6.093367099761963, "learning_rate": 6.976716033417227e-06, "loss": 0.1458, "num_input_tokens_seen": 16431880, "step": 16355 }, { "epoch": 8.674443266171792, "grad_norm": 3.1677920818328857, "learning_rate": 6.974590764429447e-06, "loss": 0.1576, "num_input_tokens_seen": 16436680, "step": 16360 }, { "epoch": 8.677094379639449, "grad_norm": 8.662678718566895, "learning_rate": 6.9724650726869514e-06, "loss": 0.1903, "num_input_tokens_seen": 16441160, "step": 16365 }, { "epoch": 8.679745493107106, "grad_norm": 2.731201171875, "learning_rate": 6.9703389586448446e-06, "loss": 0.2861, "num_input_tokens_seen": 16445832, "step": 16370 }, { "epoch": 8.68239660657476, "grad_norm": 10.484996795654297, "learning_rate": 6.968212422758322e-06, "loss": 0.1885, "num_input_tokens_seen": 16450088, "step": 16375 }, { "epoch": 8.685047720042418, "grad_norm": 19.98200225830078, "learning_rate": 6.966085465482673e-06, "loss": 0.2871, "num_input_tokens_seen": 16455176, "step": 16380 }, { "epoch": 8.687698833510074, "grad_norm": 1.746923804283142, "learning_rate": 6.963958087273268e-06, "loss": 0.118, "num_input_tokens_seen": 16460072, "step": 16385 }, { "epoch": 8.690349946977731, "grad_norm": 21.163183212280273, "learning_rate": 6.961830288585576e-06, "loss": 0.2333, "num_input_tokens_seen": 16464328, "step": 16390 }, { "epoch": 8.693001060445386, "grad_norm": 18.555177688598633, "learning_rate": 6.959702069875154e-06, "loss": 0.2309, "num_input_tokens_seen": 16468616, "step": 16395 }, { "epoch": 8.695652173913043, "grad_norm": 11.02647590637207, "learning_rate": 6.957573431597646e-06, "loss": 0.1235, "num_input_tokens_seen": 16474376, "step": 16400 }, { "epoch": 8.6983032873807, "grad_norm": 7.549180507659912, "learning_rate": 6.955444374208788e-06, "loss": 0.1791, "num_input_tokens_seen": 16480520, "step": 16405 }, { "epoch": 8.700954400848357, "grad_norm": 12.505334854125977, "learning_rate": 6.953314898164407e-06, "loss": 0.1656, "num_input_tokens_seen": 16485960, "step": 16410 }, { "epoch": 8.703605514316013, "grad_norm": 10.542779922485352, "learning_rate": 6.951185003920418e-06, "loss": 0.1463, "num_input_tokens_seen": 16490888, "step": 16415 }, { "epoch": 8.706256627783668, "grad_norm": 6.060083866119385, "learning_rate": 6.949054691932825e-06, "loss": 0.2328, "num_input_tokens_seen": 16495464, "step": 16420 }, { "epoch": 8.708907741251325, "grad_norm": 12.645034790039062, "learning_rate": 6.9469239626577234e-06, "loss": 0.303, "num_input_tokens_seen": 16500712, "step": 16425 }, { "epoch": 8.711558854718982, "grad_norm": 13.473114967346191, "learning_rate": 6.944792816551295e-06, "loss": 0.2971, "num_input_tokens_seen": 16506312, "step": 16430 }, { "epoch": 8.714209968186639, "grad_norm": 8.474678993225098, "learning_rate": 6.942661254069813e-06, "loss": 0.2106, "num_input_tokens_seen": 16511400, "step": 16435 }, { "epoch": 8.716861081654296, "grad_norm": 13.650873184204102, "learning_rate": 6.940529275669642e-06, "loss": 0.206, "num_input_tokens_seen": 16516200, "step": 16440 }, { "epoch": 8.71951219512195, "grad_norm": 11.969503402709961, "learning_rate": 6.93839688180723e-06, "loss": 0.3346, "num_input_tokens_seen": 16520456, "step": 16445 }, { "epoch": 8.722163308589607, "grad_norm": 14.585583686828613, "learning_rate": 6.9362640729391165e-06, "loss": 0.3004, "num_input_tokens_seen": 16525512, "step": 16450 }, { "epoch": 8.724814422057264, "grad_norm": 7.594461917877197, "learning_rate": 6.934130849521933e-06, "loss": 0.1635, "num_input_tokens_seen": 16530312, "step": 16455 }, { "epoch": 8.72746553552492, "grad_norm": 16.14742088317871, "learning_rate": 6.931997212012396e-06, "loss": 0.2563, "num_input_tokens_seen": 16536712, "step": 16460 }, { "epoch": 8.730116648992578, "grad_norm": 10.819111824035645, "learning_rate": 6.9298631608673115e-06, "loss": 0.2129, "num_input_tokens_seen": 16540936, "step": 16465 }, { "epoch": 8.732767762460233, "grad_norm": 11.97216510772705, "learning_rate": 6.927728696543573e-06, "loss": 0.1886, "num_input_tokens_seen": 16546376, "step": 16470 }, { "epoch": 8.73541887592789, "grad_norm": 12.981947898864746, "learning_rate": 6.9255938194981654e-06, "loss": 0.155, "num_input_tokens_seen": 16552168, "step": 16475 }, { "epoch": 8.738069989395546, "grad_norm": 18.908653259277344, "learning_rate": 6.923458530188162e-06, "loss": 0.2194, "num_input_tokens_seen": 16556680, "step": 16480 }, { "epoch": 8.740721102863203, "grad_norm": 3.076826333999634, "learning_rate": 6.921322829070718e-06, "loss": 0.1154, "num_input_tokens_seen": 16561128, "step": 16485 }, { "epoch": 8.743372216330858, "grad_norm": 15.175497055053711, "learning_rate": 6.9191867166030835e-06, "loss": 0.2074, "num_input_tokens_seen": 16566056, "step": 16490 }, { "epoch": 8.746023329798515, "grad_norm": 6.642989158630371, "learning_rate": 6.917050193242596e-06, "loss": 0.1765, "num_input_tokens_seen": 16571112, "step": 16495 }, { "epoch": 8.748674443266172, "grad_norm": 10.002790451049805, "learning_rate": 6.91491325944668e-06, "loss": 0.2287, "num_input_tokens_seen": 16576552, "step": 16500 }, { "epoch": 8.751325556733828, "grad_norm": 6.547449588775635, "learning_rate": 6.912775915672842e-06, "loss": 0.2574, "num_input_tokens_seen": 16581608, "step": 16505 }, { "epoch": 8.753976670201485, "grad_norm": 1.8603906631469727, "learning_rate": 6.910638162378689e-06, "loss": 0.1531, "num_input_tokens_seen": 16585832, "step": 16510 }, { "epoch": 8.756627783669142, "grad_norm": 6.386181354522705, "learning_rate": 6.908500000021905e-06, "loss": 0.1443, "num_input_tokens_seen": 16590600, "step": 16515 }, { "epoch": 8.759278897136797, "grad_norm": 19.514633178710938, "learning_rate": 6.906361429060264e-06, "loss": 0.2654, "num_input_tokens_seen": 16596968, "step": 16520 }, { "epoch": 8.761930010604454, "grad_norm": 12.598747253417969, "learning_rate": 6.90422244995163e-06, "loss": 0.2206, "num_input_tokens_seen": 16601864, "step": 16525 }, { "epoch": 8.76458112407211, "grad_norm": 9.135684967041016, "learning_rate": 6.902083063153951e-06, "loss": 0.3192, "num_input_tokens_seen": 16606664, "step": 16530 }, { "epoch": 8.767232237539767, "grad_norm": 6.075790882110596, "learning_rate": 6.899943269125267e-06, "loss": 0.1953, "num_input_tokens_seen": 16611752, "step": 16535 }, { "epoch": 8.769883351007422, "grad_norm": 16.086366653442383, "learning_rate": 6.8978030683237e-06, "loss": 0.2234, "num_input_tokens_seen": 16616776, "step": 16540 }, { "epoch": 8.77253446447508, "grad_norm": 3.88708758354187, "learning_rate": 6.895662461207462e-06, "loss": 0.2771, "num_input_tokens_seen": 16621704, "step": 16545 }, { "epoch": 8.775185577942736, "grad_norm": 17.255266189575195, "learning_rate": 6.893521448234853e-06, "loss": 0.2971, "num_input_tokens_seen": 16626536, "step": 16550 }, { "epoch": 8.777836691410393, "grad_norm": 8.799355506896973, "learning_rate": 6.891380029864254e-06, "loss": 0.1119, "num_input_tokens_seen": 16631656, "step": 16555 }, { "epoch": 8.78048780487805, "grad_norm": 11.5503568649292, "learning_rate": 6.889238206554143e-06, "loss": 0.2142, "num_input_tokens_seen": 16636040, "step": 16560 }, { "epoch": 8.783138918345704, "grad_norm": 6.194159030914307, "learning_rate": 6.887095978763072e-06, "loss": 0.1335, "num_input_tokens_seen": 16642696, "step": 16565 }, { "epoch": 8.785790031813361, "grad_norm": 11.637984275817871, "learning_rate": 6.8849533469496934e-06, "loss": 0.3673, "num_input_tokens_seen": 16647208, "step": 16570 }, { "epoch": 8.788441145281018, "grad_norm": 2.840888023376465, "learning_rate": 6.882810311572734e-06, "loss": 0.1167, "num_input_tokens_seen": 16651912, "step": 16575 }, { "epoch": 8.791092258748675, "grad_norm": 3.173252582550049, "learning_rate": 6.880666873091014e-06, "loss": 0.1618, "num_input_tokens_seen": 16656744, "step": 16580 }, { "epoch": 8.793743372216332, "grad_norm": 7.675191402435303, "learning_rate": 6.878523031963438e-06, "loss": 0.1052, "num_input_tokens_seen": 16661640, "step": 16585 }, { "epoch": 8.796394485683987, "grad_norm": 6.870221138000488, "learning_rate": 6.8763787886489954e-06, "loss": 0.1993, "num_input_tokens_seen": 16666856, "step": 16590 }, { "epoch": 8.799045599151643, "grad_norm": 6.087161064147949, "learning_rate": 6.874234143606765e-06, "loss": 0.0742, "num_input_tokens_seen": 16672072, "step": 16595 }, { "epoch": 8.8016967126193, "grad_norm": 9.041096687316895, "learning_rate": 6.872089097295909e-06, "loss": 0.275, "num_input_tokens_seen": 16676648, "step": 16600 }, { "epoch": 8.804347826086957, "grad_norm": 13.829833984375, "learning_rate": 6.8699436501756744e-06, "loss": 0.2734, "num_input_tokens_seen": 16682504, "step": 16605 }, { "epoch": 8.806998939554614, "grad_norm": 20.534034729003906, "learning_rate": 6.8677978027054e-06, "loss": 0.2974, "num_input_tokens_seen": 16686408, "step": 16610 }, { "epoch": 8.809650053022269, "grad_norm": 12.968006134033203, "learning_rate": 6.8656515553445034e-06, "loss": 0.2857, "num_input_tokens_seen": 16691624, "step": 16615 }, { "epoch": 8.812301166489926, "grad_norm": 12.090718269348145, "learning_rate": 6.863504908552488e-06, "loss": 0.1988, "num_input_tokens_seen": 16695816, "step": 16620 }, { "epoch": 8.814952279957582, "grad_norm": 12.426926612854004, "learning_rate": 6.861357862788951e-06, "loss": 0.1846, "num_input_tokens_seen": 16701192, "step": 16625 }, { "epoch": 8.81760339342524, "grad_norm": 19.106367111206055, "learning_rate": 6.859210418513564e-06, "loss": 0.2403, "num_input_tokens_seen": 16706888, "step": 16630 }, { "epoch": 8.820254506892894, "grad_norm": 3.015529155731201, "learning_rate": 6.857062576186092e-06, "loss": 0.2927, "num_input_tokens_seen": 16711624, "step": 16635 }, { "epoch": 8.822905620360551, "grad_norm": 8.589150428771973, "learning_rate": 6.854914336266383e-06, "loss": 0.2254, "num_input_tokens_seen": 16716552, "step": 16640 }, { "epoch": 8.825556733828208, "grad_norm": 10.534332275390625, "learning_rate": 6.852765699214368e-06, "loss": 0.1211, "num_input_tokens_seen": 16721448, "step": 16645 }, { "epoch": 8.828207847295864, "grad_norm": 5.987590789794922, "learning_rate": 6.850616665490064e-06, "loss": 0.0997, "num_input_tokens_seen": 16726440, "step": 16650 }, { "epoch": 8.830858960763521, "grad_norm": 11.510811805725098, "learning_rate": 6.848467235553575e-06, "loss": 0.2281, "num_input_tokens_seen": 16731528, "step": 16655 }, { "epoch": 8.833510074231176, "grad_norm": 11.786620140075684, "learning_rate": 6.846317409865087e-06, "loss": 0.2815, "num_input_tokens_seen": 16739400, "step": 16660 }, { "epoch": 8.836161187698833, "grad_norm": 2.430131673812866, "learning_rate": 6.844167188884875e-06, "loss": 0.0659, "num_input_tokens_seen": 16743816, "step": 16665 }, { "epoch": 8.83881230116649, "grad_norm": 2.951690435409546, "learning_rate": 6.842016573073293e-06, "loss": 0.1388, "num_input_tokens_seen": 16748744, "step": 16670 }, { "epoch": 8.841463414634147, "grad_norm": 11.60821533203125, "learning_rate": 6.83986556289078e-06, "loss": 0.2826, "num_input_tokens_seen": 16754600, "step": 16675 }, { "epoch": 8.844114528101803, "grad_norm": 11.725674629211426, "learning_rate": 6.837714158797868e-06, "loss": 0.1426, "num_input_tokens_seen": 16760360, "step": 16680 }, { "epoch": 8.846765641569458, "grad_norm": 16.712078094482422, "learning_rate": 6.835562361255162e-06, "loss": 0.3128, "num_input_tokens_seen": 16764872, "step": 16685 }, { "epoch": 8.849416755037115, "grad_norm": 8.840873718261719, "learning_rate": 6.833410170723358e-06, "loss": 0.165, "num_input_tokens_seen": 16769384, "step": 16690 }, { "epoch": 8.852067868504772, "grad_norm": 8.257488250732422, "learning_rate": 6.8312575876632324e-06, "loss": 0.218, "num_input_tokens_seen": 16774152, "step": 16695 }, { "epoch": 8.854718981972429, "grad_norm": 8.065561294555664, "learning_rate": 6.829104612535651e-06, "loss": 0.0705, "num_input_tokens_seen": 16779848, "step": 16700 }, { "epoch": 8.857370095440086, "grad_norm": 9.987449645996094, "learning_rate": 6.826951245801558e-06, "loss": 0.2583, "num_input_tokens_seen": 16784072, "step": 16705 }, { "epoch": 8.86002120890774, "grad_norm": 6.952293872833252, "learning_rate": 6.824797487921984e-06, "loss": 0.1809, "num_input_tokens_seen": 16789800, "step": 16710 }, { "epoch": 8.862672322375397, "grad_norm": 2.5947725772857666, "learning_rate": 6.822643339358042e-06, "loss": 0.0762, "num_input_tokens_seen": 16794440, "step": 16715 }, { "epoch": 8.865323435843054, "grad_norm": 12.075912475585938, "learning_rate": 6.820488800570931e-06, "loss": 0.3238, "num_input_tokens_seen": 16798536, "step": 16720 }, { "epoch": 8.867974549310711, "grad_norm": 13.238699913024902, "learning_rate": 6.818333872021932e-06, "loss": 0.3361, "num_input_tokens_seen": 16803944, "step": 16725 }, { "epoch": 8.870625662778368, "grad_norm": 13.634347915649414, "learning_rate": 6.816178554172409e-06, "loss": 0.2805, "num_input_tokens_seen": 16809704, "step": 16730 }, { "epoch": 8.873276776246023, "grad_norm": 3.8790950775146484, "learning_rate": 6.814022847483809e-06, "loss": 0.1624, "num_input_tokens_seen": 16813864, "step": 16735 }, { "epoch": 8.87592788971368, "grad_norm": 7.462247371673584, "learning_rate": 6.811866752417664e-06, "loss": 0.2387, "num_input_tokens_seen": 16819048, "step": 16740 }, { "epoch": 8.878579003181336, "grad_norm": 7.656747341156006, "learning_rate": 6.80971026943559e-06, "loss": 0.1628, "num_input_tokens_seen": 16824008, "step": 16745 }, { "epoch": 8.881230116648993, "grad_norm": 19.782760620117188, "learning_rate": 6.8075533989992824e-06, "loss": 0.2521, "num_input_tokens_seen": 16829576, "step": 16750 }, { "epoch": 8.88388123011665, "grad_norm": 11.2128324508667, "learning_rate": 6.805396141570522e-06, "loss": 0.1282, "num_input_tokens_seen": 16835240, "step": 16755 }, { "epoch": 8.886532343584305, "grad_norm": 6.197446823120117, "learning_rate": 6.803238497611172e-06, "loss": 0.1417, "num_input_tokens_seen": 16839944, "step": 16760 }, { "epoch": 8.889183457051962, "grad_norm": 7.40912389755249, "learning_rate": 6.801080467583178e-06, "loss": 0.2251, "num_input_tokens_seen": 16845768, "step": 16765 }, { "epoch": 8.891834570519618, "grad_norm": 2.341864824295044, "learning_rate": 6.798922051948569e-06, "loss": 0.1939, "num_input_tokens_seen": 16850632, "step": 16770 }, { "epoch": 8.894485683987275, "grad_norm": 4.33017635345459, "learning_rate": 6.7967632511694545e-06, "loss": 0.139, "num_input_tokens_seen": 16855240, "step": 16775 }, { "epoch": 8.89713679745493, "grad_norm": 23.2880916595459, "learning_rate": 6.794604065708031e-06, "loss": 0.2208, "num_input_tokens_seen": 16859880, "step": 16780 }, { "epoch": 8.899787910922587, "grad_norm": 2.338548183441162, "learning_rate": 6.792444496026573e-06, "loss": 0.2849, "num_input_tokens_seen": 16864392, "step": 16785 }, { "epoch": 8.902439024390244, "grad_norm": 2.325596570968628, "learning_rate": 6.790284542587439e-06, "loss": 0.2436, "num_input_tokens_seen": 16869352, "step": 16790 }, { "epoch": 8.9050901378579, "grad_norm": 11.023849487304688, "learning_rate": 6.788124205853068e-06, "loss": 0.1324, "num_input_tokens_seen": 16874024, "step": 16795 }, { "epoch": 8.907741251325557, "grad_norm": 13.248861312866211, "learning_rate": 6.785963486285984e-06, "loss": 0.3698, "num_input_tokens_seen": 16879720, "step": 16800 }, { "epoch": 8.910392364793212, "grad_norm": 16.490684509277344, "learning_rate": 6.783802384348792e-06, "loss": 0.3197, "num_input_tokens_seen": 16884168, "step": 16805 }, { "epoch": 8.91304347826087, "grad_norm": 8.115852355957031, "learning_rate": 6.781640900504176e-06, "loss": 0.1083, "num_input_tokens_seen": 16887496, "step": 16810 }, { "epoch": 8.915694591728526, "grad_norm": 6.149453639984131, "learning_rate": 6.779479035214906e-06, "loss": 0.2386, "num_input_tokens_seen": 16892488, "step": 16815 }, { "epoch": 8.918345705196183, "grad_norm": 5.021007537841797, "learning_rate": 6.777316788943832e-06, "loss": 0.1354, "num_input_tokens_seen": 16896392, "step": 16820 }, { "epoch": 8.92099681866384, "grad_norm": 7.014645576477051, "learning_rate": 6.775154162153883e-06, "loss": 0.0867, "num_input_tokens_seen": 16901896, "step": 16825 }, { "epoch": 8.923647932131495, "grad_norm": 8.012094497680664, "learning_rate": 6.772991155308072e-06, "loss": 0.2526, "num_input_tokens_seen": 16907528, "step": 16830 }, { "epoch": 8.926299045599151, "grad_norm": 1.5558668375015259, "learning_rate": 6.770827768869496e-06, "loss": 0.2733, "num_input_tokens_seen": 16912520, "step": 16835 }, { "epoch": 8.928950159066808, "grad_norm": 1.379557490348816, "learning_rate": 6.768664003301327e-06, "loss": 0.1539, "num_input_tokens_seen": 16917288, "step": 16840 }, { "epoch": 8.931601272534465, "grad_norm": 5.039188385009766, "learning_rate": 6.766499859066824e-06, "loss": 0.1758, "num_input_tokens_seen": 16922280, "step": 16845 }, { "epoch": 8.934252386002122, "grad_norm": 17.976829528808594, "learning_rate": 6.764335336629323e-06, "loss": 0.1074, "num_input_tokens_seen": 16927368, "step": 16850 }, { "epoch": 8.936903499469777, "grad_norm": 7.930082321166992, "learning_rate": 6.762170436452244e-06, "loss": 0.2527, "num_input_tokens_seen": 16932072, "step": 16855 }, { "epoch": 8.939554612937433, "grad_norm": 17.04059600830078, "learning_rate": 6.7600051589990855e-06, "loss": 0.2026, "num_input_tokens_seen": 16937864, "step": 16860 }, { "epoch": 8.94220572640509, "grad_norm": 20.992862701416016, "learning_rate": 6.757839504733428e-06, "loss": 0.2886, "num_input_tokens_seen": 16942472, "step": 16865 }, { "epoch": 8.944856839872747, "grad_norm": 20.514280319213867, "learning_rate": 6.7556734741189316e-06, "loss": 0.2689, "num_input_tokens_seen": 16947016, "step": 16870 }, { "epoch": 8.947507953340402, "grad_norm": 8.470490455627441, "learning_rate": 6.753507067619338e-06, "loss": 0.2088, "num_input_tokens_seen": 16951144, "step": 16875 }, { "epoch": 8.950159066808059, "grad_norm": 11.379754066467285, "learning_rate": 6.751340285698471e-06, "loss": 0.1416, "num_input_tokens_seen": 16955144, "step": 16880 }, { "epoch": 8.952810180275716, "grad_norm": 13.431331634521484, "learning_rate": 6.749173128820231e-06, "loss": 0.1061, "num_input_tokens_seen": 16959784, "step": 16885 }, { "epoch": 8.955461293743372, "grad_norm": 10.413552284240723, "learning_rate": 6.747005597448601e-06, "loss": 0.1821, "num_input_tokens_seen": 16964712, "step": 16890 }, { "epoch": 8.95811240721103, "grad_norm": 6.264420509338379, "learning_rate": 6.744837692047646e-06, "loss": 0.1651, "num_input_tokens_seen": 16969672, "step": 16895 }, { "epoch": 8.960763520678686, "grad_norm": 8.907979011535645, "learning_rate": 6.7426694130815055e-06, "loss": 0.1598, "num_input_tokens_seen": 16974120, "step": 16900 }, { "epoch": 8.963414634146341, "grad_norm": 7.883851528167725, "learning_rate": 6.740500761014404e-06, "loss": 0.2434, "num_input_tokens_seen": 16980040, "step": 16905 }, { "epoch": 8.966065747613998, "grad_norm": 17.405508041381836, "learning_rate": 6.738331736310643e-06, "loss": 0.3002, "num_input_tokens_seen": 16984936, "step": 16910 }, { "epoch": 8.968716861081655, "grad_norm": 8.602653503417969, "learning_rate": 6.7361623394346065e-06, "loss": 0.1051, "num_input_tokens_seen": 16989704, "step": 16915 }, { "epoch": 8.971367974549311, "grad_norm": 11.453965187072754, "learning_rate": 6.733992570850757e-06, "loss": 0.0918, "num_input_tokens_seen": 16995592, "step": 16920 }, { "epoch": 8.974019088016966, "grad_norm": 20.789955139160156, "learning_rate": 6.731822431023633e-06, "loss": 0.2384, "num_input_tokens_seen": 17000904, "step": 16925 }, { "epoch": 8.976670201484623, "grad_norm": 8.311341285705566, "learning_rate": 6.729651920417861e-06, "loss": 0.142, "num_input_tokens_seen": 17005992, "step": 16930 }, { "epoch": 8.97932131495228, "grad_norm": 5.894685745239258, "learning_rate": 6.727481039498138e-06, "loss": 0.1297, "num_input_tokens_seen": 17010248, "step": 16935 }, { "epoch": 8.981972428419937, "grad_norm": 12.953635215759277, "learning_rate": 6.725309788729244e-06, "loss": 0.1689, "num_input_tokens_seen": 17015464, "step": 16940 }, { "epoch": 8.984623541887593, "grad_norm": 4.755133628845215, "learning_rate": 6.72313816857604e-06, "loss": 0.1538, "num_input_tokens_seen": 17020296, "step": 16945 }, { "epoch": 8.987274655355248, "grad_norm": 4.577930450439453, "learning_rate": 6.720966179503463e-06, "loss": 0.2266, "num_input_tokens_seen": 17024872, "step": 16950 }, { "epoch": 8.989925768822905, "grad_norm": 4.9945855140686035, "learning_rate": 6.718793821976531e-06, "loss": 0.27, "num_input_tokens_seen": 17029608, "step": 16955 }, { "epoch": 8.992576882290562, "grad_norm": 15.50662899017334, "learning_rate": 6.7166210964603375e-06, "loss": 0.1532, "num_input_tokens_seen": 17036712, "step": 16960 }, { "epoch": 8.995227995758219, "grad_norm": 7.622265815734863, "learning_rate": 6.714448003420061e-06, "loss": 0.1277, "num_input_tokens_seen": 17041640, "step": 16965 }, { "epoch": 8.997879109225876, "grad_norm": 24.629730224609375, "learning_rate": 6.712274543320952e-06, "loss": 0.2523, "num_input_tokens_seen": 17045960, "step": 16970 }, { "epoch": 9.00053022269353, "grad_norm": 2.2695751190185547, "learning_rate": 6.710100716628345e-06, "loss": 0.1335, "num_input_tokens_seen": 17050744, "step": 16975 }, { "epoch": 9.003181336161187, "grad_norm": 18.085636138916016, "learning_rate": 6.707926523807647e-06, "loss": 0.1712, "num_input_tokens_seen": 17055576, "step": 16980 }, { "epoch": 9.005832449628844, "grad_norm": 7.698113918304443, "learning_rate": 6.705751965324352e-06, "loss": 0.1146, "num_input_tokens_seen": 17059992, "step": 16985 }, { "epoch": 9.008483563096501, "grad_norm": 21.612537384033203, "learning_rate": 6.703577041644023e-06, "loss": 0.2521, "num_input_tokens_seen": 17064920, "step": 16990 }, { "epoch": 9.011134676564158, "grad_norm": 11.395997047424316, "learning_rate": 6.70140175323231e-06, "loss": 0.162, "num_input_tokens_seen": 17070008, "step": 16995 }, { "epoch": 9.013785790031813, "grad_norm": 11.288150787353516, "learning_rate": 6.69922610055493e-06, "loss": 0.2561, "num_input_tokens_seen": 17074968, "step": 17000 }, { "epoch": 9.01643690349947, "grad_norm": 0.6561773419380188, "learning_rate": 6.697050084077691e-06, "loss": 0.1274, "num_input_tokens_seen": 17079704, "step": 17005 }, { "epoch": 9.019088016967126, "grad_norm": 13.622669219970703, "learning_rate": 6.694873704266469e-06, "loss": 0.1587, "num_input_tokens_seen": 17084408, "step": 17010 }, { "epoch": 9.021739130434783, "grad_norm": 4.519171714782715, "learning_rate": 6.6926969615872215e-06, "loss": 0.1351, "num_input_tokens_seen": 17088344, "step": 17015 }, { "epoch": 9.024390243902438, "grad_norm": 6.024263858795166, "learning_rate": 6.690519856505983e-06, "loss": 0.0735, "num_input_tokens_seen": 17093336, "step": 17020 }, { "epoch": 9.027041357370095, "grad_norm": 14.199169158935547, "learning_rate": 6.688342389488869e-06, "loss": 0.163, "num_input_tokens_seen": 17097912, "step": 17025 }, { "epoch": 9.029692470837752, "grad_norm": 9.354988098144531, "learning_rate": 6.686164561002068e-06, "loss": 0.0727, "num_input_tokens_seen": 17103992, "step": 17030 }, { "epoch": 9.032343584305409, "grad_norm": 18.466339111328125, "learning_rate": 6.683986371511844e-06, "loss": 0.0917, "num_input_tokens_seen": 17110168, "step": 17035 }, { "epoch": 9.034994697773065, "grad_norm": 5.7319865226745605, "learning_rate": 6.681807821484545e-06, "loss": 0.2541, "num_input_tokens_seen": 17114520, "step": 17040 }, { "epoch": 9.03764581124072, "grad_norm": 8.348085403442383, "learning_rate": 6.679628911386593e-06, "loss": 0.1692, "num_input_tokens_seen": 17119160, "step": 17045 }, { "epoch": 9.040296924708377, "grad_norm": 22.052675247192383, "learning_rate": 6.677449641684487e-06, "loss": 0.1632, "num_input_tokens_seen": 17124088, "step": 17050 }, { "epoch": 9.042948038176034, "grad_norm": 5.328943729400635, "learning_rate": 6.6752700128448e-06, "loss": 0.2361, "num_input_tokens_seen": 17129336, "step": 17055 }, { "epoch": 9.04559915164369, "grad_norm": 9.694482803344727, "learning_rate": 6.673090025334188e-06, "loss": 0.1915, "num_input_tokens_seen": 17133912, "step": 17060 }, { "epoch": 9.048250265111347, "grad_norm": 21.86893081665039, "learning_rate": 6.670909679619381e-06, "loss": 0.1608, "num_input_tokens_seen": 17138872, "step": 17065 }, { "epoch": 9.050901378579002, "grad_norm": 16.24037742614746, "learning_rate": 6.6687289761671826e-06, "loss": 0.0957, "num_input_tokens_seen": 17143640, "step": 17070 }, { "epoch": 9.05355249204666, "grad_norm": 3.440319776535034, "learning_rate": 6.666547915444477e-06, "loss": 0.0686, "num_input_tokens_seen": 17148728, "step": 17075 }, { "epoch": 9.056203605514316, "grad_norm": 18.56546401977539, "learning_rate": 6.6643664979182235e-06, "loss": 0.0923, "num_input_tokens_seen": 17152856, "step": 17080 }, { "epoch": 9.058854718981973, "grad_norm": 15.81460189819336, "learning_rate": 6.662184724055458e-06, "loss": 0.1031, "num_input_tokens_seen": 17156984, "step": 17085 }, { "epoch": 9.06150583244963, "grad_norm": 20.515331268310547, "learning_rate": 6.6600025943232935e-06, "loss": 0.2247, "num_input_tokens_seen": 17161688, "step": 17090 }, { "epoch": 9.064156945917285, "grad_norm": 24.06011962890625, "learning_rate": 6.657820109188915e-06, "loss": 0.1732, "num_input_tokens_seen": 17165976, "step": 17095 }, { "epoch": 9.066808059384941, "grad_norm": 9.151001930236816, "learning_rate": 6.6556372691195916e-06, "loss": 0.1616, "num_input_tokens_seen": 17171192, "step": 17100 }, { "epoch": 9.069459172852598, "grad_norm": 5.919821739196777, "learning_rate": 6.65345407458266e-06, "loss": 0.1383, "num_input_tokens_seen": 17176408, "step": 17105 }, { "epoch": 9.072110286320255, "grad_norm": 20.204448699951172, "learning_rate": 6.651270526045539e-06, "loss": 0.2424, "num_input_tokens_seen": 17182712, "step": 17110 }, { "epoch": 9.074761399787912, "grad_norm": 17.863079071044922, "learning_rate": 6.649086623975718e-06, "loss": 0.1135, "num_input_tokens_seen": 17187128, "step": 17115 }, { "epoch": 9.077412513255567, "grad_norm": 19.16132164001465, "learning_rate": 6.6469023688407666e-06, "loss": 0.3708, "num_input_tokens_seen": 17192600, "step": 17120 }, { "epoch": 9.080063626723224, "grad_norm": 12.037595748901367, "learning_rate": 6.644717761108328e-06, "loss": 0.1064, "num_input_tokens_seen": 17197528, "step": 17125 }, { "epoch": 9.08271474019088, "grad_norm": 1.5959441661834717, "learning_rate": 6.64253280124612e-06, "loss": 0.0856, "num_input_tokens_seen": 17201816, "step": 17130 }, { "epoch": 9.085365853658537, "grad_norm": 1.5485416650772095, "learning_rate": 6.640347489721937e-06, "loss": 0.0614, "num_input_tokens_seen": 17206456, "step": 17135 }, { "epoch": 9.088016967126194, "grad_norm": 16.246002197265625, "learning_rate": 6.638161827003648e-06, "loss": 0.2696, "num_input_tokens_seen": 17211736, "step": 17140 }, { "epoch": 9.090668080593849, "grad_norm": 15.03096866607666, "learning_rate": 6.635975813559201e-06, "loss": 0.177, "num_input_tokens_seen": 17216600, "step": 17145 }, { "epoch": 9.093319194061506, "grad_norm": 26.31279182434082, "learning_rate": 6.633789449856611e-06, "loss": 0.2671, "num_input_tokens_seen": 17220984, "step": 17150 }, { "epoch": 9.095970307529162, "grad_norm": 10.122415542602539, "learning_rate": 6.631602736363976e-06, "loss": 0.1644, "num_input_tokens_seen": 17225592, "step": 17155 }, { "epoch": 9.09862142099682, "grad_norm": 10.456079483032227, "learning_rate": 6.629415673549464e-06, "loss": 0.2238, "num_input_tokens_seen": 17229208, "step": 17160 }, { "epoch": 9.101272534464474, "grad_norm": 27.293302536010742, "learning_rate": 6.6272282618813214e-06, "loss": 0.2641, "num_input_tokens_seen": 17234744, "step": 17165 }, { "epoch": 9.103923647932131, "grad_norm": 3.6173553466796875, "learning_rate": 6.625040501827865e-06, "loss": 0.2331, "num_input_tokens_seen": 17238808, "step": 17170 }, { "epoch": 9.106574761399788, "grad_norm": 19.202287673950195, "learning_rate": 6.6228523938574906e-06, "loss": 0.1892, "num_input_tokens_seen": 17245112, "step": 17175 }, { "epoch": 9.109225874867445, "grad_norm": 8.63238525390625, "learning_rate": 6.620663938438664e-06, "loss": 0.2373, "num_input_tokens_seen": 17249912, "step": 17180 }, { "epoch": 9.111876988335101, "grad_norm": 12.081741333007812, "learning_rate": 6.618475136039929e-06, "loss": 0.1574, "num_input_tokens_seen": 17254296, "step": 17185 }, { "epoch": 9.114528101802756, "grad_norm": 18.165979385375977, "learning_rate": 6.616285987129902e-06, "loss": 0.1986, "num_input_tokens_seen": 17260472, "step": 17190 }, { "epoch": 9.117179215270413, "grad_norm": 3.0389950275421143, "learning_rate": 6.614096492177276e-06, "loss": 0.2563, "num_input_tokens_seen": 17266104, "step": 17195 }, { "epoch": 9.11983032873807, "grad_norm": 3.571742057800293, "learning_rate": 6.611906651650814e-06, "loss": 0.1265, "num_input_tokens_seen": 17271896, "step": 17200 }, { "epoch": 9.122481442205727, "grad_norm": 16.870283126831055, "learning_rate": 6.609716466019356e-06, "loss": 0.1941, "num_input_tokens_seen": 17277912, "step": 17205 }, { "epoch": 9.125132555673384, "grad_norm": 2.358524799346924, "learning_rate": 6.607525935751814e-06, "loss": 0.0901, "num_input_tokens_seen": 17282968, "step": 17210 }, { "epoch": 9.127783669141039, "grad_norm": 17.88125991821289, "learning_rate": 6.605335061317177e-06, "loss": 0.2113, "num_input_tokens_seen": 17287704, "step": 17215 }, { "epoch": 9.130434782608695, "grad_norm": 4.249388694763184, "learning_rate": 6.603143843184503e-06, "loss": 0.1824, "num_input_tokens_seen": 17294104, "step": 17220 }, { "epoch": 9.133085896076352, "grad_norm": 12.555737495422363, "learning_rate": 6.600952281822926e-06, "loss": 0.2391, "num_input_tokens_seen": 17298392, "step": 17225 }, { "epoch": 9.135737009544009, "grad_norm": 2.0564332008361816, "learning_rate": 6.598760377701656e-06, "loss": 0.0645, "num_input_tokens_seen": 17303160, "step": 17230 }, { "epoch": 9.138388123011666, "grad_norm": 12.899239540100098, "learning_rate": 6.596568131289974e-06, "loss": 0.2484, "num_input_tokens_seen": 17308280, "step": 17235 }, { "epoch": 9.14103923647932, "grad_norm": 20.18808364868164, "learning_rate": 6.594375543057232e-06, "loss": 0.1844, "num_input_tokens_seen": 17314968, "step": 17240 }, { "epoch": 9.143690349946977, "grad_norm": 26.662113189697266, "learning_rate": 6.5921826134728575e-06, "loss": 0.3293, "num_input_tokens_seen": 17320504, "step": 17245 }, { "epoch": 9.146341463414634, "grad_norm": 10.091729164123535, "learning_rate": 6.589989343006352e-06, "loss": 0.1102, "num_input_tokens_seen": 17324920, "step": 17250 }, { "epoch": 9.148992576882291, "grad_norm": 7.593074798583984, "learning_rate": 6.5877957321272904e-06, "loss": 0.1102, "num_input_tokens_seen": 17330040, "step": 17255 }, { "epoch": 9.151643690349948, "grad_norm": 3.2834208011627197, "learning_rate": 6.585601781305315e-06, "loss": 0.2964, "num_input_tokens_seen": 17336792, "step": 17260 }, { "epoch": 9.154294803817603, "grad_norm": 3.0207831859588623, "learning_rate": 6.583407491010149e-06, "loss": 0.0894, "num_input_tokens_seen": 17341496, "step": 17265 }, { "epoch": 9.15694591728526, "grad_norm": 8.392488479614258, "learning_rate": 6.581212861711581e-06, "loss": 0.2362, "num_input_tokens_seen": 17346072, "step": 17270 }, { "epoch": 9.159597030752916, "grad_norm": 16.98826026916504, "learning_rate": 6.579017893879478e-06, "loss": 0.2456, "num_input_tokens_seen": 17350936, "step": 17275 }, { "epoch": 9.162248144220573, "grad_norm": 2.5893056392669678, "learning_rate": 6.576822587983776e-06, "loss": 0.0635, "num_input_tokens_seen": 17355896, "step": 17280 }, { "epoch": 9.164899257688228, "grad_norm": 21.08008575439453, "learning_rate": 6.5746269444944835e-06, "loss": 0.1492, "num_input_tokens_seen": 17360472, "step": 17285 }, { "epoch": 9.167550371155885, "grad_norm": 8.771710395812988, "learning_rate": 6.572430963881683e-06, "loss": 0.174, "num_input_tokens_seen": 17364248, "step": 17290 }, { "epoch": 9.170201484623542, "grad_norm": 12.736270904541016, "learning_rate": 6.570234646615528e-06, "loss": 0.1405, "num_input_tokens_seen": 17368696, "step": 17295 }, { "epoch": 9.172852598091199, "grad_norm": 26.836193084716797, "learning_rate": 6.568037993166243e-06, "loss": 0.2427, "num_input_tokens_seen": 17373208, "step": 17300 }, { "epoch": 9.175503711558855, "grad_norm": 22.257047653198242, "learning_rate": 6.5658410040041276e-06, "loss": 0.1301, "num_input_tokens_seen": 17377400, "step": 17305 }, { "epoch": 9.17815482502651, "grad_norm": 1.8934491872787476, "learning_rate": 6.563643679599552e-06, "loss": 0.1374, "num_input_tokens_seen": 17382200, "step": 17310 }, { "epoch": 9.180805938494167, "grad_norm": 10.468643188476562, "learning_rate": 6.561446020422955e-06, "loss": 0.1193, "num_input_tokens_seen": 17386488, "step": 17315 }, { "epoch": 9.183457051961824, "grad_norm": 9.089741706848145, "learning_rate": 6.5592480269448485e-06, "loss": 0.1008, "num_input_tokens_seen": 17391640, "step": 17320 }, { "epoch": 9.18610816542948, "grad_norm": 3.3297739028930664, "learning_rate": 6.557049699635823e-06, "loss": 0.1758, "num_input_tokens_seen": 17397368, "step": 17325 }, { "epoch": 9.188759278897138, "grad_norm": 4.845911502838135, "learning_rate": 6.55485103896653e-06, "loss": 0.0919, "num_input_tokens_seen": 17401656, "step": 17330 }, { "epoch": 9.191410392364793, "grad_norm": 7.562836647033691, "learning_rate": 6.552652045407698e-06, "loss": 0.1265, "num_input_tokens_seen": 17406552, "step": 17335 }, { "epoch": 9.19406150583245, "grad_norm": 9.437161445617676, "learning_rate": 6.5504527194301245e-06, "loss": 0.1246, "num_input_tokens_seen": 17411704, "step": 17340 }, { "epoch": 9.196712619300106, "grad_norm": 4.171730995178223, "learning_rate": 6.548253061504684e-06, "loss": 0.1051, "num_input_tokens_seen": 17417464, "step": 17345 }, { "epoch": 9.199363732767763, "grad_norm": 22.50580596923828, "learning_rate": 6.546053072102312e-06, "loss": 0.1397, "num_input_tokens_seen": 17422392, "step": 17350 }, { "epoch": 9.20201484623542, "grad_norm": 10.9346923828125, "learning_rate": 6.5438527516940244e-06, "loss": 0.3147, "num_input_tokens_seen": 17427160, "step": 17355 }, { "epoch": 9.204665959703075, "grad_norm": 14.201148986816406, "learning_rate": 6.541652100750901e-06, "loss": 0.1885, "num_input_tokens_seen": 17431224, "step": 17360 }, { "epoch": 9.207317073170731, "grad_norm": 9.541899681091309, "learning_rate": 6.539451119744098e-06, "loss": 0.1689, "num_input_tokens_seen": 17436408, "step": 17365 }, { "epoch": 9.209968186638388, "grad_norm": 10.455002784729004, "learning_rate": 6.537249809144839e-06, "loss": 0.1412, "num_input_tokens_seen": 17441464, "step": 17370 }, { "epoch": 9.212619300106045, "grad_norm": 2.1265788078308105, "learning_rate": 6.5350481694244175e-06, "loss": 0.1865, "num_input_tokens_seen": 17447224, "step": 17375 }, { "epoch": 9.215270413573702, "grad_norm": 29.876691818237305, "learning_rate": 6.5328462010542e-06, "loss": 0.2926, "num_input_tokens_seen": 17452984, "step": 17380 }, { "epoch": 9.217921527041357, "grad_norm": 2.6715097427368164, "learning_rate": 6.530643904505622e-06, "loss": 0.0669, "num_input_tokens_seen": 17458488, "step": 17385 }, { "epoch": 9.220572640509014, "grad_norm": 30.904396057128906, "learning_rate": 6.528441280250189e-06, "loss": 0.3113, "num_input_tokens_seen": 17463768, "step": 17390 }, { "epoch": 9.22322375397667, "grad_norm": 23.409217834472656, "learning_rate": 6.526238328759478e-06, "loss": 0.1068, "num_input_tokens_seen": 17468504, "step": 17395 }, { "epoch": 9.225874867444327, "grad_norm": 26.836475372314453, "learning_rate": 6.524035050505133e-06, "loss": 0.2848, "num_input_tokens_seen": 17473336, "step": 17400 }, { "epoch": 9.228525980911982, "grad_norm": 7.352761745452881, "learning_rate": 6.521831445958874e-06, "loss": 0.1466, "num_input_tokens_seen": 17477944, "step": 17405 }, { "epoch": 9.231177094379639, "grad_norm": 14.039148330688477, "learning_rate": 6.519627515592484e-06, "loss": 0.1352, "num_input_tokens_seen": 17481880, "step": 17410 }, { "epoch": 9.233828207847296, "grad_norm": 1.7655620574951172, "learning_rate": 6.517423259877819e-06, "loss": 0.0705, "num_input_tokens_seen": 17486520, "step": 17415 }, { "epoch": 9.236479321314953, "grad_norm": 1.306114673614502, "learning_rate": 6.515218679286803e-06, "loss": 0.0608, "num_input_tokens_seen": 17494392, "step": 17420 }, { "epoch": 9.23913043478261, "grad_norm": 20.48309898376465, "learning_rate": 6.513013774291435e-06, "loss": 0.1339, "num_input_tokens_seen": 17499544, "step": 17425 }, { "epoch": 9.241781548250264, "grad_norm": 18.443817138671875, "learning_rate": 6.510808545363777e-06, "loss": 0.1461, "num_input_tokens_seen": 17505368, "step": 17430 }, { "epoch": 9.244432661717921, "grad_norm": 12.859796524047852, "learning_rate": 6.508602992975963e-06, "loss": 0.2176, "num_input_tokens_seen": 17510488, "step": 17435 }, { "epoch": 9.247083775185578, "grad_norm": 13.354915618896484, "learning_rate": 6.506397117600194e-06, "loss": 0.181, "num_input_tokens_seen": 17515096, "step": 17440 }, { "epoch": 9.249734888653235, "grad_norm": 2.4363083839416504, "learning_rate": 6.5041909197087465e-06, "loss": 0.2185, "num_input_tokens_seen": 17520184, "step": 17445 }, { "epoch": 9.252386002120891, "grad_norm": 20.780187606811523, "learning_rate": 6.501984399773957e-06, "loss": 0.249, "num_input_tokens_seen": 17525528, "step": 17450 }, { "epoch": 9.255037115588546, "grad_norm": 15.314559936523438, "learning_rate": 6.4997775582682385e-06, "loss": 0.213, "num_input_tokens_seen": 17532472, "step": 17455 }, { "epoch": 9.257688229056203, "grad_norm": 25.92035675048828, "learning_rate": 6.49757039566407e-06, "loss": 0.2072, "num_input_tokens_seen": 17537592, "step": 17460 }, { "epoch": 9.26033934252386, "grad_norm": 7.6363983154296875, "learning_rate": 6.4953629124339975e-06, "loss": 0.1377, "num_input_tokens_seen": 17541944, "step": 17465 }, { "epoch": 9.262990455991517, "grad_norm": 2.23244309425354, "learning_rate": 6.4931551090506395e-06, "loss": 0.1029, "num_input_tokens_seen": 17545880, "step": 17470 }, { "epoch": 9.265641569459174, "grad_norm": 7.393557548522949, "learning_rate": 6.490946985986678e-06, "loss": 0.1624, "num_input_tokens_seen": 17552152, "step": 17475 }, { "epoch": 9.268292682926829, "grad_norm": 6.413189888000488, "learning_rate": 6.4887385437148695e-06, "loss": 0.2043, "num_input_tokens_seen": 17556888, "step": 17480 }, { "epoch": 9.270943796394485, "grad_norm": 9.329233169555664, "learning_rate": 6.486529782708035e-06, "loss": 0.2108, "num_input_tokens_seen": 17561976, "step": 17485 }, { "epoch": 9.273594909862142, "grad_norm": 20.31235694885254, "learning_rate": 6.484320703439063e-06, "loss": 0.1859, "num_input_tokens_seen": 17567448, "step": 17490 }, { "epoch": 9.276246023329799, "grad_norm": 5.443185329437256, "learning_rate": 6.482111306380913e-06, "loss": 0.1416, "num_input_tokens_seen": 17572504, "step": 17495 }, { "epoch": 9.278897136797456, "grad_norm": 33.643211364746094, "learning_rate": 6.47990159200661e-06, "loss": 0.2699, "num_input_tokens_seen": 17577240, "step": 17500 }, { "epoch": 9.28154825026511, "grad_norm": 16.63841438293457, "learning_rate": 6.477691560789249e-06, "loss": 0.2669, "num_input_tokens_seen": 17581752, "step": 17505 }, { "epoch": 9.284199363732768, "grad_norm": 20.26548194885254, "learning_rate": 6.475481213201991e-06, "loss": 0.1081, "num_input_tokens_seen": 17586968, "step": 17510 }, { "epoch": 9.286850477200424, "grad_norm": 32.79441452026367, "learning_rate": 6.473270549718067e-06, "loss": 0.2329, "num_input_tokens_seen": 17591288, "step": 17515 }, { "epoch": 9.289501590668081, "grad_norm": 16.080163955688477, "learning_rate": 6.4710595708107734e-06, "loss": 0.0806, "num_input_tokens_seen": 17595992, "step": 17520 }, { "epoch": 9.292152704135738, "grad_norm": 9.06263256072998, "learning_rate": 6.4688482769534745e-06, "loss": 0.2385, "num_input_tokens_seen": 17601944, "step": 17525 }, { "epoch": 9.294803817603393, "grad_norm": 18.432767868041992, "learning_rate": 6.4666366686196026e-06, "loss": 0.1758, "num_input_tokens_seen": 17607352, "step": 17530 }, { "epoch": 9.29745493107105, "grad_norm": 32.21429443359375, "learning_rate": 6.464424746282659e-06, "loss": 0.2143, "num_input_tokens_seen": 17611800, "step": 17535 }, { "epoch": 9.300106044538706, "grad_norm": 11.501421928405762, "learning_rate": 6.4622125104162095e-06, "loss": 0.1047, "num_input_tokens_seen": 17616312, "step": 17540 }, { "epoch": 9.302757158006363, "grad_norm": 6.819126129150391, "learning_rate": 6.459999961493887e-06, "loss": 0.1061, "num_input_tokens_seen": 17621368, "step": 17545 }, { "epoch": 9.305408271474018, "grad_norm": 17.948787689208984, "learning_rate": 6.457787099989392e-06, "loss": 0.3129, "num_input_tokens_seen": 17626648, "step": 17550 }, { "epoch": 9.308059384941675, "grad_norm": 8.460654258728027, "learning_rate": 6.455573926376495e-06, "loss": 0.0843, "num_input_tokens_seen": 17631192, "step": 17555 }, { "epoch": 9.310710498409332, "grad_norm": 19.932865142822266, "learning_rate": 6.453360441129029e-06, "loss": 0.1351, "num_input_tokens_seen": 17636600, "step": 17560 }, { "epoch": 9.313361611876989, "grad_norm": 7.96771240234375, "learning_rate": 6.4511466447208955e-06, "loss": 0.0615, "num_input_tokens_seen": 17641784, "step": 17565 }, { "epoch": 9.316012725344645, "grad_norm": 23.924457550048828, "learning_rate": 6.448932537626062e-06, "loss": 0.2509, "num_input_tokens_seen": 17646520, "step": 17570 }, { "epoch": 9.3186638388123, "grad_norm": 12.83343505859375, "learning_rate": 6.446718120318566e-06, "loss": 0.0851, "num_input_tokens_seen": 17651256, "step": 17575 }, { "epoch": 9.321314952279957, "grad_norm": 20.383251190185547, "learning_rate": 6.444503393272505e-06, "loss": 0.1674, "num_input_tokens_seen": 17656728, "step": 17580 }, { "epoch": 9.323966065747614, "grad_norm": 17.057523727416992, "learning_rate": 6.442288356962046e-06, "loss": 0.1837, "num_input_tokens_seen": 17664184, "step": 17585 }, { "epoch": 9.32661717921527, "grad_norm": 17.715866088867188, "learning_rate": 6.440073011861425e-06, "loss": 0.165, "num_input_tokens_seen": 17669528, "step": 17590 }, { "epoch": 9.329268292682928, "grad_norm": 15.009793281555176, "learning_rate": 6.437857358444941e-06, "loss": 0.1285, "num_input_tokens_seen": 17674488, "step": 17595 }, { "epoch": 9.331919406150583, "grad_norm": 4.277724266052246, "learning_rate": 6.435641397186958e-06, "loss": 0.1328, "num_input_tokens_seen": 17679672, "step": 17600 }, { "epoch": 9.33457051961824, "grad_norm": 4.308593273162842, "learning_rate": 6.433425128561909e-06, "loss": 0.1135, "num_input_tokens_seen": 17684408, "step": 17605 }, { "epoch": 9.337221633085896, "grad_norm": 4.247529983520508, "learning_rate": 6.43120855304429e-06, "loss": 0.0405, "num_input_tokens_seen": 17688536, "step": 17610 }, { "epoch": 9.339872746553553, "grad_norm": 2.0135209560394287, "learning_rate": 6.4289916711086664e-06, "loss": 0.1282, "num_input_tokens_seen": 17694232, "step": 17615 }, { "epoch": 9.34252386002121, "grad_norm": 26.874374389648438, "learning_rate": 6.426774483229664e-06, "loss": 0.2016, "num_input_tokens_seen": 17699352, "step": 17620 }, { "epoch": 9.345174973488865, "grad_norm": 22.77092170715332, "learning_rate": 6.424556989881978e-06, "loss": 0.1403, "num_input_tokens_seen": 17704216, "step": 17625 }, { "epoch": 9.347826086956522, "grad_norm": 19.018451690673828, "learning_rate": 6.42233919154037e-06, "loss": 0.2745, "num_input_tokens_seen": 17710456, "step": 17630 }, { "epoch": 9.350477200424178, "grad_norm": 3.6252880096435547, "learning_rate": 6.420121088679662e-06, "loss": 0.1412, "num_input_tokens_seen": 17717240, "step": 17635 }, { "epoch": 9.353128313891835, "grad_norm": 13.038777351379395, "learning_rate": 6.4179026817747445e-06, "loss": 0.0795, "num_input_tokens_seen": 17721496, "step": 17640 }, { "epoch": 9.35577942735949, "grad_norm": 2.7955427169799805, "learning_rate": 6.415683971300573e-06, "loss": 0.0994, "num_input_tokens_seen": 17726840, "step": 17645 }, { "epoch": 9.358430540827147, "grad_norm": 11.337196350097656, "learning_rate": 6.413464957732168e-06, "loss": 0.2405, "num_input_tokens_seen": 17732120, "step": 17650 }, { "epoch": 9.361081654294804, "grad_norm": 28.208152770996094, "learning_rate": 6.411245641544615e-06, "loss": 0.3389, "num_input_tokens_seen": 17736408, "step": 17655 }, { "epoch": 9.36373276776246, "grad_norm": 26.518918991088867, "learning_rate": 6.409026023213063e-06, "loss": 0.3202, "num_input_tokens_seen": 17741336, "step": 17660 }, { "epoch": 9.366383881230117, "grad_norm": 30.768056869506836, "learning_rate": 6.406806103212725e-06, "loss": 0.2094, "num_input_tokens_seen": 17747288, "step": 17665 }, { "epoch": 9.369034994697772, "grad_norm": 2.8867528438568115, "learning_rate": 6.404585882018882e-06, "loss": 0.1581, "num_input_tokens_seen": 17752984, "step": 17670 }, { "epoch": 9.371686108165429, "grad_norm": 4.779358386993408, "learning_rate": 6.402365360106878e-06, "loss": 0.1037, "num_input_tokens_seen": 17757720, "step": 17675 }, { "epoch": 9.374337221633086, "grad_norm": 12.677481651306152, "learning_rate": 6.40014453795212e-06, "loss": 0.1115, "num_input_tokens_seen": 17763320, "step": 17680 }, { "epoch": 9.376988335100743, "grad_norm": 5.623738765716553, "learning_rate": 6.3979234160300786e-06, "loss": 0.0641, "num_input_tokens_seen": 17768408, "step": 17685 }, { "epoch": 9.3796394485684, "grad_norm": 11.767419815063477, "learning_rate": 6.395701994816293e-06, "loss": 0.1219, "num_input_tokens_seen": 17772536, "step": 17690 }, { "epoch": 9.382290562036054, "grad_norm": 26.611120223999023, "learning_rate": 6.3934802747863635e-06, "loss": 0.4572, "num_input_tokens_seen": 17778520, "step": 17695 }, { "epoch": 9.384941675503711, "grad_norm": 9.261659622192383, "learning_rate": 6.391258256415953e-06, "loss": 0.0811, "num_input_tokens_seen": 17783352, "step": 17700 }, { "epoch": 9.387592788971368, "grad_norm": 7.97147274017334, "learning_rate": 6.389035940180789e-06, "loss": 0.1257, "num_input_tokens_seen": 17788856, "step": 17705 }, { "epoch": 9.390243902439025, "grad_norm": 7.398244857788086, "learning_rate": 6.386813326556666e-06, "loss": 0.2538, "num_input_tokens_seen": 17794072, "step": 17710 }, { "epoch": 9.392895015906682, "grad_norm": 3.1395833492279053, "learning_rate": 6.384590416019438e-06, "loss": 0.2335, "num_input_tokens_seen": 17800248, "step": 17715 }, { "epoch": 9.395546129374337, "grad_norm": 14.704669952392578, "learning_rate": 6.382367209045026e-06, "loss": 0.2395, "num_input_tokens_seen": 17805368, "step": 17720 }, { "epoch": 9.398197242841993, "grad_norm": 21.765426635742188, "learning_rate": 6.380143706109412e-06, "loss": 0.1872, "num_input_tokens_seen": 17810392, "step": 17725 }, { "epoch": 9.40084835630965, "grad_norm": 2.111689567565918, "learning_rate": 6.377919907688641e-06, "loss": 0.0562, "num_input_tokens_seen": 17814904, "step": 17730 }, { "epoch": 9.403499469777307, "grad_norm": 12.361485481262207, "learning_rate": 6.3756958142588245e-06, "loss": 0.1417, "num_input_tokens_seen": 17820632, "step": 17735 }, { "epoch": 9.406150583244964, "grad_norm": 26.75429916381836, "learning_rate": 6.373471426296132e-06, "loss": 0.3827, "num_input_tokens_seen": 17826904, "step": 17740 }, { "epoch": 9.408801696712619, "grad_norm": 6.67063570022583, "learning_rate": 6.371246744276803e-06, "loss": 0.2503, "num_input_tokens_seen": 17833144, "step": 17745 }, { "epoch": 9.411452810180275, "grad_norm": 22.582361221313477, "learning_rate": 6.369021768677134e-06, "loss": 0.1509, "num_input_tokens_seen": 17838136, "step": 17750 }, { "epoch": 9.414103923647932, "grad_norm": 18.10053253173828, "learning_rate": 6.366796499973486e-06, "loss": 0.172, "num_input_tokens_seen": 17843064, "step": 17755 }, { "epoch": 9.416755037115589, "grad_norm": 6.451843738555908, "learning_rate": 6.364570938642285e-06, "loss": 0.1226, "num_input_tokens_seen": 17848120, "step": 17760 }, { "epoch": 9.419406150583246, "grad_norm": 21.425188064575195, "learning_rate": 6.3623450851600156e-06, "loss": 0.2875, "num_input_tokens_seen": 17852760, "step": 17765 }, { "epoch": 9.4220572640509, "grad_norm": 16.051006317138672, "learning_rate": 6.360118940003229e-06, "loss": 0.1611, "num_input_tokens_seen": 17857784, "step": 17770 }, { "epoch": 9.424708377518558, "grad_norm": 19.442371368408203, "learning_rate": 6.357892503648537e-06, "loss": 0.166, "num_input_tokens_seen": 17862488, "step": 17775 }, { "epoch": 9.427359490986214, "grad_norm": 6.184763431549072, "learning_rate": 6.3556657765726116e-06, "loss": 0.2173, "num_input_tokens_seen": 17866712, "step": 17780 }, { "epoch": 9.430010604453871, "grad_norm": 1.566176176071167, "learning_rate": 6.353438759252192e-06, "loss": 0.0959, "num_input_tokens_seen": 17871096, "step": 17785 }, { "epoch": 9.432661717921526, "grad_norm": 3.292165756225586, "learning_rate": 6.351211452164075e-06, "loss": 0.1542, "num_input_tokens_seen": 17877912, "step": 17790 }, { "epoch": 9.435312831389183, "grad_norm": 14.025659561157227, "learning_rate": 6.348983855785122e-06, "loss": 0.2144, "num_input_tokens_seen": 17883768, "step": 17795 }, { "epoch": 9.43796394485684, "grad_norm": 20.87649154663086, "learning_rate": 6.346755970592256e-06, "loss": 0.194, "num_input_tokens_seen": 17888984, "step": 17800 }, { "epoch": 9.440615058324497, "grad_norm": 7.6847991943359375, "learning_rate": 6.344527797062459e-06, "loss": 0.2171, "num_input_tokens_seen": 17894488, "step": 17805 }, { "epoch": 9.443266171792153, "grad_norm": 8.737561225891113, "learning_rate": 6.342299335672781e-06, "loss": 0.137, "num_input_tokens_seen": 17899256, "step": 17810 }, { "epoch": 9.445917285259808, "grad_norm": 11.239274024963379, "learning_rate": 6.340070586900327e-06, "loss": 0.1198, "num_input_tokens_seen": 17904664, "step": 17815 }, { "epoch": 9.448568398727465, "grad_norm": 8.26764965057373, "learning_rate": 6.337841551222267e-06, "loss": 0.177, "num_input_tokens_seen": 17909304, "step": 17820 }, { "epoch": 9.451219512195122, "grad_norm": 3.198253631591797, "learning_rate": 6.335612229115832e-06, "loss": 0.1027, "num_input_tokens_seen": 17914104, "step": 17825 }, { "epoch": 9.453870625662779, "grad_norm": 1.3381352424621582, "learning_rate": 6.333382621058314e-06, "loss": 0.1248, "num_input_tokens_seen": 17919672, "step": 17830 }, { "epoch": 9.456521739130435, "grad_norm": 19.377408981323242, "learning_rate": 6.3311527275270635e-06, "loss": 0.1251, "num_input_tokens_seen": 17924184, "step": 17835 }, { "epoch": 9.45917285259809, "grad_norm": 8.080642700195312, "learning_rate": 6.3289225489995e-06, "loss": 0.0748, "num_input_tokens_seen": 17929336, "step": 17840 }, { "epoch": 9.461823966065747, "grad_norm": 6.757633686065674, "learning_rate": 6.326692085953096e-06, "loss": 0.1899, "num_input_tokens_seen": 17933816, "step": 17845 }, { "epoch": 9.464475079533404, "grad_norm": 27.36014747619629, "learning_rate": 6.324461338865387e-06, "loss": 0.307, "num_input_tokens_seen": 17939896, "step": 17850 }, { "epoch": 9.46712619300106, "grad_norm": 3.691497564315796, "learning_rate": 6.32223030821397e-06, "loss": 0.1542, "num_input_tokens_seen": 17944536, "step": 17855 }, { "epoch": 9.469777306468718, "grad_norm": 18.67241096496582, "learning_rate": 6.319998994476507e-06, "loss": 0.2381, "num_input_tokens_seen": 17948664, "step": 17860 }, { "epoch": 9.472428419936373, "grad_norm": 26.529544830322266, "learning_rate": 6.317767398130712e-06, "loss": 0.3121, "num_input_tokens_seen": 17954072, "step": 17865 }, { "epoch": 9.47507953340403, "grad_norm": 21.16661262512207, "learning_rate": 6.315535519654364e-06, "loss": 0.2524, "num_input_tokens_seen": 17959256, "step": 17870 }, { "epoch": 9.477730646871686, "grad_norm": 3.2655415534973145, "learning_rate": 6.313303359525305e-06, "loss": 0.1498, "num_input_tokens_seen": 17963992, "step": 17875 }, { "epoch": 9.480381760339343, "grad_norm": 12.742919921875, "learning_rate": 6.311070918221433e-06, "loss": 0.1704, "num_input_tokens_seen": 17969624, "step": 17880 }, { "epoch": 9.483032873807, "grad_norm": 13.773459434509277, "learning_rate": 6.308838196220709e-06, "loss": 0.106, "num_input_tokens_seen": 17974072, "step": 17885 }, { "epoch": 9.485683987274655, "grad_norm": 5.783457279205322, "learning_rate": 6.306605194001149e-06, "loss": 0.1368, "num_input_tokens_seen": 17978456, "step": 17890 }, { "epoch": 9.488335100742312, "grad_norm": 4.165128231048584, "learning_rate": 6.30437191204084e-06, "loss": 0.1801, "num_input_tokens_seen": 17983384, "step": 17895 }, { "epoch": 9.490986214209968, "grad_norm": 8.306781768798828, "learning_rate": 6.302138350817916e-06, "loss": 0.2738, "num_input_tokens_seen": 17988984, "step": 17900 }, { "epoch": 9.493637327677625, "grad_norm": 17.426036834716797, "learning_rate": 6.299904510810578e-06, "loss": 0.1543, "num_input_tokens_seen": 17993304, "step": 17905 }, { "epoch": 9.496288441145282, "grad_norm": 10.40703010559082, "learning_rate": 6.297670392497086e-06, "loss": 0.3897, "num_input_tokens_seen": 17998328, "step": 17910 }, { "epoch": 9.498939554612937, "grad_norm": 21.411184310913086, "learning_rate": 6.2954359963557585e-06, "loss": 0.2758, "num_input_tokens_seen": 18002680, "step": 17915 }, { "epoch": 9.501590668080594, "grad_norm": 18.368467330932617, "learning_rate": 6.2932013228649745e-06, "loss": 0.1165, "num_input_tokens_seen": 18007192, "step": 17920 }, { "epoch": 9.50424178154825, "grad_norm": 16.287273406982422, "learning_rate": 6.2909663725031714e-06, "loss": 0.1203, "num_input_tokens_seen": 18011096, "step": 17925 }, { "epoch": 9.506892895015907, "grad_norm": 1.716687798500061, "learning_rate": 6.288731145748845e-06, "loss": 0.199, "num_input_tokens_seen": 18015608, "step": 17930 }, { "epoch": 9.509544008483562, "grad_norm": 9.98215389251709, "learning_rate": 6.286495643080553e-06, "loss": 0.372, "num_input_tokens_seen": 18020888, "step": 17935 }, { "epoch": 9.512195121951219, "grad_norm": 11.664313316345215, "learning_rate": 6.284259864976911e-06, "loss": 0.175, "num_input_tokens_seen": 18025464, "step": 17940 }, { "epoch": 9.514846235418876, "grad_norm": 12.966170310974121, "learning_rate": 6.282023811916593e-06, "loss": 0.0808, "num_input_tokens_seen": 18029912, "step": 17945 }, { "epoch": 9.517497348886533, "grad_norm": 23.13735008239746, "learning_rate": 6.27978748437833e-06, "loss": 0.2114, "num_input_tokens_seen": 18034488, "step": 17950 }, { "epoch": 9.52014846235419, "grad_norm": 2.271491765975952, "learning_rate": 6.277550882840916e-06, "loss": 0.0971, "num_input_tokens_seen": 18040056, "step": 17955 }, { "epoch": 9.522799575821844, "grad_norm": 25.817514419555664, "learning_rate": 6.275314007783201e-06, "loss": 0.1388, "num_input_tokens_seen": 18044536, "step": 17960 }, { "epoch": 9.525450689289501, "grad_norm": 19.656206130981445, "learning_rate": 6.2730768596840944e-06, "loss": 0.257, "num_input_tokens_seen": 18049656, "step": 17965 }, { "epoch": 9.528101802757158, "grad_norm": 16.48688507080078, "learning_rate": 6.270839439022562e-06, "loss": 0.1834, "num_input_tokens_seen": 18054616, "step": 17970 }, { "epoch": 9.530752916224815, "grad_norm": 2.605739116668701, "learning_rate": 6.268601746277633e-06, "loss": 0.1316, "num_input_tokens_seen": 18059288, "step": 17975 }, { "epoch": 9.533404029692472, "grad_norm": 12.293506622314453, "learning_rate": 6.266363781928389e-06, "loss": 0.0875, "num_input_tokens_seen": 18064216, "step": 17980 }, { "epoch": 9.536055143160127, "grad_norm": 5.387440204620361, "learning_rate": 6.264125546453974e-06, "loss": 0.1412, "num_input_tokens_seen": 18068408, "step": 17985 }, { "epoch": 9.538706256627783, "grad_norm": 24.502805709838867, "learning_rate": 6.261887040333585e-06, "loss": 0.1608, "num_input_tokens_seen": 18072568, "step": 17990 }, { "epoch": 9.54135737009544, "grad_norm": 9.364672660827637, "learning_rate": 6.259648264046485e-06, "loss": 0.3919, "num_input_tokens_seen": 18078136, "step": 17995 }, { "epoch": 9.544008483563097, "grad_norm": 7.144257545471191, "learning_rate": 6.257409218071988e-06, "loss": 0.1167, "num_input_tokens_seen": 18082648, "step": 18000 }, { "epoch": 9.546659597030754, "grad_norm": 7.723264217376709, "learning_rate": 6.255169902889466e-06, "loss": 0.1787, "num_input_tokens_seen": 18088216, "step": 18005 }, { "epoch": 9.549310710498409, "grad_norm": 15.116156578063965, "learning_rate": 6.252930318978353e-06, "loss": 0.1263, "num_input_tokens_seen": 18092920, "step": 18010 }, { "epoch": 9.551961823966066, "grad_norm": 5.532839775085449, "learning_rate": 6.250690466818138e-06, "loss": 0.1373, "num_input_tokens_seen": 18097528, "step": 18015 }, { "epoch": 9.554612937433722, "grad_norm": 10.795331001281738, "learning_rate": 6.248450346888366e-06, "loss": 0.1843, "num_input_tokens_seen": 18102744, "step": 18020 }, { "epoch": 9.557264050901379, "grad_norm": 14.78930950164795, "learning_rate": 6.246209959668641e-06, "loss": 0.2247, "num_input_tokens_seen": 18108184, "step": 18025 }, { "epoch": 9.559915164369034, "grad_norm": 15.21864128112793, "learning_rate": 6.243969305638625e-06, "loss": 0.1999, "num_input_tokens_seen": 18112824, "step": 18030 }, { "epoch": 9.56256627783669, "grad_norm": 22.580724716186523, "learning_rate": 6.241728385278037e-06, "loss": 0.2263, "num_input_tokens_seen": 18117592, "step": 18035 }, { "epoch": 9.565217391304348, "grad_norm": 8.782367706298828, "learning_rate": 6.23948719906665e-06, "loss": 0.2166, "num_input_tokens_seen": 18123032, "step": 18040 }, { "epoch": 9.567868504772004, "grad_norm": 6.667860984802246, "learning_rate": 6.237245747484296e-06, "loss": 0.1567, "num_input_tokens_seen": 18127320, "step": 18045 }, { "epoch": 9.570519618239661, "grad_norm": 4.37400484085083, "learning_rate": 6.235004031010866e-06, "loss": 0.1443, "num_input_tokens_seen": 18131480, "step": 18050 }, { "epoch": 9.573170731707316, "grad_norm": 15.832245826721191, "learning_rate": 6.232762050126305e-06, "loss": 0.1866, "num_input_tokens_seen": 18136440, "step": 18055 }, { "epoch": 9.575821845174973, "grad_norm": 17.554832458496094, "learning_rate": 6.230519805310615e-06, "loss": 0.1689, "num_input_tokens_seen": 18141656, "step": 18060 }, { "epoch": 9.57847295864263, "grad_norm": 18.682653427124023, "learning_rate": 6.2282772970438546e-06, "loss": 0.1866, "num_input_tokens_seen": 18147192, "step": 18065 }, { "epoch": 9.581124072110287, "grad_norm": 21.977100372314453, "learning_rate": 6.226034525806138e-06, "loss": 0.2375, "num_input_tokens_seen": 18153144, "step": 18070 }, { "epoch": 9.583775185577943, "grad_norm": 4.457308769226074, "learning_rate": 6.223791492077639e-06, "loss": 0.149, "num_input_tokens_seen": 18157080, "step": 18075 }, { "epoch": 9.586426299045598, "grad_norm": 7.4049763679504395, "learning_rate": 6.221548196338582e-06, "loss": 0.1468, "num_input_tokens_seen": 18163256, "step": 18080 }, { "epoch": 9.589077412513255, "grad_norm": 25.684303283691406, "learning_rate": 6.2193046390692546e-06, "loss": 0.3094, "num_input_tokens_seen": 18169368, "step": 18085 }, { "epoch": 9.591728525980912, "grad_norm": 2.6573712825775146, "learning_rate": 6.217060820749994e-06, "loss": 0.1316, "num_input_tokens_seen": 18173880, "step": 18090 }, { "epoch": 9.594379639448569, "grad_norm": 25.569337844848633, "learning_rate": 6.214816741861196e-06, "loss": 0.187, "num_input_tokens_seen": 18178808, "step": 18095 }, { "epoch": 9.597030752916226, "grad_norm": 11.916346549987793, "learning_rate": 6.212572402883312e-06, "loss": 0.1727, "num_input_tokens_seen": 18185080, "step": 18100 }, { "epoch": 9.59968186638388, "grad_norm": 25.679508209228516, "learning_rate": 6.2103278042968505e-06, "loss": 0.2504, "num_input_tokens_seen": 18189080, "step": 18105 }, { "epoch": 9.602332979851537, "grad_norm": 16.94690704345703, "learning_rate": 6.208082946582373e-06, "loss": 0.1876, "num_input_tokens_seen": 18194968, "step": 18110 }, { "epoch": 9.604984093319194, "grad_norm": 16.76116943359375, "learning_rate": 6.205837830220498e-06, "loss": 0.2, "num_input_tokens_seen": 18199608, "step": 18115 }, { "epoch": 9.607635206786851, "grad_norm": 7.804359436035156, "learning_rate": 6.203592455691898e-06, "loss": 0.2028, "num_input_tokens_seen": 18204888, "step": 18120 }, { "epoch": 9.610286320254508, "grad_norm": 1.8277140855789185, "learning_rate": 6.2013468234773034e-06, "loss": 0.2129, "num_input_tokens_seen": 18209688, "step": 18125 }, { "epoch": 9.612937433722163, "grad_norm": 7.902063369750977, "learning_rate": 6.199100934057498e-06, "loss": 0.1079, "num_input_tokens_seen": 18215096, "step": 18130 }, { "epoch": 9.61558854718982, "grad_norm": 30.037145614624023, "learning_rate": 6.19685478791332e-06, "loss": 0.2521, "num_input_tokens_seen": 18220760, "step": 18135 }, { "epoch": 9.618239660657476, "grad_norm": 3.7814524173736572, "learning_rate": 6.194608385525665e-06, "loss": 0.0888, "num_input_tokens_seen": 18225912, "step": 18140 }, { "epoch": 9.620890774125133, "grad_norm": 7.530581951141357, "learning_rate": 6.192361727375479e-06, "loss": 0.2258, "num_input_tokens_seen": 18230392, "step": 18145 }, { "epoch": 9.62354188759279, "grad_norm": 18.073190689086914, "learning_rate": 6.190114813943767e-06, "loss": 0.2285, "num_input_tokens_seen": 18235224, "step": 18150 }, { "epoch": 9.626193001060445, "grad_norm": 18.6386661529541, "learning_rate": 6.187867645711587e-06, "loss": 0.1036, "num_input_tokens_seen": 18240024, "step": 18155 }, { "epoch": 9.628844114528102, "grad_norm": 26.519824981689453, "learning_rate": 6.1856202231600535e-06, "loss": 0.2709, "num_input_tokens_seen": 18245816, "step": 18160 }, { "epoch": 9.631495227995758, "grad_norm": 7.903061866760254, "learning_rate": 6.183372546770333e-06, "loss": 0.2677, "num_input_tokens_seen": 18249976, "step": 18165 }, { "epoch": 9.634146341463415, "grad_norm": 17.780439376831055, "learning_rate": 6.181124617023647e-06, "loss": 0.141, "num_input_tokens_seen": 18254680, "step": 18170 }, { "epoch": 9.63679745493107, "grad_norm": 3.9917054176330566, "learning_rate": 6.17887643440127e-06, "loss": 0.1478, "num_input_tokens_seen": 18259320, "step": 18175 }, { "epoch": 9.639448568398727, "grad_norm": 11.991201400756836, "learning_rate": 6.176627999384533e-06, "loss": 0.152, "num_input_tokens_seen": 18264312, "step": 18180 }, { "epoch": 9.642099681866384, "grad_norm": 19.11747169494629, "learning_rate": 6.174379312454822e-06, "loss": 0.1884, "num_input_tokens_seen": 18269432, "step": 18185 }, { "epoch": 9.64475079533404, "grad_norm": 21.2005558013916, "learning_rate": 6.172130374093573e-06, "loss": 0.3156, "num_input_tokens_seen": 18274712, "step": 18190 }, { "epoch": 9.647401908801697, "grad_norm": 10.200029373168945, "learning_rate": 6.169881184782276e-06, "loss": 0.1988, "num_input_tokens_seen": 18279896, "step": 18195 }, { "epoch": 9.650053022269352, "grad_norm": 21.092426300048828, "learning_rate": 6.1676317450024804e-06, "loss": 0.3308, "num_input_tokens_seen": 18285880, "step": 18200 }, { "epoch": 9.65270413573701, "grad_norm": 22.134489059448242, "learning_rate": 6.165382055235784e-06, "loss": 0.1446, "num_input_tokens_seen": 18290680, "step": 18205 }, { "epoch": 9.655355249204666, "grad_norm": 6.638855934143066, "learning_rate": 6.163132115963838e-06, "loss": 0.1223, "num_input_tokens_seen": 18295960, "step": 18210 }, { "epoch": 9.658006362672323, "grad_norm": 8.495382308959961, "learning_rate": 6.16088192766835e-06, "loss": 0.1396, "num_input_tokens_seen": 18300824, "step": 18215 }, { "epoch": 9.66065747613998, "grad_norm": 19.36283302307129, "learning_rate": 6.1586314908310785e-06, "loss": 0.1394, "num_input_tokens_seen": 18306072, "step": 18220 }, { "epoch": 9.663308589607635, "grad_norm": 0.7509904503822327, "learning_rate": 6.156380805933837e-06, "loss": 0.0516, "num_input_tokens_seen": 18310584, "step": 18225 }, { "epoch": 9.665959703075291, "grad_norm": 17.559284210205078, "learning_rate": 6.154129873458491e-06, "loss": 0.1095, "num_input_tokens_seen": 18315480, "step": 18230 }, { "epoch": 9.668610816542948, "grad_norm": 5.200393199920654, "learning_rate": 6.151878693886958e-06, "loss": 0.0615, "num_input_tokens_seen": 18320216, "step": 18235 }, { "epoch": 9.671261930010605, "grad_norm": 1.6825188398361206, "learning_rate": 6.149627267701212e-06, "loss": 0.1627, "num_input_tokens_seen": 18325176, "step": 18240 }, { "epoch": 9.673913043478262, "grad_norm": 11.620758056640625, "learning_rate": 6.147375595383276e-06, "loss": 0.1983, "num_input_tokens_seen": 18329464, "step": 18245 }, { "epoch": 9.676564156945917, "grad_norm": 8.80762004852295, "learning_rate": 6.1451236774152275e-06, "loss": 0.3467, "num_input_tokens_seen": 18334936, "step": 18250 }, { "epoch": 9.679215270413573, "grad_norm": 23.82595443725586, "learning_rate": 6.142871514279196e-06, "loss": 0.37, "num_input_tokens_seen": 18339992, "step": 18255 }, { "epoch": 9.68186638388123, "grad_norm": 31.716907501220703, "learning_rate": 6.140619106457364e-06, "loss": 0.3071, "num_input_tokens_seen": 18344888, "step": 18260 }, { "epoch": 9.684517497348887, "grad_norm": 5.960750102996826, "learning_rate": 6.1383664544319665e-06, "loss": 0.1123, "num_input_tokens_seen": 18349944, "step": 18265 }, { "epoch": 9.687168610816542, "grad_norm": 24.459457397460938, "learning_rate": 6.136113558685289e-06, "loss": 0.3956, "num_input_tokens_seen": 18354936, "step": 18270 }, { "epoch": 9.689819724284199, "grad_norm": 17.29036521911621, "learning_rate": 6.133860419699673e-06, "loss": 0.2208, "num_input_tokens_seen": 18359896, "step": 18275 }, { "epoch": 9.692470837751856, "grad_norm": 3.639599323272705, "learning_rate": 6.131607037957508e-06, "loss": 0.1407, "num_input_tokens_seen": 18365272, "step": 18280 }, { "epoch": 9.695121951219512, "grad_norm": 20.10426902770996, "learning_rate": 6.129353413941236e-06, "loss": 0.1547, "num_input_tokens_seen": 18370200, "step": 18285 }, { "epoch": 9.69777306468717, "grad_norm": 9.36231517791748, "learning_rate": 6.1270995481333524e-06, "loss": 0.2518, "num_input_tokens_seen": 18374328, "step": 18290 }, { "epoch": 9.700424178154826, "grad_norm": 17.870384216308594, "learning_rate": 6.124845441016407e-06, "loss": 0.2047, "num_input_tokens_seen": 18379032, "step": 18295 }, { "epoch": 9.703075291622481, "grad_norm": 5.436060428619385, "learning_rate": 6.122591093072995e-06, "loss": 0.087, "num_input_tokens_seen": 18385208, "step": 18300 }, { "epoch": 9.705726405090138, "grad_norm": 14.310104370117188, "learning_rate": 6.120336504785768e-06, "loss": 0.0881, "num_input_tokens_seen": 18390200, "step": 18305 }, { "epoch": 9.708377518557795, "grad_norm": 4.7068915367126465, "learning_rate": 6.118081676637426e-06, "loss": 0.1413, "num_input_tokens_seen": 18395192, "step": 18310 }, { "epoch": 9.711028632025451, "grad_norm": 4.277151107788086, "learning_rate": 6.1158266091107235e-06, "loss": 0.1462, "num_input_tokens_seen": 18400600, "step": 18315 }, { "epoch": 9.713679745493106, "grad_norm": 28.10204315185547, "learning_rate": 6.113571302688464e-06, "loss": 0.4344, "num_input_tokens_seen": 18405240, "step": 18320 }, { "epoch": 9.716330858960763, "grad_norm": 1.5132728815078735, "learning_rate": 6.111315757853502e-06, "loss": 0.3177, "num_input_tokens_seen": 18410392, "step": 18325 }, { "epoch": 9.71898197242842, "grad_norm": 8.320651054382324, "learning_rate": 6.109059975088744e-06, "loss": 0.0866, "num_input_tokens_seen": 18414936, "step": 18330 }, { "epoch": 9.721633085896077, "grad_norm": 11.82850170135498, "learning_rate": 6.106803954877149e-06, "loss": 0.1615, "num_input_tokens_seen": 18419544, "step": 18335 }, { "epoch": 9.724284199363733, "grad_norm": 10.874860763549805, "learning_rate": 6.104547697701723e-06, "loss": 0.2209, "num_input_tokens_seen": 18425976, "step": 18340 }, { "epoch": 9.726935312831388, "grad_norm": 6.957747459411621, "learning_rate": 6.102291204045524e-06, "loss": 0.1733, "num_input_tokens_seen": 18430456, "step": 18345 }, { "epoch": 9.729586426299045, "grad_norm": 8.862494468688965, "learning_rate": 6.1000344743916625e-06, "loss": 0.2206, "num_input_tokens_seen": 18435736, "step": 18350 }, { "epoch": 9.732237539766702, "grad_norm": 13.592695236206055, "learning_rate": 6.097777509223299e-06, "loss": 0.3003, "num_input_tokens_seen": 18443064, "step": 18355 }, { "epoch": 9.734888653234359, "grad_norm": 8.592730522155762, "learning_rate": 6.095520309023645e-06, "loss": 0.1859, "num_input_tokens_seen": 18448536, "step": 18360 }, { "epoch": 9.737539766702016, "grad_norm": 3.350093364715576, "learning_rate": 6.093262874275957e-06, "loss": 0.1603, "num_input_tokens_seen": 18453880, "step": 18365 }, { "epoch": 9.74019088016967, "grad_norm": 4.939136981964111, "learning_rate": 6.0910052054635506e-06, "loss": 0.1133, "num_input_tokens_seen": 18458456, "step": 18370 }, { "epoch": 9.742841993637327, "grad_norm": 7.037093162536621, "learning_rate": 6.088747303069783e-06, "loss": 0.1465, "num_input_tokens_seen": 18463736, "step": 18375 }, { "epoch": 9.745493107104984, "grad_norm": 15.863027572631836, "learning_rate": 6.086489167578068e-06, "loss": 0.2578, "num_input_tokens_seen": 18467960, "step": 18380 }, { "epoch": 9.748144220572641, "grad_norm": 8.51325511932373, "learning_rate": 6.084230799471864e-06, "loss": 0.0797, "num_input_tokens_seen": 18472056, "step": 18385 }, { "epoch": 9.750795334040298, "grad_norm": 9.557209968566895, "learning_rate": 6.081972199234684e-06, "loss": 0.1631, "num_input_tokens_seen": 18476184, "step": 18390 }, { "epoch": 9.753446447507953, "grad_norm": 1.35293447971344, "learning_rate": 6.0797133673500875e-06, "loss": 0.0693, "num_input_tokens_seen": 18481528, "step": 18395 }, { "epoch": 9.75609756097561, "grad_norm": 16.276050567626953, "learning_rate": 6.0774543043016844e-06, "loss": 0.1624, "num_input_tokens_seen": 18486296, "step": 18400 }, { "epoch": 9.758748674443266, "grad_norm": 3.6266191005706787, "learning_rate": 6.0751950105731315e-06, "loss": 0.0962, "num_input_tokens_seen": 18491064, "step": 18405 }, { "epoch": 9.761399787910923, "grad_norm": 18.411746978759766, "learning_rate": 6.072935486648144e-06, "loss": 0.1296, "num_input_tokens_seen": 18495832, "step": 18410 }, { "epoch": 9.764050901378578, "grad_norm": 5.2963948249816895, "learning_rate": 6.070675733010476e-06, "loss": 0.068, "num_input_tokens_seen": 18500120, "step": 18415 }, { "epoch": 9.766702014846235, "grad_norm": 11.485118865966797, "learning_rate": 6.068415750143934e-06, "loss": 0.175, "num_input_tokens_seen": 18505528, "step": 18420 }, { "epoch": 9.769353128313892, "grad_norm": 9.448915481567383, "learning_rate": 6.0661555385323745e-06, "loss": 0.1709, "num_input_tokens_seen": 18510968, "step": 18425 }, { "epoch": 9.772004241781548, "grad_norm": 15.150205612182617, "learning_rate": 6.063895098659706e-06, "loss": 0.2065, "num_input_tokens_seen": 18515896, "step": 18430 }, { "epoch": 9.774655355249205, "grad_norm": 28.791847229003906, "learning_rate": 6.06163443100988e-06, "loss": 0.2846, "num_input_tokens_seen": 18519992, "step": 18435 }, { "epoch": 9.77730646871686, "grad_norm": 14.477248191833496, "learning_rate": 6.0593735360669e-06, "loss": 0.1616, "num_input_tokens_seen": 18524376, "step": 18440 }, { "epoch": 9.779957582184517, "grad_norm": 15.967047691345215, "learning_rate": 6.057112414314819e-06, "loss": 0.2369, "num_input_tokens_seen": 18528952, "step": 18445 }, { "epoch": 9.782608695652174, "grad_norm": 5.028298377990723, "learning_rate": 6.0548510662377355e-06, "loss": 0.1087, "num_input_tokens_seen": 18533368, "step": 18450 }, { "epoch": 9.78525980911983, "grad_norm": 32.195411682128906, "learning_rate": 6.0525894923198005e-06, "loss": 0.2183, "num_input_tokens_seen": 18538616, "step": 18455 }, { "epoch": 9.787910922587487, "grad_norm": 16.46666145324707, "learning_rate": 6.050327693045207e-06, "loss": 0.1035, "num_input_tokens_seen": 18543928, "step": 18460 }, { "epoch": 9.790562036055142, "grad_norm": 13.417084693908691, "learning_rate": 6.048065668898204e-06, "loss": 0.1315, "num_input_tokens_seen": 18548376, "step": 18465 }, { "epoch": 9.7932131495228, "grad_norm": 19.55624008178711, "learning_rate": 6.045803420363085e-06, "loss": 0.1843, "num_input_tokens_seen": 18553112, "step": 18470 }, { "epoch": 9.795864262990456, "grad_norm": 10.262194633483887, "learning_rate": 6.043540947924189e-06, "loss": 0.1537, "num_input_tokens_seen": 18558008, "step": 18475 }, { "epoch": 9.798515376458113, "grad_norm": 2.4531211853027344, "learning_rate": 6.041278252065906e-06, "loss": 0.1173, "num_input_tokens_seen": 18564504, "step": 18480 }, { "epoch": 9.80116648992577, "grad_norm": 17.51598358154297, "learning_rate": 6.039015333272676e-06, "loss": 0.1819, "num_input_tokens_seen": 18569240, "step": 18485 }, { "epoch": 9.803817603393425, "grad_norm": 19.700023651123047, "learning_rate": 6.036752192028981e-06, "loss": 0.0697, "num_input_tokens_seen": 18574072, "step": 18490 }, { "epoch": 9.806468716861081, "grad_norm": 7.820878028869629, "learning_rate": 6.0344888288193546e-06, "loss": 0.1703, "num_input_tokens_seen": 18579128, "step": 18495 }, { "epoch": 9.809119830328738, "grad_norm": 2.3581268787384033, "learning_rate": 6.032225244128376e-06, "loss": 0.2701, "num_input_tokens_seen": 18584024, "step": 18500 }, { "epoch": 9.811770943796395, "grad_norm": 1.1285128593444824, "learning_rate": 6.029961438440675e-06, "loss": 0.1172, "num_input_tokens_seen": 18588664, "step": 18505 }, { "epoch": 9.814422057264052, "grad_norm": 2.444871425628662, "learning_rate": 6.027697412240925e-06, "loss": 0.2509, "num_input_tokens_seen": 18593368, "step": 18510 }, { "epoch": 9.817073170731707, "grad_norm": 0.7735134363174438, "learning_rate": 6.0254331660138475e-06, "loss": 0.0733, "num_input_tokens_seen": 18597528, "step": 18515 }, { "epoch": 9.819724284199363, "grad_norm": 26.567380905151367, "learning_rate": 6.023168700244213e-06, "loss": 0.3951, "num_input_tokens_seen": 18602584, "step": 18520 }, { "epoch": 9.82237539766702, "grad_norm": 11.50928020477295, "learning_rate": 6.020904015416838e-06, "loss": 0.1878, "num_input_tokens_seen": 18608216, "step": 18525 }, { "epoch": 9.825026511134677, "grad_norm": 27.751590728759766, "learning_rate": 6.018639112016583e-06, "loss": 0.3551, "num_input_tokens_seen": 18613624, "step": 18530 }, { "epoch": 9.827677624602334, "grad_norm": 22.666465759277344, "learning_rate": 6.016373990528361e-06, "loss": 0.2156, "num_input_tokens_seen": 18617912, "step": 18535 }, { "epoch": 9.830328738069989, "grad_norm": 15.340936660766602, "learning_rate": 6.014108651437125e-06, "loss": 0.173, "num_input_tokens_seen": 18623256, "step": 18540 }, { "epoch": 9.832979851537646, "grad_norm": 4.198511123657227, "learning_rate": 6.011843095227883e-06, "loss": 0.2869, "num_input_tokens_seen": 18629080, "step": 18545 }, { "epoch": 9.835630965005302, "grad_norm": 14.759681701660156, "learning_rate": 6.009577322385683e-06, "loss": 0.1196, "num_input_tokens_seen": 18633784, "step": 18550 }, { "epoch": 9.83828207847296, "grad_norm": 13.176021575927734, "learning_rate": 6.0073113333956165e-06, "loss": 0.0989, "num_input_tokens_seen": 18638904, "step": 18555 }, { "epoch": 9.840933191940614, "grad_norm": 1.654018521308899, "learning_rate": 6.005045128742831e-06, "loss": 0.0846, "num_input_tokens_seen": 18644728, "step": 18560 }, { "epoch": 9.843584305408271, "grad_norm": 20.353015899658203, "learning_rate": 6.002778708912514e-06, "loss": 0.299, "num_input_tokens_seen": 18650296, "step": 18565 }, { "epoch": 9.846235418875928, "grad_norm": 20.790328979492188, "learning_rate": 6.000512074389899e-06, "loss": 0.1216, "num_input_tokens_seen": 18655608, "step": 18570 }, { "epoch": 9.848886532343585, "grad_norm": 11.540942192077637, "learning_rate": 5.998245225660266e-06, "loss": 0.4109, "num_input_tokens_seen": 18659576, "step": 18575 }, { "epoch": 9.851537645811241, "grad_norm": 14.777534484863281, "learning_rate": 5.995978163208943e-06, "loss": 0.1547, "num_input_tokens_seen": 18664856, "step": 18580 }, { "epoch": 9.854188759278896, "grad_norm": 4.62302303314209, "learning_rate": 5.993710887521302e-06, "loss": 0.1722, "num_input_tokens_seen": 18670904, "step": 18585 }, { "epoch": 9.856839872746553, "grad_norm": 18.965190887451172, "learning_rate": 5.9914433990827605e-06, "loss": 0.1463, "num_input_tokens_seen": 18674488, "step": 18590 }, { "epoch": 9.85949098621421, "grad_norm": 12.83149528503418, "learning_rate": 5.9891756983787795e-06, "loss": 0.1237, "num_input_tokens_seen": 18679992, "step": 18595 }, { "epoch": 9.862142099681867, "grad_norm": 2.8949368000030518, "learning_rate": 5.9869077858948725e-06, "loss": 0.1868, "num_input_tokens_seen": 18684312, "step": 18600 }, { "epoch": 9.864793213149524, "grad_norm": 15.997482299804688, "learning_rate": 5.984639662116589e-06, "loss": 0.1711, "num_input_tokens_seen": 18689688, "step": 18605 }, { "epoch": 9.867444326617179, "grad_norm": 4.272665500640869, "learning_rate": 5.982371327529532e-06, "loss": 0.1633, "num_input_tokens_seen": 18695128, "step": 18610 }, { "epoch": 9.870095440084835, "grad_norm": 24.403671264648438, "learning_rate": 5.980102782619343e-06, "loss": 0.1864, "num_input_tokens_seen": 18702040, "step": 18615 }, { "epoch": 9.872746553552492, "grad_norm": 3.723447322845459, "learning_rate": 5.977834027871714e-06, "loss": 0.1251, "num_input_tokens_seen": 18707288, "step": 18620 }, { "epoch": 9.875397667020149, "grad_norm": 19.548891067504883, "learning_rate": 5.975565063772378e-06, "loss": 0.2096, "num_input_tokens_seen": 18713304, "step": 18625 }, { "epoch": 9.878048780487806, "grad_norm": 2.143702507019043, "learning_rate": 5.973295890807115e-06, "loss": 0.146, "num_input_tokens_seen": 18718072, "step": 18630 }, { "epoch": 9.88069989395546, "grad_norm": 24.830211639404297, "learning_rate": 5.971026509461747e-06, "loss": 0.2769, "num_input_tokens_seen": 18722840, "step": 18635 }, { "epoch": 9.883351007423117, "grad_norm": 23.295330047607422, "learning_rate": 5.968756920222145e-06, "loss": 0.3212, "num_input_tokens_seen": 18726712, "step": 18640 }, { "epoch": 9.886002120890774, "grad_norm": 12.052016258239746, "learning_rate": 5.966487123574223e-06, "loss": 0.2573, "num_input_tokens_seen": 18731288, "step": 18645 }, { "epoch": 9.888653234358431, "grad_norm": 2.7260355949401855, "learning_rate": 5.964217120003934e-06, "loss": 0.225, "num_input_tokens_seen": 18736216, "step": 18650 }, { "epoch": 9.891304347826086, "grad_norm": 20.33873748779297, "learning_rate": 5.961946909997285e-06, "loss": 0.1373, "num_input_tokens_seen": 18740248, "step": 18655 }, { "epoch": 9.893955461293743, "grad_norm": 13.478219032287598, "learning_rate": 5.959676494040319e-06, "loss": 0.2405, "num_input_tokens_seen": 18746072, "step": 18660 }, { "epoch": 9.8966065747614, "grad_norm": 3.847651243209839, "learning_rate": 5.957405872619128e-06, "loss": 0.0882, "num_input_tokens_seen": 18750392, "step": 18665 }, { "epoch": 9.899257688229056, "grad_norm": 8.083761215209961, "learning_rate": 5.955135046219843e-06, "loss": 0.2274, "num_input_tokens_seen": 18755544, "step": 18670 }, { "epoch": 9.901908801696713, "grad_norm": 11.9597749710083, "learning_rate": 5.952864015328645e-06, "loss": 0.0514, "num_input_tokens_seen": 18760248, "step": 18675 }, { "epoch": 9.90455991516437, "grad_norm": 2.908517360687256, "learning_rate": 5.950592780431755e-06, "loss": 0.1255, "num_input_tokens_seen": 18764696, "step": 18680 }, { "epoch": 9.907211028632025, "grad_norm": 16.487163543701172, "learning_rate": 5.948321342015437e-06, "loss": 0.2326, "num_input_tokens_seen": 18770552, "step": 18685 }, { "epoch": 9.909862142099682, "grad_norm": 3.9444780349731445, "learning_rate": 5.946049700566003e-06, "loss": 0.1418, "num_input_tokens_seen": 18774904, "step": 18690 }, { "epoch": 9.912513255567339, "grad_norm": 8.683358192443848, "learning_rate": 5.943777856569802e-06, "loss": 0.1581, "num_input_tokens_seen": 18780056, "step": 18695 }, { "epoch": 9.915164369034995, "grad_norm": 5.3043718338012695, "learning_rate": 5.941505810513233e-06, "loss": 0.161, "num_input_tokens_seen": 18783608, "step": 18700 }, { "epoch": 9.91781548250265, "grad_norm": 1.8293861150741577, "learning_rate": 5.939233562882732e-06, "loss": 0.1139, "num_input_tokens_seen": 18788280, "step": 18705 }, { "epoch": 9.920466595970307, "grad_norm": 10.317240715026855, "learning_rate": 5.936961114164785e-06, "loss": 0.2541, "num_input_tokens_seen": 18793592, "step": 18710 }, { "epoch": 9.923117709437964, "grad_norm": 1.2293519973754883, "learning_rate": 5.934688464845914e-06, "loss": 0.2787, "num_input_tokens_seen": 18798040, "step": 18715 }, { "epoch": 9.92576882290562, "grad_norm": 15.777280807495117, "learning_rate": 5.932415615412688e-06, "loss": 0.2458, "num_input_tokens_seen": 18802712, "step": 18720 }, { "epoch": 9.928419936373277, "grad_norm": 1.847036600112915, "learning_rate": 5.9301425663517175e-06, "loss": 0.0953, "num_input_tokens_seen": 18807672, "step": 18725 }, { "epoch": 9.931071049840932, "grad_norm": 20.275590896606445, "learning_rate": 5.927869318149659e-06, "loss": 0.322, "num_input_tokens_seen": 18811512, "step": 18730 }, { "epoch": 9.93372216330859, "grad_norm": 11.885297775268555, "learning_rate": 5.9255958712932085e-06, "loss": 0.157, "num_input_tokens_seen": 18815384, "step": 18735 }, { "epoch": 9.936373276776246, "grad_norm": 1.5486198663711548, "learning_rate": 5.923322226269104e-06, "loss": 0.1894, "num_input_tokens_seen": 18820696, "step": 18740 }, { "epoch": 9.939024390243903, "grad_norm": 27.4266357421875, "learning_rate": 5.921048383564124e-06, "loss": 0.5278, "num_input_tokens_seen": 18825880, "step": 18745 }, { "epoch": 9.94167550371156, "grad_norm": 13.6989164352417, "learning_rate": 5.918774343665098e-06, "loss": 0.2544, "num_input_tokens_seen": 18830040, "step": 18750 }, { "epoch": 9.944326617179215, "grad_norm": 5.976822376251221, "learning_rate": 5.916500107058888e-06, "loss": 0.1454, "num_input_tokens_seen": 18835320, "step": 18755 }, { "epoch": 9.946977730646871, "grad_norm": 25.550556182861328, "learning_rate": 5.9142256742324045e-06, "loss": 0.2627, "num_input_tokens_seen": 18839064, "step": 18760 }, { "epoch": 9.949628844114528, "grad_norm": 16.99507713317871, "learning_rate": 5.911951045672595e-06, "loss": 0.1533, "num_input_tokens_seen": 18844024, "step": 18765 }, { "epoch": 9.952279957582185, "grad_norm": 18.18876075744629, "learning_rate": 5.909676221866454e-06, "loss": 0.1526, "num_input_tokens_seen": 18848408, "step": 18770 }, { "epoch": 9.954931071049842, "grad_norm": 15.667267799377441, "learning_rate": 5.907401203301013e-06, "loss": 0.3215, "num_input_tokens_seen": 18853048, "step": 18775 }, { "epoch": 9.957582184517497, "grad_norm": 4.59809684753418, "learning_rate": 5.905125990463351e-06, "loss": 0.0829, "num_input_tokens_seen": 18858680, "step": 18780 }, { "epoch": 9.960233297985154, "grad_norm": 10.72795581817627, "learning_rate": 5.90285058384058e-06, "loss": 0.0887, "num_input_tokens_seen": 18863320, "step": 18785 }, { "epoch": 9.96288441145281, "grad_norm": 11.297547340393066, "learning_rate": 5.900574983919864e-06, "loss": 0.1364, "num_input_tokens_seen": 18868024, "step": 18790 }, { "epoch": 9.965535524920467, "grad_norm": 13.58968448638916, "learning_rate": 5.898299191188399e-06, "loss": 0.2116, "num_input_tokens_seen": 18873304, "step": 18795 }, { "epoch": 9.968186638388122, "grad_norm": 5.010105609893799, "learning_rate": 5.8960232061334285e-06, "loss": 0.2738, "num_input_tokens_seen": 18877016, "step": 18800 }, { "epoch": 9.970837751855779, "grad_norm": 13.459622383117676, "learning_rate": 5.893747029242234e-06, "loss": 0.1161, "num_input_tokens_seen": 18883000, "step": 18805 }, { "epoch": 9.973488865323436, "grad_norm": 17.517480850219727, "learning_rate": 5.891470661002139e-06, "loss": 0.3551, "num_input_tokens_seen": 18888536, "step": 18810 }, { "epoch": 9.976139978791092, "grad_norm": 17.360309600830078, "learning_rate": 5.8891941019005095e-06, "loss": 0.1513, "num_input_tokens_seen": 18893496, "step": 18815 }, { "epoch": 9.97879109225875, "grad_norm": 8.347784042358398, "learning_rate": 5.886917352424748e-06, "loss": 0.0939, "num_input_tokens_seen": 18898520, "step": 18820 }, { "epoch": 9.981442205726404, "grad_norm": 14.774667739868164, "learning_rate": 5.884640413062302e-06, "loss": 0.2326, "num_input_tokens_seen": 18904024, "step": 18825 }, { "epoch": 9.984093319194061, "grad_norm": 12.96216106414795, "learning_rate": 5.88236328430066e-06, "loss": 0.133, "num_input_tokens_seen": 18909816, "step": 18830 }, { "epoch": 9.986744432661718, "grad_norm": 11.002775192260742, "learning_rate": 5.880085966627348e-06, "loss": 0.232, "num_input_tokens_seen": 18914072, "step": 18835 }, { "epoch": 9.989395546129375, "grad_norm": 3.3865697383880615, "learning_rate": 5.877808460529932e-06, "loss": 0.2153, "num_input_tokens_seen": 18920056, "step": 18840 }, { "epoch": 9.992046659597031, "grad_norm": 3.0739707946777344, "learning_rate": 5.875530766496022e-06, "loss": 0.2457, "num_input_tokens_seen": 18925080, "step": 18845 }, { "epoch": 9.994697773064686, "grad_norm": 21.274539947509766, "learning_rate": 5.8732528850132665e-06, "loss": 0.2949, "num_input_tokens_seen": 18929336, "step": 18850 }, { "epoch": 9.997348886532343, "grad_norm": 5.19474983215332, "learning_rate": 5.870974816569352e-06, "loss": 0.0749, "num_input_tokens_seen": 18935384, "step": 18855 }, { "epoch": 10.0, "grad_norm": 0.1630992889404297, "learning_rate": 5.8686965616520095e-06, "loss": 0.1659, "num_input_tokens_seen": 18940064, "step": 18860 }, { "epoch": 10.0, "eval_loss": 0.4134344458580017, "eval_runtime": 29.3293, "eval_samples_per_second": 64.304, "eval_steps_per_second": 16.093, "num_input_tokens_seen": 18940064, "step": 18860 }, { "epoch": 10.002651113467657, "grad_norm": 11.884035110473633, "learning_rate": 5.866418120749007e-06, "loss": 0.0831, "num_input_tokens_seen": 18945440, "step": 18865 }, { "epoch": 10.005302226935314, "grad_norm": 2.3356106281280518, "learning_rate": 5.8641394943481515e-06, "loss": 0.1214, "num_input_tokens_seen": 18949344, "step": 18870 }, { "epoch": 10.007953340402969, "grad_norm": 1.0748172998428345, "learning_rate": 5.86186068293729e-06, "loss": 0.169, "num_input_tokens_seen": 18954176, "step": 18875 }, { "epoch": 10.010604453870625, "grad_norm": 4.052696228027344, "learning_rate": 5.859581687004312e-06, "loss": 0.0477, "num_input_tokens_seen": 18959296, "step": 18880 }, { "epoch": 10.013255567338282, "grad_norm": 7.296810626983643, "learning_rate": 5.8573025070371455e-06, "loss": 0.1274, "num_input_tokens_seen": 18963904, "step": 18885 }, { "epoch": 10.015906680805939, "grad_norm": 4.643251895904541, "learning_rate": 5.855023143523755e-06, "loss": 0.137, "num_input_tokens_seen": 18969184, "step": 18890 }, { "epoch": 10.018557794273596, "grad_norm": 16.628814697265625, "learning_rate": 5.852743596952146e-06, "loss": 0.2287, "num_input_tokens_seen": 18974368, "step": 18895 }, { "epoch": 10.02120890774125, "grad_norm": 14.493392944335938, "learning_rate": 5.850463867810365e-06, "loss": 0.0676, "num_input_tokens_seen": 18979968, "step": 18900 }, { "epoch": 10.023860021208908, "grad_norm": 2.5349271297454834, "learning_rate": 5.848183956586495e-06, "loss": 0.1155, "num_input_tokens_seen": 18985088, "step": 18905 }, { "epoch": 10.026511134676564, "grad_norm": 1.714271903038025, "learning_rate": 5.845903863768661e-06, "loss": 0.1115, "num_input_tokens_seen": 18990240, "step": 18910 }, { "epoch": 10.029162248144221, "grad_norm": 17.104745864868164, "learning_rate": 5.843623589845022e-06, "loss": 0.1341, "num_input_tokens_seen": 18994720, "step": 18915 }, { "epoch": 10.031813361611878, "grad_norm": 16.636566162109375, "learning_rate": 5.841343135303779e-06, "loss": 0.0776, "num_input_tokens_seen": 19000256, "step": 18920 }, { "epoch": 10.034464475079533, "grad_norm": 3.6493825912475586, "learning_rate": 5.839062500633174e-06, "loss": 0.1469, "num_input_tokens_seen": 19003968, "step": 18925 }, { "epoch": 10.03711558854719, "grad_norm": 5.566834449768066, "learning_rate": 5.8367816863214825e-06, "loss": 0.1358, "num_input_tokens_seen": 19008448, "step": 18930 }, { "epoch": 10.039766702014846, "grad_norm": 29.343116760253906, "learning_rate": 5.8345006928570205e-06, "loss": 0.1819, "num_input_tokens_seen": 19013184, "step": 18935 }, { "epoch": 10.042417815482503, "grad_norm": 4.657991409301758, "learning_rate": 5.8322195207281454e-06, "loss": 0.0424, "num_input_tokens_seen": 19017856, "step": 18940 }, { "epoch": 10.045068928950158, "grad_norm": 5.553671836853027, "learning_rate": 5.829938170423247e-06, "loss": 0.0645, "num_input_tokens_seen": 19022944, "step": 18945 }, { "epoch": 10.047720042417815, "grad_norm": 8.997426986694336, "learning_rate": 5.8276566424307605e-06, "loss": 0.3454, "num_input_tokens_seen": 19027968, "step": 18950 }, { "epoch": 10.050371155885472, "grad_norm": 6.10443639755249, "learning_rate": 5.82537493723915e-06, "loss": 0.0961, "num_input_tokens_seen": 19032928, "step": 18955 }, { "epoch": 10.053022269353129, "grad_norm": 3.0193424224853516, "learning_rate": 5.823093055336927e-06, "loss": 0.1039, "num_input_tokens_seen": 19037248, "step": 18960 }, { "epoch": 10.055673382820785, "grad_norm": 6.37654447555542, "learning_rate": 5.820810997212635e-06, "loss": 0.127, "num_input_tokens_seen": 19042336, "step": 18965 }, { "epoch": 10.05832449628844, "grad_norm": 1.2354062795639038, "learning_rate": 5.818528763354855e-06, "loss": 0.0431, "num_input_tokens_seen": 19048288, "step": 18970 }, { "epoch": 10.060975609756097, "grad_norm": 3.4346213340759277, "learning_rate": 5.816246354252209e-06, "loss": 0.0929, "num_input_tokens_seen": 19052864, "step": 18975 }, { "epoch": 10.063626723223754, "grad_norm": 11.012407302856445, "learning_rate": 5.813963770393355e-06, "loss": 0.0674, "num_input_tokens_seen": 19057664, "step": 18980 }, { "epoch": 10.06627783669141, "grad_norm": 16.049989700317383, "learning_rate": 5.811681012266987e-06, "loss": 0.09, "num_input_tokens_seen": 19063552, "step": 18985 }, { "epoch": 10.068928950159068, "grad_norm": 26.733415603637695, "learning_rate": 5.809398080361838e-06, "loss": 0.1038, "num_input_tokens_seen": 19068320, "step": 18990 }, { "epoch": 10.071580063626723, "grad_norm": 7.014225482940674, "learning_rate": 5.807114975166679e-06, "loss": 0.167, "num_input_tokens_seen": 19073280, "step": 18995 }, { "epoch": 10.07423117709438, "grad_norm": 1.8474817276000977, "learning_rate": 5.8048316971703154e-06, "loss": 0.0566, "num_input_tokens_seen": 19077888, "step": 19000 }, { "epoch": 10.076882290562036, "grad_norm": 1.8793795108795166, "learning_rate": 5.8025482468615916e-06, "loss": 0.0792, "num_input_tokens_seen": 19082688, "step": 19005 }, { "epoch": 10.079533404029693, "grad_norm": 10.151161193847656, "learning_rate": 5.800264624729387e-06, "loss": 0.0775, "num_input_tokens_seen": 19087712, "step": 19010 }, { "epoch": 10.08218451749735, "grad_norm": 1.1952593326568604, "learning_rate": 5.7979808312626215e-06, "loss": 0.0334, "num_input_tokens_seen": 19093280, "step": 19015 }, { "epoch": 10.084835630965005, "grad_norm": 4.587573051452637, "learning_rate": 5.795696866950248e-06, "loss": 0.1859, "num_input_tokens_seen": 19098496, "step": 19020 }, { "epoch": 10.087486744432661, "grad_norm": 4.260233402252197, "learning_rate": 5.793412732281258e-06, "loss": 0.1149, "num_input_tokens_seen": 19103584, "step": 19025 }, { "epoch": 10.090137857900318, "grad_norm": 9.128663063049316, "learning_rate": 5.791128427744677e-06, "loss": 0.0733, "num_input_tokens_seen": 19107808, "step": 19030 }, { "epoch": 10.092788971367975, "grad_norm": 9.91177749633789, "learning_rate": 5.7888439538295706e-06, "loss": 0.2558, "num_input_tokens_seen": 19113696, "step": 19035 }, { "epoch": 10.095440084835632, "grad_norm": 9.61712646484375, "learning_rate": 5.7865593110250376e-06, "loss": 0.0302, "num_input_tokens_seen": 19119104, "step": 19040 }, { "epoch": 10.098091198303287, "grad_norm": 7.535170555114746, "learning_rate": 5.784274499820214e-06, "loss": 0.1732, "num_input_tokens_seen": 19123424, "step": 19045 }, { "epoch": 10.100742311770944, "grad_norm": 8.032465934753418, "learning_rate": 5.781989520704272e-06, "loss": 0.0472, "num_input_tokens_seen": 19127392, "step": 19050 }, { "epoch": 10.1033934252386, "grad_norm": 18.077617645263672, "learning_rate": 5.779704374166421e-06, "loss": 0.2381, "num_input_tokens_seen": 19131968, "step": 19055 }, { "epoch": 10.106044538706257, "grad_norm": 25.1151180267334, "learning_rate": 5.777419060695902e-06, "loss": 0.2047, "num_input_tokens_seen": 19136544, "step": 19060 }, { "epoch": 10.108695652173912, "grad_norm": 11.391676902770996, "learning_rate": 5.775133580781999e-06, "loss": 0.0608, "num_input_tokens_seen": 19140640, "step": 19065 }, { "epoch": 10.111346765641569, "grad_norm": 9.628335952758789, "learning_rate": 5.772847934914022e-06, "loss": 0.1363, "num_input_tokens_seen": 19145440, "step": 19070 }, { "epoch": 10.113997879109226, "grad_norm": 17.792545318603516, "learning_rate": 5.770562123581326e-06, "loss": 0.0808, "num_input_tokens_seen": 19151552, "step": 19075 }, { "epoch": 10.116648992576883, "grad_norm": 3.4166693687438965, "learning_rate": 5.7682761472732955e-06, "loss": 0.1945, "num_input_tokens_seen": 19156768, "step": 19080 }, { "epoch": 10.11930010604454, "grad_norm": 27.319904327392578, "learning_rate": 5.765990006479353e-06, "loss": 0.148, "num_input_tokens_seen": 19160672, "step": 19085 }, { "epoch": 10.121951219512194, "grad_norm": 6.425615310668945, "learning_rate": 5.763703701688952e-06, "loss": 0.1551, "num_input_tokens_seen": 19164896, "step": 19090 }, { "epoch": 10.124602332979851, "grad_norm": 2.988171100616455, "learning_rate": 5.76141723339159e-06, "loss": 0.0617, "num_input_tokens_seen": 19171072, "step": 19095 }, { "epoch": 10.127253446447508, "grad_norm": 18.6291446685791, "learning_rate": 5.759130602076789e-06, "loss": 0.1223, "num_input_tokens_seen": 19176256, "step": 19100 }, { "epoch": 10.129904559915165, "grad_norm": 2.3823890686035156, "learning_rate": 5.7568438082341125e-06, "loss": 0.133, "num_input_tokens_seen": 19183424, "step": 19105 }, { "epoch": 10.132555673382821, "grad_norm": 2.057077646255493, "learning_rate": 5.754556852353158e-06, "loss": 0.0495, "num_input_tokens_seen": 19190208, "step": 19110 }, { "epoch": 10.135206786850476, "grad_norm": 37.45603942871094, "learning_rate": 5.752269734923555e-06, "loss": 0.2174, "num_input_tokens_seen": 19195584, "step": 19115 }, { "epoch": 10.137857900318133, "grad_norm": 17.811548233032227, "learning_rate": 5.749982456434972e-06, "loss": 0.0759, "num_input_tokens_seen": 19200992, "step": 19120 }, { "epoch": 10.14050901378579, "grad_norm": 8.815546035766602, "learning_rate": 5.747695017377105e-06, "loss": 0.0822, "num_input_tokens_seen": 19205856, "step": 19125 }, { "epoch": 10.143160127253447, "grad_norm": 8.414329528808594, "learning_rate": 5.745407418239692e-06, "loss": 0.1061, "num_input_tokens_seen": 19211360, "step": 19130 }, { "epoch": 10.145811240721104, "grad_norm": 22.91179084777832, "learning_rate": 5.743119659512503e-06, "loss": 0.1829, "num_input_tokens_seen": 19216608, "step": 19135 }, { "epoch": 10.148462354188759, "grad_norm": 4.118380069732666, "learning_rate": 5.740831741685338e-06, "loss": 0.0277, "num_input_tokens_seen": 19220992, "step": 19140 }, { "epoch": 10.151113467656415, "grad_norm": 0.82356858253479, "learning_rate": 5.7385436652480355e-06, "loss": 0.0814, "num_input_tokens_seen": 19225184, "step": 19145 }, { "epoch": 10.153764581124072, "grad_norm": 11.40511703491211, "learning_rate": 5.736255430690467e-06, "loss": 0.1945, "num_input_tokens_seen": 19233376, "step": 19150 }, { "epoch": 10.156415694591729, "grad_norm": 1.3543798923492432, "learning_rate": 5.7339670385025384e-06, "loss": 0.0608, "num_input_tokens_seen": 19238080, "step": 19155 }, { "epoch": 10.159066808059386, "grad_norm": 15.341796875, "learning_rate": 5.731678489174186e-06, "loss": 0.1403, "num_input_tokens_seen": 19243424, "step": 19160 }, { "epoch": 10.16171792152704, "grad_norm": 23.36968421936035, "learning_rate": 5.729389783195384e-06, "loss": 0.4318, "num_input_tokens_seen": 19248544, "step": 19165 }, { "epoch": 10.164369034994698, "grad_norm": 14.30880355834961, "learning_rate": 5.727100921056139e-06, "loss": 0.0874, "num_input_tokens_seen": 19253664, "step": 19170 }, { "epoch": 10.167020148462354, "grad_norm": 21.3333740234375, "learning_rate": 5.724811903246489e-06, "loss": 0.1045, "num_input_tokens_seen": 19258240, "step": 19175 }, { "epoch": 10.169671261930011, "grad_norm": 31.598249435424805, "learning_rate": 5.722522730256507e-06, "loss": 0.2539, "num_input_tokens_seen": 19263424, "step": 19180 }, { "epoch": 10.172322375397666, "grad_norm": 4.805815696716309, "learning_rate": 5.720233402576298e-06, "loss": 0.1668, "num_input_tokens_seen": 19267840, "step": 19185 }, { "epoch": 10.174973488865323, "grad_norm": 6.041678428649902, "learning_rate": 5.7179439206960044e-06, "loss": 0.1897, "num_input_tokens_seen": 19273024, "step": 19190 }, { "epoch": 10.17762460233298, "grad_norm": 16.304887771606445, "learning_rate": 5.715654285105794e-06, "loss": 0.2062, "num_input_tokens_seen": 19278432, "step": 19195 }, { "epoch": 10.180275715800637, "grad_norm": 6.246288776397705, "learning_rate": 5.7133644962958756e-06, "loss": 0.1681, "num_input_tokens_seen": 19283904, "step": 19200 }, { "epoch": 10.182926829268293, "grad_norm": 22.743303298950195, "learning_rate": 5.7110745547564835e-06, "loss": 0.1656, "num_input_tokens_seen": 19289664, "step": 19205 }, { "epoch": 10.185577942735948, "grad_norm": 0.9769912958145142, "learning_rate": 5.7087844609778905e-06, "loss": 0.1143, "num_input_tokens_seen": 19295520, "step": 19210 }, { "epoch": 10.188229056203605, "grad_norm": 17.914653778076172, "learning_rate": 5.7064942154504e-06, "loss": 0.1277, "num_input_tokens_seen": 19300448, "step": 19215 }, { "epoch": 10.190880169671262, "grad_norm": 11.072222709655762, "learning_rate": 5.704203818664347e-06, "loss": 0.1031, "num_input_tokens_seen": 19305088, "step": 19220 }, { "epoch": 10.193531283138919, "grad_norm": 22.51119041442871, "learning_rate": 5.7019132711100965e-06, "loss": 0.1067, "num_input_tokens_seen": 19309664, "step": 19225 }, { "epoch": 10.196182396606575, "grad_norm": 31.030517578125, "learning_rate": 5.699622573278054e-06, "loss": 0.3082, "num_input_tokens_seen": 19314432, "step": 19230 }, { "epoch": 10.19883351007423, "grad_norm": 3.8415939807891846, "learning_rate": 5.697331725658648e-06, "loss": 0.1491, "num_input_tokens_seen": 19319392, "step": 19235 }, { "epoch": 10.201484623541887, "grad_norm": 1.1531460285186768, "learning_rate": 5.695040728742344e-06, "loss": 0.2464, "num_input_tokens_seen": 19324000, "step": 19240 }, { "epoch": 10.204135737009544, "grad_norm": 33.50776290893555, "learning_rate": 5.692749583019641e-06, "loss": 0.1641, "num_input_tokens_seen": 19329024, "step": 19245 }, { "epoch": 10.2067868504772, "grad_norm": 13.380941390991211, "learning_rate": 5.6904582889810635e-06, "loss": 0.0878, "num_input_tokens_seen": 19334272, "step": 19250 }, { "epoch": 10.209437963944858, "grad_norm": 6.469421863555908, "learning_rate": 5.6881668471171715e-06, "loss": 0.2162, "num_input_tokens_seen": 19339488, "step": 19255 }, { "epoch": 10.212089077412513, "grad_norm": 1.256261944770813, "learning_rate": 5.68587525791856e-06, "loss": 0.076, "num_input_tokens_seen": 19344192, "step": 19260 }, { "epoch": 10.21474019088017, "grad_norm": 9.247322082519531, "learning_rate": 5.683583521875851e-06, "loss": 0.0471, "num_input_tokens_seen": 19348864, "step": 19265 }, { "epoch": 10.217391304347826, "grad_norm": 25.275981903076172, "learning_rate": 5.681291639479698e-06, "loss": 0.1405, "num_input_tokens_seen": 19354080, "step": 19270 }, { "epoch": 10.220042417815483, "grad_norm": 13.889473915100098, "learning_rate": 5.6789996112207865e-06, "loss": 0.3687, "num_input_tokens_seen": 19359424, "step": 19275 }, { "epoch": 10.22269353128314, "grad_norm": 14.283897399902344, "learning_rate": 5.6767074375898365e-06, "loss": 0.0521, "num_input_tokens_seen": 19365152, "step": 19280 }, { "epoch": 10.225344644750795, "grad_norm": 6.54670524597168, "learning_rate": 5.6744151190775956e-06, "loss": 0.0688, "num_input_tokens_seen": 19370176, "step": 19285 }, { "epoch": 10.227995758218452, "grad_norm": 5.5056939125061035, "learning_rate": 5.672122656174841e-06, "loss": 0.0663, "num_input_tokens_seen": 19375168, "step": 19290 }, { "epoch": 10.230646871686108, "grad_norm": 12.77245807647705, "learning_rate": 5.669830049372385e-06, "loss": 0.1194, "num_input_tokens_seen": 19381472, "step": 19295 }, { "epoch": 10.233297985153765, "grad_norm": 7.833137512207031, "learning_rate": 5.667537299161068e-06, "loss": 0.0857, "num_input_tokens_seen": 19386560, "step": 19300 }, { "epoch": 10.235949098621422, "grad_norm": 15.823030471801758, "learning_rate": 5.665244406031763e-06, "loss": 0.1216, "num_input_tokens_seen": 19390624, "step": 19305 }, { "epoch": 10.238600212089077, "grad_norm": 6.543422698974609, "learning_rate": 5.66295137047537e-06, "loss": 0.255, "num_input_tokens_seen": 19395104, "step": 19310 }, { "epoch": 10.241251325556734, "grad_norm": 8.633119583129883, "learning_rate": 5.660658192982824e-06, "loss": 0.0568, "num_input_tokens_seen": 19399904, "step": 19315 }, { "epoch": 10.24390243902439, "grad_norm": 4.801278114318848, "learning_rate": 5.658364874045088e-06, "loss": 0.0667, "num_input_tokens_seen": 19404608, "step": 19320 }, { "epoch": 10.246553552492047, "grad_norm": 12.581483840942383, "learning_rate": 5.656071414153154e-06, "loss": 0.1122, "num_input_tokens_seen": 19409024, "step": 19325 }, { "epoch": 10.249204665959702, "grad_norm": 27.421384811401367, "learning_rate": 5.653777813798049e-06, "loss": 0.1626, "num_input_tokens_seen": 19414688, "step": 19330 }, { "epoch": 10.251855779427359, "grad_norm": 13.243912696838379, "learning_rate": 5.651484073470823e-06, "loss": 0.1399, "num_input_tokens_seen": 19418624, "step": 19335 }, { "epoch": 10.254506892895016, "grad_norm": 16.861650466918945, "learning_rate": 5.649190193662562e-06, "loss": 0.064, "num_input_tokens_seen": 19422912, "step": 19340 }, { "epoch": 10.257158006362673, "grad_norm": 7.277976036071777, "learning_rate": 5.64689617486438e-06, "loss": 0.212, "num_input_tokens_seen": 19427360, "step": 19345 }, { "epoch": 10.25980911983033, "grad_norm": 8.223973274230957, "learning_rate": 5.64460201756742e-06, "loss": 0.0873, "num_input_tokens_seen": 19432256, "step": 19350 }, { "epoch": 10.262460233297984, "grad_norm": 1.4507458209991455, "learning_rate": 5.642307722262853e-06, "loss": 0.0468, "num_input_tokens_seen": 19438144, "step": 19355 }, { "epoch": 10.265111346765641, "grad_norm": 37.03429412841797, "learning_rate": 5.640013289441885e-06, "loss": 0.2895, "num_input_tokens_seen": 19443616, "step": 19360 }, { "epoch": 10.267762460233298, "grad_norm": 2.918020009994507, "learning_rate": 5.637718719595746e-06, "loss": 0.0958, "num_input_tokens_seen": 19448256, "step": 19365 }, { "epoch": 10.270413573700955, "grad_norm": 1.5117034912109375, "learning_rate": 5.635424013215698e-06, "loss": 0.0257, "num_input_tokens_seen": 19453216, "step": 19370 }, { "epoch": 10.273064687168612, "grad_norm": 3.4335503578186035, "learning_rate": 5.63312917079303e-06, "loss": 0.1808, "num_input_tokens_seen": 19458016, "step": 19375 }, { "epoch": 10.275715800636267, "grad_norm": 6.317764759063721, "learning_rate": 5.630834192819067e-06, "loss": 0.098, "num_input_tokens_seen": 19463200, "step": 19380 }, { "epoch": 10.278366914103923, "grad_norm": 18.68247413635254, "learning_rate": 5.628539079785152e-06, "loss": 0.0956, "num_input_tokens_seen": 19467200, "step": 19385 }, { "epoch": 10.28101802757158, "grad_norm": 47.430171966552734, "learning_rate": 5.626243832182663e-06, "loss": 0.1808, "num_input_tokens_seen": 19472608, "step": 19390 }, { "epoch": 10.283669141039237, "grad_norm": 17.873327255249023, "learning_rate": 5.62394845050301e-06, "loss": 0.1413, "num_input_tokens_seen": 19478432, "step": 19395 }, { "epoch": 10.286320254506894, "grad_norm": 2.973151206970215, "learning_rate": 5.621652935237627e-06, "loss": 0.1884, "num_input_tokens_seen": 19482592, "step": 19400 }, { "epoch": 10.288971367974549, "grad_norm": 6.675217628479004, "learning_rate": 5.619357286877977e-06, "loss": 0.0824, "num_input_tokens_seen": 19488192, "step": 19405 }, { "epoch": 10.291622481442205, "grad_norm": 17.290260314941406, "learning_rate": 5.617061505915551e-06, "loss": 0.1205, "num_input_tokens_seen": 19492672, "step": 19410 }, { "epoch": 10.294273594909862, "grad_norm": 21.355939865112305, "learning_rate": 5.614765592841872e-06, "loss": 0.1206, "num_input_tokens_seen": 19497984, "step": 19415 }, { "epoch": 10.296924708377519, "grad_norm": 13.722570419311523, "learning_rate": 5.612469548148489e-06, "loss": 0.088, "num_input_tokens_seen": 19503712, "step": 19420 }, { "epoch": 10.299575821845174, "grad_norm": 10.361557960510254, "learning_rate": 5.610173372326978e-06, "loss": 0.1361, "num_input_tokens_seen": 19509120, "step": 19425 }, { "epoch": 10.30222693531283, "grad_norm": 27.508323669433594, "learning_rate": 5.607877065868944e-06, "loss": 0.2022, "num_input_tokens_seen": 19513600, "step": 19430 }, { "epoch": 10.304878048780488, "grad_norm": 10.102245330810547, "learning_rate": 5.605580629266021e-06, "loss": 0.193, "num_input_tokens_seen": 19520128, "step": 19435 }, { "epoch": 10.307529162248144, "grad_norm": 8.08935832977295, "learning_rate": 5.603284063009871e-06, "loss": 0.2331, "num_input_tokens_seen": 19525408, "step": 19440 }, { "epoch": 10.310180275715801, "grad_norm": 18.08251953125, "learning_rate": 5.600987367592181e-06, "loss": 0.1245, "num_input_tokens_seen": 19529216, "step": 19445 }, { "epoch": 10.312831389183456, "grad_norm": 12.922357559204102, "learning_rate": 5.598690543504668e-06, "loss": 0.1511, "num_input_tokens_seen": 19534880, "step": 19450 }, { "epoch": 10.315482502651113, "grad_norm": 22.479522705078125, "learning_rate": 5.596393591239076e-06, "loss": 0.3077, "num_input_tokens_seen": 19539136, "step": 19455 }, { "epoch": 10.31813361611877, "grad_norm": 18.215179443359375, "learning_rate": 5.594096511287178e-06, "loss": 0.1369, "num_input_tokens_seen": 19544576, "step": 19460 }, { "epoch": 10.320784729586427, "grad_norm": 2.9547791481018066, "learning_rate": 5.591799304140771e-06, "loss": 0.2703, "num_input_tokens_seen": 19550208, "step": 19465 }, { "epoch": 10.323435843054083, "grad_norm": 31.36358642578125, "learning_rate": 5.589501970291681e-06, "loss": 0.3264, "num_input_tokens_seen": 19554400, "step": 19470 }, { "epoch": 10.326086956521738, "grad_norm": 3.2713654041290283, "learning_rate": 5.587204510231762e-06, "loss": 0.1845, "num_input_tokens_seen": 19558720, "step": 19475 }, { "epoch": 10.328738069989395, "grad_norm": 10.656766891479492, "learning_rate": 5.584906924452894e-06, "loss": 0.099, "num_input_tokens_seen": 19563776, "step": 19480 }, { "epoch": 10.331389183457052, "grad_norm": 6.4508514404296875, "learning_rate": 5.582609213446984e-06, "loss": 0.0935, "num_input_tokens_seen": 19569504, "step": 19485 }, { "epoch": 10.334040296924709, "grad_norm": 20.727216720581055, "learning_rate": 5.580311377705966e-06, "loss": 0.1835, "num_input_tokens_seen": 19574624, "step": 19490 }, { "epoch": 10.336691410392365, "grad_norm": 1.4906456470489502, "learning_rate": 5.5780134177218005e-06, "loss": 0.1462, "num_input_tokens_seen": 19579488, "step": 19495 }, { "epoch": 10.33934252386002, "grad_norm": 36.23921203613281, "learning_rate": 5.575715333986475e-06, "loss": 0.3495, "num_input_tokens_seen": 19584096, "step": 19500 }, { "epoch": 10.341993637327677, "grad_norm": 31.541305541992188, "learning_rate": 5.573417126992004e-06, "loss": 0.2214, "num_input_tokens_seen": 19588704, "step": 19505 }, { "epoch": 10.344644750795334, "grad_norm": 34.99803924560547, "learning_rate": 5.571118797230424e-06, "loss": 0.2581, "num_input_tokens_seen": 19595424, "step": 19510 }, { "epoch": 10.34729586426299, "grad_norm": 30.213640213012695, "learning_rate": 5.568820345193805e-06, "loss": 0.106, "num_input_tokens_seen": 19599872, "step": 19515 }, { "epoch": 10.349946977730648, "grad_norm": 18.498432159423828, "learning_rate": 5.566521771374241e-06, "loss": 0.0766, "num_input_tokens_seen": 19604544, "step": 19520 }, { "epoch": 10.352598091198303, "grad_norm": 15.63537883758545, "learning_rate": 5.564223076263845e-06, "loss": 0.1689, "num_input_tokens_seen": 19610368, "step": 19525 }, { "epoch": 10.35524920466596, "grad_norm": 3.5020089149475098, "learning_rate": 5.561924260354767e-06, "loss": 0.0813, "num_input_tokens_seen": 19614976, "step": 19530 }, { "epoch": 10.357900318133616, "grad_norm": 22.83772087097168, "learning_rate": 5.559625324139173e-06, "loss": 0.1885, "num_input_tokens_seen": 19619808, "step": 19535 }, { "epoch": 10.360551431601273, "grad_norm": 5.8670125007629395, "learning_rate": 5.557326268109262e-06, "loss": 0.0953, "num_input_tokens_seen": 19624608, "step": 19540 }, { "epoch": 10.36320254506893, "grad_norm": 11.255960464477539, "learning_rate": 5.5550270927572565e-06, "loss": 0.1562, "num_input_tokens_seen": 19629984, "step": 19545 }, { "epoch": 10.365853658536585, "grad_norm": 19.01799774169922, "learning_rate": 5.552727798575403e-06, "loss": 0.0658, "num_input_tokens_seen": 19635616, "step": 19550 }, { "epoch": 10.368504772004242, "grad_norm": 3.8414924144744873, "learning_rate": 5.550428386055974e-06, "loss": 0.045, "num_input_tokens_seen": 19640352, "step": 19555 }, { "epoch": 10.371155885471898, "grad_norm": 5.9574408531188965, "learning_rate": 5.548128855691266e-06, "loss": 0.1707, "num_input_tokens_seen": 19644704, "step": 19560 }, { "epoch": 10.373806998939555, "grad_norm": 8.129465103149414, "learning_rate": 5.545829207973605e-06, "loss": 0.0543, "num_input_tokens_seen": 19650048, "step": 19565 }, { "epoch": 10.37645811240721, "grad_norm": 17.484758377075195, "learning_rate": 5.5435294433953405e-06, "loss": 0.19, "num_input_tokens_seen": 19654496, "step": 19570 }, { "epoch": 10.379109225874867, "grad_norm": 7.782597064971924, "learning_rate": 5.541229562448844e-06, "loss": 0.1379, "num_input_tokens_seen": 19659328, "step": 19575 }, { "epoch": 10.381760339342524, "grad_norm": 24.927310943603516, "learning_rate": 5.538929565626513e-06, "loss": 0.1266, "num_input_tokens_seen": 19663712, "step": 19580 }, { "epoch": 10.38441145281018, "grad_norm": 29.08075523376465, "learning_rate": 5.5366294534207734e-06, "loss": 0.1218, "num_input_tokens_seen": 19667520, "step": 19585 }, { "epoch": 10.387062566277837, "grad_norm": 19.773595809936523, "learning_rate": 5.534329226324072e-06, "loss": 0.2595, "num_input_tokens_seen": 19672928, "step": 19590 }, { "epoch": 10.389713679745492, "grad_norm": 2.2185964584350586, "learning_rate": 5.532028884828882e-06, "loss": 0.092, "num_input_tokens_seen": 19678112, "step": 19595 }, { "epoch": 10.392364793213149, "grad_norm": 20.68547821044922, "learning_rate": 5.529728429427698e-06, "loss": 0.1089, "num_input_tokens_seen": 19682496, "step": 19600 }, { "epoch": 10.395015906680806, "grad_norm": 8.244166374206543, "learning_rate": 5.527427860613045e-06, "loss": 0.1198, "num_input_tokens_seen": 19687968, "step": 19605 }, { "epoch": 10.397667020148463, "grad_norm": 34.368900299072266, "learning_rate": 5.525127178877466e-06, "loss": 0.1968, "num_input_tokens_seen": 19693120, "step": 19610 }, { "epoch": 10.40031813361612, "grad_norm": 3.7985289096832275, "learning_rate": 5.522826384713534e-06, "loss": 0.066, "num_input_tokens_seen": 19698112, "step": 19615 }, { "epoch": 10.402969247083774, "grad_norm": 5.054315567016602, "learning_rate": 5.520525478613838e-06, "loss": 0.2145, "num_input_tokens_seen": 19703168, "step": 19620 }, { "epoch": 10.405620360551431, "grad_norm": 23.475915908813477, "learning_rate": 5.518224461071002e-06, "loss": 0.2861, "num_input_tokens_seen": 19707744, "step": 19625 }, { "epoch": 10.408271474019088, "grad_norm": 3.1017842292785645, "learning_rate": 5.515923332577665e-06, "loss": 0.0791, "num_input_tokens_seen": 19712544, "step": 19630 }, { "epoch": 10.410922587486745, "grad_norm": 2.3008272647857666, "learning_rate": 5.5136220936264914e-06, "loss": 0.2267, "num_input_tokens_seen": 19717760, "step": 19635 }, { "epoch": 10.413573700954402, "grad_norm": 4.10191011428833, "learning_rate": 5.511320744710171e-06, "loss": 0.0816, "num_input_tokens_seen": 19722048, "step": 19640 }, { "epoch": 10.416224814422057, "grad_norm": 6.261319637298584, "learning_rate": 5.5090192863214185e-06, "loss": 0.086, "num_input_tokens_seen": 19726304, "step": 19645 }, { "epoch": 10.418875927889713, "grad_norm": 7.988612651824951, "learning_rate": 5.506717718952968e-06, "loss": 0.2057, "num_input_tokens_seen": 19731488, "step": 19650 }, { "epoch": 10.42152704135737, "grad_norm": 17.388011932373047, "learning_rate": 5.50441604309758e-06, "loss": 0.1005, "num_input_tokens_seen": 19737440, "step": 19655 }, { "epoch": 10.424178154825027, "grad_norm": 13.75885009765625, "learning_rate": 5.502114259248038e-06, "loss": 0.0594, "num_input_tokens_seen": 19744000, "step": 19660 }, { "epoch": 10.426829268292684, "grad_norm": 26.966796875, "learning_rate": 5.499812367897146e-06, "loss": 0.2617, "num_input_tokens_seen": 19748640, "step": 19665 }, { "epoch": 10.429480381760339, "grad_norm": 34.47968292236328, "learning_rate": 5.497510369537734e-06, "loss": 0.2446, "num_input_tokens_seen": 19752736, "step": 19670 }, { "epoch": 10.432131495227996, "grad_norm": 5.338565826416016, "learning_rate": 5.495208264662654e-06, "loss": 0.1115, "num_input_tokens_seen": 19757248, "step": 19675 }, { "epoch": 10.434782608695652, "grad_norm": 5.319272041320801, "learning_rate": 5.4929060537647795e-06, "loss": 0.0606, "num_input_tokens_seen": 19763168, "step": 19680 }, { "epoch": 10.43743372216331, "grad_norm": 0.9963927865028381, "learning_rate": 5.490603737337011e-06, "loss": 0.1411, "num_input_tokens_seen": 19767904, "step": 19685 }, { "epoch": 10.440084835630966, "grad_norm": 4.172150611877441, "learning_rate": 5.488301315872263e-06, "loss": 0.1012, "num_input_tokens_seen": 19772192, "step": 19690 }, { "epoch": 10.442735949098621, "grad_norm": 1.4372581243515015, "learning_rate": 5.4859987898634826e-06, "loss": 0.0873, "num_input_tokens_seen": 19776896, "step": 19695 }, { "epoch": 10.445387062566278, "grad_norm": 4.359182357788086, "learning_rate": 5.483696159803633e-06, "loss": 0.1949, "num_input_tokens_seen": 19781056, "step": 19700 }, { "epoch": 10.448038176033934, "grad_norm": 23.00486946105957, "learning_rate": 5.481393426185701e-06, "loss": 0.227, "num_input_tokens_seen": 19785376, "step": 19705 }, { "epoch": 10.450689289501591, "grad_norm": 15.107956886291504, "learning_rate": 5.4790905895026965e-06, "loss": 0.2389, "num_input_tokens_seen": 19789888, "step": 19710 }, { "epoch": 10.453340402969246, "grad_norm": 11.475747108459473, "learning_rate": 5.476787650247649e-06, "loss": 0.1417, "num_input_tokens_seen": 19795232, "step": 19715 }, { "epoch": 10.455991516436903, "grad_norm": 4.058704853057861, "learning_rate": 5.474484608913615e-06, "loss": 0.0786, "num_input_tokens_seen": 19800544, "step": 19720 }, { "epoch": 10.45864262990456, "grad_norm": 2.6586222648620605, "learning_rate": 5.472181465993668e-06, "loss": 0.1438, "num_input_tokens_seen": 19805280, "step": 19725 }, { "epoch": 10.461293743372217, "grad_norm": 19.170146942138672, "learning_rate": 5.469878221980904e-06, "loss": 0.1253, "num_input_tokens_seen": 19810080, "step": 19730 }, { "epoch": 10.463944856839873, "grad_norm": 2.183380603790283, "learning_rate": 5.467574877368441e-06, "loss": 0.1219, "num_input_tokens_seen": 19814976, "step": 19735 }, { "epoch": 10.466595970307528, "grad_norm": 17.710783004760742, "learning_rate": 5.4652714326494216e-06, "loss": 0.1807, "num_input_tokens_seen": 19822400, "step": 19740 }, { "epoch": 10.469247083775185, "grad_norm": 9.070260047912598, "learning_rate": 5.462967888317007e-06, "loss": 0.0755, "num_input_tokens_seen": 19828256, "step": 19745 }, { "epoch": 10.471898197242842, "grad_norm": 2.604691743850708, "learning_rate": 5.460664244864378e-06, "loss": 0.0809, "num_input_tokens_seen": 19833344, "step": 19750 }, { "epoch": 10.474549310710499, "grad_norm": 4.01166296005249, "learning_rate": 5.458360502784739e-06, "loss": 0.1152, "num_input_tokens_seen": 19838240, "step": 19755 }, { "epoch": 10.477200424178156, "grad_norm": 2.6281487941741943, "learning_rate": 5.456056662571316e-06, "loss": 0.0835, "num_input_tokens_seen": 19844096, "step": 19760 }, { "epoch": 10.47985153764581, "grad_norm": 15.75788402557373, "learning_rate": 5.453752724717355e-06, "loss": 0.0935, "num_input_tokens_seen": 19849504, "step": 19765 }, { "epoch": 10.482502651113467, "grad_norm": 42.985694885253906, "learning_rate": 5.451448689716123e-06, "loss": 0.3345, "num_input_tokens_seen": 19855712, "step": 19770 }, { "epoch": 10.485153764581124, "grad_norm": 5.201847076416016, "learning_rate": 5.4491445580609086e-06, "loss": 0.0958, "num_input_tokens_seen": 19860096, "step": 19775 }, { "epoch": 10.487804878048781, "grad_norm": 19.244943618774414, "learning_rate": 5.446840330245018e-06, "loss": 0.3082, "num_input_tokens_seen": 19865600, "step": 19780 }, { "epoch": 10.490455991516438, "grad_norm": 6.047175407409668, "learning_rate": 5.444536006761784e-06, "loss": 0.0698, "num_input_tokens_seen": 19870848, "step": 19785 }, { "epoch": 10.493107104984093, "grad_norm": 28.782262802124023, "learning_rate": 5.442231588104553e-06, "loss": 0.121, "num_input_tokens_seen": 19875360, "step": 19790 }, { "epoch": 10.49575821845175, "grad_norm": 0.8659067749977112, "learning_rate": 5.439927074766697e-06, "loss": 0.156, "num_input_tokens_seen": 19880384, "step": 19795 }, { "epoch": 10.498409331919406, "grad_norm": 9.497617721557617, "learning_rate": 5.437622467241605e-06, "loss": 0.0931, "num_input_tokens_seen": 19884992, "step": 19800 }, { "epoch": 10.501060445387063, "grad_norm": 41.239166259765625, "learning_rate": 5.4353177660226875e-06, "loss": 0.2392, "num_input_tokens_seen": 19889568, "step": 19805 }, { "epoch": 10.503711558854718, "grad_norm": 17.03343963623047, "learning_rate": 5.433012971603374e-06, "loss": 0.1458, "num_input_tokens_seen": 19894816, "step": 19810 }, { "epoch": 10.506362672322375, "grad_norm": 11.252861022949219, "learning_rate": 5.430708084477119e-06, "loss": 0.2185, "num_input_tokens_seen": 19900192, "step": 19815 }, { "epoch": 10.509013785790032, "grad_norm": 5.564919471740723, "learning_rate": 5.42840310513739e-06, "loss": 0.03, "num_input_tokens_seen": 19906048, "step": 19820 }, { "epoch": 10.511664899257688, "grad_norm": 7.2812604904174805, "learning_rate": 5.426098034077676e-06, "loss": 0.249, "num_input_tokens_seen": 19910432, "step": 19825 }, { "epoch": 10.514316012725345, "grad_norm": 10.457470893859863, "learning_rate": 5.423792871791487e-06, "loss": 0.1208, "num_input_tokens_seen": 19914880, "step": 19830 }, { "epoch": 10.516967126193002, "grad_norm": 31.595678329467773, "learning_rate": 5.421487618772355e-06, "loss": 0.1344, "num_input_tokens_seen": 19920128, "step": 19835 }, { "epoch": 10.519618239660657, "grad_norm": 37.414878845214844, "learning_rate": 5.419182275513825e-06, "loss": 0.2636, "num_input_tokens_seen": 19925856, "step": 19840 }, { "epoch": 10.522269353128314, "grad_norm": 25.07284164428711, "learning_rate": 5.416876842509468e-06, "loss": 0.2026, "num_input_tokens_seen": 19931648, "step": 19845 }, { "epoch": 10.52492046659597, "grad_norm": 18.32428550720215, "learning_rate": 5.41457132025287e-06, "loss": 0.0705, "num_input_tokens_seen": 19936608, "step": 19850 }, { "epoch": 10.527571580063627, "grad_norm": 5.1493000984191895, "learning_rate": 5.412265709237637e-06, "loss": 0.0755, "num_input_tokens_seen": 19941984, "step": 19855 }, { "epoch": 10.530222693531282, "grad_norm": 0.3306381106376648, "learning_rate": 5.4099600099573945e-06, "loss": 0.094, "num_input_tokens_seen": 19945632, "step": 19860 }, { "epoch": 10.53287380699894, "grad_norm": 22.2292537689209, "learning_rate": 5.407654222905785e-06, "loss": 0.2073, "num_input_tokens_seen": 19949952, "step": 19865 }, { "epoch": 10.535524920466596, "grad_norm": 12.549044609069824, "learning_rate": 5.405348348576475e-06, "loss": 0.0918, "num_input_tokens_seen": 19954400, "step": 19870 }, { "epoch": 10.538176033934253, "grad_norm": 1.7654091119766235, "learning_rate": 5.403042387463144e-06, "loss": 0.1607, "num_input_tokens_seen": 19958464, "step": 19875 }, { "epoch": 10.54082714740191, "grad_norm": 2.3814823627471924, "learning_rate": 5.400736340059492e-06, "loss": 0.2121, "num_input_tokens_seen": 19963712, "step": 19880 }, { "epoch": 10.543478260869565, "grad_norm": 21.33064842224121, "learning_rate": 5.398430206859237e-06, "loss": 0.1123, "num_input_tokens_seen": 19968256, "step": 19885 }, { "epoch": 10.546129374337221, "grad_norm": 21.108423233032227, "learning_rate": 5.39612398835612e-06, "loss": 0.2731, "num_input_tokens_seen": 19972128, "step": 19890 }, { "epoch": 10.548780487804878, "grad_norm": 19.36099624633789, "learning_rate": 5.393817685043893e-06, "loss": 0.2803, "num_input_tokens_seen": 19977760, "step": 19895 }, { "epoch": 10.551431601272535, "grad_norm": 23.38446807861328, "learning_rate": 5.39151129741633e-06, "loss": 0.2687, "num_input_tokens_seen": 19982304, "step": 19900 }, { "epoch": 10.554082714740192, "grad_norm": 5.830677032470703, "learning_rate": 5.389204825967221e-06, "loss": 0.1585, "num_input_tokens_seen": 19986848, "step": 19905 }, { "epoch": 10.556733828207847, "grad_norm": 5.704689979553223, "learning_rate": 5.3868982711903785e-06, "loss": 0.1464, "num_input_tokens_seen": 19991776, "step": 19910 }, { "epoch": 10.559384941675503, "grad_norm": 22.146896362304688, "learning_rate": 5.3845916335796286e-06, "loss": 0.1652, "num_input_tokens_seen": 19997600, "step": 19915 }, { "epoch": 10.56203605514316, "grad_norm": 4.27588415145874, "learning_rate": 5.382284913628817e-06, "loss": 0.1646, "num_input_tokens_seen": 20002144, "step": 19920 }, { "epoch": 10.564687168610817, "grad_norm": 10.560737609863281, "learning_rate": 5.379978111831804e-06, "loss": 0.32, "num_input_tokens_seen": 20008160, "step": 19925 }, { "epoch": 10.567338282078474, "grad_norm": 14.325536727905273, "learning_rate": 5.377671228682473e-06, "loss": 0.0964, "num_input_tokens_seen": 20012448, "step": 19930 }, { "epoch": 10.569989395546129, "grad_norm": 26.51279067993164, "learning_rate": 5.37536426467472e-06, "loss": 0.202, "num_input_tokens_seen": 20016960, "step": 19935 }, { "epoch": 10.572640509013786, "grad_norm": 33.60559844970703, "learning_rate": 5.37305722030246e-06, "loss": 0.1755, "num_input_tokens_seen": 20021632, "step": 19940 }, { "epoch": 10.575291622481442, "grad_norm": 11.893566131591797, "learning_rate": 5.370750096059625e-06, "loss": 0.0634, "num_input_tokens_seen": 20026272, "step": 19945 }, { "epoch": 10.5779427359491, "grad_norm": 10.148799896240234, "learning_rate": 5.368442892440165e-06, "loss": 0.1374, "num_input_tokens_seen": 20033664, "step": 19950 }, { "epoch": 10.580593849416754, "grad_norm": 22.01934242248535, "learning_rate": 5.366135609938046e-06, "loss": 0.0962, "num_input_tokens_seen": 20040192, "step": 19955 }, { "epoch": 10.583244962884411, "grad_norm": 9.247483253479004, "learning_rate": 5.36382824904725e-06, "loss": 0.2071, "num_input_tokens_seen": 20044320, "step": 19960 }, { "epoch": 10.585896076352068, "grad_norm": 15.91799545288086, "learning_rate": 5.361520810261779e-06, "loss": 0.3152, "num_input_tokens_seen": 20048480, "step": 19965 }, { "epoch": 10.588547189819725, "grad_norm": 13.289165496826172, "learning_rate": 5.359213294075648e-06, "loss": 0.2565, "num_input_tokens_seen": 20053472, "step": 19970 }, { "epoch": 10.591198303287381, "grad_norm": 20.779678344726562, "learning_rate": 5.3569057009828905e-06, "loss": 0.1058, "num_input_tokens_seen": 20057632, "step": 19975 }, { "epoch": 10.593849416755036, "grad_norm": 9.342503547668457, "learning_rate": 5.354598031477556e-06, "loss": 0.1486, "num_input_tokens_seen": 20062432, "step": 19980 }, { "epoch": 10.596500530222693, "grad_norm": 20.83500099182129, "learning_rate": 5.352290286053712e-06, "loss": 0.1247, "num_input_tokens_seen": 20067424, "step": 19985 }, { "epoch": 10.59915164369035, "grad_norm": 7.849559307098389, "learning_rate": 5.349982465205439e-06, "loss": 0.0629, "num_input_tokens_seen": 20073632, "step": 19990 }, { "epoch": 10.601802757158007, "grad_norm": 6.510061740875244, "learning_rate": 5.347674569426836e-06, "loss": 0.056, "num_input_tokens_seen": 20078688, "step": 19995 }, { "epoch": 10.604453870625663, "grad_norm": 27.23178482055664, "learning_rate": 5.345366599212018e-06, "loss": 0.1122, "num_input_tokens_seen": 20083808, "step": 20000 }, { "epoch": 10.607104984093318, "grad_norm": 20.56533432006836, "learning_rate": 5.343058555055114e-06, "loss": 0.1336, "num_input_tokens_seen": 20089280, "step": 20005 }, { "epoch": 10.609756097560975, "grad_norm": 2.03434681892395, "learning_rate": 5.340750437450272e-06, "loss": 0.0649, "num_input_tokens_seen": 20094176, "step": 20010 }, { "epoch": 10.612407211028632, "grad_norm": 15.688400268554688, "learning_rate": 5.338442246891652e-06, "loss": 0.3016, "num_input_tokens_seen": 20099040, "step": 20015 }, { "epoch": 10.615058324496289, "grad_norm": 24.93320655822754, "learning_rate": 5.336133983873433e-06, "loss": 0.0872, "num_input_tokens_seen": 20103712, "step": 20020 }, { "epoch": 10.617709437963946, "grad_norm": 6.9354119300842285, "learning_rate": 5.333825648889808e-06, "loss": 0.0878, "num_input_tokens_seen": 20108352, "step": 20025 }, { "epoch": 10.6203605514316, "grad_norm": 4.378899097442627, "learning_rate": 5.331517242434985e-06, "loss": 0.1547, "num_input_tokens_seen": 20113568, "step": 20030 }, { "epoch": 10.623011664899257, "grad_norm": 13.795784950256348, "learning_rate": 5.3292087650031875e-06, "loss": 0.0945, "num_input_tokens_seen": 20118624, "step": 20035 }, { "epoch": 10.625662778366914, "grad_norm": 15.345702171325684, "learning_rate": 5.326900217088654e-06, "loss": 0.2189, "num_input_tokens_seen": 20123456, "step": 20040 }, { "epoch": 10.628313891834571, "grad_norm": 23.84792137145996, "learning_rate": 5.32459159918564e-06, "loss": 0.1812, "num_input_tokens_seen": 20128096, "step": 20045 }, { "epoch": 10.630965005302228, "grad_norm": 19.055727005004883, "learning_rate": 5.322282911788416e-06, "loss": 0.1495, "num_input_tokens_seen": 20133408, "step": 20050 }, { "epoch": 10.633616118769883, "grad_norm": 14.429389953613281, "learning_rate": 5.319974155391262e-06, "loss": 0.1763, "num_input_tokens_seen": 20137824, "step": 20055 }, { "epoch": 10.63626723223754, "grad_norm": 14.456567764282227, "learning_rate": 5.317665330488477e-06, "loss": 0.1396, "num_input_tokens_seen": 20142144, "step": 20060 }, { "epoch": 10.638918345705196, "grad_norm": 7.3633599281311035, "learning_rate": 5.315356437574379e-06, "loss": 0.1133, "num_input_tokens_seen": 20146496, "step": 20065 }, { "epoch": 10.641569459172853, "grad_norm": 7.442054271697998, "learning_rate": 5.313047477143294e-06, "loss": 0.1795, "num_input_tokens_seen": 20151520, "step": 20070 }, { "epoch": 10.64422057264051, "grad_norm": 1.6056815385818481, "learning_rate": 5.31073844968956e-06, "loss": 0.0282, "num_input_tokens_seen": 20156800, "step": 20075 }, { "epoch": 10.646871686108165, "grad_norm": 5.057211399078369, "learning_rate": 5.308429355707538e-06, "loss": 0.2139, "num_input_tokens_seen": 20161216, "step": 20080 }, { "epoch": 10.649522799575822, "grad_norm": 26.119976043701172, "learning_rate": 5.306120195691599e-06, "loss": 0.0762, "num_input_tokens_seen": 20165056, "step": 20085 }, { "epoch": 10.652173913043478, "grad_norm": 18.71857261657715, "learning_rate": 5.303810970136126e-06, "loss": 0.1457, "num_input_tokens_seen": 20170080, "step": 20090 }, { "epoch": 10.654825026511135, "grad_norm": 5.918149948120117, "learning_rate": 5.301501679535519e-06, "loss": 0.0555, "num_input_tokens_seen": 20175648, "step": 20095 }, { "epoch": 10.65747613997879, "grad_norm": 27.068599700927734, "learning_rate": 5.299192324384193e-06, "loss": 0.188, "num_input_tokens_seen": 20179648, "step": 20100 }, { "epoch": 10.660127253446447, "grad_norm": 24.641563415527344, "learning_rate": 5.296882905176571e-06, "loss": 0.1002, "num_input_tokens_seen": 20183840, "step": 20105 }, { "epoch": 10.662778366914104, "grad_norm": 21.94982147216797, "learning_rate": 5.294573422407098e-06, "loss": 0.0689, "num_input_tokens_seen": 20189376, "step": 20110 }, { "epoch": 10.66542948038176, "grad_norm": 26.253189086914062, "learning_rate": 5.292263876570224e-06, "loss": 0.1543, "num_input_tokens_seen": 20197152, "step": 20115 }, { "epoch": 10.668080593849417, "grad_norm": 24.923160552978516, "learning_rate": 5.289954268160421e-06, "loss": 0.1589, "num_input_tokens_seen": 20202656, "step": 20120 }, { "epoch": 10.670731707317072, "grad_norm": 32.092193603515625, "learning_rate": 5.287644597672167e-06, "loss": 0.1322, "num_input_tokens_seen": 20207488, "step": 20125 }, { "epoch": 10.67338282078473, "grad_norm": 6.663637161254883, "learning_rate": 5.285334865599956e-06, "loss": 0.2251, "num_input_tokens_seen": 20212032, "step": 20130 }, { "epoch": 10.676033934252386, "grad_norm": 9.064939498901367, "learning_rate": 5.283025072438299e-06, "loss": 0.1545, "num_input_tokens_seen": 20216960, "step": 20135 }, { "epoch": 10.678685047720043, "grad_norm": 3.6562395095825195, "learning_rate": 5.280715218681713e-06, "loss": 0.0721, "num_input_tokens_seen": 20222176, "step": 20140 }, { "epoch": 10.6813361611877, "grad_norm": 25.230894088745117, "learning_rate": 5.2784053048247356e-06, "loss": 0.1384, "num_input_tokens_seen": 20226816, "step": 20145 }, { "epoch": 10.683987274655355, "grad_norm": 2.6713576316833496, "learning_rate": 5.276095331361908e-06, "loss": 0.1596, "num_input_tokens_seen": 20231264, "step": 20150 }, { "epoch": 10.686638388123011, "grad_norm": 5.049556255340576, "learning_rate": 5.273785298787793e-06, "loss": 0.1371, "num_input_tokens_seen": 20235488, "step": 20155 }, { "epoch": 10.689289501590668, "grad_norm": 20.93416976928711, "learning_rate": 5.271475207596964e-06, "loss": 0.2597, "num_input_tokens_seen": 20240576, "step": 20160 }, { "epoch": 10.691940615058325, "grad_norm": 8.683208465576172, "learning_rate": 5.269165058284003e-06, "loss": 0.2002, "num_input_tokens_seen": 20245408, "step": 20165 }, { "epoch": 10.694591728525982, "grad_norm": 14.059590339660645, "learning_rate": 5.266854851343505e-06, "loss": 0.1305, "num_input_tokens_seen": 20250048, "step": 20170 }, { "epoch": 10.697242841993637, "grad_norm": 10.57894229888916, "learning_rate": 5.264544587270084e-06, "loss": 0.1408, "num_input_tokens_seen": 20255872, "step": 20175 }, { "epoch": 10.699893955461294, "grad_norm": 16.38808822631836, "learning_rate": 5.262234266558358e-06, "loss": 0.0814, "num_input_tokens_seen": 20261056, "step": 20180 }, { "epoch": 10.70254506892895, "grad_norm": 5.217442512512207, "learning_rate": 5.259923889702963e-06, "loss": 0.0996, "num_input_tokens_seen": 20267520, "step": 20185 }, { "epoch": 10.705196182396607, "grad_norm": 16.660158157348633, "learning_rate": 5.2576134571985406e-06, "loss": 0.1319, "num_input_tokens_seen": 20272608, "step": 20190 }, { "epoch": 10.707847295864262, "grad_norm": 24.549530029296875, "learning_rate": 5.255302969539753e-06, "loss": 0.13, "num_input_tokens_seen": 20277056, "step": 20195 }, { "epoch": 10.710498409331919, "grad_norm": 32.19437026977539, "learning_rate": 5.252992427221269e-06, "loss": 0.2477, "num_input_tokens_seen": 20282560, "step": 20200 }, { "epoch": 10.713149522799576, "grad_norm": 1.4419541358947754, "learning_rate": 5.2506818307377665e-06, "loss": 0.1958, "num_input_tokens_seen": 20287200, "step": 20205 }, { "epoch": 10.715800636267232, "grad_norm": 9.783110618591309, "learning_rate": 5.24837118058394e-06, "loss": 0.0765, "num_input_tokens_seen": 20292704, "step": 20210 }, { "epoch": 10.71845174973489, "grad_norm": 7.392271518707275, "learning_rate": 5.246060477254495e-06, "loss": 0.1486, "num_input_tokens_seen": 20298176, "step": 20215 }, { "epoch": 10.721102863202544, "grad_norm": 1.8960639238357544, "learning_rate": 5.243749721244144e-06, "loss": 0.2906, "num_input_tokens_seen": 20304064, "step": 20220 }, { "epoch": 10.723753976670201, "grad_norm": 36.15266418457031, "learning_rate": 5.241438913047616e-06, "loss": 0.1844, "num_input_tokens_seen": 20309184, "step": 20225 }, { "epoch": 10.726405090137858, "grad_norm": 24.216167449951172, "learning_rate": 5.239128053159649e-06, "loss": 0.1469, "num_input_tokens_seen": 20314176, "step": 20230 }, { "epoch": 10.729056203605515, "grad_norm": 5.308985233306885, "learning_rate": 5.236817142074991e-06, "loss": 0.0523, "num_input_tokens_seen": 20319392, "step": 20235 }, { "epoch": 10.731707317073171, "grad_norm": 7.5056376457214355, "learning_rate": 5.234506180288403e-06, "loss": 0.1817, "num_input_tokens_seen": 20325408, "step": 20240 }, { "epoch": 10.734358430540826, "grad_norm": 35.941707611083984, "learning_rate": 5.2321951682946556e-06, "loss": 0.1645, "num_input_tokens_seen": 20329408, "step": 20245 }, { "epoch": 10.737009544008483, "grad_norm": 6.620516300201416, "learning_rate": 5.22988410658853e-06, "loss": 0.0307, "num_input_tokens_seen": 20334464, "step": 20250 }, { "epoch": 10.73966065747614, "grad_norm": 2.842048406600952, "learning_rate": 5.227572995664819e-06, "loss": 0.155, "num_input_tokens_seen": 20339392, "step": 20255 }, { "epoch": 10.742311770943797, "grad_norm": 11.843693733215332, "learning_rate": 5.2252618360183245e-06, "loss": 0.0701, "num_input_tokens_seen": 20344672, "step": 20260 }, { "epoch": 10.744962884411454, "grad_norm": 23.051626205444336, "learning_rate": 5.22295062814386e-06, "loss": 0.2079, "num_input_tokens_seen": 20349632, "step": 20265 }, { "epoch": 10.747613997879109, "grad_norm": 23.185489654541016, "learning_rate": 5.22063937253625e-06, "loss": 0.1755, "num_input_tokens_seen": 20354240, "step": 20270 }, { "epoch": 10.750265111346765, "grad_norm": 4.077610492706299, "learning_rate": 5.218328069690329e-06, "loss": 0.0964, "num_input_tokens_seen": 20359360, "step": 20275 }, { "epoch": 10.752916224814422, "grad_norm": 12.607144355773926, "learning_rate": 5.216016720100939e-06, "loss": 0.1459, "num_input_tokens_seen": 20364224, "step": 20280 }, { "epoch": 10.755567338282079, "grad_norm": 21.5239315032959, "learning_rate": 5.213705324262932e-06, "loss": 0.1293, "num_input_tokens_seen": 20369056, "step": 20285 }, { "epoch": 10.758218451749736, "grad_norm": 10.918327331542969, "learning_rate": 5.211393882671177e-06, "loss": 0.1388, "num_input_tokens_seen": 20373856, "step": 20290 }, { "epoch": 10.76086956521739, "grad_norm": 4.039086818695068, "learning_rate": 5.209082395820544e-06, "loss": 0.1321, "num_input_tokens_seen": 20378272, "step": 20295 }, { "epoch": 10.763520678685047, "grad_norm": 1.164446473121643, "learning_rate": 5.206770864205918e-06, "loss": 0.1955, "num_input_tokens_seen": 20382784, "step": 20300 }, { "epoch": 10.766171792152704, "grad_norm": 1.5328402519226074, "learning_rate": 5.204459288322189e-06, "loss": 0.1461, "num_input_tokens_seen": 20387968, "step": 20305 }, { "epoch": 10.768822905620361, "grad_norm": 22.596893310546875, "learning_rate": 5.202147668664264e-06, "loss": 0.1677, "num_input_tokens_seen": 20395232, "step": 20310 }, { "epoch": 10.771474019088018, "grad_norm": 11.98953914642334, "learning_rate": 5.1998360057270505e-06, "loss": 0.1222, "num_input_tokens_seen": 20399520, "step": 20315 }, { "epoch": 10.774125132555673, "grad_norm": 37.43484878540039, "learning_rate": 5.197524300005471e-06, "loss": 0.3222, "num_input_tokens_seen": 20404736, "step": 20320 }, { "epoch": 10.77677624602333, "grad_norm": 4.511785507202148, "learning_rate": 5.195212551994455e-06, "loss": 0.0195, "num_input_tokens_seen": 20409984, "step": 20325 }, { "epoch": 10.779427359490986, "grad_norm": 0.566935122013092, "learning_rate": 5.192900762188944e-06, "loss": 0.0442, "num_input_tokens_seen": 20415072, "step": 20330 }, { "epoch": 10.782078472958643, "grad_norm": 21.477724075317383, "learning_rate": 5.190588931083884e-06, "loss": 0.1636, "num_input_tokens_seen": 20418624, "step": 20335 }, { "epoch": 10.784729586426298, "grad_norm": 4.632109642028809, "learning_rate": 5.188277059174234e-06, "loss": 0.1071, "num_input_tokens_seen": 20423840, "step": 20340 }, { "epoch": 10.787380699893955, "grad_norm": 9.911543846130371, "learning_rate": 5.1859651469549555e-06, "loss": 0.0932, "num_input_tokens_seen": 20430688, "step": 20345 }, { "epoch": 10.790031813361612, "grad_norm": 32.256507873535156, "learning_rate": 5.183653194921029e-06, "loss": 0.0703, "num_input_tokens_seen": 20436512, "step": 20350 }, { "epoch": 10.792682926829269, "grad_norm": 17.128389358520508, "learning_rate": 5.181341203567433e-06, "loss": 0.0966, "num_input_tokens_seen": 20441312, "step": 20355 }, { "epoch": 10.795334040296925, "grad_norm": 14.624302864074707, "learning_rate": 5.179029173389157e-06, "loss": 0.104, "num_input_tokens_seen": 20446784, "step": 20360 }, { "epoch": 10.79798515376458, "grad_norm": 5.289689540863037, "learning_rate": 5.176717104881206e-06, "loss": 0.1576, "num_input_tokens_seen": 20450848, "step": 20365 }, { "epoch": 10.800636267232237, "grad_norm": 18.323034286499023, "learning_rate": 5.174404998538585e-06, "loss": 0.2183, "num_input_tokens_seen": 20456576, "step": 20370 }, { "epoch": 10.803287380699894, "grad_norm": 4.777399063110352, "learning_rate": 5.17209285485631e-06, "loss": 0.1267, "num_input_tokens_seen": 20460672, "step": 20375 }, { "epoch": 10.80593849416755, "grad_norm": 17.577844619750977, "learning_rate": 5.169780674329404e-06, "loss": 0.2279, "num_input_tokens_seen": 20466080, "step": 20380 }, { "epoch": 10.808589607635207, "grad_norm": 11.684486389160156, "learning_rate": 5.167468457452901e-06, "loss": 0.1062, "num_input_tokens_seen": 20470976, "step": 20385 }, { "epoch": 10.811240721102862, "grad_norm": 13.691306114196777, "learning_rate": 5.165156204721841e-06, "loss": 0.1815, "num_input_tokens_seen": 20475552, "step": 20390 }, { "epoch": 10.81389183457052, "grad_norm": 33.745460510253906, "learning_rate": 5.162843916631267e-06, "loss": 0.2203, "num_input_tokens_seen": 20481152, "step": 20395 }, { "epoch": 10.816542948038176, "grad_norm": 22.511940002441406, "learning_rate": 5.160531593676238e-06, "loss": 0.278, "num_input_tokens_seen": 20485760, "step": 20400 }, { "epoch": 10.819194061505833, "grad_norm": 7.1110310554504395, "learning_rate": 5.158219236351815e-06, "loss": 0.0874, "num_input_tokens_seen": 20490464, "step": 20405 }, { "epoch": 10.82184517497349, "grad_norm": 20.99064064025879, "learning_rate": 5.155906845153067e-06, "loss": 0.1173, "num_input_tokens_seen": 20496032, "step": 20410 }, { "epoch": 10.824496288441145, "grad_norm": 39.07052230834961, "learning_rate": 5.15359442057507e-06, "loss": 0.1845, "num_input_tokens_seen": 20501120, "step": 20415 }, { "epoch": 10.827147401908801, "grad_norm": 0.32585710287094116, "learning_rate": 5.151281963112912e-06, "loss": 0.1221, "num_input_tokens_seen": 20505504, "step": 20420 }, { "epoch": 10.829798515376458, "grad_norm": 11.689288139343262, "learning_rate": 5.1489694732616805e-06, "loss": 0.071, "num_input_tokens_seen": 20512096, "step": 20425 }, { "epoch": 10.832449628844115, "grad_norm": 6.263365268707275, "learning_rate": 5.1466569515164745e-06, "loss": 0.063, "num_input_tokens_seen": 20517472, "step": 20430 }, { "epoch": 10.83510074231177, "grad_norm": 5.376230716705322, "learning_rate": 5.144344398372398e-06, "loss": 0.1783, "num_input_tokens_seen": 20521696, "step": 20435 }, { "epoch": 10.837751855779427, "grad_norm": 25.39902114868164, "learning_rate": 5.142031814324565e-06, "loss": 0.1794, "num_input_tokens_seen": 20527008, "step": 20440 }, { "epoch": 10.840402969247084, "grad_norm": 26.580684661865234, "learning_rate": 5.139719199868093e-06, "loss": 0.1865, "num_input_tokens_seen": 20532352, "step": 20445 }, { "epoch": 10.84305408271474, "grad_norm": 0.6630423665046692, "learning_rate": 5.137406555498105e-06, "loss": 0.1967, "num_input_tokens_seen": 20538624, "step": 20450 }, { "epoch": 10.845705196182397, "grad_norm": 1.5008832216262817, "learning_rate": 5.135093881709731e-06, "loss": 0.1427, "num_input_tokens_seen": 20543584, "step": 20455 }, { "epoch": 10.848356309650054, "grad_norm": 23.728239059448242, "learning_rate": 5.132781178998113e-06, "loss": 0.1345, "num_input_tokens_seen": 20548352, "step": 20460 }, { "epoch": 10.851007423117709, "grad_norm": 5.513129234313965, "learning_rate": 5.130468447858391e-06, "loss": 0.2538, "num_input_tokens_seen": 20554880, "step": 20465 }, { "epoch": 10.853658536585366, "grad_norm": 0.24844293296337128, "learning_rate": 5.128155688785717e-06, "loss": 0.0534, "num_input_tokens_seen": 20558976, "step": 20470 }, { "epoch": 10.856309650053023, "grad_norm": 13.43002700805664, "learning_rate": 5.1258429022752435e-06, "loss": 0.0895, "num_input_tokens_seen": 20563168, "step": 20475 }, { "epoch": 10.85896076352068, "grad_norm": 18.60541343688965, "learning_rate": 5.123530088822134e-06, "loss": 0.1471, "num_input_tokens_seen": 20567616, "step": 20480 }, { "epoch": 10.861611876988334, "grad_norm": 24.41878318786621, "learning_rate": 5.121217248921558e-06, "loss": 0.3069, "num_input_tokens_seen": 20574560, "step": 20485 }, { "epoch": 10.864262990455991, "grad_norm": 10.933459281921387, "learning_rate": 5.118904383068684e-06, "loss": 0.0504, "num_input_tokens_seen": 20579360, "step": 20490 }, { "epoch": 10.866914103923648, "grad_norm": 10.381598472595215, "learning_rate": 5.116591491758693e-06, "loss": 0.1133, "num_input_tokens_seen": 20584384, "step": 20495 }, { "epoch": 10.869565217391305, "grad_norm": 35.08940124511719, "learning_rate": 5.11427857548677e-06, "loss": 0.1366, "num_input_tokens_seen": 20590112, "step": 20500 }, { "epoch": 10.872216330858961, "grad_norm": 34.04245376586914, "learning_rate": 5.111965634748102e-06, "loss": 0.1165, "num_input_tokens_seen": 20595008, "step": 20505 }, { "epoch": 10.874867444326616, "grad_norm": 3.755695343017578, "learning_rate": 5.1096526700378836e-06, "loss": 0.0689, "num_input_tokens_seen": 20600672, "step": 20510 }, { "epoch": 10.877518557794273, "grad_norm": 1.2733193635940552, "learning_rate": 5.107339681851318e-06, "loss": 0.0605, "num_input_tokens_seen": 20607072, "step": 20515 }, { "epoch": 10.88016967126193, "grad_norm": 5.917698860168457, "learning_rate": 5.1050266706836045e-06, "loss": 0.1788, "num_input_tokens_seen": 20612992, "step": 20520 }, { "epoch": 10.882820784729587, "grad_norm": 25.56051254272461, "learning_rate": 5.1027136370299575e-06, "loss": 0.1659, "num_input_tokens_seen": 20617632, "step": 20525 }, { "epoch": 10.885471898197244, "grad_norm": 6.976589679718018, "learning_rate": 5.1004005813855895e-06, "loss": 0.1335, "num_input_tokens_seen": 20622720, "step": 20530 }, { "epoch": 10.888123011664899, "grad_norm": 37.06829833984375, "learning_rate": 5.098087504245719e-06, "loss": 0.2019, "num_input_tokens_seen": 20627264, "step": 20535 }, { "epoch": 10.890774125132555, "grad_norm": 10.349249839782715, "learning_rate": 5.095774406105572e-06, "loss": 0.2785, "num_input_tokens_seen": 20632192, "step": 20540 }, { "epoch": 10.893425238600212, "grad_norm": 1.8057496547698975, "learning_rate": 5.093461287460374e-06, "loss": 0.24, "num_input_tokens_seen": 20637248, "step": 20545 }, { "epoch": 10.896076352067869, "grad_norm": 6.927521228790283, "learning_rate": 5.091148148805358e-06, "loss": 0.1001, "num_input_tokens_seen": 20641600, "step": 20550 }, { "epoch": 10.898727465535526, "grad_norm": 4.761428356170654, "learning_rate": 5.088834990635764e-06, "loss": 0.0714, "num_input_tokens_seen": 20646304, "step": 20555 }, { "epoch": 10.90137857900318, "grad_norm": 3.5453248023986816, "learning_rate": 5.08652181344683e-06, "loss": 0.04, "num_input_tokens_seen": 20652192, "step": 20560 }, { "epoch": 10.904029692470838, "grad_norm": 2.307642698287964, "learning_rate": 5.084208617733802e-06, "loss": 0.2338, "num_input_tokens_seen": 20657056, "step": 20565 }, { "epoch": 10.906680805938494, "grad_norm": 11.49953556060791, "learning_rate": 5.081895403991929e-06, "loss": 0.0736, "num_input_tokens_seen": 20661984, "step": 20570 }, { "epoch": 10.909331919406151, "grad_norm": 12.845601081848145, "learning_rate": 5.079582172716464e-06, "loss": 0.1709, "num_input_tokens_seen": 20666176, "step": 20575 }, { "epoch": 10.911983032873806, "grad_norm": 49.15665817260742, "learning_rate": 5.077268924402665e-06, "loss": 0.2064, "num_input_tokens_seen": 20671200, "step": 20580 }, { "epoch": 10.914634146341463, "grad_norm": 5.088171482086182, "learning_rate": 5.0749556595457915e-06, "loss": 0.2295, "num_input_tokens_seen": 20677504, "step": 20585 }, { "epoch": 10.91728525980912, "grad_norm": 7.744271755218506, "learning_rate": 5.072642378641107e-06, "loss": 0.217, "num_input_tokens_seen": 20682912, "step": 20590 }, { "epoch": 10.919936373276776, "grad_norm": 3.1356496810913086, "learning_rate": 5.070329082183879e-06, "loss": 0.1595, "num_input_tokens_seen": 20687296, "step": 20595 }, { "epoch": 10.922587486744433, "grad_norm": 9.700809478759766, "learning_rate": 5.06801577066938e-06, "loss": 0.2144, "num_input_tokens_seen": 20692224, "step": 20600 }, { "epoch": 10.925238600212088, "grad_norm": 4.010022163391113, "learning_rate": 5.065702444592881e-06, "loss": 0.1124, "num_input_tokens_seen": 20697504, "step": 20605 }, { "epoch": 10.927889713679745, "grad_norm": 4.683710098266602, "learning_rate": 5.063389104449661e-06, "loss": 0.088, "num_input_tokens_seen": 20703136, "step": 20610 }, { "epoch": 10.930540827147402, "grad_norm": 27.468107223510742, "learning_rate": 5.061075750734999e-06, "loss": 0.0911, "num_input_tokens_seen": 20709632, "step": 20615 }, { "epoch": 10.933191940615059, "grad_norm": 8.32169246673584, "learning_rate": 5.05876238394418e-06, "loss": 0.2934, "num_input_tokens_seen": 20714016, "step": 20620 }, { "epoch": 10.935843054082715, "grad_norm": 49.402626037597656, "learning_rate": 5.056449004572488e-06, "loss": 0.3792, "num_input_tokens_seen": 20718816, "step": 20625 }, { "epoch": 10.93849416755037, "grad_norm": 17.07676124572754, "learning_rate": 5.054135613115212e-06, "loss": 0.1794, "num_input_tokens_seen": 20722880, "step": 20630 }, { "epoch": 10.941145281018027, "grad_norm": 31.38433074951172, "learning_rate": 5.051822210067642e-06, "loss": 0.3439, "num_input_tokens_seen": 20726816, "step": 20635 }, { "epoch": 10.943796394485684, "grad_norm": 20.82102394104004, "learning_rate": 5.049508795925073e-06, "loss": 0.1255, "num_input_tokens_seen": 20731744, "step": 20640 }, { "epoch": 10.94644750795334, "grad_norm": 16.5789737701416, "learning_rate": 5.047195371182799e-06, "loss": 0.1525, "num_input_tokens_seen": 20736576, "step": 20645 }, { "epoch": 10.949098621420998, "grad_norm": 39.75028610229492, "learning_rate": 5.044881936336122e-06, "loss": 0.1598, "num_input_tokens_seen": 20742304, "step": 20650 }, { "epoch": 10.951749734888653, "grad_norm": 24.000192642211914, "learning_rate": 5.042568491880338e-06, "loss": 0.1712, "num_input_tokens_seen": 20746976, "step": 20655 }, { "epoch": 10.95440084835631, "grad_norm": 25.6882266998291, "learning_rate": 5.040255038310751e-06, "loss": 0.1381, "num_input_tokens_seen": 20753152, "step": 20660 }, { "epoch": 10.957051961823966, "grad_norm": 4.274637699127197, "learning_rate": 5.037941576122667e-06, "loss": 0.07, "num_input_tokens_seen": 20758240, "step": 20665 }, { "epoch": 10.959703075291623, "grad_norm": 9.304180145263672, "learning_rate": 5.03562810581139e-06, "loss": 0.08, "num_input_tokens_seen": 20762976, "step": 20670 }, { "epoch": 10.96235418875928, "grad_norm": 21.66292953491211, "learning_rate": 5.033314627872229e-06, "loss": 0.3895, "num_input_tokens_seen": 20767424, "step": 20675 }, { "epoch": 10.965005302226935, "grad_norm": 28.72234535217285, "learning_rate": 5.031001142800494e-06, "loss": 0.1657, "num_input_tokens_seen": 20771904, "step": 20680 }, { "epoch": 10.967656415694591, "grad_norm": 15.894820213317871, "learning_rate": 5.028687651091494e-06, "loss": 0.1153, "num_input_tokens_seen": 20778336, "step": 20685 }, { "epoch": 10.970307529162248, "grad_norm": 2.79201602935791, "learning_rate": 5.026374153240544e-06, "loss": 0.0961, "num_input_tokens_seen": 20783104, "step": 20690 }, { "epoch": 10.972958642629905, "grad_norm": 50.249176025390625, "learning_rate": 5.024060649742957e-06, "loss": 0.4186, "num_input_tokens_seen": 20788128, "step": 20695 }, { "epoch": 10.975609756097562, "grad_norm": 1.7788447141647339, "learning_rate": 5.021747141094046e-06, "loss": 0.0735, "num_input_tokens_seen": 20793344, "step": 20700 }, { "epoch": 10.978260869565217, "grad_norm": 26.40123176574707, "learning_rate": 5.0194336277891305e-06, "loss": 0.1505, "num_input_tokens_seen": 20798496, "step": 20705 }, { "epoch": 10.980911983032874, "grad_norm": 6.519003391265869, "learning_rate": 5.017120110323527e-06, "loss": 0.0182, "num_input_tokens_seen": 20803904, "step": 20710 }, { "epoch": 10.98356309650053, "grad_norm": 0.8110318779945374, "learning_rate": 5.0148065891925525e-06, "loss": 0.0814, "num_input_tokens_seen": 20809728, "step": 20715 }, { "epoch": 10.986214209968187, "grad_norm": 1.477117657661438, "learning_rate": 5.0124930648915235e-06, "loss": 0.1737, "num_input_tokens_seen": 20815200, "step": 20720 }, { "epoch": 10.988865323435842, "grad_norm": 35.228668212890625, "learning_rate": 5.010179537915765e-06, "loss": 0.214, "num_input_tokens_seen": 20820864, "step": 20725 }, { "epoch": 10.991516436903499, "grad_norm": 19.187580108642578, "learning_rate": 5.007866008760593e-06, "loss": 0.338, "num_input_tokens_seen": 20824960, "step": 20730 }, { "epoch": 10.994167550371156, "grad_norm": 1.4879233837127686, "learning_rate": 5.005552477921328e-06, "loss": 0.2322, "num_input_tokens_seen": 20829920, "step": 20735 }, { "epoch": 10.996818663838813, "grad_norm": 2.7316644191741943, "learning_rate": 5.003238945893293e-06, "loss": 0.2039, "num_input_tokens_seen": 20836224, "step": 20740 }, { "epoch": 10.99946977730647, "grad_norm": 3.0861616134643555, "learning_rate": 5.0009254131718075e-06, "loss": 0.0536, "num_input_tokens_seen": 20842048, "step": 20745 }, { "epoch": 11.002120890774124, "grad_norm": 10.108379364013672, "learning_rate": 4.998611880252196e-06, "loss": 0.0365, "num_input_tokens_seen": 20846320, "step": 20750 }, { "epoch": 11.004772004241781, "grad_norm": 12.423861503601074, "learning_rate": 4.996298347629774e-06, "loss": 0.0769, "num_input_tokens_seen": 20851184, "step": 20755 }, { "epoch": 11.007423117709438, "grad_norm": 5.932409763336182, "learning_rate": 4.993984815799868e-06, "loss": 0.0948, "num_input_tokens_seen": 20856976, "step": 20760 }, { "epoch": 11.010074231177095, "grad_norm": 17.68355369567871, "learning_rate": 4.991671285257796e-06, "loss": 0.0653, "num_input_tokens_seen": 20861744, "step": 20765 }, { "epoch": 11.012725344644752, "grad_norm": 2.4897215366363525, "learning_rate": 4.989357756498882e-06, "loss": 0.0316, "num_input_tokens_seen": 20866224, "step": 20770 }, { "epoch": 11.015376458112407, "grad_norm": 12.028095245361328, "learning_rate": 4.987044230018442e-06, "loss": 0.0578, "num_input_tokens_seen": 20871568, "step": 20775 }, { "epoch": 11.018027571580063, "grad_norm": 17.808908462524414, "learning_rate": 4.9847307063118006e-06, "loss": 0.1542, "num_input_tokens_seen": 20876240, "step": 20780 }, { "epoch": 11.02067868504772, "grad_norm": 9.206863403320312, "learning_rate": 4.9824171858742734e-06, "loss": 0.0811, "num_input_tokens_seen": 20881648, "step": 20785 }, { "epoch": 11.023329798515377, "grad_norm": 2.448005437850952, "learning_rate": 4.980103669201183e-06, "loss": 0.073, "num_input_tokens_seen": 20886288, "step": 20790 }, { "epoch": 11.025980911983034, "grad_norm": 10.985525131225586, "learning_rate": 4.977790156787842e-06, "loss": 0.0965, "num_input_tokens_seen": 20890576, "step": 20795 }, { "epoch": 11.028632025450689, "grad_norm": 15.244794845581055, "learning_rate": 4.975476649129571e-06, "loss": 0.1049, "num_input_tokens_seen": 20895280, "step": 20800 }, { "epoch": 11.031283138918345, "grad_norm": 12.19970989227295, "learning_rate": 4.973163146721686e-06, "loss": 0.0649, "num_input_tokens_seen": 20900656, "step": 20805 }, { "epoch": 11.033934252386002, "grad_norm": 7.918367385864258, "learning_rate": 4.970849650059502e-06, "loss": 0.0739, "num_input_tokens_seen": 20904560, "step": 20810 }, { "epoch": 11.036585365853659, "grad_norm": 9.928897857666016, "learning_rate": 4.9685361596383295e-06, "loss": 0.0454, "num_input_tokens_seen": 20909104, "step": 20815 }, { "epoch": 11.039236479321316, "grad_norm": 9.858799934387207, "learning_rate": 4.966222675953484e-06, "loss": 0.1038, "num_input_tokens_seen": 20914544, "step": 20820 }, { "epoch": 11.04188759278897, "grad_norm": 1.8394056558609009, "learning_rate": 4.963909199500275e-06, "loss": 0.0912, "num_input_tokens_seen": 20918928, "step": 20825 }, { "epoch": 11.044538706256628, "grad_norm": 0.9974989891052246, "learning_rate": 4.961595730774013e-06, "loss": 0.0994, "num_input_tokens_seen": 20923600, "step": 20830 }, { "epoch": 11.047189819724284, "grad_norm": 38.480079650878906, "learning_rate": 4.959282270270004e-06, "loss": 0.0988, "num_input_tokens_seen": 20928944, "step": 20835 }, { "epoch": 11.049840933191941, "grad_norm": 11.472413063049316, "learning_rate": 4.956968818483553e-06, "loss": 0.0892, "num_input_tokens_seen": 20935568, "step": 20840 }, { "epoch": 11.052492046659596, "grad_norm": 2.664863109588623, "learning_rate": 4.954655375909968e-06, "loss": 0.0401, "num_input_tokens_seen": 20940976, "step": 20845 }, { "epoch": 11.055143160127253, "grad_norm": 21.030548095703125, "learning_rate": 4.952341943044546e-06, "loss": 0.0403, "num_input_tokens_seen": 20945520, "step": 20850 }, { "epoch": 11.05779427359491, "grad_norm": 14.696815490722656, "learning_rate": 4.95002852038259e-06, "loss": 0.0342, "num_input_tokens_seen": 20949968, "step": 20855 }, { "epoch": 11.060445387062567, "grad_norm": 31.22821044921875, "learning_rate": 4.947715108419397e-06, "loss": 0.0961, "num_input_tokens_seen": 20954448, "step": 20860 }, { "epoch": 11.063096500530223, "grad_norm": 7.018080711364746, "learning_rate": 4.9454017076502634e-06, "loss": 0.042, "num_input_tokens_seen": 20958864, "step": 20865 }, { "epoch": 11.065747613997878, "grad_norm": 7.273893356323242, "learning_rate": 4.9430883185704796e-06, "loss": 0.0672, "num_input_tokens_seen": 20963344, "step": 20870 }, { "epoch": 11.068398727465535, "grad_norm": 5.74468994140625, "learning_rate": 4.940774941675338e-06, "loss": 0.0272, "num_input_tokens_seen": 20968400, "step": 20875 }, { "epoch": 11.071049840933192, "grad_norm": 11.683055877685547, "learning_rate": 4.938461577460127e-06, "loss": 0.019, "num_input_tokens_seen": 20972944, "step": 20880 }, { "epoch": 11.073700954400849, "grad_norm": 37.56507110595703, "learning_rate": 4.936148226420133e-06, "loss": 0.1429, "num_input_tokens_seen": 20978096, "step": 20885 }, { "epoch": 11.076352067868505, "grad_norm": 22.46236228942871, "learning_rate": 4.933834889050634e-06, "loss": 0.074, "num_input_tokens_seen": 20982576, "step": 20890 }, { "epoch": 11.07900318133616, "grad_norm": 1.6853764057159424, "learning_rate": 4.931521565846914e-06, "loss": 0.0266, "num_input_tokens_seen": 20986672, "step": 20895 }, { "epoch": 11.081654294803817, "grad_norm": 28.491640090942383, "learning_rate": 4.929208257304245e-06, "loss": 0.2211, "num_input_tokens_seen": 20991216, "step": 20900 }, { "epoch": 11.084305408271474, "grad_norm": 17.743196487426758, "learning_rate": 4.926894963917906e-06, "loss": 0.2162, "num_input_tokens_seen": 20995984, "step": 20905 }, { "epoch": 11.08695652173913, "grad_norm": 4.390822410583496, "learning_rate": 4.924581686183162e-06, "loss": 0.0626, "num_input_tokens_seen": 21000784, "step": 20910 }, { "epoch": 11.089607635206788, "grad_norm": 27.610958099365234, "learning_rate": 4.922268424595282e-06, "loss": 0.173, "num_input_tokens_seen": 21005776, "step": 20915 }, { "epoch": 11.092258748674443, "grad_norm": 16.25704574584961, "learning_rate": 4.919955179649528e-06, "loss": 0.1152, "num_input_tokens_seen": 21010576, "step": 20920 }, { "epoch": 11.0949098621421, "grad_norm": 5.217303276062012, "learning_rate": 4.917641951841163e-06, "loss": 0.0572, "num_input_tokens_seen": 21016080, "step": 20925 }, { "epoch": 11.097560975609756, "grad_norm": 8.108250617980957, "learning_rate": 4.9153287416654386e-06, "loss": 0.1655, "num_input_tokens_seen": 21020944, "step": 20930 }, { "epoch": 11.100212089077413, "grad_norm": 3.732548952102661, "learning_rate": 4.91301554961761e-06, "loss": 0.0544, "num_input_tokens_seen": 21025648, "step": 20935 }, { "epoch": 11.10286320254507, "grad_norm": 36.52517318725586, "learning_rate": 4.910702376192923e-06, "loss": 0.1767, "num_input_tokens_seen": 21031312, "step": 20940 }, { "epoch": 11.105514316012725, "grad_norm": 6.416959285736084, "learning_rate": 4.908389221886627e-06, "loss": 0.2047, "num_input_tokens_seen": 21035536, "step": 20945 }, { "epoch": 11.108165429480382, "grad_norm": 26.229251861572266, "learning_rate": 4.906076087193954e-06, "loss": 0.1511, "num_input_tokens_seen": 21042128, "step": 20950 }, { "epoch": 11.110816542948038, "grad_norm": 14.523839950561523, "learning_rate": 4.9037629726101495e-06, "loss": 0.0611, "num_input_tokens_seen": 21046960, "step": 20955 }, { "epoch": 11.113467656415695, "grad_norm": 1.4406794309616089, "learning_rate": 4.901449878630437e-06, "loss": 0.2172, "num_input_tokens_seen": 21051856, "step": 20960 }, { "epoch": 11.11611876988335, "grad_norm": 6.047017574310303, "learning_rate": 4.899136805750052e-06, "loss": 0.1658, "num_input_tokens_seen": 21056944, "step": 20965 }, { "epoch": 11.118769883351007, "grad_norm": 30.98756217956543, "learning_rate": 4.89682375446421e-06, "loss": 0.1174, "num_input_tokens_seen": 21061296, "step": 20970 }, { "epoch": 11.121420996818664, "grad_norm": 1.101090669631958, "learning_rate": 4.894510725268133e-06, "loss": 0.0644, "num_input_tokens_seen": 21066160, "step": 20975 }, { "epoch": 11.12407211028632, "grad_norm": 25.61544418334961, "learning_rate": 4.892197718657034e-06, "loss": 0.0952, "num_input_tokens_seen": 21071152, "step": 20980 }, { "epoch": 11.126723223753977, "grad_norm": 2.7282907962799072, "learning_rate": 4.889884735126119e-06, "loss": 0.0754, "num_input_tokens_seen": 21076496, "step": 20985 }, { "epoch": 11.129374337221632, "grad_norm": 5.494358539581299, "learning_rate": 4.887571775170597e-06, "loss": 0.0284, "num_input_tokens_seen": 21080880, "step": 20990 }, { "epoch": 11.132025450689289, "grad_norm": 2.196190357208252, "learning_rate": 4.885258839285662e-06, "loss": 0.0311, "num_input_tokens_seen": 21088400, "step": 20995 }, { "epoch": 11.134676564156946, "grad_norm": 8.57479190826416, "learning_rate": 4.88294592796651e-06, "loss": 0.0656, "num_input_tokens_seen": 21093968, "step": 21000 }, { "epoch": 11.137327677624603, "grad_norm": 24.469297409057617, "learning_rate": 4.880633041708327e-06, "loss": 0.0845, "num_input_tokens_seen": 21100144, "step": 21005 }, { "epoch": 11.13997879109226, "grad_norm": 2.306950807571411, "learning_rate": 4.8783201810063e-06, "loss": 0.1036, "num_input_tokens_seen": 21105424, "step": 21010 }, { "epoch": 11.142629904559914, "grad_norm": 0.38249796628952026, "learning_rate": 4.876007346355602e-06, "loss": 0.1434, "num_input_tokens_seen": 21110512, "step": 21015 }, { "epoch": 11.145281018027571, "grad_norm": 3.1653478145599365, "learning_rate": 4.873694538251407e-06, "loss": 0.053, "num_input_tokens_seen": 21115760, "step": 21020 }, { "epoch": 11.147932131495228, "grad_norm": 19.51761245727539, "learning_rate": 4.87138175718888e-06, "loss": 0.0466, "num_input_tokens_seen": 21122256, "step": 21025 }, { "epoch": 11.150583244962885, "grad_norm": 6.662445545196533, "learning_rate": 4.869069003663185e-06, "loss": 0.0594, "num_input_tokens_seen": 21128240, "step": 21030 }, { "epoch": 11.153234358430542, "grad_norm": 20.4639892578125, "learning_rate": 4.866756278169471e-06, "loss": 0.0454, "num_input_tokens_seen": 21132176, "step": 21035 }, { "epoch": 11.155885471898197, "grad_norm": 4.95155668258667, "learning_rate": 4.864443581202891e-06, "loss": 0.0381, "num_input_tokens_seen": 21136272, "step": 21040 }, { "epoch": 11.158536585365853, "grad_norm": 0.9074049592018127, "learning_rate": 4.862130913258583e-06, "loss": 0.0744, "num_input_tokens_seen": 21140304, "step": 21045 }, { "epoch": 11.16118769883351, "grad_norm": 38.64006805419922, "learning_rate": 4.859818274831689e-06, "loss": 0.2605, "num_input_tokens_seen": 21145520, "step": 21050 }, { "epoch": 11.163838812301167, "grad_norm": 1.586613655090332, "learning_rate": 4.857505666417334e-06, "loss": 0.0161, "num_input_tokens_seen": 21150160, "step": 21055 }, { "epoch": 11.166489925768824, "grad_norm": 0.7756893038749695, "learning_rate": 4.855193088510643e-06, "loss": 0.147, "num_input_tokens_seen": 21154896, "step": 21060 }, { "epoch": 11.169141039236479, "grad_norm": 22.509010314941406, "learning_rate": 4.852880541606732e-06, "loss": 0.1902, "num_input_tokens_seen": 21159632, "step": 21065 }, { "epoch": 11.171792152704136, "grad_norm": 0.5208498239517212, "learning_rate": 4.850568026200713e-06, "loss": 0.0699, "num_input_tokens_seen": 21164528, "step": 21070 }, { "epoch": 11.174443266171792, "grad_norm": 0.6770282983779907, "learning_rate": 4.848255542787689e-06, "loss": 0.0241, "num_input_tokens_seen": 21169968, "step": 21075 }, { "epoch": 11.177094379639449, "grad_norm": 0.42672833800315857, "learning_rate": 4.845943091862756e-06, "loss": 0.0172, "num_input_tokens_seen": 21174704, "step": 21080 }, { "epoch": 11.179745493107106, "grad_norm": 28.22241973876953, "learning_rate": 4.843630673921002e-06, "loss": 0.104, "num_input_tokens_seen": 21180496, "step": 21085 }, { "epoch": 11.18239660657476, "grad_norm": 1.1492503881454468, "learning_rate": 4.841318289457514e-06, "loss": 0.0927, "num_input_tokens_seen": 21185648, "step": 21090 }, { "epoch": 11.185047720042418, "grad_norm": 32.78813552856445, "learning_rate": 4.839005938967364e-06, "loss": 0.0581, "num_input_tokens_seen": 21189584, "step": 21095 }, { "epoch": 11.187698833510074, "grad_norm": 11.656672477722168, "learning_rate": 4.836693622945621e-06, "loss": 0.0522, "num_input_tokens_seen": 21194288, "step": 21100 }, { "epoch": 11.190349946977731, "grad_norm": 56.52348327636719, "learning_rate": 4.834381341887346e-06, "loss": 0.2145, "num_input_tokens_seen": 21198320, "step": 21105 }, { "epoch": 11.193001060445386, "grad_norm": 0.9931650757789612, "learning_rate": 4.8320690962875935e-06, "loss": 0.0611, "num_input_tokens_seen": 21205424, "step": 21110 }, { "epoch": 11.195652173913043, "grad_norm": 9.534041404724121, "learning_rate": 4.829756886641408e-06, "loss": 0.1881, "num_input_tokens_seen": 21211664, "step": 21115 }, { "epoch": 11.1983032873807, "grad_norm": 4.488645076751709, "learning_rate": 4.827444713443825e-06, "loss": 0.035, "num_input_tokens_seen": 21215760, "step": 21120 }, { "epoch": 11.200954400848357, "grad_norm": 1.529425024986267, "learning_rate": 4.825132577189881e-06, "loss": 0.0563, "num_input_tokens_seen": 21220464, "step": 21125 }, { "epoch": 11.203605514316013, "grad_norm": 10.738299369812012, "learning_rate": 4.822820478374592e-06, "loss": 0.1501, "num_input_tokens_seen": 21225296, "step": 21130 }, { "epoch": 11.206256627783668, "grad_norm": 1.583711862564087, "learning_rate": 4.820508417492975e-06, "loss": 0.0558, "num_input_tokens_seen": 21231120, "step": 21135 }, { "epoch": 11.208907741251325, "grad_norm": 15.942314147949219, "learning_rate": 4.818196395040036e-06, "loss": 0.0761, "num_input_tokens_seen": 21236336, "step": 21140 }, { "epoch": 11.211558854718982, "grad_norm": 12.662548065185547, "learning_rate": 4.8158844115107755e-06, "loss": 0.147, "num_input_tokens_seen": 21242160, "step": 21145 }, { "epoch": 11.214209968186639, "grad_norm": 24.82217788696289, "learning_rate": 4.813572467400178e-06, "loss": 0.0968, "num_input_tokens_seen": 21247184, "step": 21150 }, { "epoch": 11.216861081654296, "grad_norm": 3.6320340633392334, "learning_rate": 4.811260563203229e-06, "loss": 0.0694, "num_input_tokens_seen": 21252816, "step": 21155 }, { "epoch": 11.21951219512195, "grad_norm": 16.544233322143555, "learning_rate": 4.808948699414898e-06, "loss": 0.0548, "num_input_tokens_seen": 21257328, "step": 21160 }, { "epoch": 11.222163308589607, "grad_norm": 14.27189826965332, "learning_rate": 4.806636876530153e-06, "loss": 0.1674, "num_input_tokens_seen": 21263216, "step": 21165 }, { "epoch": 11.224814422057264, "grad_norm": 7.392589092254639, "learning_rate": 4.804325095043944e-06, "loss": 0.117, "num_input_tokens_seen": 21268400, "step": 21170 }, { "epoch": 11.22746553552492, "grad_norm": 4.46127462387085, "learning_rate": 4.8020133554512214e-06, "loss": 0.0408, "num_input_tokens_seen": 21273328, "step": 21175 }, { "epoch": 11.230116648992578, "grad_norm": 16.091583251953125, "learning_rate": 4.799701658246921e-06, "loss": 0.0815, "num_input_tokens_seen": 21279472, "step": 21180 }, { "epoch": 11.232767762460233, "grad_norm": 29.14113998413086, "learning_rate": 4.797390003925973e-06, "loss": 0.1383, "num_input_tokens_seen": 21284528, "step": 21185 }, { "epoch": 11.23541887592789, "grad_norm": 52.68559265136719, "learning_rate": 4.795078392983293e-06, "loss": 0.0599, "num_input_tokens_seen": 21288912, "step": 21190 }, { "epoch": 11.238069989395546, "grad_norm": 15.95315933227539, "learning_rate": 4.792766825913794e-06, "loss": 0.0508, "num_input_tokens_seen": 21294032, "step": 21195 }, { "epoch": 11.240721102863203, "grad_norm": 5.57429838180542, "learning_rate": 4.7904553032123735e-06, "loss": 0.1703, "num_input_tokens_seen": 21298928, "step": 21200 }, { "epoch": 11.24337221633086, "grad_norm": 38.427364349365234, "learning_rate": 4.788143825373927e-06, "loss": 0.1359, "num_input_tokens_seen": 21303856, "step": 21205 }, { "epoch": 11.246023329798515, "grad_norm": 3.4994964599609375, "learning_rate": 4.785832392893331e-06, "loss": 0.049, "num_input_tokens_seen": 21307920, "step": 21210 }, { "epoch": 11.248674443266172, "grad_norm": 13.492942810058594, "learning_rate": 4.78352100626546e-06, "loss": 0.155, "num_input_tokens_seen": 21312688, "step": 21215 }, { "epoch": 11.251325556733828, "grad_norm": 28.13371467590332, "learning_rate": 4.7812096659851746e-06, "loss": 0.0934, "num_input_tokens_seen": 21317104, "step": 21220 }, { "epoch": 11.253976670201485, "grad_norm": 4.103416442871094, "learning_rate": 4.7788983725473295e-06, "loss": 0.0711, "num_input_tokens_seen": 21321744, "step": 21225 }, { "epoch": 11.25662778366914, "grad_norm": 4.0027174949646, "learning_rate": 4.776587126446761e-06, "loss": 0.0572, "num_input_tokens_seen": 21326704, "step": 21230 }, { "epoch": 11.259278897136797, "grad_norm": 11.723624229431152, "learning_rate": 4.774275928178306e-06, "loss": 0.0641, "num_input_tokens_seen": 21332432, "step": 21235 }, { "epoch": 11.261930010604454, "grad_norm": 10.437895774841309, "learning_rate": 4.771964778236783e-06, "loss": 0.2447, "num_input_tokens_seen": 21337168, "step": 21240 }, { "epoch": 11.26458112407211, "grad_norm": 1.1482374668121338, "learning_rate": 4.769653677117006e-06, "loss": 0.0319, "num_input_tokens_seen": 21342096, "step": 21245 }, { "epoch": 11.267232237539767, "grad_norm": 19.50579071044922, "learning_rate": 4.767342625313773e-06, "loss": 0.1317, "num_input_tokens_seen": 21346224, "step": 21250 }, { "epoch": 11.269883351007422, "grad_norm": 6.1737213134765625, "learning_rate": 4.765031623321874e-06, "loss": 0.1554, "num_input_tokens_seen": 21350960, "step": 21255 }, { "epoch": 11.27253446447508, "grad_norm": 25.031776428222656, "learning_rate": 4.762720671636092e-06, "loss": 0.0492, "num_input_tokens_seen": 21356560, "step": 21260 }, { "epoch": 11.275185577942736, "grad_norm": 17.218137741088867, "learning_rate": 4.760409770751192e-06, "loss": 0.1325, "num_input_tokens_seen": 21360944, "step": 21265 }, { "epoch": 11.277836691410393, "grad_norm": 40.82634735107422, "learning_rate": 4.758098921161933e-06, "loss": 0.1418, "num_input_tokens_seen": 21365136, "step": 21270 }, { "epoch": 11.28048780487805, "grad_norm": 0.5065979957580566, "learning_rate": 4.755788123363062e-06, "loss": 0.1807, "num_input_tokens_seen": 21369840, "step": 21275 }, { "epoch": 11.283138918345704, "grad_norm": 9.824331283569336, "learning_rate": 4.753477377849316e-06, "loss": 0.1785, "num_input_tokens_seen": 21375184, "step": 21280 }, { "epoch": 11.285790031813361, "grad_norm": 4.634650230407715, "learning_rate": 4.7511666851154155e-06, "loss": 0.168, "num_input_tokens_seen": 21382608, "step": 21285 }, { "epoch": 11.288441145281018, "grad_norm": 7.882034778594971, "learning_rate": 4.748856045656079e-06, "loss": 0.0793, "num_input_tokens_seen": 21386512, "step": 21290 }, { "epoch": 11.291092258748675, "grad_norm": 12.135456085205078, "learning_rate": 4.746545459966005e-06, "loss": 0.1019, "num_input_tokens_seen": 21391440, "step": 21295 }, { "epoch": 11.293743372216332, "grad_norm": 27.404531478881836, "learning_rate": 4.744234928539885e-06, "loss": 0.225, "num_input_tokens_seen": 21396592, "step": 21300 }, { "epoch": 11.296394485683987, "grad_norm": 11.597930908203125, "learning_rate": 4.7419244518723975e-06, "loss": 0.0752, "num_input_tokens_seen": 21402000, "step": 21305 }, { "epoch": 11.299045599151643, "grad_norm": 20.55582046508789, "learning_rate": 4.73961403045821e-06, "loss": 0.1057, "num_input_tokens_seen": 21408176, "step": 21310 }, { "epoch": 11.3016967126193, "grad_norm": 26.91362762451172, "learning_rate": 4.737303664791976e-06, "loss": 0.081, "num_input_tokens_seen": 21413104, "step": 21315 }, { "epoch": 11.304347826086957, "grad_norm": 12.992321014404297, "learning_rate": 4.734993355368342e-06, "loss": 0.1656, "num_input_tokens_seen": 21418352, "step": 21320 }, { "epoch": 11.306998939554614, "grad_norm": 4.4736328125, "learning_rate": 4.732683102681935e-06, "loss": 0.0709, "num_input_tokens_seen": 21422704, "step": 21325 }, { "epoch": 11.309650053022269, "grad_norm": 46.394840240478516, "learning_rate": 4.730372907227377e-06, "loss": 0.1077, "num_input_tokens_seen": 21427664, "step": 21330 }, { "epoch": 11.312301166489926, "grad_norm": 11.703520774841309, "learning_rate": 4.728062769499273e-06, "loss": 0.1164, "num_input_tokens_seen": 21431984, "step": 21335 }, { "epoch": 11.314952279957582, "grad_norm": 17.016845703125, "learning_rate": 4.7257526899922215e-06, "loss": 0.0496, "num_input_tokens_seen": 21437776, "step": 21340 }, { "epoch": 11.31760339342524, "grad_norm": 32.28733825683594, "learning_rate": 4.7234426692007985e-06, "loss": 0.2431, "num_input_tokens_seen": 21442576, "step": 21345 }, { "epoch": 11.320254506892894, "grad_norm": 3.4577512741088867, "learning_rate": 4.721132707619577e-06, "loss": 0.1037, "num_input_tokens_seen": 21447440, "step": 21350 }, { "epoch": 11.322905620360551, "grad_norm": 3.8506314754486084, "learning_rate": 4.718822805743113e-06, "loss": 0.0528, "num_input_tokens_seen": 21453072, "step": 21355 }, { "epoch": 11.325556733828208, "grad_norm": 29.199277877807617, "learning_rate": 4.716512964065952e-06, "loss": 0.1605, "num_input_tokens_seen": 21457552, "step": 21360 }, { "epoch": 11.328207847295864, "grad_norm": 5.334163188934326, "learning_rate": 4.714203183082622e-06, "loss": 0.0909, "num_input_tokens_seen": 21464560, "step": 21365 }, { "epoch": 11.330858960763521, "grad_norm": 1.5911953449249268, "learning_rate": 4.711893463287643e-06, "loss": 0.0588, "num_input_tokens_seen": 21469584, "step": 21370 }, { "epoch": 11.333510074231176, "grad_norm": 35.58081817626953, "learning_rate": 4.709583805175519e-06, "loss": 0.175, "num_input_tokens_seen": 21475536, "step": 21375 }, { "epoch": 11.336161187698833, "grad_norm": 9.294585227966309, "learning_rate": 4.707274209240745e-06, "loss": 0.135, "num_input_tokens_seen": 21480176, "step": 21380 }, { "epoch": 11.33881230116649, "grad_norm": 23.311342239379883, "learning_rate": 4.704964675977796e-06, "loss": 0.1633, "num_input_tokens_seen": 21484368, "step": 21385 }, { "epoch": 11.341463414634147, "grad_norm": 20.39995765686035, "learning_rate": 4.702655205881135e-06, "loss": 0.1193, "num_input_tokens_seen": 21490416, "step": 21390 }, { "epoch": 11.344114528101803, "grad_norm": 25.813682556152344, "learning_rate": 4.70034579944522e-06, "loss": 0.0639, "num_input_tokens_seen": 21495088, "step": 21395 }, { "epoch": 11.346765641569458, "grad_norm": 0.3348287045955658, "learning_rate": 4.698036457164483e-06, "loss": 0.0313, "num_input_tokens_seen": 21500368, "step": 21400 }, { "epoch": 11.349416755037115, "grad_norm": 23.187217712402344, "learning_rate": 4.695727179533351e-06, "loss": 0.0363, "num_input_tokens_seen": 21504304, "step": 21405 }, { "epoch": 11.352067868504772, "grad_norm": 25.54473114013672, "learning_rate": 4.693417967046232e-06, "loss": 0.1686, "num_input_tokens_seen": 21509840, "step": 21410 }, { "epoch": 11.354718981972429, "grad_norm": 22.60965347290039, "learning_rate": 4.6911088201975265e-06, "loss": 0.1903, "num_input_tokens_seen": 21514800, "step": 21415 }, { "epoch": 11.357370095440086, "grad_norm": 10.280037879943848, "learning_rate": 4.688799739481613e-06, "loss": 0.0194, "num_input_tokens_seen": 21520112, "step": 21420 }, { "epoch": 11.36002120890774, "grad_norm": 23.760913848876953, "learning_rate": 4.6864907253928616e-06, "loss": 0.1147, "num_input_tokens_seen": 21526064, "step": 21425 }, { "epoch": 11.362672322375397, "grad_norm": 6.169363975524902, "learning_rate": 4.684181778425624e-06, "loss": 0.0285, "num_input_tokens_seen": 21530992, "step": 21430 }, { "epoch": 11.365323435843054, "grad_norm": 1.245425820350647, "learning_rate": 4.681872899074243e-06, "loss": 0.0753, "num_input_tokens_seen": 21536144, "step": 21435 }, { "epoch": 11.367974549310711, "grad_norm": 24.54837417602539, "learning_rate": 4.6795640878330416e-06, "loss": 0.057, "num_input_tokens_seen": 21541264, "step": 21440 }, { "epoch": 11.370625662778368, "grad_norm": 7.5393595695495605, "learning_rate": 4.67725534519633e-06, "loss": 0.0981, "num_input_tokens_seen": 21546160, "step": 21445 }, { "epoch": 11.373276776246023, "grad_norm": 6.2610697746276855, "learning_rate": 4.674946671658403e-06, "loss": 0.1723, "num_input_tokens_seen": 21551056, "step": 21450 }, { "epoch": 11.37592788971368, "grad_norm": 0.9497334957122803, "learning_rate": 4.6726380677135466e-06, "loss": 0.1918, "num_input_tokens_seen": 21556752, "step": 21455 }, { "epoch": 11.378579003181336, "grad_norm": 2.7004847526550293, "learning_rate": 4.67032953385602e-06, "loss": 0.0484, "num_input_tokens_seen": 21563696, "step": 21460 }, { "epoch": 11.381230116648993, "grad_norm": 3.289774179458618, "learning_rate": 4.668021070580079e-06, "loss": 0.0638, "num_input_tokens_seen": 21568784, "step": 21465 }, { "epoch": 11.38388123011665, "grad_norm": 0.19248613715171814, "learning_rate": 4.6657126783799575e-06, "loss": 0.2358, "num_input_tokens_seen": 21573744, "step": 21470 }, { "epoch": 11.386532343584305, "grad_norm": 0.5147577524185181, "learning_rate": 4.663404357749878e-06, "loss": 0.096, "num_input_tokens_seen": 21578352, "step": 21475 }, { "epoch": 11.389183457051962, "grad_norm": 13.702170372009277, "learning_rate": 4.661096109184044e-06, "loss": 0.0687, "num_input_tokens_seen": 21583408, "step": 21480 }, { "epoch": 11.391834570519618, "grad_norm": 21.650102615356445, "learning_rate": 4.6587879331766465e-06, "loss": 0.0858, "num_input_tokens_seen": 21589040, "step": 21485 }, { "epoch": 11.394485683987275, "grad_norm": 3.6960809230804443, "learning_rate": 4.6564798302218575e-06, "loss": 0.0585, "num_input_tokens_seen": 21594448, "step": 21490 }, { "epoch": 11.39713679745493, "grad_norm": 31.8400821685791, "learning_rate": 4.6541718008138416e-06, "loss": 0.0728, "num_input_tokens_seen": 21598736, "step": 21495 }, { "epoch": 11.399787910922587, "grad_norm": 16.45880889892578, "learning_rate": 4.651863845446735e-06, "loss": 0.1366, "num_input_tokens_seen": 21603504, "step": 21500 }, { "epoch": 11.402439024390244, "grad_norm": 48.97816848754883, "learning_rate": 4.64955596461467e-06, "loss": 0.1735, "num_input_tokens_seen": 21610224, "step": 21505 }, { "epoch": 11.4050901378579, "grad_norm": 7.38187837600708, "learning_rate": 4.647248158811755e-06, "loss": 0.031, "num_input_tokens_seen": 21615056, "step": 21510 }, { "epoch": 11.407741251325557, "grad_norm": 3.192380666732788, "learning_rate": 4.644940428532087e-06, "loss": 0.0934, "num_input_tokens_seen": 21619248, "step": 21515 }, { "epoch": 11.410392364793212, "grad_norm": 37.979366302490234, "learning_rate": 4.642632774269743e-06, "loss": 0.0973, "num_input_tokens_seen": 21624784, "step": 21520 }, { "epoch": 11.41304347826087, "grad_norm": 23.65243911743164, "learning_rate": 4.640325196518789e-06, "loss": 0.2166, "num_input_tokens_seen": 21629136, "step": 21525 }, { "epoch": 11.415694591728526, "grad_norm": 2.832390069961548, "learning_rate": 4.638017695773269e-06, "loss": 0.0945, "num_input_tokens_seen": 21633648, "step": 21530 }, { "epoch": 11.418345705196183, "grad_norm": 2.492205858230591, "learning_rate": 4.635710272527211e-06, "loss": 0.0472, "num_input_tokens_seen": 21638640, "step": 21535 }, { "epoch": 11.42099681866384, "grad_norm": 34.63751983642578, "learning_rate": 4.6334029272746315e-06, "loss": 0.1705, "num_input_tokens_seen": 21643792, "step": 21540 }, { "epoch": 11.423647932131495, "grad_norm": 2.1022989749908447, "learning_rate": 4.6310956605095246e-06, "loss": 0.0686, "num_input_tokens_seen": 21649136, "step": 21545 }, { "epoch": 11.426299045599151, "grad_norm": 21.168487548828125, "learning_rate": 4.628788472725874e-06, "loss": 0.1822, "num_input_tokens_seen": 21655184, "step": 21550 }, { "epoch": 11.428950159066808, "grad_norm": 1.0244437456130981, "learning_rate": 4.6264813644176375e-06, "loss": 0.0661, "num_input_tokens_seen": 21659184, "step": 21555 }, { "epoch": 11.431601272534465, "grad_norm": 2.0563876628875732, "learning_rate": 4.624174336078765e-06, "loss": 0.0621, "num_input_tokens_seen": 21663888, "step": 21560 }, { "epoch": 11.434252386002122, "grad_norm": 8.791871070861816, "learning_rate": 4.6218673882031825e-06, "loss": 0.0339, "num_input_tokens_seen": 21668624, "step": 21565 }, { "epoch": 11.436903499469777, "grad_norm": 0.36395713686943054, "learning_rate": 4.6195605212848055e-06, "loss": 0.0878, "num_input_tokens_seen": 21674224, "step": 21570 }, { "epoch": 11.439554612937433, "grad_norm": 22.4663028717041, "learning_rate": 4.617253735817522e-06, "loss": 0.0353, "num_input_tokens_seen": 21678544, "step": 21575 }, { "epoch": 11.44220572640509, "grad_norm": 35.41714096069336, "learning_rate": 4.614947032295215e-06, "loss": 0.1978, "num_input_tokens_seen": 21682640, "step": 21580 }, { "epoch": 11.444856839872747, "grad_norm": 27.28754997253418, "learning_rate": 4.612640411211739e-06, "loss": 0.1956, "num_input_tokens_seen": 21688624, "step": 21585 }, { "epoch": 11.447507953340402, "grad_norm": 47.67634963989258, "learning_rate": 4.610333873060937e-06, "loss": 0.1038, "num_input_tokens_seen": 21693648, "step": 21590 }, { "epoch": 11.450159066808059, "grad_norm": 38.90577697753906, "learning_rate": 4.608027418336633e-06, "loss": 0.2849, "num_input_tokens_seen": 21699824, "step": 21595 }, { "epoch": 11.452810180275716, "grad_norm": 6.576326370239258, "learning_rate": 4.605721047532635e-06, "loss": 0.1267, "num_input_tokens_seen": 21705168, "step": 21600 }, { "epoch": 11.455461293743372, "grad_norm": 29.568531036376953, "learning_rate": 4.603414761142728e-06, "loss": 0.1764, "num_input_tokens_seen": 21709200, "step": 21605 }, { "epoch": 11.45811240721103, "grad_norm": 17.595487594604492, "learning_rate": 4.601108559660683e-06, "loss": 0.1092, "num_input_tokens_seen": 21713840, "step": 21610 }, { "epoch": 11.460763520678684, "grad_norm": 20.731727600097656, "learning_rate": 4.59880244358025e-06, "loss": 0.1039, "num_input_tokens_seen": 21718992, "step": 21615 }, { "epoch": 11.463414634146341, "grad_norm": 49.02792739868164, "learning_rate": 4.5964964133951675e-06, "loss": 0.3673, "num_input_tokens_seen": 21724784, "step": 21620 }, { "epoch": 11.466065747613998, "grad_norm": 9.19880199432373, "learning_rate": 4.594190469599144e-06, "loss": 0.0687, "num_input_tokens_seen": 21729744, "step": 21625 }, { "epoch": 11.468716861081655, "grad_norm": 20.114660263061523, "learning_rate": 4.59188461268588e-06, "loss": 0.0926, "num_input_tokens_seen": 21735600, "step": 21630 }, { "epoch": 11.471367974549311, "grad_norm": 5.773331165313721, "learning_rate": 4.589578843149052e-06, "loss": 0.049, "num_input_tokens_seen": 21739920, "step": 21635 }, { "epoch": 11.474019088016966, "grad_norm": 6.868895530700684, "learning_rate": 4.587273161482321e-06, "loss": 0.094, "num_input_tokens_seen": 21745904, "step": 21640 }, { "epoch": 11.476670201484623, "grad_norm": 3.3970305919647217, "learning_rate": 4.584967568179325e-06, "loss": 0.1415, "num_input_tokens_seen": 21750928, "step": 21645 }, { "epoch": 11.47932131495228, "grad_norm": 6.490922927856445, "learning_rate": 4.5826620637336876e-06, "loss": 0.0213, "num_input_tokens_seen": 21756080, "step": 21650 }, { "epoch": 11.481972428419937, "grad_norm": 4.497189998626709, "learning_rate": 4.580356648639008e-06, "loss": 0.1078, "num_input_tokens_seen": 21761136, "step": 21655 }, { "epoch": 11.484623541887593, "grad_norm": 40.97780990600586, "learning_rate": 4.578051323388875e-06, "loss": 0.3402, "num_input_tokens_seen": 21766224, "step": 21660 }, { "epoch": 11.487274655355248, "grad_norm": 41.0826416015625, "learning_rate": 4.575746088476849e-06, "loss": 0.1536, "num_input_tokens_seen": 21770960, "step": 21665 }, { "epoch": 11.489925768822905, "grad_norm": 12.25953483581543, "learning_rate": 4.573440944396473e-06, "loss": 0.0638, "num_input_tokens_seen": 21775120, "step": 21670 }, { "epoch": 11.492576882290562, "grad_norm": 4.0581536293029785, "learning_rate": 4.571135891641277e-06, "loss": 0.078, "num_input_tokens_seen": 21779536, "step": 21675 }, { "epoch": 11.495227995758219, "grad_norm": 1.9565503597259521, "learning_rate": 4.568830930704762e-06, "loss": 0.035, "num_input_tokens_seen": 21783792, "step": 21680 }, { "epoch": 11.497879109225876, "grad_norm": 2.9864399433135986, "learning_rate": 4.566526062080418e-06, "loss": 0.0232, "num_input_tokens_seen": 21788784, "step": 21685 }, { "epoch": 11.50053022269353, "grad_norm": 9.592079162597656, "learning_rate": 4.564221286261709e-06, "loss": 0.0659, "num_input_tokens_seen": 21793264, "step": 21690 }, { "epoch": 11.503181336161187, "grad_norm": 33.89397430419922, "learning_rate": 4.561916603742084e-06, "loss": 0.1793, "num_input_tokens_seen": 21798736, "step": 21695 }, { "epoch": 11.505832449628844, "grad_norm": 0.2765964865684509, "learning_rate": 4.559612015014967e-06, "loss": 0.1831, "num_input_tokens_seen": 21803632, "step": 21700 }, { "epoch": 11.508483563096501, "grad_norm": 14.439413070678711, "learning_rate": 4.557307520573765e-06, "loss": 0.0901, "num_input_tokens_seen": 21808272, "step": 21705 }, { "epoch": 11.511134676564158, "grad_norm": 0.2315121740102768, "learning_rate": 4.555003120911863e-06, "loss": 0.1077, "num_input_tokens_seen": 21814512, "step": 21710 }, { "epoch": 11.513785790031813, "grad_norm": 14.250540733337402, "learning_rate": 4.552698816522631e-06, "loss": 0.1043, "num_input_tokens_seen": 21818864, "step": 21715 }, { "epoch": 11.51643690349947, "grad_norm": 28.841794967651367, "learning_rate": 4.55039460789941e-06, "loss": 0.1392, "num_input_tokens_seen": 21823952, "step": 21720 }, { "epoch": 11.519088016967126, "grad_norm": 34.62331771850586, "learning_rate": 4.548090495535528e-06, "loss": 0.1477, "num_input_tokens_seen": 21828624, "step": 21725 }, { "epoch": 11.521739130434783, "grad_norm": 0.6770429015159607, "learning_rate": 4.545786479924287e-06, "loss": 0.1836, "num_input_tokens_seen": 21833424, "step": 21730 }, { "epoch": 11.524390243902438, "grad_norm": 11.568171501159668, "learning_rate": 4.543482561558974e-06, "loss": 0.1108, "num_input_tokens_seen": 21837520, "step": 21735 }, { "epoch": 11.527041357370095, "grad_norm": 30.724624633789062, "learning_rate": 4.541178740932849e-06, "loss": 0.1832, "num_input_tokens_seen": 21844080, "step": 21740 }, { "epoch": 11.529692470837752, "grad_norm": 5.292746543884277, "learning_rate": 4.538875018539155e-06, "loss": 0.049, "num_input_tokens_seen": 21848976, "step": 21745 }, { "epoch": 11.532343584305409, "grad_norm": 4.937493324279785, "learning_rate": 4.536571394871112e-06, "loss": 0.1044, "num_input_tokens_seen": 21853616, "step": 21750 }, { "epoch": 11.534994697773065, "grad_norm": 3.162432909011841, "learning_rate": 4.534267870421922e-06, "loss": 0.0277, "num_input_tokens_seen": 21858192, "step": 21755 }, { "epoch": 11.53764581124072, "grad_norm": 26.433223724365234, "learning_rate": 4.531964445684761e-06, "loss": 0.0981, "num_input_tokens_seen": 21864048, "step": 21760 }, { "epoch": 11.540296924708377, "grad_norm": 7.535511016845703, "learning_rate": 4.529661121152789e-06, "loss": 0.0725, "num_input_tokens_seen": 21869392, "step": 21765 }, { "epoch": 11.542948038176034, "grad_norm": 0.807181715965271, "learning_rate": 4.527357897319138e-06, "loss": 0.1188, "num_input_tokens_seen": 21874192, "step": 21770 }, { "epoch": 11.54559915164369, "grad_norm": 22.308578491210938, "learning_rate": 4.525054774676928e-06, "loss": 0.0529, "num_input_tokens_seen": 21879152, "step": 21775 }, { "epoch": 11.548250265111347, "grad_norm": 0.5097073316574097, "learning_rate": 4.522751753719244e-06, "loss": 0.2724, "num_input_tokens_seen": 21884144, "step": 21780 }, { "epoch": 11.550901378579002, "grad_norm": 5.185466289520264, "learning_rate": 4.520448834939164e-06, "loss": 0.2791, "num_input_tokens_seen": 21889584, "step": 21785 }, { "epoch": 11.55355249204666, "grad_norm": 0.9201309680938721, "learning_rate": 4.5181460188297304e-06, "loss": 0.1008, "num_input_tokens_seen": 21894064, "step": 21790 }, { "epoch": 11.556203605514316, "grad_norm": 6.42110013961792, "learning_rate": 4.515843305883977e-06, "loss": 0.1002, "num_input_tokens_seen": 21899408, "step": 21795 }, { "epoch": 11.558854718981973, "grad_norm": 0.9368299841880798, "learning_rate": 4.513540696594902e-06, "loss": 0.0609, "num_input_tokens_seen": 21906896, "step": 21800 }, { "epoch": 11.56150583244963, "grad_norm": 2.1625006198883057, "learning_rate": 4.511238191455491e-06, "loss": 0.1266, "num_input_tokens_seen": 21912528, "step": 21805 }, { "epoch": 11.564156945917285, "grad_norm": 7.013222694396973, "learning_rate": 4.508935790958707e-06, "loss": 0.0455, "num_input_tokens_seen": 21916912, "step": 21810 }, { "epoch": 11.566808059384941, "grad_norm": 39.03867721557617, "learning_rate": 4.506633495597482e-06, "loss": 0.2537, "num_input_tokens_seen": 21921072, "step": 21815 }, { "epoch": 11.569459172852598, "grad_norm": 9.36975383758545, "learning_rate": 4.5043313058647365e-06, "loss": 0.0801, "num_input_tokens_seen": 21925488, "step": 21820 }, { "epoch": 11.572110286320255, "grad_norm": 1.8729320764541626, "learning_rate": 4.502029222253359e-06, "loss": 0.0566, "num_input_tokens_seen": 21930064, "step": 21825 }, { "epoch": 11.574761399787912, "grad_norm": 35.68826675415039, "learning_rate": 4.499727245256226e-06, "loss": 0.155, "num_input_tokens_seen": 21934960, "step": 21830 }, { "epoch": 11.577412513255567, "grad_norm": 1.3375567197799683, "learning_rate": 4.497425375366177e-06, "loss": 0.0515, "num_input_tokens_seen": 21939280, "step": 21835 }, { "epoch": 11.580063626723224, "grad_norm": 32.83678436279297, "learning_rate": 4.495123613076042e-06, "loss": 0.2307, "num_input_tokens_seen": 21943792, "step": 21840 }, { "epoch": 11.58271474019088, "grad_norm": 5.461807727813721, "learning_rate": 4.492821958878619e-06, "loss": 0.0238, "num_input_tokens_seen": 21949488, "step": 21845 }, { "epoch": 11.585365853658537, "grad_norm": 17.44470977783203, "learning_rate": 4.490520413266689e-06, "loss": 0.1449, "num_input_tokens_seen": 21955280, "step": 21850 }, { "epoch": 11.588016967126194, "grad_norm": 2.4998068809509277, "learning_rate": 4.4882189767330036e-06, "loss": 0.0781, "num_input_tokens_seen": 21960880, "step": 21855 }, { "epoch": 11.590668080593849, "grad_norm": 20.540483474731445, "learning_rate": 4.485917649770297e-06, "loss": 0.1333, "num_input_tokens_seen": 21965328, "step": 21860 }, { "epoch": 11.593319194061506, "grad_norm": 3.408811092376709, "learning_rate": 4.483616432871276e-06, "loss": 0.1261, "num_input_tokens_seen": 21970256, "step": 21865 }, { "epoch": 11.595970307529162, "grad_norm": 30.4432430267334, "learning_rate": 4.481315326528627e-06, "loss": 0.1271, "num_input_tokens_seen": 21975600, "step": 21870 }, { "epoch": 11.59862142099682, "grad_norm": 8.512913703918457, "learning_rate": 4.479014331235007e-06, "loss": 0.0406, "num_input_tokens_seen": 21980080, "step": 21875 }, { "epoch": 11.601272534464474, "grad_norm": 3.608048915863037, "learning_rate": 4.476713447483057e-06, "loss": 0.0377, "num_input_tokens_seen": 21985488, "step": 21880 }, { "epoch": 11.603923647932131, "grad_norm": 54.57777786254883, "learning_rate": 4.474412675765387e-06, "loss": 0.2129, "num_input_tokens_seen": 21990096, "step": 21885 }, { "epoch": 11.606574761399788, "grad_norm": 2.823711395263672, "learning_rate": 4.472112016574591e-06, "loss": 0.1435, "num_input_tokens_seen": 21995728, "step": 21890 }, { "epoch": 11.609225874867445, "grad_norm": 43.41755294799805, "learning_rate": 4.469811470403228e-06, "loss": 0.0955, "num_input_tokens_seen": 22000656, "step": 21895 }, { "epoch": 11.611876988335101, "grad_norm": 1.0103096961975098, "learning_rate": 4.4675110377438424e-06, "loss": 0.0481, "num_input_tokens_seen": 22005520, "step": 21900 }, { "epoch": 11.614528101802756, "grad_norm": 28.930034637451172, "learning_rate": 4.46521071908895e-06, "loss": 0.2264, "num_input_tokens_seen": 22011888, "step": 21905 }, { "epoch": 11.617179215270413, "grad_norm": 11.718111038208008, "learning_rate": 4.462910514931045e-06, "loss": 0.0481, "num_input_tokens_seen": 22016112, "step": 21910 }, { "epoch": 11.61983032873807, "grad_norm": 1.5789422988891602, "learning_rate": 4.460610425762591e-06, "loss": 0.0095, "num_input_tokens_seen": 22021264, "step": 21915 }, { "epoch": 11.622481442205727, "grad_norm": 7.7989044189453125, "learning_rate": 4.458310452076034e-06, "loss": 0.261, "num_input_tokens_seen": 22026096, "step": 21920 }, { "epoch": 11.625132555673384, "grad_norm": 0.6155077219009399, "learning_rate": 4.4560105943637925e-06, "loss": 0.0521, "num_input_tokens_seen": 22030832, "step": 21925 }, { "epoch": 11.627783669141039, "grad_norm": 36.18185806274414, "learning_rate": 4.45371085311826e-06, "loss": 0.1535, "num_input_tokens_seen": 22035824, "step": 21930 }, { "epoch": 11.630434782608695, "grad_norm": 0.13612279295921326, "learning_rate": 4.451411228831802e-06, "loss": 0.0717, "num_input_tokens_seen": 22040560, "step": 21935 }, { "epoch": 11.633085896076352, "grad_norm": 40.84706497192383, "learning_rate": 4.449111721996766e-06, "loss": 0.1569, "num_input_tokens_seen": 22046608, "step": 21940 }, { "epoch": 11.635737009544009, "grad_norm": 1.7971546649932861, "learning_rate": 4.44681233310547e-06, "loss": 0.1828, "num_input_tokens_seen": 22054352, "step": 21945 }, { "epoch": 11.638388123011666, "grad_norm": 14.394524574279785, "learning_rate": 4.444513062650203e-06, "loss": 0.1873, "num_input_tokens_seen": 22060624, "step": 21950 }, { "epoch": 11.64103923647932, "grad_norm": 26.110368728637695, "learning_rate": 4.442213911123238e-06, "loss": 0.0878, "num_input_tokens_seen": 22064624, "step": 21955 }, { "epoch": 11.643690349946977, "grad_norm": 13.093669891357422, "learning_rate": 4.4399148790168135e-06, "loss": 0.1319, "num_input_tokens_seen": 22069968, "step": 21960 }, { "epoch": 11.646341463414634, "grad_norm": 20.494979858398438, "learning_rate": 4.43761596682315e-06, "loss": 0.0212, "num_input_tokens_seen": 22075248, "step": 21965 }, { "epoch": 11.648992576882291, "grad_norm": 22.50486946105957, "learning_rate": 4.435317175034434e-06, "loss": 0.1102, "num_input_tokens_seen": 22081712, "step": 21970 }, { "epoch": 11.651643690349946, "grad_norm": 6.9252777099609375, "learning_rate": 4.433018504142835e-06, "loss": 0.0573, "num_input_tokens_seen": 22086672, "step": 21975 }, { "epoch": 11.654294803817603, "grad_norm": 34.62725067138672, "learning_rate": 4.430719954640488e-06, "loss": 0.0674, "num_input_tokens_seen": 22090960, "step": 21980 }, { "epoch": 11.65694591728526, "grad_norm": 39.16783142089844, "learning_rate": 4.428421527019513e-06, "loss": 0.2529, "num_input_tokens_seen": 22095216, "step": 21985 }, { "epoch": 11.659597030752916, "grad_norm": 0.3244672417640686, "learning_rate": 4.42612322177199e-06, "loss": 0.3569, "num_input_tokens_seen": 22099856, "step": 21990 }, { "epoch": 11.662248144220573, "grad_norm": 27.531118392944336, "learning_rate": 4.423825039389985e-06, "loss": 0.4617, "num_input_tokens_seen": 22105392, "step": 21995 }, { "epoch": 11.66489925768823, "grad_norm": 24.485504150390625, "learning_rate": 4.42152698036553e-06, "loss": 0.2205, "num_input_tokens_seen": 22110192, "step": 22000 }, { "epoch": 11.667550371155885, "grad_norm": 34.30963134765625, "learning_rate": 4.419229045190637e-06, "loss": 0.1053, "num_input_tokens_seen": 22114800, "step": 22005 }, { "epoch": 11.670201484623542, "grad_norm": 3.829418659210205, "learning_rate": 4.416931234357285e-06, "loss": 0.0222, "num_input_tokens_seen": 22119632, "step": 22010 }, { "epoch": 11.672852598091199, "grad_norm": 14.514158248901367, "learning_rate": 4.4146335483574295e-06, "loss": 0.0627, "num_input_tokens_seen": 22124624, "step": 22015 }, { "epoch": 11.675503711558855, "grad_norm": 17.893117904663086, "learning_rate": 4.4123359876829985e-06, "loss": 0.0947, "num_input_tokens_seen": 22129136, "step": 22020 }, { "epoch": 11.67815482502651, "grad_norm": 2.7856314182281494, "learning_rate": 4.410038552825897e-06, "loss": 0.0748, "num_input_tokens_seen": 22133744, "step": 22025 }, { "epoch": 11.680805938494167, "grad_norm": 0.41479843854904175, "learning_rate": 4.407741244277996e-06, "loss": 0.0568, "num_input_tokens_seen": 22138160, "step": 22030 }, { "epoch": 11.683457051961824, "grad_norm": 2.1268930435180664, "learning_rate": 4.405444062531145e-06, "loss": 0.1684, "num_input_tokens_seen": 22144528, "step": 22035 }, { "epoch": 11.68610816542948, "grad_norm": 20.819610595703125, "learning_rate": 4.403147008077164e-06, "loss": 0.298, "num_input_tokens_seen": 22148592, "step": 22040 }, { "epoch": 11.688759278897138, "grad_norm": 0.5476641654968262, "learning_rate": 4.400850081407849e-06, "loss": 0.0384, "num_input_tokens_seen": 22153328, "step": 22045 }, { "epoch": 11.691410392364793, "grad_norm": 21.162208557128906, "learning_rate": 4.398553283014962e-06, "loss": 0.1445, "num_input_tokens_seen": 22158640, "step": 22050 }, { "epoch": 11.69406150583245, "grad_norm": 10.044708251953125, "learning_rate": 4.396256613390243e-06, "loss": 0.0663, "num_input_tokens_seen": 22162960, "step": 22055 }, { "epoch": 11.696712619300106, "grad_norm": 14.781889915466309, "learning_rate": 4.393960073025403e-06, "loss": 0.0654, "num_input_tokens_seen": 22167504, "step": 22060 }, { "epoch": 11.699363732767763, "grad_norm": 25.869699478149414, "learning_rate": 4.391663662412128e-06, "loss": 0.1749, "num_input_tokens_seen": 22172720, "step": 22065 }, { "epoch": 11.70201484623542, "grad_norm": 1.5781099796295166, "learning_rate": 4.389367382042068e-06, "loss": 0.2382, "num_input_tokens_seen": 22178288, "step": 22070 }, { "epoch": 11.704665959703075, "grad_norm": 12.721879005432129, "learning_rate": 4.387071232406855e-06, "loss": 0.1331, "num_input_tokens_seen": 22182416, "step": 22075 }, { "epoch": 11.707317073170731, "grad_norm": 5.990090847015381, "learning_rate": 4.384775213998089e-06, "loss": 0.087, "num_input_tokens_seen": 22187920, "step": 22080 }, { "epoch": 11.709968186638388, "grad_norm": 3.417951822280884, "learning_rate": 4.382479327307337e-06, "loss": 0.1334, "num_input_tokens_seen": 22192400, "step": 22085 }, { "epoch": 11.712619300106045, "grad_norm": 9.277209281921387, "learning_rate": 4.380183572826147e-06, "loss": 0.0524, "num_input_tokens_seen": 22196848, "step": 22090 }, { "epoch": 11.715270413573702, "grad_norm": 11.494571685791016, "learning_rate": 4.3778879510460295e-06, "loss": 0.038, "num_input_tokens_seen": 22201680, "step": 22095 }, { "epoch": 11.717921527041357, "grad_norm": 6.788589000701904, "learning_rate": 4.375592462458477e-06, "loss": 0.0658, "num_input_tokens_seen": 22206672, "step": 22100 }, { "epoch": 11.720572640509014, "grad_norm": 43.00249481201172, "learning_rate": 4.373297107554942e-06, "loss": 0.0965, "num_input_tokens_seen": 22211824, "step": 22105 }, { "epoch": 11.72322375397667, "grad_norm": 0.7177967429161072, "learning_rate": 4.371001886826858e-06, "loss": 0.063, "num_input_tokens_seen": 22216240, "step": 22110 }, { "epoch": 11.725874867444327, "grad_norm": 0.47045058012008667, "learning_rate": 4.368706800765622e-06, "loss": 0.2007, "num_input_tokens_seen": 22221264, "step": 22115 }, { "epoch": 11.728525980911982, "grad_norm": 1.9781010150909424, "learning_rate": 4.366411849862611e-06, "loss": 0.1074, "num_input_tokens_seen": 22226224, "step": 22120 }, { "epoch": 11.731177094379639, "grad_norm": 4.950662612915039, "learning_rate": 4.364117034609162e-06, "loss": 0.1305, "num_input_tokens_seen": 22231376, "step": 22125 }, { "epoch": 11.733828207847296, "grad_norm": 8.282474517822266, "learning_rate": 4.3618223554965944e-06, "loss": 0.2244, "num_input_tokens_seen": 22235696, "step": 22130 }, { "epoch": 11.736479321314953, "grad_norm": 7.652164459228516, "learning_rate": 4.359527813016189e-06, "loss": 0.1152, "num_input_tokens_seen": 22241232, "step": 22135 }, { "epoch": 11.73913043478261, "grad_norm": 40.736385345458984, "learning_rate": 4.357233407659204e-06, "loss": 0.1927, "num_input_tokens_seen": 22247632, "step": 22140 }, { "epoch": 11.741781548250264, "grad_norm": 17.454273223876953, "learning_rate": 4.3549391399168626e-06, "loss": 0.0681, "num_input_tokens_seen": 22252752, "step": 22145 }, { "epoch": 11.744432661717921, "grad_norm": 6.749276638031006, "learning_rate": 4.3526450102803654e-06, "loss": 0.1564, "num_input_tokens_seen": 22257424, "step": 22150 }, { "epoch": 11.747083775185578, "grad_norm": 4.859851360321045, "learning_rate": 4.350351019240877e-06, "loss": 0.0413, "num_input_tokens_seen": 22261968, "step": 22155 }, { "epoch": 11.749734888653235, "grad_norm": 16.266908645629883, "learning_rate": 4.348057167289535e-06, "loss": 0.183, "num_input_tokens_seen": 22265968, "step": 22160 }, { "epoch": 11.752386002120891, "grad_norm": 42.180419921875, "learning_rate": 4.345763454917447e-06, "loss": 0.3982, "num_input_tokens_seen": 22270480, "step": 22165 }, { "epoch": 11.755037115588546, "grad_norm": 18.472166061401367, "learning_rate": 4.343469882615694e-06, "loss": 0.1246, "num_input_tokens_seen": 22275696, "step": 22170 }, { "epoch": 11.757688229056203, "grad_norm": 0.9965976476669312, "learning_rate": 4.341176450875318e-06, "loss": 0.0613, "num_input_tokens_seen": 22279504, "step": 22175 }, { "epoch": 11.76033934252386, "grad_norm": 0.4850604832172394, "learning_rate": 4.338883160187342e-06, "loss": 0.105, "num_input_tokens_seen": 22285072, "step": 22180 }, { "epoch": 11.762990455991517, "grad_norm": 3.854783773422241, "learning_rate": 4.336590011042749e-06, "loss": 0.1499, "num_input_tokens_seen": 22289424, "step": 22185 }, { "epoch": 11.765641569459174, "grad_norm": 9.663069725036621, "learning_rate": 4.3342970039325015e-06, "loss": 0.0595, "num_input_tokens_seen": 22293648, "step": 22190 }, { "epoch": 11.768292682926829, "grad_norm": 8.452537536621094, "learning_rate": 4.332004139347522e-06, "loss": 0.0508, "num_input_tokens_seen": 22298640, "step": 22195 }, { "epoch": 11.770943796394485, "grad_norm": 21.2038516998291, "learning_rate": 4.329711417778708e-06, "loss": 0.0374, "num_input_tokens_seen": 22303632, "step": 22200 }, { "epoch": 11.773594909862142, "grad_norm": 4.928699970245361, "learning_rate": 4.327418839716926e-06, "loss": 0.1246, "num_input_tokens_seen": 22307920, "step": 22205 }, { "epoch": 11.776246023329799, "grad_norm": 22.786624908447266, "learning_rate": 4.325126405653012e-06, "loss": 0.2888, "num_input_tokens_seen": 22313360, "step": 22210 }, { "epoch": 11.778897136797456, "grad_norm": 10.771256446838379, "learning_rate": 4.322834116077766e-06, "loss": 0.0882, "num_input_tokens_seen": 22318192, "step": 22215 }, { "epoch": 11.78154825026511, "grad_norm": 40.180702209472656, "learning_rate": 4.3205419714819646e-06, "loss": 0.1596, "num_input_tokens_seen": 22322736, "step": 22220 }, { "epoch": 11.784199363732768, "grad_norm": 1.8129782676696777, "learning_rate": 4.318249972356351e-06, "loss": 0.0719, "num_input_tokens_seen": 22328688, "step": 22225 }, { "epoch": 11.786850477200424, "grad_norm": 9.687899589538574, "learning_rate": 4.315958119191632e-06, "loss": 0.1024, "num_input_tokens_seen": 22333136, "step": 22230 }, { "epoch": 11.789501590668081, "grad_norm": 19.17058563232422, "learning_rate": 4.31366641247849e-06, "loss": 0.1278, "num_input_tokens_seen": 22337616, "step": 22235 }, { "epoch": 11.792152704135738, "grad_norm": 17.33881950378418, "learning_rate": 4.311374852707573e-06, "loss": 0.1237, "num_input_tokens_seen": 22343216, "step": 22240 }, { "epoch": 11.794803817603393, "grad_norm": 34.63949203491211, "learning_rate": 4.3090834403695e-06, "loss": 0.2258, "num_input_tokens_seen": 22348304, "step": 22245 }, { "epoch": 11.79745493107105, "grad_norm": 18.739208221435547, "learning_rate": 4.306792175954853e-06, "loss": 0.12, "num_input_tokens_seen": 22353264, "step": 22250 }, { "epoch": 11.800106044538706, "grad_norm": 8.701579093933105, "learning_rate": 4.3045010599541874e-06, "loss": 0.2632, "num_input_tokens_seen": 22358256, "step": 22255 }, { "epoch": 11.802757158006363, "grad_norm": 31.726661682128906, "learning_rate": 4.302210092858024e-06, "loss": 0.0772, "num_input_tokens_seen": 22363056, "step": 22260 }, { "epoch": 11.805408271474018, "grad_norm": 28.90397834777832, "learning_rate": 4.299919275156857e-06, "loss": 0.0826, "num_input_tokens_seen": 22367728, "step": 22265 }, { "epoch": 11.808059384941675, "grad_norm": 47.26774215698242, "learning_rate": 4.297628607341137e-06, "loss": 0.1415, "num_input_tokens_seen": 22372080, "step": 22270 }, { "epoch": 11.810710498409332, "grad_norm": 40.41941452026367, "learning_rate": 4.295338089901296e-06, "loss": 0.0547, "num_input_tokens_seen": 22376176, "step": 22275 }, { "epoch": 11.813361611876989, "grad_norm": 2.8417160511016846, "learning_rate": 4.293047723327726e-06, "loss": 0.0329, "num_input_tokens_seen": 22380592, "step": 22280 }, { "epoch": 11.816012725344645, "grad_norm": 8.821227073669434, "learning_rate": 4.29075750811079e-06, "loss": 0.2395, "num_input_tokens_seen": 22386480, "step": 22285 }, { "epoch": 11.8186638388123, "grad_norm": 3.8267228603363037, "learning_rate": 4.288467444740814e-06, "loss": 0.0654, "num_input_tokens_seen": 22392464, "step": 22290 }, { "epoch": 11.821314952279957, "grad_norm": 21.251466751098633, "learning_rate": 4.286177533708097e-06, "loss": 0.0831, "num_input_tokens_seen": 22397840, "step": 22295 }, { "epoch": 11.823966065747614, "grad_norm": 18.353778839111328, "learning_rate": 4.2838877755029e-06, "loss": 0.1083, "num_input_tokens_seen": 22402096, "step": 22300 }, { "epoch": 11.82661717921527, "grad_norm": 2.994741201400757, "learning_rate": 4.28159817061546e-06, "loss": 0.0621, "num_input_tokens_seen": 22406864, "step": 22305 }, { "epoch": 11.829268292682928, "grad_norm": 3.31852388381958, "learning_rate": 4.279308719535969e-06, "loss": 0.2745, "num_input_tokens_seen": 22411088, "step": 22310 }, { "epoch": 11.831919406150583, "grad_norm": 4.998472690582275, "learning_rate": 4.2770194227545965e-06, "loss": 0.1956, "num_input_tokens_seen": 22415728, "step": 22315 }, { "epoch": 11.83457051961824, "grad_norm": 15.184981346130371, "learning_rate": 4.274730280761472e-06, "loss": 0.0741, "num_input_tokens_seen": 22420080, "step": 22320 }, { "epoch": 11.837221633085896, "grad_norm": 3.5280861854553223, "learning_rate": 4.2724412940466995e-06, "loss": 0.1639, "num_input_tokens_seen": 22425296, "step": 22325 }, { "epoch": 11.839872746553553, "grad_norm": 3.9869089126586914, "learning_rate": 4.27015246310034e-06, "loss": 0.0644, "num_input_tokens_seen": 22429840, "step": 22330 }, { "epoch": 11.84252386002121, "grad_norm": 5.274107456207275, "learning_rate": 4.267863788412429e-06, "loss": 0.0532, "num_input_tokens_seen": 22434544, "step": 22335 }, { "epoch": 11.845174973488865, "grad_norm": 10.28508472442627, "learning_rate": 4.265575270472964e-06, "loss": 0.2014, "num_input_tokens_seen": 22439952, "step": 22340 }, { "epoch": 11.847826086956522, "grad_norm": 21.804088592529297, "learning_rate": 4.263286909771914e-06, "loss": 0.1293, "num_input_tokens_seen": 22444496, "step": 22345 }, { "epoch": 11.850477200424178, "grad_norm": 28.6063289642334, "learning_rate": 4.2609987067992075e-06, "loss": 0.1437, "num_input_tokens_seen": 22448944, "step": 22350 }, { "epoch": 11.853128313891835, "grad_norm": 32.01140213012695, "learning_rate": 4.258710662044744e-06, "loss": 0.1551, "num_input_tokens_seen": 22452848, "step": 22355 }, { "epoch": 11.85577942735949, "grad_norm": 32.29179382324219, "learning_rate": 4.25642277599839e-06, "loss": 0.1306, "num_input_tokens_seen": 22458096, "step": 22360 }, { "epoch": 11.858430540827147, "grad_norm": 26.79598045349121, "learning_rate": 4.2541350491499715e-06, "loss": 0.2291, "num_input_tokens_seen": 22464432, "step": 22365 }, { "epoch": 11.861081654294804, "grad_norm": 1.5056872367858887, "learning_rate": 4.251847481989288e-06, "loss": 0.0485, "num_input_tokens_seen": 22470064, "step": 22370 }, { "epoch": 11.86373276776246, "grad_norm": 10.898418426513672, "learning_rate": 4.2495600750061e-06, "loss": 0.0833, "num_input_tokens_seen": 22474416, "step": 22375 }, { "epoch": 11.866383881230117, "grad_norm": 12.595444679260254, "learning_rate": 4.247272828690138e-06, "loss": 0.0933, "num_input_tokens_seen": 22479728, "step": 22380 }, { "epoch": 11.869034994697772, "grad_norm": 24.066455841064453, "learning_rate": 4.244985743531092e-06, "loss": 0.1036, "num_input_tokens_seen": 22485264, "step": 22385 }, { "epoch": 11.871686108165429, "grad_norm": 37.784141540527344, "learning_rate": 4.242698820018623e-06, "loss": 0.1956, "num_input_tokens_seen": 22489648, "step": 22390 }, { "epoch": 11.874337221633086, "grad_norm": 40.37348937988281, "learning_rate": 4.2404120586423555e-06, "loss": 0.1155, "num_input_tokens_seen": 22494000, "step": 22395 }, { "epoch": 11.876988335100743, "grad_norm": 42.673404693603516, "learning_rate": 4.238125459891879e-06, "loss": 0.1063, "num_input_tokens_seen": 22498576, "step": 22400 }, { "epoch": 11.8796394485684, "grad_norm": 46.12135314941406, "learning_rate": 4.235839024256747e-06, "loss": 0.1696, "num_input_tokens_seen": 22502928, "step": 22405 }, { "epoch": 11.882290562036054, "grad_norm": 3.892481803894043, "learning_rate": 4.233552752226481e-06, "loss": 0.0523, "num_input_tokens_seen": 22509072, "step": 22410 }, { "epoch": 11.884941675503711, "grad_norm": 19.26828384399414, "learning_rate": 4.231266644290564e-06, "loss": 0.0457, "num_input_tokens_seen": 22514736, "step": 22415 }, { "epoch": 11.887592788971368, "grad_norm": 39.211578369140625, "learning_rate": 4.2289807009384485e-06, "loss": 0.0596, "num_input_tokens_seen": 22519760, "step": 22420 }, { "epoch": 11.890243902439025, "grad_norm": 57.01297378540039, "learning_rate": 4.226694922659546e-06, "loss": 0.1155, "num_input_tokens_seen": 22524208, "step": 22425 }, { "epoch": 11.892895015906682, "grad_norm": 2.7072532176971436, "learning_rate": 4.2244093099432385e-06, "loss": 0.0507, "num_input_tokens_seen": 22528720, "step": 22430 }, { "epoch": 11.895546129374337, "grad_norm": 12.152212142944336, "learning_rate": 4.222123863278866e-06, "loss": 0.0155, "num_input_tokens_seen": 22533488, "step": 22435 }, { "epoch": 11.898197242841993, "grad_norm": 3.3901710510253906, "learning_rate": 4.219838583155741e-06, "loss": 0.0804, "num_input_tokens_seen": 22540752, "step": 22440 }, { "epoch": 11.90084835630965, "grad_norm": 21.978126525878906, "learning_rate": 4.217553470063133e-06, "loss": 0.067, "num_input_tokens_seen": 22545520, "step": 22445 }, { "epoch": 11.903499469777307, "grad_norm": 6.415416717529297, "learning_rate": 4.2152685244902795e-06, "loss": 0.1871, "num_input_tokens_seen": 22551632, "step": 22450 }, { "epoch": 11.906150583244964, "grad_norm": 10.668030738830566, "learning_rate": 4.212983746926381e-06, "loss": 0.1283, "num_input_tokens_seen": 22556848, "step": 22455 }, { "epoch": 11.908801696712619, "grad_norm": 4.935552597045898, "learning_rate": 4.210699137860604e-06, "loss": 0.1706, "num_input_tokens_seen": 22562992, "step": 22460 }, { "epoch": 11.911452810180275, "grad_norm": 31.524585723876953, "learning_rate": 4.208414697782075e-06, "loss": 0.2168, "num_input_tokens_seen": 22568048, "step": 22465 }, { "epoch": 11.914103923647932, "grad_norm": 0.23691381514072418, "learning_rate": 4.206130427179889e-06, "loss": 0.2266, "num_input_tokens_seen": 22572976, "step": 22470 }, { "epoch": 11.916755037115589, "grad_norm": 7.900880813598633, "learning_rate": 4.2038463265431e-06, "loss": 0.3598, "num_input_tokens_seen": 22577552, "step": 22475 }, { "epoch": 11.919406150583246, "grad_norm": 23.51902961730957, "learning_rate": 4.201562396360732e-06, "loss": 0.072, "num_input_tokens_seen": 22582064, "step": 22480 }, { "epoch": 11.9220572640509, "grad_norm": 2.9006974697113037, "learning_rate": 4.199278637121762e-06, "loss": 0.0607, "num_input_tokens_seen": 22587344, "step": 22485 }, { "epoch": 11.924708377518558, "grad_norm": 7.455650806427002, "learning_rate": 4.196995049315142e-06, "loss": 0.018, "num_input_tokens_seen": 22591792, "step": 22490 }, { "epoch": 11.927359490986214, "grad_norm": 1.4726893901824951, "learning_rate": 4.194711633429782e-06, "loss": 0.1826, "num_input_tokens_seen": 22596816, "step": 22495 }, { "epoch": 11.930010604453871, "grad_norm": 5.845005035400391, "learning_rate": 4.192428389954552e-06, "loss": 0.1275, "num_input_tokens_seen": 22602960, "step": 22500 }, { "epoch": 11.932661717921526, "grad_norm": 41.546627044677734, "learning_rate": 4.190145319378292e-06, "loss": 0.2167, "num_input_tokens_seen": 22607536, "step": 22505 }, { "epoch": 11.935312831389183, "grad_norm": 50.61341094970703, "learning_rate": 4.1878624221897985e-06, "loss": 0.1933, "num_input_tokens_seen": 22611888, "step": 22510 }, { "epoch": 11.93796394485684, "grad_norm": 10.658870697021484, "learning_rate": 4.185579698877837e-06, "loss": 0.1086, "num_input_tokens_seen": 22616752, "step": 22515 }, { "epoch": 11.940615058324497, "grad_norm": 29.80352210998535, "learning_rate": 4.183297149931129e-06, "loss": 0.2451, "num_input_tokens_seen": 22621328, "step": 22520 }, { "epoch": 11.943266171792153, "grad_norm": 8.271590232849121, "learning_rate": 4.1810147758383655e-06, "loss": 0.0478, "num_input_tokens_seen": 22626544, "step": 22525 }, { "epoch": 11.945917285259808, "grad_norm": 9.74661636352539, "learning_rate": 4.178732577088193e-06, "loss": 0.1675, "num_input_tokens_seen": 22630256, "step": 22530 }, { "epoch": 11.948568398727465, "grad_norm": 33.621429443359375, "learning_rate": 4.176450554169231e-06, "loss": 0.1843, "num_input_tokens_seen": 22634096, "step": 22535 }, { "epoch": 11.951219512195122, "grad_norm": 23.663846969604492, "learning_rate": 4.174168707570047e-06, "loss": 0.0948, "num_input_tokens_seen": 22639024, "step": 22540 }, { "epoch": 11.953870625662779, "grad_norm": 22.147199630737305, "learning_rate": 4.171887037779183e-06, "loss": 0.0725, "num_input_tokens_seen": 22644080, "step": 22545 }, { "epoch": 11.956521739130435, "grad_norm": 10.234842300415039, "learning_rate": 4.169605545285136e-06, "loss": 0.1545, "num_input_tokens_seen": 22648432, "step": 22550 }, { "epoch": 11.95917285259809, "grad_norm": 31.558561325073242, "learning_rate": 4.167324230576371e-06, "loss": 0.3449, "num_input_tokens_seen": 22654096, "step": 22555 }, { "epoch": 11.961823966065747, "grad_norm": 9.383342742919922, "learning_rate": 4.165043094141307e-06, "loss": 0.3071, "num_input_tokens_seen": 22660304, "step": 22560 }, { "epoch": 11.964475079533404, "grad_norm": 2.2597603797912598, "learning_rate": 4.162762136468334e-06, "loss": 0.0636, "num_input_tokens_seen": 22665808, "step": 22565 }, { "epoch": 11.96712619300106, "grad_norm": 25.25659942626953, "learning_rate": 4.160481358045794e-06, "loss": 0.0808, "num_input_tokens_seen": 22670736, "step": 22570 }, { "epoch": 11.969777306468718, "grad_norm": 0.837870717048645, "learning_rate": 4.158200759362001e-06, "loss": 0.0577, "num_input_tokens_seen": 22675696, "step": 22575 }, { "epoch": 11.972428419936373, "grad_norm": 19.9324893951416, "learning_rate": 4.155920340905221e-06, "loss": 0.0678, "num_input_tokens_seen": 22680336, "step": 22580 }, { "epoch": 11.97507953340403, "grad_norm": 4.204411506652832, "learning_rate": 4.153640103163688e-06, "loss": 0.0663, "num_input_tokens_seen": 22685296, "step": 22585 }, { "epoch": 11.977730646871686, "grad_norm": 4.849092483520508, "learning_rate": 4.151360046625593e-06, "loss": 0.0357, "num_input_tokens_seen": 22690448, "step": 22590 }, { "epoch": 11.980381760339343, "grad_norm": 27.863521575927734, "learning_rate": 4.149080171779094e-06, "loss": 0.1815, "num_input_tokens_seen": 22696592, "step": 22595 }, { "epoch": 11.983032873806998, "grad_norm": 2.8333065509796143, "learning_rate": 4.146800479112301e-06, "loss": 0.1552, "num_input_tokens_seen": 22701488, "step": 22600 }, { "epoch": 11.985683987274655, "grad_norm": 2.0621843338012695, "learning_rate": 4.144520969113295e-06, "loss": 0.0711, "num_input_tokens_seen": 22705744, "step": 22605 }, { "epoch": 11.988335100742312, "grad_norm": 4.636600017547607, "learning_rate": 4.142241642270109e-06, "loss": 0.0165, "num_input_tokens_seen": 22709936, "step": 22610 }, { "epoch": 11.990986214209968, "grad_norm": 26.369443893432617, "learning_rate": 4.139962499070744e-06, "loss": 0.2895, "num_input_tokens_seen": 22716400, "step": 22615 }, { "epoch": 11.993637327677625, "grad_norm": 7.529167175292969, "learning_rate": 4.137683540003157e-06, "loss": 0.0509, "num_input_tokens_seen": 22721968, "step": 22620 }, { "epoch": 11.996288441145282, "grad_norm": 10.640077590942383, "learning_rate": 4.135404765555268e-06, "loss": 0.0433, "num_input_tokens_seen": 22728176, "step": 22625 }, { "epoch": 11.998939554612937, "grad_norm": 43.91273498535156, "learning_rate": 4.133126176214957e-06, "loss": 0.2084, "num_input_tokens_seen": 22733168, "step": 22630 }, { "epoch": 12.0, "eval_loss": 0.5529431104660034, "eval_runtime": 29.2858, "eval_samples_per_second": 64.4, "eval_steps_per_second": 16.117, "num_input_tokens_seen": 22734144, "step": 22632 }, { "epoch": 12.001590668080594, "grad_norm": 1.3475388288497925, "learning_rate": 4.130847772470062e-06, "loss": 0.0925, "num_input_tokens_seen": 22736864, "step": 22635 }, { "epoch": 12.00424178154825, "grad_norm": 0.1410762518644333, "learning_rate": 4.1285695548083846e-06, "loss": 0.0127, "num_input_tokens_seen": 22742592, "step": 22640 }, { "epoch": 12.006892895015907, "grad_norm": 1.7631950378417969, "learning_rate": 4.126291523717685e-06, "loss": 0.1364, "num_input_tokens_seen": 22748736, "step": 22645 }, { "epoch": 12.009544008483562, "grad_norm": 8.403829574584961, "learning_rate": 4.124013679685686e-06, "loss": 0.0354, "num_input_tokens_seen": 22753856, "step": 22650 }, { "epoch": 12.012195121951219, "grad_norm": 2.008455514907837, "learning_rate": 4.121736023200063e-06, "loss": 0.0429, "num_input_tokens_seen": 22758816, "step": 22655 }, { "epoch": 12.014846235418876, "grad_norm": 17.880876541137695, "learning_rate": 4.1194585547484615e-06, "loss": 0.0217, "num_input_tokens_seen": 22763296, "step": 22660 }, { "epoch": 12.017497348886533, "grad_norm": 4.5751566886901855, "learning_rate": 4.117181274818477e-06, "loss": 0.0228, "num_input_tokens_seen": 22768096, "step": 22665 }, { "epoch": 12.02014846235419, "grad_norm": 33.48725128173828, "learning_rate": 4.114904183897674e-06, "loss": 0.0844, "num_input_tokens_seen": 22772608, "step": 22670 }, { "epoch": 12.022799575821844, "grad_norm": 1.5343265533447266, "learning_rate": 4.1126272824735675e-06, "loss": 0.0799, "num_input_tokens_seen": 22776800, "step": 22675 }, { "epoch": 12.025450689289501, "grad_norm": 3.2106575965881348, "learning_rate": 4.110350571033639e-06, "loss": 0.0337, "num_input_tokens_seen": 22781536, "step": 22680 }, { "epoch": 12.028101802757158, "grad_norm": 3.1026840209960938, "learning_rate": 4.108074050065323e-06, "loss": 0.0399, "num_input_tokens_seen": 22786336, "step": 22685 }, { "epoch": 12.030752916224815, "grad_norm": 25.867525100708008, "learning_rate": 4.105797720056023e-06, "loss": 0.0681, "num_input_tokens_seen": 22790688, "step": 22690 }, { "epoch": 12.033404029692472, "grad_norm": 1.7828210592269897, "learning_rate": 4.1035215814930875e-06, "loss": 0.1329, "num_input_tokens_seen": 22795200, "step": 22695 }, { "epoch": 12.036055143160127, "grad_norm": 24.228174209594727, "learning_rate": 4.101245634863837e-06, "loss": 0.0665, "num_input_tokens_seen": 22799968, "step": 22700 }, { "epoch": 12.038706256627783, "grad_norm": 4.9745330810546875, "learning_rate": 4.0989698806555434e-06, "loss": 0.028, "num_input_tokens_seen": 22806624, "step": 22705 }, { "epoch": 12.04135737009544, "grad_norm": 16.43109893798828, "learning_rate": 4.096694319355442e-06, "loss": 0.0307, "num_input_tokens_seen": 22812896, "step": 22710 }, { "epoch": 12.044008483563097, "grad_norm": 0.7424713373184204, "learning_rate": 4.094418951450721e-06, "loss": 0.061, "num_input_tokens_seen": 22817280, "step": 22715 }, { "epoch": 12.046659597030754, "grad_norm": 2.183793544769287, "learning_rate": 4.092143777428535e-06, "loss": 0.0672, "num_input_tokens_seen": 22822240, "step": 22720 }, { "epoch": 12.049310710498409, "grad_norm": 7.082901477813721, "learning_rate": 4.0898687977759895e-06, "loss": 0.0344, "num_input_tokens_seen": 22826752, "step": 22725 }, { "epoch": 12.051961823966066, "grad_norm": 2.9999759197235107, "learning_rate": 4.087594012980155e-06, "loss": 0.0575, "num_input_tokens_seen": 22831168, "step": 22730 }, { "epoch": 12.054612937433722, "grad_norm": 10.60460376739502, "learning_rate": 4.085319423528051e-06, "loss": 0.0884, "num_input_tokens_seen": 22835680, "step": 22735 }, { "epoch": 12.057264050901379, "grad_norm": 6.231082916259766, "learning_rate": 4.083045029906668e-06, "loss": 0.1948, "num_input_tokens_seen": 22840128, "step": 22740 }, { "epoch": 12.059915164369036, "grad_norm": 25.08047866821289, "learning_rate": 4.080770832602943e-06, "loss": 0.0299, "num_input_tokens_seen": 22845216, "step": 22745 }, { "epoch": 12.06256627783669, "grad_norm": 3.3908867835998535, "learning_rate": 4.078496832103778e-06, "loss": 0.0306, "num_input_tokens_seen": 22850016, "step": 22750 }, { "epoch": 12.065217391304348, "grad_norm": 1.4341298341751099, "learning_rate": 4.076223028896029e-06, "loss": 0.0707, "num_input_tokens_seen": 22855360, "step": 22755 }, { "epoch": 12.067868504772004, "grad_norm": 5.881563186645508, "learning_rate": 4.073949423466515e-06, "loss": 0.1282, "num_input_tokens_seen": 22860480, "step": 22760 }, { "epoch": 12.070519618239661, "grad_norm": 22.640188217163086, "learning_rate": 4.071676016302003e-06, "loss": 0.0442, "num_input_tokens_seen": 22865696, "step": 22765 }, { "epoch": 12.073170731707316, "grad_norm": 39.81977462768555, "learning_rate": 4.069402807889228e-06, "loss": 0.3324, "num_input_tokens_seen": 22870400, "step": 22770 }, { "epoch": 12.075821845174973, "grad_norm": 7.555329322814941, "learning_rate": 4.067129798714878e-06, "loss": 0.0968, "num_input_tokens_seen": 22874336, "step": 22775 }, { "epoch": 12.07847295864263, "grad_norm": 29.24135971069336, "learning_rate": 4.064856989265595e-06, "loss": 0.0531, "num_input_tokens_seen": 22879744, "step": 22780 }, { "epoch": 12.081124072110287, "grad_norm": 5.746772289276123, "learning_rate": 4.0625843800279855e-06, "loss": 0.0306, "num_input_tokens_seen": 22884224, "step": 22785 }, { "epoch": 12.083775185577943, "grad_norm": 9.701937675476074, "learning_rate": 4.060311971488604e-06, "loss": 0.0314, "num_input_tokens_seen": 22890880, "step": 22790 }, { "epoch": 12.086426299045598, "grad_norm": 0.7267919182777405, "learning_rate": 4.058039764133975e-06, "loss": 0.1366, "num_input_tokens_seen": 22897376, "step": 22795 }, { "epoch": 12.089077412513255, "grad_norm": 1.8631224632263184, "learning_rate": 4.055767758450564e-06, "loss": 0.0154, "num_input_tokens_seen": 22902624, "step": 22800 }, { "epoch": 12.091728525980912, "grad_norm": 14.464579582214355, "learning_rate": 4.053495954924806e-06, "loss": 0.0858, "num_input_tokens_seen": 22907904, "step": 22805 }, { "epoch": 12.094379639448569, "grad_norm": 1.170082449913025, "learning_rate": 4.051224354043087e-06, "loss": 0.0235, "num_input_tokens_seen": 22914112, "step": 22810 }, { "epoch": 12.097030752916226, "grad_norm": 4.004683494567871, "learning_rate": 4.0489529562917534e-06, "loss": 0.0514, "num_input_tokens_seen": 22919840, "step": 22815 }, { "epoch": 12.09968186638388, "grad_norm": 0.8937333822250366, "learning_rate": 4.046681762157101e-06, "loss": 0.1364, "num_input_tokens_seen": 22924832, "step": 22820 }, { "epoch": 12.102332979851537, "grad_norm": 7.52601432800293, "learning_rate": 4.044410772125389e-06, "loss": 0.0813, "num_input_tokens_seen": 22930560, "step": 22825 }, { "epoch": 12.104984093319194, "grad_norm": 2.476409912109375, "learning_rate": 4.04213998668283e-06, "loss": 0.0992, "num_input_tokens_seen": 22935328, "step": 22830 }, { "epoch": 12.107635206786851, "grad_norm": 12.926346778869629, "learning_rate": 4.039869406315595e-06, "loss": 0.0396, "num_input_tokens_seen": 22939488, "step": 22835 }, { "epoch": 12.110286320254508, "grad_norm": 9.809091567993164, "learning_rate": 4.037599031509806e-06, "loss": 0.0415, "num_input_tokens_seen": 22944448, "step": 22840 }, { "epoch": 12.112937433722163, "grad_norm": 35.74409103393555, "learning_rate": 4.035328862751547e-06, "loss": 0.0828, "num_input_tokens_seen": 22948672, "step": 22845 }, { "epoch": 12.11558854718982, "grad_norm": 2.3257758617401123, "learning_rate": 4.033058900526853e-06, "loss": 0.0098, "num_input_tokens_seen": 22952800, "step": 22850 }, { "epoch": 12.118239660657476, "grad_norm": 28.672061920166016, "learning_rate": 4.03078914532172e-06, "loss": 0.0758, "num_input_tokens_seen": 22958592, "step": 22855 }, { "epoch": 12.120890774125133, "grad_norm": 0.941519558429718, "learning_rate": 4.0285195976220935e-06, "loss": 0.015, "num_input_tokens_seen": 22964416, "step": 22860 }, { "epoch": 12.12354188759279, "grad_norm": 10.061566352844238, "learning_rate": 4.026250257913879e-06, "loss": 0.0214, "num_input_tokens_seen": 22970624, "step": 22865 }, { "epoch": 12.126193001060445, "grad_norm": 12.563941955566406, "learning_rate": 4.023981126682935e-06, "loss": 0.0516, "num_input_tokens_seen": 22975904, "step": 22870 }, { "epoch": 12.128844114528102, "grad_norm": 15.470148086547852, "learning_rate": 4.0217122044150815e-06, "loss": 0.0526, "num_input_tokens_seen": 22979840, "step": 22875 }, { "epoch": 12.131495227995758, "grad_norm": 4.0213942527771, "learning_rate": 4.019443491596082e-06, "loss": 0.0266, "num_input_tokens_seen": 22985536, "step": 22880 }, { "epoch": 12.134146341463415, "grad_norm": 10.523847579956055, "learning_rate": 4.017174988711666e-06, "loss": 0.0258, "num_input_tokens_seen": 22990496, "step": 22885 }, { "epoch": 12.13679745493107, "grad_norm": 6.35181188583374, "learning_rate": 4.014906696247512e-06, "loss": 0.1747, "num_input_tokens_seen": 22996000, "step": 22890 }, { "epoch": 12.139448568398727, "grad_norm": 1.4529993534088135, "learning_rate": 4.0126386146892595e-06, "loss": 0.0737, "num_input_tokens_seen": 23000704, "step": 22895 }, { "epoch": 12.142099681866384, "grad_norm": 2.7128310203552246, "learning_rate": 4.0103707445224935e-06, "loss": 0.0671, "num_input_tokens_seen": 23005728, "step": 22900 }, { "epoch": 12.14475079533404, "grad_norm": 2.374058723449707, "learning_rate": 4.008103086232762e-06, "loss": 0.0226, "num_input_tokens_seen": 23011072, "step": 22905 }, { "epoch": 12.147401908801697, "grad_norm": 0.87184077501297, "learning_rate": 4.005835640305566e-06, "loss": 0.2457, "num_input_tokens_seen": 23015616, "step": 22910 }, { "epoch": 12.150053022269352, "grad_norm": 31.984098434448242, "learning_rate": 4.0035684072263555e-06, "loss": 0.0383, "num_input_tokens_seen": 23020448, "step": 22915 }, { "epoch": 12.15270413573701, "grad_norm": 10.686904907226562, "learning_rate": 4.001301387480543e-06, "loss": 0.0368, "num_input_tokens_seen": 23025568, "step": 22920 }, { "epoch": 12.155355249204666, "grad_norm": 0.5426256060600281, "learning_rate": 3.999034581553489e-06, "loss": 0.0061, "num_input_tokens_seen": 23030048, "step": 22925 }, { "epoch": 12.158006362672323, "grad_norm": 0.575725257396698, "learning_rate": 3.996767989930513e-06, "loss": 0.0136, "num_input_tokens_seen": 23034944, "step": 22930 }, { "epoch": 12.16065747613998, "grad_norm": 0.532843291759491, "learning_rate": 3.994501613096884e-06, "loss": 0.0067, "num_input_tokens_seen": 23040256, "step": 22935 }, { "epoch": 12.163308589607635, "grad_norm": 10.575358390808105, "learning_rate": 3.992235451537829e-06, "loss": 0.1401, "num_input_tokens_seen": 23045856, "step": 22940 }, { "epoch": 12.165959703075291, "grad_norm": 35.171142578125, "learning_rate": 3.989969505738526e-06, "loss": 0.1132, "num_input_tokens_seen": 23051008, "step": 22945 }, { "epoch": 12.168610816542948, "grad_norm": 3.2263166904449463, "learning_rate": 3.987703776184111e-06, "loss": 0.0373, "num_input_tokens_seen": 23055072, "step": 22950 }, { "epoch": 12.171261930010605, "grad_norm": 7.467157363891602, "learning_rate": 3.985438263359667e-06, "loss": 0.0115, "num_input_tokens_seen": 23059776, "step": 22955 }, { "epoch": 12.173913043478262, "grad_norm": 2.217461109161377, "learning_rate": 3.983172967750236e-06, "loss": 0.0365, "num_input_tokens_seen": 23065216, "step": 22960 }, { "epoch": 12.176564156945917, "grad_norm": 2.5186893939971924, "learning_rate": 3.980907889840812e-06, "loss": 0.0718, "num_input_tokens_seen": 23071168, "step": 22965 }, { "epoch": 12.179215270413573, "grad_norm": 42.06881332397461, "learning_rate": 3.978643030116343e-06, "loss": 0.0436, "num_input_tokens_seen": 23075680, "step": 22970 }, { "epoch": 12.18186638388123, "grad_norm": 39.86969757080078, "learning_rate": 3.976378389061727e-06, "loss": 0.0855, "num_input_tokens_seen": 23080864, "step": 22975 }, { "epoch": 12.184517497348887, "grad_norm": 0.34619808197021484, "learning_rate": 3.97411396716182e-06, "loss": 0.0856, "num_input_tokens_seen": 23084992, "step": 22980 }, { "epoch": 12.187168610816544, "grad_norm": 3.340202808380127, "learning_rate": 3.971849764901427e-06, "loss": 0.0957, "num_input_tokens_seen": 23090784, "step": 22985 }, { "epoch": 12.189819724284199, "grad_norm": 1.082749605178833, "learning_rate": 3.9695857827653124e-06, "loss": 0.0517, "num_input_tokens_seen": 23095328, "step": 22990 }, { "epoch": 12.192470837751856, "grad_norm": 8.161919593811035, "learning_rate": 3.9673220212381825e-06, "loss": 0.0564, "num_input_tokens_seen": 23099360, "step": 22995 }, { "epoch": 12.195121951219512, "grad_norm": 1.3307111263275146, "learning_rate": 3.965058480804706e-06, "loss": 0.0178, "num_input_tokens_seen": 23104160, "step": 23000 }, { "epoch": 12.19777306468717, "grad_norm": 0.64155513048172, "learning_rate": 3.962795161949501e-06, "loss": 0.0229, "num_input_tokens_seen": 23108448, "step": 23005 }, { "epoch": 12.200424178154824, "grad_norm": 2.6291818618774414, "learning_rate": 3.960532065157139e-06, "loss": 0.1104, "num_input_tokens_seen": 23114112, "step": 23010 }, { "epoch": 12.203075291622481, "grad_norm": 46.46784973144531, "learning_rate": 3.958269190912139e-06, "loss": 0.2763, "num_input_tokens_seen": 23118432, "step": 23015 }, { "epoch": 12.205726405090138, "grad_norm": 0.9997286200523376, "learning_rate": 3.956006539698981e-06, "loss": 0.0767, "num_input_tokens_seen": 23123008, "step": 23020 }, { "epoch": 12.208377518557795, "grad_norm": 1.8048406839370728, "learning_rate": 3.95374411200209e-06, "loss": 0.0584, "num_input_tokens_seen": 23127328, "step": 23025 }, { "epoch": 12.211028632025451, "grad_norm": 10.670056343078613, "learning_rate": 3.951481908305849e-06, "loss": 0.1182, "num_input_tokens_seen": 23132672, "step": 23030 }, { "epoch": 12.213679745493106, "grad_norm": 26.515432357788086, "learning_rate": 3.949219929094586e-06, "loss": 0.0511, "num_input_tokens_seen": 23136768, "step": 23035 }, { "epoch": 12.216330858960763, "grad_norm": 18.772964477539062, "learning_rate": 3.946958174852587e-06, "loss": 0.0665, "num_input_tokens_seen": 23141792, "step": 23040 }, { "epoch": 12.21898197242842, "grad_norm": 4.787991046905518, "learning_rate": 3.944696646064089e-06, "loss": 0.0422, "num_input_tokens_seen": 23147232, "step": 23045 }, { "epoch": 12.221633085896077, "grad_norm": 32.28514099121094, "learning_rate": 3.942435343213274e-06, "loss": 0.1421, "num_input_tokens_seen": 23152352, "step": 23050 }, { "epoch": 12.224284199363733, "grad_norm": 17.39682388305664, "learning_rate": 3.940174266784287e-06, "loss": 0.0602, "num_input_tokens_seen": 23156800, "step": 23055 }, { "epoch": 12.226935312831388, "grad_norm": 3.066255807876587, "learning_rate": 3.937913417261215e-06, "loss": 0.1239, "num_input_tokens_seen": 23161728, "step": 23060 }, { "epoch": 12.229586426299045, "grad_norm": 27.176542282104492, "learning_rate": 3.935652795128103e-06, "loss": 0.0296, "num_input_tokens_seen": 23167584, "step": 23065 }, { "epoch": 12.232237539766702, "grad_norm": 33.731868743896484, "learning_rate": 3.9333924008689405e-06, "loss": 0.0519, "num_input_tokens_seen": 23173248, "step": 23070 }, { "epoch": 12.234888653234359, "grad_norm": 0.2260504812002182, "learning_rate": 3.931132234967675e-06, "loss": 0.04, "num_input_tokens_seen": 23178528, "step": 23075 }, { "epoch": 12.237539766702016, "grad_norm": 6.599628448486328, "learning_rate": 3.9288722979082e-06, "loss": 0.0234, "num_input_tokens_seen": 23182880, "step": 23080 }, { "epoch": 12.24019088016967, "grad_norm": 4.062585353851318, "learning_rate": 3.926612590174365e-06, "loss": 0.0186, "num_input_tokens_seen": 23187552, "step": 23085 }, { "epoch": 12.242841993637327, "grad_norm": 15.722773551940918, "learning_rate": 3.924353112249964e-06, "loss": 0.2023, "num_input_tokens_seen": 23191648, "step": 23090 }, { "epoch": 12.245493107104984, "grad_norm": 16.54620933532715, "learning_rate": 3.922093864618748e-06, "loss": 0.0238, "num_input_tokens_seen": 23197408, "step": 23095 }, { "epoch": 12.248144220572641, "grad_norm": 10.469114303588867, "learning_rate": 3.919834847764413e-06, "loss": 0.0191, "num_input_tokens_seen": 23201952, "step": 23100 }, { "epoch": 12.250795334040298, "grad_norm": 1.151292324066162, "learning_rate": 3.917576062170614e-06, "loss": 0.0447, "num_input_tokens_seen": 23206752, "step": 23105 }, { "epoch": 12.253446447507953, "grad_norm": 28.716392517089844, "learning_rate": 3.915317508320945e-06, "loss": 0.071, "num_input_tokens_seen": 23211200, "step": 23110 }, { "epoch": 12.25609756097561, "grad_norm": 6.331356048583984, "learning_rate": 3.9130591866989596e-06, "loss": 0.0522, "num_input_tokens_seen": 23217888, "step": 23115 }, { "epoch": 12.258748674443266, "grad_norm": 0.9475852251052856, "learning_rate": 3.910801097788158e-06, "loss": 0.0112, "num_input_tokens_seen": 23223840, "step": 23120 }, { "epoch": 12.261399787910923, "grad_norm": 0.7131706476211548, "learning_rate": 3.9085432420719934e-06, "loss": 0.0378, "num_input_tokens_seen": 23230272, "step": 23125 }, { "epoch": 12.264050901378578, "grad_norm": 2.307114839553833, "learning_rate": 3.906285620033862e-06, "loss": 0.0296, "num_input_tokens_seen": 23235296, "step": 23130 }, { "epoch": 12.266702014846235, "grad_norm": 0.33747804164886475, "learning_rate": 3.904028232157118e-06, "loss": 0.115, "num_input_tokens_seen": 23240000, "step": 23135 }, { "epoch": 12.269353128313892, "grad_norm": 1.6384714841842651, "learning_rate": 3.901771078925062e-06, "loss": 0.1393, "num_input_tokens_seen": 23244512, "step": 23140 }, { "epoch": 12.272004241781548, "grad_norm": 10.192366600036621, "learning_rate": 3.8995141608209446e-06, "loss": 0.2392, "num_input_tokens_seen": 23250400, "step": 23145 }, { "epoch": 12.274655355249205, "grad_norm": 64.88370513916016, "learning_rate": 3.8972574783279644e-06, "loss": 0.1181, "num_input_tokens_seen": 23256000, "step": 23150 }, { "epoch": 12.27730646871686, "grad_norm": 0.9738166332244873, "learning_rate": 3.895001031929274e-06, "loss": 0.0245, "num_input_tokens_seen": 23261408, "step": 23155 }, { "epoch": 12.279957582184517, "grad_norm": 26.73456382751465, "learning_rate": 3.89274482210797e-06, "loss": 0.0645, "num_input_tokens_seen": 23266112, "step": 23160 }, { "epoch": 12.282608695652174, "grad_norm": 0.5259668827056885, "learning_rate": 3.890488849347103e-06, "loss": 0.0834, "num_input_tokens_seen": 23272768, "step": 23165 }, { "epoch": 12.28525980911983, "grad_norm": 5.069518089294434, "learning_rate": 3.888233114129669e-06, "loss": 0.072, "num_input_tokens_seen": 23277504, "step": 23170 }, { "epoch": 12.287910922587487, "grad_norm": 39.393184661865234, "learning_rate": 3.885977616938617e-06, "loss": 0.1259, "num_input_tokens_seen": 23281984, "step": 23175 }, { "epoch": 12.290562036055142, "grad_norm": 40.20841979980469, "learning_rate": 3.883722358256843e-06, "loss": 0.0985, "num_input_tokens_seen": 23286560, "step": 23180 }, { "epoch": 12.2932131495228, "grad_norm": 3.4941539764404297, "learning_rate": 3.88146733856719e-06, "loss": 0.054, "num_input_tokens_seen": 23291328, "step": 23185 }, { "epoch": 12.295864262990456, "grad_norm": 23.09983253479004, "learning_rate": 3.879212558352453e-06, "loss": 0.1979, "num_input_tokens_seen": 23296864, "step": 23190 }, { "epoch": 12.298515376458113, "grad_norm": 5.341620445251465, "learning_rate": 3.876958018095374e-06, "loss": 0.0705, "num_input_tokens_seen": 23301280, "step": 23195 }, { "epoch": 12.30116648992577, "grad_norm": 34.8084831237793, "learning_rate": 3.8747037182786455e-06, "loss": 0.0538, "num_input_tokens_seen": 23305312, "step": 23200 }, { "epoch": 12.303817603393425, "grad_norm": 18.465789794921875, "learning_rate": 3.8724496593849055e-06, "loss": 0.0698, "num_input_tokens_seen": 23309696, "step": 23205 }, { "epoch": 12.306468716861081, "grad_norm": 7.179751396179199, "learning_rate": 3.870195841896743e-06, "loss": 0.1853, "num_input_tokens_seen": 23314592, "step": 23210 }, { "epoch": 12.309119830328738, "grad_norm": 8.590514183044434, "learning_rate": 3.867942266296693e-06, "loss": 0.0727, "num_input_tokens_seen": 23318720, "step": 23215 }, { "epoch": 12.311770943796395, "grad_norm": 43.99088668823242, "learning_rate": 3.865688933067246e-06, "loss": 0.0457, "num_input_tokens_seen": 23324032, "step": 23220 }, { "epoch": 12.314422057264052, "grad_norm": 8.692695617675781, "learning_rate": 3.863435842690826e-06, "loss": 0.0345, "num_input_tokens_seen": 23328576, "step": 23225 }, { "epoch": 12.317073170731707, "grad_norm": 43.97314453125, "learning_rate": 3.861182995649819e-06, "loss": 0.0679, "num_input_tokens_seen": 23332992, "step": 23230 }, { "epoch": 12.319724284199363, "grad_norm": 74.34603118896484, "learning_rate": 3.858930392426553e-06, "loss": 0.1874, "num_input_tokens_seen": 23337472, "step": 23235 }, { "epoch": 12.32237539766702, "grad_norm": 0.016163736581802368, "learning_rate": 3.8566780335033045e-06, "loss": 0.0088, "num_input_tokens_seen": 23342208, "step": 23240 }, { "epoch": 12.325026511134677, "grad_norm": 3.0257349014282227, "learning_rate": 3.854425919362296e-06, "loss": 0.1377, "num_input_tokens_seen": 23347616, "step": 23245 }, { "epoch": 12.327677624602334, "grad_norm": 9.734068870544434, "learning_rate": 3.852174050485702e-06, "loss": 0.0406, "num_input_tokens_seen": 23352704, "step": 23250 }, { "epoch": 12.330328738069989, "grad_norm": 0.7745085954666138, "learning_rate": 3.849922427355638e-06, "loss": 0.0354, "num_input_tokens_seen": 23358432, "step": 23255 }, { "epoch": 12.332979851537646, "grad_norm": 28.688934326171875, "learning_rate": 3.847671050454176e-06, "loss": 0.0618, "num_input_tokens_seen": 23363360, "step": 23260 }, { "epoch": 12.335630965005302, "grad_norm": 1.9075018167495728, "learning_rate": 3.845419920263325e-06, "loss": 0.0405, "num_input_tokens_seen": 23368832, "step": 23265 }, { "epoch": 12.33828207847296, "grad_norm": 23.50029182434082, "learning_rate": 3.843169037265048e-06, "loss": 0.0456, "num_input_tokens_seen": 23373600, "step": 23270 }, { "epoch": 12.340933191940614, "grad_norm": 12.405180931091309, "learning_rate": 3.840918401941252e-06, "loss": 0.1217, "num_input_tokens_seen": 23377824, "step": 23275 }, { "epoch": 12.343584305408271, "grad_norm": 10.718242645263672, "learning_rate": 3.8386680147737955e-06, "loss": 0.0651, "num_input_tokens_seen": 23383264, "step": 23280 }, { "epoch": 12.346235418875928, "grad_norm": 1.8311774730682373, "learning_rate": 3.836417876244475e-06, "loss": 0.009, "num_input_tokens_seen": 23387872, "step": 23285 }, { "epoch": 12.348886532343585, "grad_norm": 51.88303756713867, "learning_rate": 3.834167986835045e-06, "loss": 0.117, "num_input_tokens_seen": 23392960, "step": 23290 }, { "epoch": 12.351537645811241, "grad_norm": 1.6375638246536255, "learning_rate": 3.831918347027197e-06, "loss": 0.1086, "num_input_tokens_seen": 23397440, "step": 23295 }, { "epoch": 12.354188759278896, "grad_norm": 1.0058798789978027, "learning_rate": 3.829668957302576e-06, "loss": 0.0972, "num_input_tokens_seen": 23401472, "step": 23300 }, { "epoch": 12.356839872746553, "grad_norm": 12.66418170928955, "learning_rate": 3.827419818142765e-06, "loss": 0.0251, "num_input_tokens_seen": 23406176, "step": 23305 }, { "epoch": 12.35949098621421, "grad_norm": 3.110311985015869, "learning_rate": 3.825170930029307e-06, "loss": 0.0213, "num_input_tokens_seen": 23410208, "step": 23310 }, { "epoch": 12.362142099681867, "grad_norm": 38.648475646972656, "learning_rate": 3.8229222934436764e-06, "loss": 0.1423, "num_input_tokens_seen": 23415392, "step": 23315 }, { "epoch": 12.364793213149524, "grad_norm": 5.009485244750977, "learning_rate": 3.820673908867304e-06, "loss": 0.0618, "num_input_tokens_seen": 23420384, "step": 23320 }, { "epoch": 12.367444326617179, "grad_norm": 15.373570442199707, "learning_rate": 3.818425776781562e-06, "loss": 0.0364, "num_input_tokens_seen": 23425792, "step": 23325 }, { "epoch": 12.370095440084835, "grad_norm": 33.262264251708984, "learning_rate": 3.816177897667767e-06, "loss": 0.1487, "num_input_tokens_seen": 23430368, "step": 23330 }, { "epoch": 12.372746553552492, "grad_norm": 13.140048027038574, "learning_rate": 3.8139302720071893e-06, "loss": 0.0289, "num_input_tokens_seen": 23435680, "step": 23335 }, { "epoch": 12.375397667020149, "grad_norm": 16.415159225463867, "learning_rate": 3.811682900281033e-06, "loss": 0.0696, "num_input_tokens_seen": 23440352, "step": 23340 }, { "epoch": 12.378048780487806, "grad_norm": 2.980651378631592, "learning_rate": 3.8094357829704608e-06, "loss": 0.0317, "num_input_tokens_seen": 23446048, "step": 23345 }, { "epoch": 12.38069989395546, "grad_norm": 0.801933228969574, "learning_rate": 3.807188920556571e-06, "loss": 0.018, "num_input_tokens_seen": 23451072, "step": 23350 }, { "epoch": 12.383351007423117, "grad_norm": 46.9923210144043, "learning_rate": 3.8049423135204127e-06, "loss": 0.0993, "num_input_tokens_seen": 23456544, "step": 23355 }, { "epoch": 12.386002120890774, "grad_norm": 9.598777770996094, "learning_rate": 3.802695962342977e-06, "loss": 0.0445, "num_input_tokens_seen": 23462048, "step": 23360 }, { "epoch": 12.388653234358431, "grad_norm": 5.819197654724121, "learning_rate": 3.800449867505205e-06, "loss": 0.0823, "num_input_tokens_seen": 23467200, "step": 23365 }, { "epoch": 12.391304347826088, "grad_norm": 23.655460357666016, "learning_rate": 3.7982040294879753e-06, "loss": 0.2657, "num_input_tokens_seen": 23471744, "step": 23370 }, { "epoch": 12.393955461293743, "grad_norm": 4.108153343200684, "learning_rate": 3.795958448772119e-06, "loss": 0.0634, "num_input_tokens_seen": 23476576, "step": 23375 }, { "epoch": 12.3966065747614, "grad_norm": 35.752742767333984, "learning_rate": 3.793713125838407e-06, "loss": 0.1441, "num_input_tokens_seen": 23482368, "step": 23380 }, { "epoch": 12.399257688229056, "grad_norm": 9.50537395477295, "learning_rate": 3.79146806116756e-06, "loss": 0.0237, "num_input_tokens_seen": 23487840, "step": 23385 }, { "epoch": 12.401908801696713, "grad_norm": 2.1012845039367676, "learning_rate": 3.7892232552402374e-06, "loss": 0.0786, "num_input_tokens_seen": 23493248, "step": 23390 }, { "epoch": 12.404559915164368, "grad_norm": 21.26127815246582, "learning_rate": 3.7869787085370487e-06, "loss": 0.0588, "num_input_tokens_seen": 23497824, "step": 23395 }, { "epoch": 12.407211028632025, "grad_norm": 53.147247314453125, "learning_rate": 3.784734421538543e-06, "loss": 0.1272, "num_input_tokens_seen": 23502464, "step": 23400 }, { "epoch": 12.409862142099682, "grad_norm": 39.08346939086914, "learning_rate": 3.782490394725219e-06, "loss": 0.075, "num_input_tokens_seen": 23506368, "step": 23405 }, { "epoch": 12.412513255567339, "grad_norm": 4.87150239944458, "learning_rate": 3.7802466285775142e-06, "loss": 0.058, "num_input_tokens_seen": 23510560, "step": 23410 }, { "epoch": 12.415164369034995, "grad_norm": 4.820864200592041, "learning_rate": 3.778003123575815e-06, "loss": 0.1879, "num_input_tokens_seen": 23515424, "step": 23415 }, { "epoch": 12.41781548250265, "grad_norm": 9.039822578430176, "learning_rate": 3.775759880200449e-06, "loss": 0.0144, "num_input_tokens_seen": 23521056, "step": 23420 }, { "epoch": 12.420466595970307, "grad_norm": 21.666709899902344, "learning_rate": 3.773516898931691e-06, "loss": 0.286, "num_input_tokens_seen": 23525632, "step": 23425 }, { "epoch": 12.423117709437964, "grad_norm": 3.597731351852417, "learning_rate": 3.7712741802497533e-06, "loss": 0.0134, "num_input_tokens_seen": 23529664, "step": 23430 }, { "epoch": 12.42576882290562, "grad_norm": 39.95635986328125, "learning_rate": 3.7690317246347997e-06, "loss": 0.147, "num_input_tokens_seen": 23535040, "step": 23435 }, { "epoch": 12.428419936373277, "grad_norm": 16.732173919677734, "learning_rate": 3.7667895325669312e-06, "loss": 0.0478, "num_input_tokens_seen": 23539200, "step": 23440 }, { "epoch": 12.431071049840932, "grad_norm": 3.7309093475341797, "learning_rate": 3.7645476045261998e-06, "loss": 0.0334, "num_input_tokens_seen": 23543744, "step": 23445 }, { "epoch": 12.43372216330859, "grad_norm": 32.25517272949219, "learning_rate": 3.7623059409925904e-06, "loss": 0.0553, "num_input_tokens_seen": 23549056, "step": 23450 }, { "epoch": 12.436373276776246, "grad_norm": 43.19534683227539, "learning_rate": 3.7600645424460413e-06, "loss": 0.1453, "num_input_tokens_seen": 23554240, "step": 23455 }, { "epoch": 12.439024390243903, "grad_norm": 5.76394510269165, "learning_rate": 3.757823409366431e-06, "loss": 0.0589, "num_input_tokens_seen": 23559968, "step": 23460 }, { "epoch": 12.44167550371156, "grad_norm": 36.660030364990234, "learning_rate": 3.7555825422335758e-06, "loss": 0.0972, "num_input_tokens_seen": 23565184, "step": 23465 }, { "epoch": 12.444326617179215, "grad_norm": 7.215158939361572, "learning_rate": 3.7533419415272426e-06, "loss": 0.0978, "num_input_tokens_seen": 23569984, "step": 23470 }, { "epoch": 12.446977730646871, "grad_norm": 17.466524124145508, "learning_rate": 3.7511016077271367e-06, "loss": 0.0986, "num_input_tokens_seen": 23574176, "step": 23475 }, { "epoch": 12.449628844114528, "grad_norm": 32.544193267822266, "learning_rate": 3.7488615413129113e-06, "loss": 0.0389, "num_input_tokens_seen": 23579104, "step": 23480 }, { "epoch": 12.452279957582185, "grad_norm": 79.72018432617188, "learning_rate": 3.746621742764153e-06, "loss": 0.1722, "num_input_tokens_seen": 23584320, "step": 23485 }, { "epoch": 12.454931071049842, "grad_norm": 14.014677047729492, "learning_rate": 3.744382212560401e-06, "loss": 0.0355, "num_input_tokens_seen": 23592608, "step": 23490 }, { "epoch": 12.457582184517497, "grad_norm": 40.86311340332031, "learning_rate": 3.7421429511811314e-06, "loss": 0.0834, "num_input_tokens_seen": 23597952, "step": 23495 }, { "epoch": 12.460233297985154, "grad_norm": 7.1789398193359375, "learning_rate": 3.739903959105766e-06, "loss": 0.0379, "num_input_tokens_seen": 23602304, "step": 23500 }, { "epoch": 12.46288441145281, "grad_norm": 5.193507671356201, "learning_rate": 3.7376652368136636e-06, "loss": 0.0585, "num_input_tokens_seen": 23607296, "step": 23505 }, { "epoch": 12.465535524920467, "grad_norm": 5.5227437019348145, "learning_rate": 3.735426784784132e-06, "loss": 0.0921, "num_input_tokens_seen": 23611872, "step": 23510 }, { "epoch": 12.468186638388122, "grad_norm": 3.71700382232666, "learning_rate": 3.7331886034964155e-06, "loss": 0.0164, "num_input_tokens_seen": 23616192, "step": 23515 }, { "epoch": 12.470837751855779, "grad_norm": 13.580169677734375, "learning_rate": 3.730950693429706e-06, "loss": 0.0708, "num_input_tokens_seen": 23621024, "step": 23520 }, { "epoch": 12.473488865323436, "grad_norm": 6.057679653167725, "learning_rate": 3.7287130550631302e-06, "loss": 0.0802, "num_input_tokens_seen": 23625920, "step": 23525 }, { "epoch": 12.476139978791092, "grad_norm": 0.6885125637054443, "learning_rate": 3.726475688875764e-06, "loss": 0.0213, "num_input_tokens_seen": 23630176, "step": 23530 }, { "epoch": 12.47879109225875, "grad_norm": 3.2503983974456787, "learning_rate": 3.724238595346619e-06, "loss": 0.0074, "num_input_tokens_seen": 23635712, "step": 23535 }, { "epoch": 12.481442205726404, "grad_norm": 17.53618049621582, "learning_rate": 3.722001774954655e-06, "loss": 0.0594, "num_input_tokens_seen": 23639968, "step": 23540 }, { "epoch": 12.484093319194061, "grad_norm": 44.15308380126953, "learning_rate": 3.719765228178764e-06, "loss": 0.118, "num_input_tokens_seen": 23644608, "step": 23545 }, { "epoch": 12.486744432661718, "grad_norm": 23.678640365600586, "learning_rate": 3.7175289554977895e-06, "loss": 0.1739, "num_input_tokens_seen": 23649312, "step": 23550 }, { "epoch": 12.489395546129375, "grad_norm": 5.865109920501709, "learning_rate": 3.7152929573905084e-06, "loss": 0.1873, "num_input_tokens_seen": 23654560, "step": 23555 }, { "epoch": 12.492046659597031, "grad_norm": 4.086319446563721, "learning_rate": 3.7130572343356454e-06, "loss": 0.0382, "num_input_tokens_seen": 23660320, "step": 23560 }, { "epoch": 12.494697773064686, "grad_norm": 50.403533935546875, "learning_rate": 3.7108217868118584e-06, "loss": 0.1702, "num_input_tokens_seen": 23664928, "step": 23565 }, { "epoch": 12.497348886532343, "grad_norm": 3.267645835876465, "learning_rate": 3.7085866152977547e-06, "loss": 0.0208, "num_input_tokens_seen": 23669312, "step": 23570 }, { "epoch": 12.5, "grad_norm": 29.33283042907715, "learning_rate": 3.706351720271877e-06, "loss": 0.0626, "num_input_tokens_seen": 23673984, "step": 23575 }, { "epoch": 12.502651113467657, "grad_norm": 0.3074789047241211, "learning_rate": 3.7041171022127122e-06, "loss": 0.0379, "num_input_tokens_seen": 23678240, "step": 23580 }, { "epoch": 12.505302226935314, "grad_norm": 2.005655288696289, "learning_rate": 3.7018827615986826e-06, "loss": 0.0553, "num_input_tokens_seen": 23683008, "step": 23585 }, { "epoch": 12.507953340402969, "grad_norm": 2.147614002227783, "learning_rate": 3.699648698908158e-06, "loss": 0.1781, "num_input_tokens_seen": 23687648, "step": 23590 }, { "epoch": 12.510604453870625, "grad_norm": 60.54421615600586, "learning_rate": 3.697414914619446e-06, "loss": 0.318, "num_input_tokens_seen": 23693536, "step": 23595 }, { "epoch": 12.513255567338282, "grad_norm": 1.103848934173584, "learning_rate": 3.6951814092107897e-06, "loss": 0.0497, "num_input_tokens_seen": 23699200, "step": 23600 }, { "epoch": 12.515906680805939, "grad_norm": 7.567850589752197, "learning_rate": 3.6929481831603804e-06, "loss": 0.0547, "num_input_tokens_seen": 23704160, "step": 23605 }, { "epoch": 12.518557794273596, "grad_norm": 3.3752145767211914, "learning_rate": 3.690715236946344e-06, "loss": 0.0387, "num_input_tokens_seen": 23709696, "step": 23610 }, { "epoch": 12.52120890774125, "grad_norm": 0.47395962476730347, "learning_rate": 3.688482571046751e-06, "loss": 0.015, "num_input_tokens_seen": 23713952, "step": 23615 }, { "epoch": 12.523860021208908, "grad_norm": 45.97607421875, "learning_rate": 3.6862501859396062e-06, "loss": 0.2669, "num_input_tokens_seen": 23719072, "step": 23620 }, { "epoch": 12.526511134676564, "grad_norm": 16.14217758178711, "learning_rate": 3.6840180821028594e-06, "loss": 0.1329, "num_input_tokens_seen": 23724032, "step": 23625 }, { "epoch": 12.529162248144221, "grad_norm": 20.116079330444336, "learning_rate": 3.6817862600143968e-06, "loss": 0.0253, "num_input_tokens_seen": 23730880, "step": 23630 }, { "epoch": 12.531813361611878, "grad_norm": 17.94034767150879, "learning_rate": 3.6795547201520487e-06, "loss": 0.0225, "num_input_tokens_seen": 23735488, "step": 23635 }, { "epoch": 12.534464475079533, "grad_norm": 25.825851440429688, "learning_rate": 3.6773234629935774e-06, "loss": 0.0631, "num_input_tokens_seen": 23741312, "step": 23640 }, { "epoch": 12.53711558854719, "grad_norm": 48.9227409362793, "learning_rate": 3.675092489016693e-06, "loss": 0.0908, "num_input_tokens_seen": 23745440, "step": 23645 }, { "epoch": 12.539766702014846, "grad_norm": 15.038290023803711, "learning_rate": 3.672861798699038e-06, "loss": 0.108, "num_input_tokens_seen": 23751072, "step": 23650 }, { "epoch": 12.542417815482503, "grad_norm": 18.252275466918945, "learning_rate": 3.6706313925182012e-06, "loss": 0.0185, "num_input_tokens_seen": 23756768, "step": 23655 }, { "epoch": 12.545068928950158, "grad_norm": 46.213592529296875, "learning_rate": 3.668401270951704e-06, "loss": 0.0943, "num_input_tokens_seen": 23761056, "step": 23660 }, { "epoch": 12.547720042417815, "grad_norm": 9.93425464630127, "learning_rate": 3.6661714344770092e-06, "loss": 0.2213, "num_input_tokens_seen": 23764960, "step": 23665 }, { "epoch": 12.550371155885472, "grad_norm": 32.42444610595703, "learning_rate": 3.663941883571521e-06, "loss": 0.1542, "num_input_tokens_seen": 23769440, "step": 23670 }, { "epoch": 12.553022269353129, "grad_norm": 2.861753463745117, "learning_rate": 3.6617126187125807e-06, "loss": 0.1048, "num_input_tokens_seen": 23774016, "step": 23675 }, { "epoch": 12.555673382820785, "grad_norm": 56.98697280883789, "learning_rate": 3.6594836403774644e-06, "loss": 0.1296, "num_input_tokens_seen": 23779040, "step": 23680 }, { "epoch": 12.55832449628844, "grad_norm": 0.12480717152357101, "learning_rate": 3.657254949043394e-06, "loss": 0.03, "num_input_tokens_seen": 23784768, "step": 23685 }, { "epoch": 12.560975609756097, "grad_norm": 43.51899719238281, "learning_rate": 3.6550265451875244e-06, "loss": 0.1145, "num_input_tokens_seen": 23789632, "step": 23690 }, { "epoch": 12.563626723223754, "grad_norm": 9.216156005859375, "learning_rate": 3.6527984292869546e-06, "loss": 0.0236, "num_input_tokens_seen": 23794112, "step": 23695 }, { "epoch": 12.56627783669141, "grad_norm": 19.46098518371582, "learning_rate": 3.6505706018187126e-06, "loss": 0.0438, "num_input_tokens_seen": 23798336, "step": 23700 }, { "epoch": 12.568928950159068, "grad_norm": 28.838411331176758, "learning_rate": 3.648343063259776e-06, "loss": 0.059, "num_input_tokens_seen": 23803200, "step": 23705 }, { "epoch": 12.571580063626723, "grad_norm": 25.465972900390625, "learning_rate": 3.6461158140870514e-06, "loss": 0.0405, "num_input_tokens_seen": 23808576, "step": 23710 }, { "epoch": 12.57423117709438, "grad_norm": 3.5183165073394775, "learning_rate": 3.64388885477739e-06, "loss": 0.1461, "num_input_tokens_seen": 23813408, "step": 23715 }, { "epoch": 12.576882290562036, "grad_norm": 45.858882904052734, "learning_rate": 3.6416621858075737e-06, "loss": 0.1357, "num_input_tokens_seen": 23818432, "step": 23720 }, { "epoch": 12.579533404029693, "grad_norm": 39.4474983215332, "learning_rate": 3.6394358076543302e-06, "loss": 0.1726, "num_input_tokens_seen": 23823392, "step": 23725 }, { "epoch": 12.58218451749735, "grad_norm": 19.84611701965332, "learning_rate": 3.6372097207943193e-06, "loss": 0.109, "num_input_tokens_seen": 23828800, "step": 23730 }, { "epoch": 12.584835630965005, "grad_norm": 7.7085862159729, "learning_rate": 3.634983925704143e-06, "loss": 0.0496, "num_input_tokens_seen": 23832992, "step": 23735 }, { "epoch": 12.587486744432661, "grad_norm": 26.769432067871094, "learning_rate": 3.632758422860335e-06, "loss": 0.0886, "num_input_tokens_seen": 23838656, "step": 23740 }, { "epoch": 12.590137857900318, "grad_norm": 45.263729095458984, "learning_rate": 3.6305332127393712e-06, "loss": 0.083, "num_input_tokens_seen": 23843264, "step": 23745 }, { "epoch": 12.592788971367975, "grad_norm": 27.178428649902344, "learning_rate": 3.628308295817664e-06, "loss": 0.1126, "num_input_tokens_seen": 23848128, "step": 23750 }, { "epoch": 12.59544008483563, "grad_norm": 32.75479507446289, "learning_rate": 3.62608367257156e-06, "loss": 0.2245, "num_input_tokens_seen": 23853728, "step": 23755 }, { "epoch": 12.598091198303287, "grad_norm": 12.981430053710938, "learning_rate": 3.623859343477346e-06, "loss": 0.1041, "num_input_tokens_seen": 23858912, "step": 23760 }, { "epoch": 12.600742311770944, "grad_norm": 11.345250129699707, "learning_rate": 3.621635309011246e-06, "loss": 0.0962, "num_input_tokens_seen": 23864384, "step": 23765 }, { "epoch": 12.6033934252386, "grad_norm": 7.402377128601074, "learning_rate": 3.6194115696494214e-06, "loss": 0.1372, "num_input_tokens_seen": 23868576, "step": 23770 }, { "epoch": 12.606044538706257, "grad_norm": 28.217126846313477, "learning_rate": 3.6171881258679643e-06, "loss": 0.0522, "num_input_tokens_seen": 23875072, "step": 23775 }, { "epoch": 12.608695652173914, "grad_norm": 0.41204896569252014, "learning_rate": 3.614964978142913e-06, "loss": 0.0399, "num_input_tokens_seen": 23879616, "step": 23780 }, { "epoch": 12.611346765641569, "grad_norm": 29.30980110168457, "learning_rate": 3.612742126950234e-06, "loss": 0.0474, "num_input_tokens_seen": 23884640, "step": 23785 }, { "epoch": 12.613997879109226, "grad_norm": 8.730719566345215, "learning_rate": 3.610519572765837e-06, "loss": 0.0398, "num_input_tokens_seen": 23890016, "step": 23790 }, { "epoch": 12.616648992576883, "grad_norm": 6.895164966583252, "learning_rate": 3.6082973160655627e-06, "loss": 0.0414, "num_input_tokens_seen": 23894624, "step": 23795 }, { "epoch": 12.61930010604454, "grad_norm": 5.444538116455078, "learning_rate": 3.6060753573251916e-06, "loss": 0.0178, "num_input_tokens_seen": 23899360, "step": 23800 }, { "epoch": 12.621951219512194, "grad_norm": 0.6850932836532593, "learning_rate": 3.6038536970204385e-06, "loss": 0.0577, "num_input_tokens_seen": 23903744, "step": 23805 }, { "epoch": 12.624602332979851, "grad_norm": 7.228841304779053, "learning_rate": 3.6016323356269573e-06, "loss": 0.0412, "num_input_tokens_seen": 23908352, "step": 23810 }, { "epoch": 12.627253446447508, "grad_norm": 6.90790319442749, "learning_rate": 3.5994112736203313e-06, "loss": 0.096, "num_input_tokens_seen": 23912544, "step": 23815 }, { "epoch": 12.629904559915165, "grad_norm": 5.472381114959717, "learning_rate": 3.5971905114760885e-06, "loss": 0.0229, "num_input_tokens_seen": 23917952, "step": 23820 }, { "epoch": 12.632555673382821, "grad_norm": 61.083683013916016, "learning_rate": 3.594970049669685e-06, "loss": 0.1736, "num_input_tokens_seen": 23922656, "step": 23825 }, { "epoch": 12.635206786850476, "grad_norm": 0.007645610719919205, "learning_rate": 3.5927498886765193e-06, "loss": 0.0241, "num_input_tokens_seen": 23928032, "step": 23830 }, { "epoch": 12.637857900318133, "grad_norm": 0.7133540511131287, "learning_rate": 3.590530028971918e-06, "loss": 0.0095, "num_input_tokens_seen": 23932384, "step": 23835 }, { "epoch": 12.64050901378579, "grad_norm": 38.5020637512207, "learning_rate": 3.58831047103115e-06, "loss": 0.1454, "num_input_tokens_seen": 23937504, "step": 23840 }, { "epoch": 12.643160127253447, "grad_norm": 2.3834948539733887, "learning_rate": 3.586091215329416e-06, "loss": 0.2754, "num_input_tokens_seen": 23942784, "step": 23845 }, { "epoch": 12.645811240721104, "grad_norm": 28.62965965270996, "learning_rate": 3.583872262341854e-06, "loss": 0.042, "num_input_tokens_seen": 23948128, "step": 23850 }, { "epoch": 12.648462354188759, "grad_norm": 2.876948595046997, "learning_rate": 3.5816536125435332e-06, "loss": 0.0592, "num_input_tokens_seen": 23953888, "step": 23855 }, { "epoch": 12.651113467656415, "grad_norm": 3.685486316680908, "learning_rate": 3.5794352664094644e-06, "loss": 0.0224, "num_input_tokens_seen": 23958144, "step": 23860 }, { "epoch": 12.653764581124072, "grad_norm": 2.998018264770508, "learning_rate": 3.5772172244145862e-06, "loss": 0.0603, "num_input_tokens_seen": 23963136, "step": 23865 }, { "epoch": 12.656415694591729, "grad_norm": 1.964123010635376, "learning_rate": 3.574999487033779e-06, "loss": 0.0455, "num_input_tokens_seen": 23968000, "step": 23870 }, { "epoch": 12.659066808059386, "grad_norm": 3.104585647583008, "learning_rate": 3.5727820547418525e-06, "loss": 0.0624, "num_input_tokens_seen": 23973088, "step": 23875 }, { "epoch": 12.66171792152704, "grad_norm": 0.7070371508598328, "learning_rate": 3.5705649280135513e-06, "loss": 0.0917, "num_input_tokens_seen": 23977920, "step": 23880 }, { "epoch": 12.664369034994698, "grad_norm": 0.7933787703514099, "learning_rate": 3.5683481073235614e-06, "loss": 0.1524, "num_input_tokens_seen": 23983168, "step": 23885 }, { "epoch": 12.667020148462354, "grad_norm": 32.11665344238281, "learning_rate": 3.5661315931464924e-06, "loss": 0.1085, "num_input_tokens_seen": 23988384, "step": 23890 }, { "epoch": 12.669671261930011, "grad_norm": 11.587824821472168, "learning_rate": 3.5639153859569e-06, "loss": 0.0374, "num_input_tokens_seen": 23993024, "step": 23895 }, { "epoch": 12.672322375397666, "grad_norm": 0.4336709678173065, "learning_rate": 3.561699486229263e-06, "loss": 0.0219, "num_input_tokens_seen": 23997152, "step": 23900 }, { "epoch": 12.674973488865323, "grad_norm": 27.553035736083984, "learning_rate": 3.5594838944380056e-06, "loss": 0.2522, "num_input_tokens_seen": 24002176, "step": 23905 }, { "epoch": 12.67762460233298, "grad_norm": 47.04365921020508, "learning_rate": 3.557268611057473e-06, "loss": 0.1097, "num_input_tokens_seen": 24006560, "step": 23910 }, { "epoch": 12.680275715800637, "grad_norm": 34.89047622680664, "learning_rate": 3.5550536365619594e-06, "loss": 0.1302, "num_input_tokens_seen": 24011424, "step": 23915 }, { "epoch": 12.682926829268293, "grad_norm": 1.01156747341156, "learning_rate": 3.5528389714256782e-06, "loss": 0.0319, "num_input_tokens_seen": 24018656, "step": 23920 }, { "epoch": 12.685577942735948, "grad_norm": 41.31871032714844, "learning_rate": 3.550624616122788e-06, "loss": 0.2025, "num_input_tokens_seen": 24023872, "step": 23925 }, { "epoch": 12.688229056203605, "grad_norm": 0.944052517414093, "learning_rate": 3.5484105711273735e-06, "loss": 0.0301, "num_input_tokens_seen": 24028064, "step": 23930 }, { "epoch": 12.690880169671262, "grad_norm": 5.673384189605713, "learning_rate": 3.54619683691346e-06, "loss": 0.0494, "num_input_tokens_seen": 24032896, "step": 23935 }, { "epoch": 12.693531283138919, "grad_norm": 1.1966207027435303, "learning_rate": 3.5439834139549968e-06, "loss": 0.0613, "num_input_tokens_seen": 24038976, "step": 23940 }, { "epoch": 12.696182396606575, "grad_norm": 5.5614848136901855, "learning_rate": 3.5417703027258752e-06, "loss": 0.0622, "num_input_tokens_seen": 24043136, "step": 23945 }, { "epoch": 12.69883351007423, "grad_norm": 0.8945432305335999, "learning_rate": 3.5395575036999165e-06, "loss": 0.0178, "num_input_tokens_seen": 24048512, "step": 23950 }, { "epoch": 12.701484623541887, "grad_norm": 10.006369590759277, "learning_rate": 3.5373450173508755e-06, "loss": 0.1687, "num_input_tokens_seen": 24052480, "step": 23955 }, { "epoch": 12.704135737009544, "grad_norm": 3.864335298538208, "learning_rate": 3.5351328441524367e-06, "loss": 0.0235, "num_input_tokens_seen": 24057856, "step": 23960 }, { "epoch": 12.7067868504772, "grad_norm": 1.3765017986297607, "learning_rate": 3.532920984578225e-06, "loss": 0.0261, "num_input_tokens_seen": 24063008, "step": 23965 }, { "epoch": 12.709437963944858, "grad_norm": 30.635120391845703, "learning_rate": 3.530709439101789e-06, "loss": 0.0603, "num_input_tokens_seen": 24067168, "step": 23970 }, { "epoch": 12.712089077412513, "grad_norm": 7.894904613494873, "learning_rate": 3.5284982081966205e-06, "loss": 0.1287, "num_input_tokens_seen": 24072768, "step": 23975 }, { "epoch": 12.71474019088017, "grad_norm": 1.022362470626831, "learning_rate": 3.526287292336133e-06, "loss": 0.0708, "num_input_tokens_seen": 24077632, "step": 23980 }, { "epoch": 12.717391304347826, "grad_norm": 2.471843957901001, "learning_rate": 3.5240766919936807e-06, "loss": 0.0161, "num_input_tokens_seen": 24084448, "step": 23985 }, { "epoch": 12.720042417815483, "grad_norm": 0.8075243830680847, "learning_rate": 3.5218664076425455e-06, "loss": 0.0818, "num_input_tokens_seen": 24089280, "step": 23990 }, { "epoch": 12.72269353128314, "grad_norm": 16.212583541870117, "learning_rate": 3.5196564397559464e-06, "loss": 0.1037, "num_input_tokens_seen": 24094272, "step": 23995 }, { "epoch": 12.725344644750795, "grad_norm": 0.3547593951225281, "learning_rate": 3.5174467888070283e-06, "loss": 0.0587, "num_input_tokens_seen": 24099040, "step": 24000 }, { "epoch": 12.727995758218452, "grad_norm": 0.2762300968170166, "learning_rate": 3.515237455268874e-06, "loss": 0.0738, "num_input_tokens_seen": 24104448, "step": 24005 }, { "epoch": 12.730646871686108, "grad_norm": 3.0881576538085938, "learning_rate": 3.5130284396144966e-06, "loss": 0.0619, "num_input_tokens_seen": 24109792, "step": 24010 }, { "epoch": 12.733297985153765, "grad_norm": 4.522733211517334, "learning_rate": 3.5108197423168366e-06, "loss": 0.0334, "num_input_tokens_seen": 24114624, "step": 24015 }, { "epoch": 12.735949098621422, "grad_norm": 11.829642295837402, "learning_rate": 3.508611363848774e-06, "loss": 0.0997, "num_input_tokens_seen": 24120800, "step": 24020 }, { "epoch": 12.738600212089077, "grad_norm": 8.352027893066406, "learning_rate": 3.5064033046831157e-06, "loss": 0.2996, "num_input_tokens_seen": 24126624, "step": 24025 }, { "epoch": 12.741251325556734, "grad_norm": 16.392784118652344, "learning_rate": 3.504195565292603e-06, "loss": 0.0662, "num_input_tokens_seen": 24132064, "step": 24030 }, { "epoch": 12.74390243902439, "grad_norm": 12.21560287475586, "learning_rate": 3.5019881461499033e-06, "loss": 0.076, "num_input_tokens_seen": 24137440, "step": 24035 }, { "epoch": 12.746553552492047, "grad_norm": 11.149968147277832, "learning_rate": 3.499781047727623e-06, "loss": 0.1464, "num_input_tokens_seen": 24141632, "step": 24040 }, { "epoch": 12.749204665959702, "grad_norm": 1.278657078742981, "learning_rate": 3.497574270498294e-06, "loss": 0.0759, "num_input_tokens_seen": 24146112, "step": 24045 }, { "epoch": 12.751855779427359, "grad_norm": 24.342702865600586, "learning_rate": 3.495367814934384e-06, "loss": 0.0768, "num_input_tokens_seen": 24150368, "step": 24050 }, { "epoch": 12.754506892895016, "grad_norm": 19.155536651611328, "learning_rate": 3.4931616815082847e-06, "loss": 0.0314, "num_input_tokens_seen": 24155040, "step": 24055 }, { "epoch": 12.757158006362673, "grad_norm": 8.944880485534668, "learning_rate": 3.4909558706923286e-06, "loss": 0.0135, "num_input_tokens_seen": 24159968, "step": 24060 }, { "epoch": 12.75980911983033, "grad_norm": 27.967496871948242, "learning_rate": 3.48875038295877e-06, "loss": 0.023, "num_input_tokens_seen": 24165216, "step": 24065 }, { "epoch": 12.762460233297984, "grad_norm": 22.723487854003906, "learning_rate": 3.4865452187798026e-06, "loss": 0.0838, "num_input_tokens_seen": 24169600, "step": 24070 }, { "epoch": 12.765111346765641, "grad_norm": 21.20619773864746, "learning_rate": 3.484340378627541e-06, "loss": 0.0859, "num_input_tokens_seen": 24175296, "step": 24075 }, { "epoch": 12.767762460233298, "grad_norm": 43.984771728515625, "learning_rate": 3.48213586297404e-06, "loss": 0.0505, "num_input_tokens_seen": 24179936, "step": 24080 }, { "epoch": 12.770413573700955, "grad_norm": 5.840571880340576, "learning_rate": 3.479931672291277e-06, "loss": 0.1397, "num_input_tokens_seen": 24184992, "step": 24085 }, { "epoch": 12.773064687168612, "grad_norm": 0.8109363317489624, "learning_rate": 3.477727807051168e-06, "loss": 0.1227, "num_input_tokens_seen": 24191072, "step": 24090 }, { "epoch": 12.775715800636267, "grad_norm": 46.958133697509766, "learning_rate": 3.4755242677255494e-06, "loss": 0.0759, "num_input_tokens_seen": 24195200, "step": 24095 }, { "epoch": 12.778366914103923, "grad_norm": 2.073955774307251, "learning_rate": 3.473321054786197e-06, "loss": 0.1781, "num_input_tokens_seen": 24199648, "step": 24100 }, { "epoch": 12.78101802757158, "grad_norm": 0.8877121210098267, "learning_rate": 3.4711181687048114e-06, "loss": 0.0185, "num_input_tokens_seen": 24205760, "step": 24105 }, { "epoch": 12.783669141039237, "grad_norm": 6.099052906036377, "learning_rate": 3.4689156099530274e-06, "loss": 0.007, "num_input_tokens_seen": 24210976, "step": 24110 }, { "epoch": 12.786320254506894, "grad_norm": 0.6939918398857117, "learning_rate": 3.4667133790024023e-06, "loss": 0.1716, "num_input_tokens_seen": 24215936, "step": 24115 }, { "epoch": 12.788971367974549, "grad_norm": 11.079219818115234, "learning_rate": 3.4645114763244316e-06, "loss": 0.1432, "num_input_tokens_seen": 24220672, "step": 24120 }, { "epoch": 12.791622481442205, "grad_norm": 7.726589202880859, "learning_rate": 3.4623099023905347e-06, "loss": 0.0402, "num_input_tokens_seen": 24224896, "step": 24125 }, { "epoch": 12.794273594909862, "grad_norm": 4.146066665649414, "learning_rate": 3.4601086576720665e-06, "loss": 0.0366, "num_input_tokens_seen": 24230592, "step": 24130 }, { "epoch": 12.796924708377519, "grad_norm": 64.76607513427734, "learning_rate": 3.457907742640303e-06, "loss": 0.0903, "num_input_tokens_seen": 24235712, "step": 24135 }, { "epoch": 12.799575821845174, "grad_norm": 29.805700302124023, "learning_rate": 3.455707157766457e-06, "loss": 0.0955, "num_input_tokens_seen": 24241632, "step": 24140 }, { "epoch": 12.80222693531283, "grad_norm": 11.539393424987793, "learning_rate": 3.4535069035216675e-06, "loss": 0.0171, "num_input_tokens_seen": 24247136, "step": 24145 }, { "epoch": 12.804878048780488, "grad_norm": 18.147916793823242, "learning_rate": 3.4513069803770044e-06, "loss": 0.0447, "num_input_tokens_seen": 24251584, "step": 24150 }, { "epoch": 12.807529162248144, "grad_norm": 38.16765213012695, "learning_rate": 3.4491073888034635e-06, "loss": 0.071, "num_input_tokens_seen": 24257440, "step": 24155 }, { "epoch": 12.810180275715801, "grad_norm": 14.610498428344727, "learning_rate": 3.446908129271972e-06, "loss": 0.0608, "num_input_tokens_seen": 24262624, "step": 24160 }, { "epoch": 12.812831389183458, "grad_norm": 2.9193170070648193, "learning_rate": 3.444709202253387e-06, "loss": 0.1496, "num_input_tokens_seen": 24267680, "step": 24165 }, { "epoch": 12.815482502651113, "grad_norm": 2.7911620140075684, "learning_rate": 3.4425106082184905e-06, "loss": 0.0659, "num_input_tokens_seen": 24273056, "step": 24170 }, { "epoch": 12.81813361611877, "grad_norm": 5.252361297607422, "learning_rate": 3.4403123476379987e-06, "loss": 0.2743, "num_input_tokens_seen": 24278432, "step": 24175 }, { "epoch": 12.820784729586427, "grad_norm": 14.226871490478516, "learning_rate": 3.43811442098255e-06, "loss": 0.1063, "num_input_tokens_seen": 24283808, "step": 24180 }, { "epoch": 12.823435843054083, "grad_norm": 4.721558570861816, "learning_rate": 3.435916828722719e-06, "loss": 0.1259, "num_input_tokens_seen": 24291680, "step": 24185 }, { "epoch": 12.826086956521738, "grad_norm": 0.8999633193016052, "learning_rate": 3.4337195713290005e-06, "loss": 0.0138, "num_input_tokens_seen": 24297152, "step": 24190 }, { "epoch": 12.828738069989395, "grad_norm": 46.569156646728516, "learning_rate": 3.431522649271824e-06, "loss": 0.1039, "num_input_tokens_seen": 24301376, "step": 24195 }, { "epoch": 12.831389183457052, "grad_norm": 40.47321319580078, "learning_rate": 3.429326063021543e-06, "loss": 0.2578, "num_input_tokens_seen": 24306976, "step": 24200 }, { "epoch": 12.834040296924709, "grad_norm": 5.696269512176514, "learning_rate": 3.427129813048444e-06, "loss": 0.0411, "num_input_tokens_seen": 24312128, "step": 24205 }, { "epoch": 12.836691410392365, "grad_norm": 8.33916187286377, "learning_rate": 3.4249338998227344e-06, "loss": 0.1453, "num_input_tokens_seen": 24317856, "step": 24210 }, { "epoch": 12.83934252386002, "grad_norm": 55.04880905151367, "learning_rate": 3.422738323814556e-06, "loss": 0.2045, "num_input_tokens_seen": 24321312, "step": 24215 }, { "epoch": 12.841993637327677, "grad_norm": 0.06040222942829132, "learning_rate": 3.420543085493975e-06, "loss": 0.0078, "num_input_tokens_seen": 24326016, "step": 24220 }, { "epoch": 12.844644750795334, "grad_norm": 0.5434038639068604, "learning_rate": 3.4183481853309876e-06, "loss": 0.0027, "num_input_tokens_seen": 24331232, "step": 24225 }, { "epoch": 12.84729586426299, "grad_norm": 5.63760232925415, "learning_rate": 3.416153623795513e-06, "loss": 0.2464, "num_input_tokens_seen": 24336032, "step": 24230 }, { "epoch": 12.849946977730648, "grad_norm": 12.382491111755371, "learning_rate": 3.413959401357404e-06, "loss": 0.1277, "num_input_tokens_seen": 24340736, "step": 24235 }, { "epoch": 12.852598091198303, "grad_norm": 20.344837188720703, "learning_rate": 3.4117655184864355e-06, "loss": 0.0779, "num_input_tokens_seen": 24345152, "step": 24240 }, { "epoch": 12.85524920466596, "grad_norm": 40.950355529785156, "learning_rate": 3.409571975652316e-06, "loss": 0.0802, "num_input_tokens_seen": 24349824, "step": 24245 }, { "epoch": 12.857900318133616, "grad_norm": 0.43097972869873047, "learning_rate": 3.4073787733246734e-06, "loss": 0.0702, "num_input_tokens_seen": 24354016, "step": 24250 }, { "epoch": 12.860551431601273, "grad_norm": 0.6924433708190918, "learning_rate": 3.405185911973068e-06, "loss": 0.0147, "num_input_tokens_seen": 24358496, "step": 24255 }, { "epoch": 12.86320254506893, "grad_norm": 51.26555252075195, "learning_rate": 3.402993392066985e-06, "loss": 0.1735, "num_input_tokens_seen": 24363136, "step": 24260 }, { "epoch": 12.865853658536585, "grad_norm": 17.806358337402344, "learning_rate": 3.4008012140758396e-06, "loss": 0.0645, "num_input_tokens_seen": 24368512, "step": 24265 }, { "epoch": 12.868504772004242, "grad_norm": 14.30016040802002, "learning_rate": 3.3986093784689677e-06, "loss": 0.023, "num_input_tokens_seen": 24373056, "step": 24270 }, { "epoch": 12.871155885471898, "grad_norm": 54.225711822509766, "learning_rate": 3.396417885715639e-06, "loss": 0.1653, "num_input_tokens_seen": 24377632, "step": 24275 }, { "epoch": 12.873806998939555, "grad_norm": 1.9784555435180664, "learning_rate": 3.3942267362850436e-06, "loss": 0.0303, "num_input_tokens_seen": 24383072, "step": 24280 }, { "epoch": 12.87645811240721, "grad_norm": 1.2069469690322876, "learning_rate": 3.3920359306463035e-06, "loss": 0.0239, "num_input_tokens_seen": 24387232, "step": 24285 }, { "epoch": 12.879109225874867, "grad_norm": 34.656986236572266, "learning_rate": 3.3898454692684622e-06, "loss": 0.1735, "num_input_tokens_seen": 24392864, "step": 24290 }, { "epoch": 12.881760339342524, "grad_norm": 20.439624786376953, "learning_rate": 3.3876553526204915e-06, "loss": 0.0479, "num_input_tokens_seen": 24396768, "step": 24295 }, { "epoch": 12.88441145281018, "grad_norm": 10.786328315734863, "learning_rate": 3.3854655811712934e-06, "loss": 0.0306, "num_input_tokens_seen": 24401152, "step": 24300 }, { "epoch": 12.887062566277837, "grad_norm": 1.0338730812072754, "learning_rate": 3.383276155389687e-06, "loss": 0.0308, "num_input_tokens_seen": 24405568, "step": 24305 }, { "epoch": 12.889713679745492, "grad_norm": 0.8201693892478943, "learning_rate": 3.3810870757444263e-06, "loss": 0.2417, "num_input_tokens_seen": 24410080, "step": 24310 }, { "epoch": 12.892364793213149, "grad_norm": 1.2002590894699097, "learning_rate": 3.3788983427041856e-06, "loss": 0.1979, "num_input_tokens_seen": 24416800, "step": 24315 }, { "epoch": 12.895015906680806, "grad_norm": 0.8420317769050598, "learning_rate": 3.3767099567375695e-06, "loss": 0.0414, "num_input_tokens_seen": 24421600, "step": 24320 }, { "epoch": 12.897667020148463, "grad_norm": 51.900386810302734, "learning_rate": 3.3745219183131016e-06, "loss": 0.152, "num_input_tokens_seen": 24425664, "step": 24325 }, { "epoch": 12.90031813361612, "grad_norm": 1.945263385772705, "learning_rate": 3.3723342278992395e-06, "loss": 0.0308, "num_input_tokens_seen": 24430752, "step": 24330 }, { "epoch": 12.902969247083774, "grad_norm": 7.621275424957275, "learning_rate": 3.3701468859643583e-06, "loss": 0.0292, "num_input_tokens_seen": 24435360, "step": 24335 }, { "epoch": 12.905620360551431, "grad_norm": 37.1103515625, "learning_rate": 3.3679598929767658e-06, "loss": 0.1399, "num_input_tokens_seen": 24441056, "step": 24340 }, { "epoch": 12.908271474019088, "grad_norm": 33.65269470214844, "learning_rate": 3.365773249404688e-06, "loss": 0.1276, "num_input_tokens_seen": 24445888, "step": 24345 }, { "epoch": 12.910922587486745, "grad_norm": 11.648741722106934, "learning_rate": 3.3635869557162825e-06, "loss": 0.2229, "num_input_tokens_seen": 24451968, "step": 24350 }, { "epoch": 12.913573700954402, "grad_norm": 1.5348529815673828, "learning_rate": 3.3614010123796257e-06, "loss": 0.0138, "num_input_tokens_seen": 24455776, "step": 24355 }, { "epoch": 12.916224814422057, "grad_norm": 41.78534698486328, "learning_rate": 3.359215419862727e-06, "loss": 0.1185, "num_input_tokens_seen": 24460928, "step": 24360 }, { "epoch": 12.918875927889713, "grad_norm": 6.842302322387695, "learning_rate": 3.3570301786335114e-06, "loss": 0.0797, "num_input_tokens_seen": 24467392, "step": 24365 }, { "epoch": 12.92152704135737, "grad_norm": 5.1539154052734375, "learning_rate": 3.3548452891598367e-06, "loss": 0.1501, "num_input_tokens_seen": 24471296, "step": 24370 }, { "epoch": 12.924178154825027, "grad_norm": 1.602840542793274, "learning_rate": 3.352660751909479e-06, "loss": 0.007, "num_input_tokens_seen": 24476192, "step": 24375 }, { "epoch": 12.926829268292684, "grad_norm": 58.122314453125, "learning_rate": 3.3504765673501462e-06, "loss": 0.4307, "num_input_tokens_seen": 24480960, "step": 24380 }, { "epoch": 12.929480381760339, "grad_norm": 0.5833867788314819, "learning_rate": 3.348292735949461e-06, "loss": 0.0859, "num_input_tokens_seen": 24485184, "step": 24385 }, { "epoch": 12.932131495227996, "grad_norm": 38.40300369262695, "learning_rate": 3.346109258174981e-06, "loss": 0.2417, "num_input_tokens_seen": 24490752, "step": 24390 }, { "epoch": 12.934782608695652, "grad_norm": 4.357458114624023, "learning_rate": 3.3439261344941798e-06, "loss": 0.044, "num_input_tokens_seen": 24494976, "step": 24395 }, { "epoch": 12.93743372216331, "grad_norm": 13.89651870727539, "learning_rate": 3.341743365374461e-06, "loss": 0.1239, "num_input_tokens_seen": 24499744, "step": 24400 }, { "epoch": 12.940084835630966, "grad_norm": 33.31540298461914, "learning_rate": 3.339560951283147e-06, "loss": 0.0561, "num_input_tokens_seen": 24504096, "step": 24405 }, { "epoch": 12.942735949098621, "grad_norm": 1.9048172235488892, "learning_rate": 3.3373788926874893e-06, "loss": 0.0365, "num_input_tokens_seen": 24508992, "step": 24410 }, { "epoch": 12.945387062566278, "grad_norm": 12.1661958694458, "learning_rate": 3.33519719005466e-06, "loss": 0.0412, "num_input_tokens_seen": 24513312, "step": 24415 }, { "epoch": 12.948038176033934, "grad_norm": 0.3454064428806305, "learning_rate": 3.333015843851757e-06, "loss": 0.0054, "num_input_tokens_seen": 24517920, "step": 24420 }, { "epoch": 12.950689289501591, "grad_norm": 2.017723798751831, "learning_rate": 3.3308348545457988e-06, "loss": 0.0162, "num_input_tokens_seen": 24523168, "step": 24425 }, { "epoch": 12.953340402969246, "grad_norm": 22.530162811279297, "learning_rate": 3.3286542226037293e-06, "loss": 0.1015, "num_input_tokens_seen": 24528992, "step": 24430 }, { "epoch": 12.955991516436903, "grad_norm": 52.68104553222656, "learning_rate": 3.32647394849242e-06, "loss": 0.0672, "num_input_tokens_seen": 24533632, "step": 24435 }, { "epoch": 12.95864262990456, "grad_norm": 73.32913208007812, "learning_rate": 3.3242940326786564e-06, "loss": 0.1456, "num_input_tokens_seen": 24539456, "step": 24440 }, { "epoch": 12.961293743372217, "grad_norm": 6.267942428588867, "learning_rate": 3.3221144756291566e-06, "loss": 0.0789, "num_input_tokens_seen": 24543808, "step": 24445 }, { "epoch": 12.963944856839873, "grad_norm": 39.650142669677734, "learning_rate": 3.3199352778105565e-06, "loss": 0.094, "num_input_tokens_seen": 24548672, "step": 24450 }, { "epoch": 12.966595970307528, "grad_norm": 54.9967155456543, "learning_rate": 3.3177564396894186e-06, "loss": 0.0552, "num_input_tokens_seen": 24553760, "step": 24455 }, { "epoch": 12.969247083775185, "grad_norm": 3.947209119796753, "learning_rate": 3.315577961732222e-06, "loss": 0.1741, "num_input_tokens_seen": 24559936, "step": 24460 }, { "epoch": 12.971898197242842, "grad_norm": 55.06189727783203, "learning_rate": 3.3133998444053796e-06, "loss": 0.1403, "num_input_tokens_seen": 24566048, "step": 24465 }, { "epoch": 12.974549310710499, "grad_norm": 6.932125091552734, "learning_rate": 3.3112220881752145e-06, "loss": 0.1699, "num_input_tokens_seen": 24571584, "step": 24470 }, { "epoch": 12.977200424178156, "grad_norm": 1.599902629852295, "learning_rate": 3.3090446935079833e-06, "loss": 0.1106, "num_input_tokens_seen": 24575936, "step": 24475 }, { "epoch": 12.97985153764581, "grad_norm": 83.95396423339844, "learning_rate": 3.306867660869856e-06, "loss": 0.1833, "num_input_tokens_seen": 24580320, "step": 24480 }, { "epoch": 12.982502651113467, "grad_norm": 37.94839859008789, "learning_rate": 3.3046909907269347e-06, "loss": 0.0385, "num_input_tokens_seen": 24585024, "step": 24485 }, { "epoch": 12.985153764581124, "grad_norm": 5.800177574157715, "learning_rate": 3.3025146835452344e-06, "loss": 0.0478, "num_input_tokens_seen": 24589408, "step": 24490 }, { "epoch": 12.987804878048781, "grad_norm": 0.4370819926261902, "learning_rate": 3.300338739790699e-06, "loss": 0.0621, "num_input_tokens_seen": 24595040, "step": 24495 }, { "epoch": 12.990455991516438, "grad_norm": 50.56239318847656, "learning_rate": 3.298163159929191e-06, "loss": 0.3399, "num_input_tokens_seen": 24599904, "step": 24500 }, { "epoch": 12.993107104984093, "grad_norm": 2.231940269470215, "learning_rate": 3.295987944426499e-06, "loss": 0.3792, "num_input_tokens_seen": 24604192, "step": 24505 }, { "epoch": 12.99575821845175, "grad_norm": 0.10857878625392914, "learning_rate": 3.2938130937483274e-06, "loss": 0.0902, "num_input_tokens_seen": 24608832, "step": 24510 }, { "epoch": 12.998409331919406, "grad_norm": 1.5879240036010742, "learning_rate": 3.291638608360309e-06, "loss": 0.046, "num_input_tokens_seen": 24613472, "step": 24515 }, { "epoch": 13.001060445387063, "grad_norm": 2.183072090148926, "learning_rate": 3.289464488727993e-06, "loss": 0.0055, "num_input_tokens_seen": 24618568, "step": 24520 }, { "epoch": 13.00371155885472, "grad_norm": 27.324716567993164, "learning_rate": 3.2872907353168558e-06, "loss": 0.0261, "num_input_tokens_seen": 24623336, "step": 24525 }, { "epoch": 13.006362672322375, "grad_norm": 0.4154849648475647, "learning_rate": 3.285117348592289e-06, "loss": 0.0798, "num_input_tokens_seen": 24628840, "step": 24530 }, { "epoch": 13.009013785790032, "grad_norm": 29.357160568237305, "learning_rate": 3.2829443290196106e-06, "loss": 0.0577, "num_input_tokens_seen": 24634472, "step": 24535 }, { "epoch": 13.011664899257688, "grad_norm": 38.928524017333984, "learning_rate": 3.2807716770640578e-06, "loss": 0.0465, "num_input_tokens_seen": 24640328, "step": 24540 }, { "epoch": 13.014316012725345, "grad_norm": 4.321621417999268, "learning_rate": 3.278599393190792e-06, "loss": 0.053, "num_input_tokens_seen": 24645800, "step": 24545 }, { "epoch": 13.016967126193, "grad_norm": 0.1922859251499176, "learning_rate": 3.2764274778648902e-06, "loss": 0.1292, "num_input_tokens_seen": 24650120, "step": 24550 }, { "epoch": 13.019618239660657, "grad_norm": 8.349940299987793, "learning_rate": 3.2742559315513554e-06, "loss": 0.0681, "num_input_tokens_seen": 24655144, "step": 24555 }, { "epoch": 13.022269353128314, "grad_norm": 1.97116219997406, "learning_rate": 3.2720847547151096e-06, "loss": 0.0034, "num_input_tokens_seen": 24660904, "step": 24560 }, { "epoch": 13.02492046659597, "grad_norm": 0.6327964067459106, "learning_rate": 3.2699139478209987e-06, "loss": 0.0093, "num_input_tokens_seen": 24665832, "step": 24565 }, { "epoch": 13.027571580063627, "grad_norm": 0.3325463831424713, "learning_rate": 3.267743511333782e-06, "loss": 0.0124, "num_input_tokens_seen": 24671080, "step": 24570 }, { "epoch": 13.030222693531282, "grad_norm": 0.23370301723480225, "learning_rate": 3.2655734457181465e-06, "loss": 0.0341, "num_input_tokens_seen": 24676840, "step": 24575 }, { "epoch": 13.03287380699894, "grad_norm": 1.159210443496704, "learning_rate": 3.2634037514386997e-06, "loss": 0.0114, "num_input_tokens_seen": 24681544, "step": 24580 }, { "epoch": 13.035524920466596, "grad_norm": 29.973501205444336, "learning_rate": 3.261234428959963e-06, "loss": 0.0511, "num_input_tokens_seen": 24685928, "step": 24585 }, { "epoch": 13.038176033934253, "grad_norm": 0.5119381546974182, "learning_rate": 3.259065478746387e-06, "loss": 0.0903, "num_input_tokens_seen": 24692232, "step": 24590 }, { "epoch": 13.04082714740191, "grad_norm": 1.4002115726470947, "learning_rate": 3.2568969012623348e-06, "loss": 0.0448, "num_input_tokens_seen": 24696936, "step": 24595 }, { "epoch": 13.043478260869565, "grad_norm": 43.23997116088867, "learning_rate": 3.2547286969720972e-06, "loss": 0.0812, "num_input_tokens_seen": 24702312, "step": 24600 }, { "epoch": 13.046129374337221, "grad_norm": 0.07639326900243759, "learning_rate": 3.2525608663398768e-06, "loss": 0.061, "num_input_tokens_seen": 24706920, "step": 24605 }, { "epoch": 13.048780487804878, "grad_norm": 22.779226303100586, "learning_rate": 3.250393409829803e-06, "loss": 0.0176, "num_input_tokens_seen": 24712040, "step": 24610 }, { "epoch": 13.051431601272535, "grad_norm": 1.614335536956787, "learning_rate": 3.24822632790592e-06, "loss": 0.099, "num_input_tokens_seen": 24720168, "step": 24615 }, { "epoch": 13.054082714740192, "grad_norm": 0.19204890727996826, "learning_rate": 3.246059621032199e-06, "loss": 0.0455, "num_input_tokens_seen": 24724776, "step": 24620 }, { "epoch": 13.056733828207847, "grad_norm": 4.81791877746582, "learning_rate": 3.2438932896725222e-06, "loss": 0.006, "num_input_tokens_seen": 24730600, "step": 24625 }, { "epoch": 13.059384941675503, "grad_norm": 17.597370147705078, "learning_rate": 3.2417273342906975e-06, "loss": 0.0356, "num_input_tokens_seen": 24734440, "step": 24630 }, { "epoch": 13.06203605514316, "grad_norm": 0.5905193090438843, "learning_rate": 3.239561755350448e-06, "loss": 0.0464, "num_input_tokens_seen": 24739112, "step": 24635 }, { "epoch": 13.064687168610817, "grad_norm": 0.055703211575746536, "learning_rate": 3.237396553315423e-06, "loss": 0.0039, "num_input_tokens_seen": 24743720, "step": 24640 }, { "epoch": 13.067338282078474, "grad_norm": 1.791149616241455, "learning_rate": 3.2352317286491807e-06, "loss": 0.0099, "num_input_tokens_seen": 24749288, "step": 24645 }, { "epoch": 13.069989395546129, "grad_norm": 0.07312138378620148, "learning_rate": 3.233067281815209e-06, "loss": 0.0026, "num_input_tokens_seen": 24754408, "step": 24650 }, { "epoch": 13.072640509013786, "grad_norm": 0.24278491735458374, "learning_rate": 3.2309032132769073e-06, "loss": 0.1658, "num_input_tokens_seen": 24759496, "step": 24655 }, { "epoch": 13.075291622481442, "grad_norm": 6.669065475463867, "learning_rate": 3.2287395234976005e-06, "loss": 0.173, "num_input_tokens_seen": 24764744, "step": 24660 }, { "epoch": 13.0779427359491, "grad_norm": 1.6008158922195435, "learning_rate": 3.2265762129405253e-06, "loss": 0.0662, "num_input_tokens_seen": 24769576, "step": 24665 }, { "epoch": 13.080593849416754, "grad_norm": 44.62435531616211, "learning_rate": 3.2244132820688423e-06, "loss": 0.2452, "num_input_tokens_seen": 24774408, "step": 24670 }, { "epoch": 13.083244962884411, "grad_norm": 0.9855858087539673, "learning_rate": 3.222250731345629e-06, "loss": 0.0454, "num_input_tokens_seen": 24780712, "step": 24675 }, { "epoch": 13.085896076352068, "grad_norm": 68.41483306884766, "learning_rate": 3.2200885612338846e-06, "loss": 0.112, "num_input_tokens_seen": 24785352, "step": 24680 }, { "epoch": 13.088547189819725, "grad_norm": 0.5967673063278198, "learning_rate": 3.217926772196519e-06, "loss": 0.0134, "num_input_tokens_seen": 24790344, "step": 24685 }, { "epoch": 13.091198303287381, "grad_norm": 75.40608978271484, "learning_rate": 3.2157653646963694e-06, "loss": 0.1716, "num_input_tokens_seen": 24794632, "step": 24690 }, { "epoch": 13.093849416755036, "grad_norm": 1.256116271018982, "learning_rate": 3.2136043391961857e-06, "loss": 0.0035, "num_input_tokens_seen": 24799784, "step": 24695 }, { "epoch": 13.096500530222693, "grad_norm": 1.2009758949279785, "learning_rate": 3.2114436961586404e-06, "loss": 0.0235, "num_input_tokens_seen": 24804168, "step": 24700 }, { "epoch": 13.09915164369035, "grad_norm": 0.40202799439430237, "learning_rate": 3.2092834360463183e-06, "loss": 0.0122, "num_input_tokens_seen": 24809640, "step": 24705 }, { "epoch": 13.101802757158007, "grad_norm": 2.05241322517395, "learning_rate": 3.2071235593217253e-06, "loss": 0.009, "num_input_tokens_seen": 24814312, "step": 24710 }, { "epoch": 13.104453870625663, "grad_norm": 1.203561782836914, "learning_rate": 3.204964066447289e-06, "loss": 0.0058, "num_input_tokens_seen": 24818920, "step": 24715 }, { "epoch": 13.107104984093318, "grad_norm": 0.4059034585952759, "learning_rate": 3.202804957885348e-06, "loss": 0.0053, "num_input_tokens_seen": 24823528, "step": 24720 }, { "epoch": 13.109756097560975, "grad_norm": 32.835418701171875, "learning_rate": 3.2006462340981628e-06, "loss": 0.1038, "num_input_tokens_seen": 24827272, "step": 24725 }, { "epoch": 13.112407211028632, "grad_norm": 0.3175015151500702, "learning_rate": 3.1984878955479094e-06, "loss": 0.0454, "num_input_tokens_seen": 24831752, "step": 24730 }, { "epoch": 13.115058324496289, "grad_norm": 1.8629279136657715, "learning_rate": 3.1963299426966852e-06, "loss": 0.01, "num_input_tokens_seen": 24837576, "step": 24735 }, { "epoch": 13.117709437963946, "grad_norm": 8.167015075683594, "learning_rate": 3.1941723760064993e-06, "loss": 0.0131, "num_input_tokens_seen": 24842440, "step": 24740 }, { "epoch": 13.1203605514316, "grad_norm": 0.24381481111049652, "learning_rate": 3.192015195939283e-06, "loss": 0.0525, "num_input_tokens_seen": 24846728, "step": 24745 }, { "epoch": 13.123011664899257, "grad_norm": 0.7874215841293335, "learning_rate": 3.1898584029568815e-06, "loss": 0.0461, "num_input_tokens_seen": 24850696, "step": 24750 }, { "epoch": 13.125662778366914, "grad_norm": 0.9644753336906433, "learning_rate": 3.187701997521061e-06, "loss": 0.028, "num_input_tokens_seen": 24855656, "step": 24755 }, { "epoch": 13.128313891834571, "grad_norm": 1.6284209489822388, "learning_rate": 3.1855459800934986e-06, "loss": 0.0041, "num_input_tokens_seen": 24861032, "step": 24760 }, { "epoch": 13.130965005302228, "grad_norm": 0.10169089585542679, "learning_rate": 3.1833903511357943e-06, "loss": 0.0181, "num_input_tokens_seen": 24865736, "step": 24765 }, { "epoch": 13.133616118769883, "grad_norm": 0.574165403842926, "learning_rate": 3.1812351111094618e-06, "loss": 0.0095, "num_input_tokens_seen": 24869896, "step": 24770 }, { "epoch": 13.13626723223754, "grad_norm": 1.939518690109253, "learning_rate": 3.1790802604759353e-06, "loss": 0.0287, "num_input_tokens_seen": 24874664, "step": 24775 }, { "epoch": 13.138918345705196, "grad_norm": 12.155162811279297, "learning_rate": 3.1769257996965577e-06, "loss": 0.1328, "num_input_tokens_seen": 24879400, "step": 24780 }, { "epoch": 13.141569459172853, "grad_norm": 11.408838272094727, "learning_rate": 3.1747717292325964e-06, "loss": 0.0233, "num_input_tokens_seen": 24884328, "step": 24785 }, { "epoch": 13.14422057264051, "grad_norm": 6.525371074676514, "learning_rate": 3.172618049545231e-06, "loss": 0.0185, "num_input_tokens_seen": 24889288, "step": 24790 }, { "epoch": 13.146871686108165, "grad_norm": 50.92463302612305, "learning_rate": 3.1704647610955618e-06, "loss": 0.1958, "num_input_tokens_seen": 24895656, "step": 24795 }, { "epoch": 13.149522799575822, "grad_norm": 3.8756020069122314, "learning_rate": 3.1683118643445976e-06, "loss": 0.0197, "num_input_tokens_seen": 24900520, "step": 24800 }, { "epoch": 13.152173913043478, "grad_norm": 0.7581031322479248, "learning_rate": 3.1661593597532714e-06, "loss": 0.0101, "num_input_tokens_seen": 24905352, "step": 24805 }, { "epoch": 13.154825026511135, "grad_norm": 38.70171356201172, "learning_rate": 3.1640072477824258e-06, "loss": 0.0684, "num_input_tokens_seen": 24909576, "step": 24810 }, { "epoch": 13.15747613997879, "grad_norm": 1.8117985725402832, "learning_rate": 3.1618555288928267e-06, "loss": 0.0084, "num_input_tokens_seen": 24915176, "step": 24815 }, { "epoch": 13.160127253446447, "grad_norm": 15.262738227844238, "learning_rate": 3.1597042035451475e-06, "loss": 0.0613, "num_input_tokens_seen": 24921640, "step": 24820 }, { "epoch": 13.162778366914104, "grad_norm": 34.764007568359375, "learning_rate": 3.157553272199983e-06, "loss": 0.0839, "num_input_tokens_seen": 24925960, "step": 24825 }, { "epoch": 13.16542948038176, "grad_norm": 11.959729194641113, "learning_rate": 3.155402735317841e-06, "loss": 0.0094, "num_input_tokens_seen": 24930408, "step": 24830 }, { "epoch": 13.168080593849417, "grad_norm": 0.8534739017486572, "learning_rate": 3.1532525933591486e-06, "loss": 0.012, "num_input_tokens_seen": 24935144, "step": 24835 }, { "epoch": 13.170731707317072, "grad_norm": 23.91474723815918, "learning_rate": 3.151102846784242e-06, "loss": 0.0345, "num_input_tokens_seen": 24940808, "step": 24840 }, { "epoch": 13.17338282078473, "grad_norm": 23.709917068481445, "learning_rate": 3.1489534960533776e-06, "loss": 0.035, "num_input_tokens_seen": 24944968, "step": 24845 }, { "epoch": 13.176033934252386, "grad_norm": 5.233951091766357, "learning_rate": 3.1468045416267272e-06, "loss": 0.0079, "num_input_tokens_seen": 24949192, "step": 24850 }, { "epoch": 13.178685047720043, "grad_norm": 4.947116851806641, "learning_rate": 3.1446559839643735e-06, "loss": 0.032, "num_input_tokens_seen": 24953288, "step": 24855 }, { "epoch": 13.1813361611877, "grad_norm": 39.65434646606445, "learning_rate": 3.1425078235263197e-06, "loss": 0.1112, "num_input_tokens_seen": 24958088, "step": 24860 }, { "epoch": 13.183987274655355, "grad_norm": 0.6929080486297607, "learning_rate": 3.1403600607724792e-06, "loss": 0.103, "num_input_tokens_seen": 24963560, "step": 24865 }, { "epoch": 13.186638388123011, "grad_norm": 21.84273338317871, "learning_rate": 3.1382126961626856e-06, "loss": 0.02, "num_input_tokens_seen": 24968744, "step": 24870 }, { "epoch": 13.189289501590668, "grad_norm": 7.2333502769470215, "learning_rate": 3.13606573015668e-06, "loss": 0.0179, "num_input_tokens_seen": 24974120, "step": 24875 }, { "epoch": 13.191940615058325, "grad_norm": 20.04987335205078, "learning_rate": 3.133919163214125e-06, "loss": 0.0374, "num_input_tokens_seen": 24979016, "step": 24880 }, { "epoch": 13.194591728525982, "grad_norm": 4.957030773162842, "learning_rate": 3.1317729957945938e-06, "loss": 0.0349, "num_input_tokens_seen": 24984168, "step": 24885 }, { "epoch": 13.197242841993637, "grad_norm": 15.913741111755371, "learning_rate": 3.129627228357577e-06, "loss": 0.0238, "num_input_tokens_seen": 24988456, "step": 24890 }, { "epoch": 13.199893955461294, "grad_norm": 5.114404201507568, "learning_rate": 3.127481861362474e-06, "loss": 0.1597, "num_input_tokens_seen": 24993864, "step": 24895 }, { "epoch": 13.20254506892895, "grad_norm": 0.7087140083312988, "learning_rate": 3.1253368952686057e-06, "loss": 0.0114, "num_input_tokens_seen": 24997672, "step": 24900 }, { "epoch": 13.205196182396607, "grad_norm": 1.519863486289978, "learning_rate": 3.123192330535202e-06, "loss": 0.0056, "num_input_tokens_seen": 25002248, "step": 24905 }, { "epoch": 13.207847295864262, "grad_norm": 1.6812434196472168, "learning_rate": 3.121048167621411e-06, "loss": 0.0295, "num_input_tokens_seen": 25006216, "step": 24910 }, { "epoch": 13.210498409331919, "grad_norm": 5.864487171173096, "learning_rate": 3.1189044069862886e-06, "loss": 0.1036, "num_input_tokens_seen": 25010760, "step": 24915 }, { "epoch": 13.213149522799576, "grad_norm": 15.917672157287598, "learning_rate": 3.116761049088811e-06, "loss": 0.0586, "num_input_tokens_seen": 25015912, "step": 24920 }, { "epoch": 13.215800636267232, "grad_norm": 1.688709020614624, "learning_rate": 3.1146180943878644e-06, "loss": 0.1538, "num_input_tokens_seen": 25020456, "step": 24925 }, { "epoch": 13.21845174973489, "grad_norm": 10.96237850189209, "learning_rate": 3.1124755433422527e-06, "loss": 0.012, "num_input_tokens_seen": 25025320, "step": 24930 }, { "epoch": 13.221102863202544, "grad_norm": 4.920202255249023, "learning_rate": 3.1103333964106853e-06, "loss": 0.0606, "num_input_tokens_seen": 25030664, "step": 24935 }, { "epoch": 13.223753976670201, "grad_norm": 0.4014366567134857, "learning_rate": 3.108191654051794e-06, "loss": 0.035, "num_input_tokens_seen": 25036264, "step": 24940 }, { "epoch": 13.226405090137858, "grad_norm": 0.5319715738296509, "learning_rate": 3.1060503167241187e-06, "loss": 0.0358, "num_input_tokens_seen": 25041192, "step": 24945 }, { "epoch": 13.229056203605515, "grad_norm": 15.552106857299805, "learning_rate": 3.1039093848861157e-06, "loss": 0.0845, "num_input_tokens_seen": 25046632, "step": 24950 }, { "epoch": 13.231707317073171, "grad_norm": 0.9787601828575134, "learning_rate": 3.10176885899615e-06, "loss": 0.1741, "num_input_tokens_seen": 25051592, "step": 24955 }, { "epoch": 13.234358430540826, "grad_norm": 18.1484375, "learning_rate": 3.099628739512506e-06, "loss": 0.0457, "num_input_tokens_seen": 25056424, "step": 24960 }, { "epoch": 13.237009544008483, "grad_norm": 38.305885314941406, "learning_rate": 3.0974890268933743e-06, "loss": 0.0576, "num_input_tokens_seen": 25061928, "step": 24965 }, { "epoch": 13.23966065747614, "grad_norm": 42.6334342956543, "learning_rate": 3.0953497215968665e-06, "loss": 0.1074, "num_input_tokens_seen": 25069160, "step": 24970 }, { "epoch": 13.242311770943797, "grad_norm": 24.787805557250977, "learning_rate": 3.0932108240809964e-06, "loss": 0.0216, "num_input_tokens_seen": 25073832, "step": 24975 }, { "epoch": 13.244962884411454, "grad_norm": 0.6597954034805298, "learning_rate": 3.0910723348037014e-06, "loss": 0.1151, "num_input_tokens_seen": 25078600, "step": 24980 }, { "epoch": 13.247613997879109, "grad_norm": 21.755762100219727, "learning_rate": 3.0889342542228253e-06, "loss": 0.2624, "num_input_tokens_seen": 25083016, "step": 24985 }, { "epoch": 13.250265111346765, "grad_norm": 3.9656519889831543, "learning_rate": 3.0867965827961215e-06, "loss": 0.0076, "num_input_tokens_seen": 25087304, "step": 24990 }, { "epoch": 13.252916224814422, "grad_norm": 0.05469007417559624, "learning_rate": 3.084659320981265e-06, "loss": 0.0027, "num_input_tokens_seen": 25092936, "step": 24995 }, { "epoch": 13.255567338282079, "grad_norm": 6.721212387084961, "learning_rate": 3.0825224692358345e-06, "loss": 0.0331, "num_input_tokens_seen": 25098280, "step": 25000 }, { "epoch": 13.258218451749736, "grad_norm": 3.577324867248535, "learning_rate": 3.0803860280173277e-06, "loss": 0.0047, "num_input_tokens_seen": 25103368, "step": 25005 }, { "epoch": 13.26086956521739, "grad_norm": 2.0604143142700195, "learning_rate": 3.078249997783147e-06, "loss": 0.1225, "num_input_tokens_seen": 25107240, "step": 25010 }, { "epoch": 13.263520678685047, "grad_norm": 4.954214096069336, "learning_rate": 3.0761143789906147e-06, "loss": 0.2908, "num_input_tokens_seen": 25113352, "step": 25015 }, { "epoch": 13.266171792152704, "grad_norm": 38.71867752075195, "learning_rate": 3.073979172096958e-06, "loss": 0.0578, "num_input_tokens_seen": 25117896, "step": 25020 }, { "epoch": 13.268822905620361, "grad_norm": 0.5452606081962585, "learning_rate": 3.0718443775593233e-06, "loss": 0.064, "num_input_tokens_seen": 25123432, "step": 25025 }, { "epoch": 13.271474019088018, "grad_norm": 9.701188087463379, "learning_rate": 3.0697099958347594e-06, "loss": 0.026, "num_input_tokens_seen": 25128360, "step": 25030 }, { "epoch": 13.274125132555673, "grad_norm": 0.6630848050117493, "learning_rate": 3.0675760273802352e-06, "loss": 0.0116, "num_input_tokens_seen": 25132168, "step": 25035 }, { "epoch": 13.27677624602333, "grad_norm": 2.6142780780792236, "learning_rate": 3.065442472652626e-06, "loss": 0.0162, "num_input_tokens_seen": 25137256, "step": 25040 }, { "epoch": 13.279427359490986, "grad_norm": 11.104374885559082, "learning_rate": 3.063309332108724e-06, "loss": 0.0506, "num_input_tokens_seen": 25143112, "step": 25045 }, { "epoch": 13.282078472958643, "grad_norm": 4.383764743804932, "learning_rate": 3.0611766062052218e-06, "loss": 0.0201, "num_input_tokens_seen": 25147400, "step": 25050 }, { "epoch": 13.284729586426298, "grad_norm": 29.346141815185547, "learning_rate": 3.0590442953987388e-06, "loss": 0.0329, "num_input_tokens_seen": 25152232, "step": 25055 }, { "epoch": 13.287380699893955, "grad_norm": 0.5360226631164551, "learning_rate": 3.0569124001457905e-06, "loss": 0.0135, "num_input_tokens_seen": 25157096, "step": 25060 }, { "epoch": 13.290031813361612, "grad_norm": 1.1955183744430542, "learning_rate": 3.0547809209028145e-06, "loss": 0.0178, "num_input_tokens_seen": 25163592, "step": 25065 }, { "epoch": 13.292682926829269, "grad_norm": 10.795465469360352, "learning_rate": 3.0526498581261516e-06, "loss": 0.0125, "num_input_tokens_seen": 25167848, "step": 25070 }, { "epoch": 13.295334040296925, "grad_norm": 76.18960571289062, "learning_rate": 3.0505192122720607e-06, "loss": 0.2324, "num_input_tokens_seen": 25172936, "step": 25075 }, { "epoch": 13.29798515376458, "grad_norm": 4.732523441314697, "learning_rate": 3.048388983796703e-06, "loss": 0.0241, "num_input_tokens_seen": 25177256, "step": 25080 }, { "epoch": 13.300636267232237, "grad_norm": 1.0701639652252197, "learning_rate": 3.0462591731561586e-06, "loss": 0.0195, "num_input_tokens_seen": 25182184, "step": 25085 }, { "epoch": 13.303287380699894, "grad_norm": 9.447118759155273, "learning_rate": 3.0441297808064108e-06, "loss": 0.0238, "num_input_tokens_seen": 25187560, "step": 25090 }, { "epoch": 13.30593849416755, "grad_norm": 0.5049080848693848, "learning_rate": 3.0420008072033615e-06, "loss": 0.0306, "num_input_tokens_seen": 25192520, "step": 25095 }, { "epoch": 13.308589607635207, "grad_norm": 37.16279983520508, "learning_rate": 3.039872252802814e-06, "loss": 0.0743, "num_input_tokens_seen": 25197480, "step": 25100 }, { "epoch": 13.311240721102862, "grad_norm": 1.153197169303894, "learning_rate": 3.037744118060489e-06, "loss": 0.0523, "num_input_tokens_seen": 25203080, "step": 25105 }, { "epoch": 13.31389183457052, "grad_norm": 4.894021034240723, "learning_rate": 3.0356164034320123e-06, "loss": 0.0054, "num_input_tokens_seen": 25207720, "step": 25110 }, { "epoch": 13.316542948038176, "grad_norm": 3.072385549545288, "learning_rate": 3.0334891093729258e-06, "loss": 0.005, "num_input_tokens_seen": 25212840, "step": 25115 }, { "epoch": 13.319194061505833, "grad_norm": 6.7158203125, "learning_rate": 3.0313622363386735e-06, "loss": 0.1043, "num_input_tokens_seen": 25217224, "step": 25120 }, { "epoch": 13.32184517497349, "grad_norm": 12.584110260009766, "learning_rate": 3.029235784784614e-06, "loss": 0.0342, "num_input_tokens_seen": 25221960, "step": 25125 }, { "epoch": 13.324496288441145, "grad_norm": 3.4051878452301025, "learning_rate": 3.027109755166019e-06, "loss": 0.0075, "num_input_tokens_seen": 25226536, "step": 25130 }, { "epoch": 13.327147401908801, "grad_norm": 5.437232494354248, "learning_rate": 3.02498414793806e-06, "loss": 0.0302, "num_input_tokens_seen": 25231176, "step": 25135 }, { "epoch": 13.329798515376458, "grad_norm": 9.83098316192627, "learning_rate": 3.022858963555828e-06, "loss": 0.0298, "num_input_tokens_seen": 25236360, "step": 25140 }, { "epoch": 13.332449628844115, "grad_norm": 29.341230392456055, "learning_rate": 3.0207342024743168e-06, "loss": 0.0167, "num_input_tokens_seen": 25243080, "step": 25145 }, { "epoch": 13.335100742311772, "grad_norm": 1.1147466897964478, "learning_rate": 3.0186098651484365e-06, "loss": 0.0063, "num_input_tokens_seen": 25248104, "step": 25150 }, { "epoch": 13.337751855779427, "grad_norm": 2.173624038696289, "learning_rate": 3.016485952032997e-06, "loss": 0.2551, "num_input_tokens_seen": 25252328, "step": 25155 }, { "epoch": 13.340402969247084, "grad_norm": 0.49233877658843994, "learning_rate": 3.014362463582726e-06, "loss": 0.0166, "num_input_tokens_seen": 25258248, "step": 25160 }, { "epoch": 13.34305408271474, "grad_norm": 4.476584434509277, "learning_rate": 3.0122394002522544e-06, "loss": 0.0667, "num_input_tokens_seen": 25262216, "step": 25165 }, { "epoch": 13.345705196182397, "grad_norm": 0.28758925199508667, "learning_rate": 3.0101167624961273e-06, "loss": 0.0111, "num_input_tokens_seen": 25266728, "step": 25170 }, { "epoch": 13.348356309650054, "grad_norm": 69.00652313232422, "learning_rate": 3.007994550768793e-06, "loss": 0.2581, "num_input_tokens_seen": 25271048, "step": 25175 }, { "epoch": 13.351007423117709, "grad_norm": 1.0650241374969482, "learning_rate": 3.0058727655246133e-06, "loss": 0.0159, "num_input_tokens_seen": 25277000, "step": 25180 }, { "epoch": 13.353658536585366, "grad_norm": 13.055403709411621, "learning_rate": 3.0037514072178564e-06, "loss": 0.0324, "num_input_tokens_seen": 25281384, "step": 25185 }, { "epoch": 13.356309650053023, "grad_norm": 2.1186118125915527, "learning_rate": 3.0016304763027006e-06, "loss": 0.029, "num_input_tokens_seen": 25286376, "step": 25190 }, { "epoch": 13.35896076352068, "grad_norm": 35.91690444946289, "learning_rate": 2.9995099732332287e-06, "loss": 0.1747, "num_input_tokens_seen": 25290728, "step": 25195 }, { "epoch": 13.361611876988334, "grad_norm": 4.902472019195557, "learning_rate": 2.9973898984634387e-06, "loss": 0.0075, "num_input_tokens_seen": 25296072, "step": 25200 }, { "epoch": 13.364262990455991, "grad_norm": 11.04134464263916, "learning_rate": 2.9952702524472288e-06, "loss": 0.0387, "num_input_tokens_seen": 25300456, "step": 25205 }, { "epoch": 13.366914103923648, "grad_norm": 0.4223674535751343, "learning_rate": 2.9931510356384154e-06, "loss": 0.0198, "num_input_tokens_seen": 25305832, "step": 25210 }, { "epoch": 13.369565217391305, "grad_norm": 0.3906497061252594, "learning_rate": 2.9910322484907107e-06, "loss": 0.0996, "num_input_tokens_seen": 25310728, "step": 25215 }, { "epoch": 13.372216330858961, "grad_norm": 0.45336103439331055, "learning_rate": 2.9889138914577463e-06, "loss": 0.0183, "num_input_tokens_seen": 25316328, "step": 25220 }, { "epoch": 13.374867444326616, "grad_norm": 44.61100769042969, "learning_rate": 2.986795964993054e-06, "loss": 0.0578, "num_input_tokens_seen": 25321192, "step": 25225 }, { "epoch": 13.377518557794273, "grad_norm": 61.42743682861328, "learning_rate": 2.98467846955008e-06, "loss": 0.0565, "num_input_tokens_seen": 25326024, "step": 25230 }, { "epoch": 13.38016967126193, "grad_norm": 83.94845581054688, "learning_rate": 2.9825614055821692e-06, "loss": 0.1081, "num_input_tokens_seen": 25330888, "step": 25235 }, { "epoch": 13.382820784729587, "grad_norm": 66.38916015625, "learning_rate": 2.9804447735425834e-06, "loss": 0.1642, "num_input_tokens_seen": 25336168, "step": 25240 }, { "epoch": 13.385471898197244, "grad_norm": 5.960427284240723, "learning_rate": 2.9783285738844868e-06, "loss": 0.0294, "num_input_tokens_seen": 25341480, "step": 25245 }, { "epoch": 13.388123011664899, "grad_norm": 1.8379230499267578, "learning_rate": 2.976212807060954e-06, "loss": 0.0145, "num_input_tokens_seen": 25346856, "step": 25250 }, { "epoch": 13.390774125132555, "grad_norm": 8.073878288269043, "learning_rate": 2.9740974735249627e-06, "loss": 0.0116, "num_input_tokens_seen": 25351432, "step": 25255 }, { "epoch": 13.393425238600212, "grad_norm": 14.3773193359375, "learning_rate": 2.9719825737293994e-06, "loss": 0.0111, "num_input_tokens_seen": 25357192, "step": 25260 }, { "epoch": 13.396076352067869, "grad_norm": 0.5459097623825073, "learning_rate": 2.969868108127063e-06, "loss": 0.021, "num_input_tokens_seen": 25361512, "step": 25265 }, { "epoch": 13.398727465535526, "grad_norm": 7.782127857208252, "learning_rate": 2.9677540771706504e-06, "loss": 0.0687, "num_input_tokens_seen": 25366664, "step": 25270 }, { "epoch": 13.40137857900318, "grad_norm": 35.991756439208984, "learning_rate": 2.9656404813127727e-06, "loss": 0.0573, "num_input_tokens_seen": 25371592, "step": 25275 }, { "epoch": 13.404029692470838, "grad_norm": 1.337460994720459, "learning_rate": 2.9635273210059428e-06, "loss": 0.0518, "num_input_tokens_seen": 25376168, "step": 25280 }, { "epoch": 13.406680805938494, "grad_norm": 0.1909545660018921, "learning_rate": 2.9614145967025877e-06, "loss": 0.0054, "num_input_tokens_seen": 25380296, "step": 25285 }, { "epoch": 13.409331919406151, "grad_norm": 12.651819229125977, "learning_rate": 2.9593023088550303e-06, "loss": 0.0473, "num_input_tokens_seen": 25384776, "step": 25290 }, { "epoch": 13.411983032873806, "grad_norm": 0.39607226848602295, "learning_rate": 2.95719045791551e-06, "loss": 0.0088, "num_input_tokens_seen": 25390024, "step": 25295 }, { "epoch": 13.414634146341463, "grad_norm": 61.17395782470703, "learning_rate": 2.955079044336166e-06, "loss": 0.0593, "num_input_tokens_seen": 25394824, "step": 25300 }, { "epoch": 13.41728525980912, "grad_norm": 49.5806884765625, "learning_rate": 2.95296806856905e-06, "loss": 0.2123, "num_input_tokens_seen": 25400488, "step": 25305 }, { "epoch": 13.419936373276776, "grad_norm": 3.439542055130005, "learning_rate": 2.9508575310661115e-06, "loss": 0.0044, "num_input_tokens_seen": 25405352, "step": 25310 }, { "epoch": 13.422587486744433, "grad_norm": 8.473614692687988, "learning_rate": 2.948747432279214e-06, "loss": 0.014, "num_input_tokens_seen": 25410664, "step": 25315 }, { "epoch": 13.425238600212088, "grad_norm": 1.1830840110778809, "learning_rate": 2.946637772660124e-06, "loss": 0.0264, "num_input_tokens_seen": 25415336, "step": 25320 }, { "epoch": 13.427889713679745, "grad_norm": 0.14212951064109802, "learning_rate": 2.9445285526605148e-06, "loss": 0.0094, "num_input_tokens_seen": 25420200, "step": 25325 }, { "epoch": 13.430540827147402, "grad_norm": 5.434852600097656, "learning_rate": 2.9424197727319624e-06, "loss": 0.0727, "num_input_tokens_seen": 25425768, "step": 25330 }, { "epoch": 13.433191940615059, "grad_norm": 9.654315948486328, "learning_rate": 2.940311433325953e-06, "loss": 0.0478, "num_input_tokens_seen": 25430344, "step": 25335 }, { "epoch": 13.435843054082715, "grad_norm": 0.6706653237342834, "learning_rate": 2.938203534893875e-06, "loss": 0.0102, "num_input_tokens_seen": 25435272, "step": 25340 }, { "epoch": 13.43849416755037, "grad_norm": 12.237692832946777, "learning_rate": 2.936096077887028e-06, "loss": 0.0528, "num_input_tokens_seen": 25439176, "step": 25345 }, { "epoch": 13.441145281018027, "grad_norm": 1.5788190364837646, "learning_rate": 2.933989062756607e-06, "loss": 0.0516, "num_input_tokens_seen": 25443976, "step": 25350 }, { "epoch": 13.443796394485684, "grad_norm": 2.323944330215454, "learning_rate": 2.931882489953723e-06, "loss": 0.0267, "num_input_tokens_seen": 25448904, "step": 25355 }, { "epoch": 13.44644750795334, "grad_norm": 1.2781143188476562, "learning_rate": 2.929776359929385e-06, "loss": 0.1109, "num_input_tokens_seen": 25453512, "step": 25360 }, { "epoch": 13.449098621420998, "grad_norm": 0.23789221048355103, "learning_rate": 2.927670673134513e-06, "loss": 0.0061, "num_input_tokens_seen": 25459208, "step": 25365 }, { "epoch": 13.451749734888653, "grad_norm": 0.7699776291847229, "learning_rate": 2.9255654300199253e-06, "loss": 0.0051, "num_input_tokens_seen": 25467400, "step": 25370 }, { "epoch": 13.45440084835631, "grad_norm": 23.210168838500977, "learning_rate": 2.9234606310363518e-06, "loss": 0.0665, "num_input_tokens_seen": 25473480, "step": 25375 }, { "epoch": 13.457051961823966, "grad_norm": 0.3120713233947754, "learning_rate": 2.921356276634422e-06, "loss": 0.2215, "num_input_tokens_seen": 25478376, "step": 25380 }, { "epoch": 13.459703075291623, "grad_norm": 5.72735595703125, "learning_rate": 2.9192523672646765e-06, "loss": 0.0335, "num_input_tokens_seen": 25483432, "step": 25385 }, { "epoch": 13.46235418875928, "grad_norm": 7.170020580291748, "learning_rate": 2.9171489033775514e-06, "loss": 0.017, "num_input_tokens_seen": 25490088, "step": 25390 }, { "epoch": 13.465005302226935, "grad_norm": 6.047935485839844, "learning_rate": 2.9150458854234e-06, "loss": 0.024, "num_input_tokens_seen": 25495464, "step": 25395 }, { "epoch": 13.467656415694591, "grad_norm": 77.60083770751953, "learning_rate": 2.9129433138524666e-06, "loss": 0.3374, "num_input_tokens_seen": 25500328, "step": 25400 }, { "epoch": 13.470307529162248, "grad_norm": 27.12165641784668, "learning_rate": 2.9108411891149063e-06, "loss": 0.1176, "num_input_tokens_seen": 25505064, "step": 25405 }, { "epoch": 13.472958642629905, "grad_norm": 32.564327239990234, "learning_rate": 2.908739511660785e-06, "loss": 0.0582, "num_input_tokens_seen": 25509960, "step": 25410 }, { "epoch": 13.475609756097562, "grad_norm": 2.7382888793945312, "learning_rate": 2.906638281940057e-06, "loss": 0.0404, "num_input_tokens_seen": 25515592, "step": 25415 }, { "epoch": 13.478260869565217, "grad_norm": 6.157615661621094, "learning_rate": 2.904537500402598e-06, "loss": 0.0252, "num_input_tokens_seen": 25520968, "step": 25420 }, { "epoch": 13.480911983032874, "grad_norm": 8.938916206359863, "learning_rate": 2.9024371674981767e-06, "loss": 0.0719, "num_input_tokens_seen": 25526888, "step": 25425 }, { "epoch": 13.48356309650053, "grad_norm": 55.63331985473633, "learning_rate": 2.900337283676469e-06, "loss": 0.0435, "num_input_tokens_seen": 25531656, "step": 25430 }, { "epoch": 13.486214209968187, "grad_norm": 69.51956176757812, "learning_rate": 2.8982378493870543e-06, "loss": 0.1187, "num_input_tokens_seen": 25535496, "step": 25435 }, { "epoch": 13.488865323435842, "grad_norm": 9.590742111206055, "learning_rate": 2.8961388650794178e-06, "loss": 0.1224, "num_input_tokens_seen": 25539976, "step": 25440 }, { "epoch": 13.491516436903499, "grad_norm": 6.997676849365234, "learning_rate": 2.8940403312029407e-06, "loss": 0.0543, "num_input_tokens_seen": 25545512, "step": 25445 }, { "epoch": 13.494167550371156, "grad_norm": 3.4705653190612793, "learning_rate": 2.8919422482069235e-06, "loss": 0.0093, "num_input_tokens_seen": 25550056, "step": 25450 }, { "epoch": 13.496818663838813, "grad_norm": 17.367815017700195, "learning_rate": 2.88984461654055e-06, "loss": 0.0792, "num_input_tokens_seen": 25555528, "step": 25455 }, { "epoch": 13.49946977730647, "grad_norm": 4.4828386306762695, "learning_rate": 2.887747436652925e-06, "loss": 0.0102, "num_input_tokens_seen": 25559560, "step": 25460 }, { "epoch": 13.502120890774124, "grad_norm": 69.75470733642578, "learning_rate": 2.8856507089930463e-06, "loss": 0.0808, "num_input_tokens_seen": 25564776, "step": 25465 }, { "epoch": 13.504772004241781, "grad_norm": 0.046429529786109924, "learning_rate": 2.8835544340098183e-06, "loss": 0.0542, "num_input_tokens_seen": 25570600, "step": 25470 }, { "epoch": 13.507423117709438, "grad_norm": 32.43558120727539, "learning_rate": 2.881458612152047e-06, "loss": 0.0266, "num_input_tokens_seen": 25575176, "step": 25475 }, { "epoch": 13.510074231177095, "grad_norm": 0.5837829113006592, "learning_rate": 2.8793632438684443e-06, "loss": 0.0859, "num_input_tokens_seen": 25579368, "step": 25480 }, { "epoch": 13.512725344644752, "grad_norm": 33.07112121582031, "learning_rate": 2.8772683296076197e-06, "loss": 0.0518, "num_input_tokens_seen": 25584776, "step": 25485 }, { "epoch": 13.515376458112407, "grad_norm": 59.08517837524414, "learning_rate": 2.875173869818094e-06, "loss": 0.046, "num_input_tokens_seen": 25588904, "step": 25490 }, { "epoch": 13.518027571580063, "grad_norm": 8.610343933105469, "learning_rate": 2.87307986494828e-06, "loss": 0.1405, "num_input_tokens_seen": 25593416, "step": 25495 }, { "epoch": 13.52067868504772, "grad_norm": 2.6037797927856445, "learning_rate": 2.870986315446503e-06, "loss": 0.0187, "num_input_tokens_seen": 25598280, "step": 25500 }, { "epoch": 13.523329798515377, "grad_norm": 4.530488967895508, "learning_rate": 2.868893221760984e-06, "loss": 0.019, "num_input_tokens_seen": 25602408, "step": 25505 }, { "epoch": 13.525980911983034, "grad_norm": 1.6300259828567505, "learning_rate": 2.8668005843398506e-06, "loss": 0.1939, "num_input_tokens_seen": 25607176, "step": 25510 }, { "epoch": 13.528632025450689, "grad_norm": 47.57746124267578, "learning_rate": 2.86470840363113e-06, "loss": 0.0972, "num_input_tokens_seen": 25612104, "step": 25515 }, { "epoch": 13.531283138918345, "grad_norm": 1.7809593677520752, "learning_rate": 2.8626166800827524e-06, "loss": 0.0227, "num_input_tokens_seen": 25616456, "step": 25520 }, { "epoch": 13.533934252386002, "grad_norm": 44.476261138916016, "learning_rate": 2.8605254141425485e-06, "loss": 0.1129, "num_input_tokens_seen": 25621160, "step": 25525 }, { "epoch": 13.536585365853659, "grad_norm": 45.35072326660156, "learning_rate": 2.858434606258259e-06, "loss": 0.0675, "num_input_tokens_seen": 25626536, "step": 25530 }, { "epoch": 13.539236479321314, "grad_norm": 48.23379898071289, "learning_rate": 2.856344256877515e-06, "loss": 0.055, "num_input_tokens_seen": 25632488, "step": 25535 }, { "epoch": 13.54188759278897, "grad_norm": 17.985382080078125, "learning_rate": 2.854254366447854e-06, "loss": 0.0156, "num_input_tokens_seen": 25636968, "step": 25540 }, { "epoch": 13.544538706256628, "grad_norm": 29.54006576538086, "learning_rate": 2.8521649354167215e-06, "loss": 0.0483, "num_input_tokens_seen": 25643112, "step": 25545 }, { "epoch": 13.547189819724284, "grad_norm": 7.645557880401611, "learning_rate": 2.8500759642314523e-06, "loss": 0.0806, "num_input_tokens_seen": 25647560, "step": 25550 }, { "epoch": 13.549840933191941, "grad_norm": 79.53205108642578, "learning_rate": 2.8479874533392958e-06, "loss": 0.1085, "num_input_tokens_seen": 25652264, "step": 25555 }, { "epoch": 13.552492046659598, "grad_norm": 60.71955108642578, "learning_rate": 2.8458994031873944e-06, "loss": 0.0747, "num_input_tokens_seen": 25656776, "step": 25560 }, { "epoch": 13.555143160127253, "grad_norm": 10.479718208312988, "learning_rate": 2.843811814222794e-06, "loss": 0.0302, "num_input_tokens_seen": 25661928, "step": 25565 }, { "epoch": 13.55779427359491, "grad_norm": 0.4575521647930145, "learning_rate": 2.8417246868924418e-06, "loss": 0.0246, "num_input_tokens_seen": 25667016, "step": 25570 }, { "epoch": 13.560445387062567, "grad_norm": 0.925440788269043, "learning_rate": 2.839638021643186e-06, "loss": 0.0033, "num_input_tokens_seen": 25672040, "step": 25575 }, { "epoch": 13.563096500530223, "grad_norm": 2.8047709465026855, "learning_rate": 2.8375518189217756e-06, "loss": 0.1367, "num_input_tokens_seen": 25676136, "step": 25580 }, { "epoch": 13.565747613997878, "grad_norm": 0.6830475926399231, "learning_rate": 2.835466079174866e-06, "loss": 0.0785, "num_input_tokens_seen": 25680904, "step": 25585 }, { "epoch": 13.568398727465535, "grad_norm": 15.635993957519531, "learning_rate": 2.8333808028490005e-06, "loss": 0.0289, "num_input_tokens_seen": 25688680, "step": 25590 }, { "epoch": 13.571049840933192, "grad_norm": 67.05321502685547, "learning_rate": 2.831295990390638e-06, "loss": 0.0683, "num_input_tokens_seen": 25693896, "step": 25595 }, { "epoch": 13.573700954400849, "grad_norm": 0.9856876134872437, "learning_rate": 2.8292116422461283e-06, "loss": 0.0046, "num_input_tokens_seen": 25699048, "step": 25600 }, { "epoch": 13.576352067868505, "grad_norm": 1.6860430240631104, "learning_rate": 2.827127758861726e-06, "loss": 0.1232, "num_input_tokens_seen": 25704648, "step": 25605 }, { "epoch": 13.57900318133616, "grad_norm": 0.662989616394043, "learning_rate": 2.825044340683583e-06, "loss": 0.025, "num_input_tokens_seen": 25709960, "step": 25610 }, { "epoch": 13.581654294803817, "grad_norm": 0.06946390122175217, "learning_rate": 2.822961388157759e-06, "loss": 0.0997, "num_input_tokens_seen": 25714568, "step": 25615 }, { "epoch": 13.584305408271474, "grad_norm": 2.1744396686553955, "learning_rate": 2.8208789017302012e-06, "loss": 0.1857, "num_input_tokens_seen": 25718824, "step": 25620 }, { "epoch": 13.58695652173913, "grad_norm": 56.805789947509766, "learning_rate": 2.8187968818467725e-06, "loss": 0.0946, "num_input_tokens_seen": 25723784, "step": 25625 }, { "epoch": 13.589607635206788, "grad_norm": 0.5042884349822998, "learning_rate": 2.81671532895322e-06, "loss": 0.0309, "num_input_tokens_seen": 25729768, "step": 25630 }, { "epoch": 13.592258748674443, "grad_norm": 46.82351303100586, "learning_rate": 2.8146342434952038e-06, "loss": 0.0613, "num_input_tokens_seen": 25735208, "step": 25635 }, { "epoch": 13.5949098621421, "grad_norm": 17.4918155670166, "learning_rate": 2.812553625918278e-06, "loss": 0.0449, "num_input_tokens_seen": 25739880, "step": 25640 }, { "epoch": 13.597560975609756, "grad_norm": 0.013524916023015976, "learning_rate": 2.8104734766678964e-06, "loss": 0.0104, "num_input_tokens_seen": 25746056, "step": 25645 }, { "epoch": 13.600212089077413, "grad_norm": 44.33633041381836, "learning_rate": 2.8083937961894127e-06, "loss": 0.0952, "num_input_tokens_seen": 25750728, "step": 25650 }, { "epoch": 13.60286320254507, "grad_norm": 37.37687301635742, "learning_rate": 2.806314584928086e-06, "loss": 0.0457, "num_input_tokens_seen": 25755688, "step": 25655 }, { "epoch": 13.605514316012725, "grad_norm": 0.05018177628517151, "learning_rate": 2.804235843329063e-06, "loss": 0.0529, "num_input_tokens_seen": 25760232, "step": 25660 }, { "epoch": 13.608165429480382, "grad_norm": 28.881877899169922, "learning_rate": 2.8021575718374017e-06, "loss": 0.1415, "num_input_tokens_seen": 25765320, "step": 25665 }, { "epoch": 13.610816542948038, "grad_norm": 0.7698870897293091, "learning_rate": 2.800079770898055e-06, "loss": 0.0112, "num_input_tokens_seen": 25770504, "step": 25670 }, { "epoch": 13.613467656415695, "grad_norm": 60.713050842285156, "learning_rate": 2.7980024409558687e-06, "loss": 0.06, "num_input_tokens_seen": 25776168, "step": 25675 }, { "epoch": 13.61611876988335, "grad_norm": 14.813150405883789, "learning_rate": 2.7959255824555996e-06, "loss": 0.0231, "num_input_tokens_seen": 25782024, "step": 25680 }, { "epoch": 13.618769883351007, "grad_norm": 41.33409118652344, "learning_rate": 2.793849195841896e-06, "loss": 0.0422, "num_input_tokens_seen": 25786344, "step": 25685 }, { "epoch": 13.621420996818664, "grad_norm": 51.20004653930664, "learning_rate": 2.791773281559307e-06, "loss": 0.0665, "num_input_tokens_seen": 25791816, "step": 25690 }, { "epoch": 13.62407211028632, "grad_norm": 1.2100951671600342, "learning_rate": 2.78969784005228e-06, "loss": 0.0349, "num_input_tokens_seen": 25797960, "step": 25695 }, { "epoch": 13.626723223753977, "grad_norm": 1.5570693016052246, "learning_rate": 2.787622871765161e-06, "loss": 0.0519, "num_input_tokens_seen": 25802376, "step": 25700 }, { "epoch": 13.629374337221632, "grad_norm": 9.118130683898926, "learning_rate": 2.7855483771421947e-06, "loss": 0.0684, "num_input_tokens_seen": 25807304, "step": 25705 }, { "epoch": 13.632025450689289, "grad_norm": 2.4651777744293213, "learning_rate": 2.78347435662753e-06, "loss": 0.0129, "num_input_tokens_seen": 25812456, "step": 25710 }, { "epoch": 13.634676564156946, "grad_norm": 6.833725452423096, "learning_rate": 2.781400810665201e-06, "loss": 0.0131, "num_input_tokens_seen": 25816968, "step": 25715 }, { "epoch": 13.637327677624603, "grad_norm": 2.056962013244629, "learning_rate": 2.779327739699156e-06, "loss": 0.0092, "num_input_tokens_seen": 25821320, "step": 25720 }, { "epoch": 13.63997879109226, "grad_norm": 0.1074075847864151, "learning_rate": 2.7772551441732297e-06, "loss": 0.0022, "num_input_tokens_seen": 25827400, "step": 25725 }, { "epoch": 13.642629904559914, "grad_norm": 5.9916839599609375, "learning_rate": 2.7751830245311616e-06, "loss": 0.0685, "num_input_tokens_seen": 25831624, "step": 25730 }, { "epoch": 13.645281018027571, "grad_norm": 1.663611888885498, "learning_rate": 2.7731113812165866e-06, "loss": 0.0083, "num_input_tokens_seen": 25837096, "step": 25735 }, { "epoch": 13.647932131495228, "grad_norm": 0.0571708157658577, "learning_rate": 2.7710402146730365e-06, "loss": 0.1626, "num_input_tokens_seen": 25842728, "step": 25740 }, { "epoch": 13.650583244962885, "grad_norm": 0.6045464873313904, "learning_rate": 2.768969525343943e-06, "loss": 0.097, "num_input_tokens_seen": 25847400, "step": 25745 }, { "epoch": 13.653234358430542, "grad_norm": 0.47659799456596375, "learning_rate": 2.7668993136726395e-06, "loss": 0.0033, "num_input_tokens_seen": 25852232, "step": 25750 }, { "epoch": 13.655885471898197, "grad_norm": 7.612441539764404, "learning_rate": 2.7648295801023452e-06, "loss": 0.1029, "num_input_tokens_seen": 25856200, "step": 25755 }, { "epoch": 13.658536585365853, "grad_norm": 42.295902252197266, "learning_rate": 2.7627603250761915e-06, "loss": 0.1447, "num_input_tokens_seen": 25860840, "step": 25760 }, { "epoch": 13.66118769883351, "grad_norm": 14.503960609436035, "learning_rate": 2.7606915490371977e-06, "loss": 0.0738, "num_input_tokens_seen": 25865864, "step": 25765 }, { "epoch": 13.663838812301167, "grad_norm": 2.2550086975097656, "learning_rate": 2.7586232524282834e-06, "loss": 0.1169, "num_input_tokens_seen": 25870760, "step": 25770 }, { "epoch": 13.666489925768824, "grad_norm": 3.57973575592041, "learning_rate": 2.756555435692266e-06, "loss": 0.0112, "num_input_tokens_seen": 25875144, "step": 25775 }, { "epoch": 13.669141039236479, "grad_norm": 5.004094123840332, "learning_rate": 2.7544880992718592e-06, "loss": 0.0472, "num_input_tokens_seen": 25880264, "step": 25780 }, { "epoch": 13.671792152704136, "grad_norm": 8.865147590637207, "learning_rate": 2.752421243609672e-06, "loss": 0.0139, "num_input_tokens_seen": 25885096, "step": 25785 }, { "epoch": 13.674443266171792, "grad_norm": 0.4958136975765228, "learning_rate": 2.75035486914822e-06, "loss": 0.0174, "num_input_tokens_seen": 25891848, "step": 25790 }, { "epoch": 13.677094379639449, "grad_norm": 31.927427291870117, "learning_rate": 2.7482889763298993e-06, "loss": 0.0224, "num_input_tokens_seen": 25897736, "step": 25795 }, { "epoch": 13.679745493107106, "grad_norm": 28.441679000854492, "learning_rate": 2.7462235655970188e-06, "loss": 0.0205, "num_input_tokens_seen": 25903240, "step": 25800 }, { "epoch": 13.68239660657476, "grad_norm": 15.576406478881836, "learning_rate": 2.744158637391775e-06, "loss": 0.0512, "num_input_tokens_seen": 25907816, "step": 25805 }, { "epoch": 13.685047720042418, "grad_norm": 0.05506409704685211, "learning_rate": 2.742094192156265e-06, "loss": 0.0213, "num_input_tokens_seen": 25913544, "step": 25810 }, { "epoch": 13.687698833510074, "grad_norm": 1.1517338752746582, "learning_rate": 2.740030230332479e-06, "loss": 0.0083, "num_input_tokens_seen": 25918984, "step": 25815 }, { "epoch": 13.690349946977731, "grad_norm": 2.084773063659668, "learning_rate": 2.7379667523623077e-06, "loss": 0.0159, "num_input_tokens_seen": 25923784, "step": 25820 }, { "epoch": 13.693001060445386, "grad_norm": 0.8746092915534973, "learning_rate": 2.7359037586875347e-06, "loss": 0.1416, "num_input_tokens_seen": 25928296, "step": 25825 }, { "epoch": 13.695652173913043, "grad_norm": 4.284721374511719, "learning_rate": 2.7338412497498435e-06, "loss": 0.021, "num_input_tokens_seen": 25933288, "step": 25830 }, { "epoch": 13.6983032873807, "grad_norm": 65.09944915771484, "learning_rate": 2.7317792259908098e-06, "loss": 0.0683, "num_input_tokens_seen": 25938024, "step": 25835 }, { "epoch": 13.700954400848357, "grad_norm": 0.5768559575080872, "learning_rate": 2.729717687851907e-06, "loss": 0.0976, "num_input_tokens_seen": 25943272, "step": 25840 }, { "epoch": 13.703605514316013, "grad_norm": 0.3720680773258209, "learning_rate": 2.7276566357745105e-06, "loss": 0.0085, "num_input_tokens_seen": 25948712, "step": 25845 }, { "epoch": 13.706256627783668, "grad_norm": 53.78484344482422, "learning_rate": 2.7255960701998783e-06, "loss": 0.1419, "num_input_tokens_seen": 25953000, "step": 25850 }, { "epoch": 13.708907741251325, "grad_norm": 5.263442039489746, "learning_rate": 2.7235359915691774e-06, "loss": 0.0074, "num_input_tokens_seen": 25957800, "step": 25855 }, { "epoch": 13.711558854718982, "grad_norm": 46.9400520324707, "learning_rate": 2.7214764003234637e-06, "loss": 0.1986, "num_input_tokens_seen": 25963240, "step": 25860 }, { "epoch": 13.714209968186639, "grad_norm": 0.06976436823606491, "learning_rate": 2.7194172969036912e-06, "loss": 0.0363, "num_input_tokens_seen": 25967624, "step": 25865 }, { "epoch": 13.716861081654296, "grad_norm": 1.5912377834320068, "learning_rate": 2.7173586817507075e-06, "loss": 0.0353, "num_input_tokens_seen": 25973032, "step": 25870 }, { "epoch": 13.71951219512195, "grad_norm": 3.691295862197876, "learning_rate": 2.7153005553052568e-06, "loss": 0.0585, "num_input_tokens_seen": 25979112, "step": 25875 }, { "epoch": 13.722163308589607, "grad_norm": 0.6476324200630188, "learning_rate": 2.7132429180079767e-06, "loss": 0.0205, "num_input_tokens_seen": 25983688, "step": 25880 }, { "epoch": 13.724814422057264, "grad_norm": 2.1570780277252197, "learning_rate": 2.711185770299408e-06, "loss": 0.0776, "num_input_tokens_seen": 25987496, "step": 25885 }, { "epoch": 13.72746553552492, "grad_norm": 2.7957212924957275, "learning_rate": 2.7091291126199732e-06, "loss": 0.0574, "num_input_tokens_seen": 25992200, "step": 25890 }, { "epoch": 13.730116648992578, "grad_norm": 24.615537643432617, "learning_rate": 2.707072945410002e-06, "loss": 0.0456, "num_input_tokens_seen": 25997672, "step": 25895 }, { "epoch": 13.732767762460233, "grad_norm": 28.253217697143555, "learning_rate": 2.7050172691097143e-06, "loss": 0.1012, "num_input_tokens_seen": 26002216, "step": 25900 }, { "epoch": 13.73541887592789, "grad_norm": 0.12758059799671173, "learning_rate": 2.702962084159223e-06, "loss": 0.007, "num_input_tokens_seen": 26007144, "step": 25905 }, { "epoch": 13.738069989395546, "grad_norm": 7.116919994354248, "learning_rate": 2.7009073909985385e-06, "loss": 0.0093, "num_input_tokens_seen": 26011304, "step": 25910 }, { "epoch": 13.740721102863203, "grad_norm": 1.2368537187576294, "learning_rate": 2.6988531900675665e-06, "loss": 0.0053, "num_input_tokens_seen": 26016008, "step": 25915 }, { "epoch": 13.743372216330858, "grad_norm": 0.41392406821250916, "learning_rate": 2.6967994818061016e-06, "loss": 0.097, "num_input_tokens_seen": 26020552, "step": 25920 }, { "epoch": 13.746023329798515, "grad_norm": 3.4548399448394775, "learning_rate": 2.694746266653846e-06, "loss": 0.0549, "num_input_tokens_seen": 26025928, "step": 25925 }, { "epoch": 13.748674443266172, "grad_norm": 7.660516738891602, "learning_rate": 2.6926935450503776e-06, "loss": 0.005, "num_input_tokens_seen": 26032648, "step": 25930 }, { "epoch": 13.751325556733828, "grad_norm": 32.69276428222656, "learning_rate": 2.6906413174351864e-06, "loss": 0.1539, "num_input_tokens_seen": 26038504, "step": 25935 }, { "epoch": 13.753976670201485, "grad_norm": 4.770357608795166, "learning_rate": 2.688589584247646e-06, "loss": 0.0224, "num_input_tokens_seen": 26042600, "step": 25940 }, { "epoch": 13.756627783669142, "grad_norm": 11.963680267333984, "learning_rate": 2.686538345927027e-06, "loss": 0.0193, "num_input_tokens_seen": 26047784, "step": 25945 }, { "epoch": 13.759278897136797, "grad_norm": 43.45428466796875, "learning_rate": 2.6844876029124946e-06, "loss": 0.1205, "num_input_tokens_seen": 26052648, "step": 25950 }, { "epoch": 13.761930010604454, "grad_norm": 0.025136733427643776, "learning_rate": 2.6824373556431072e-06, "loss": 0.0612, "num_input_tokens_seen": 26057736, "step": 25955 }, { "epoch": 13.76458112407211, "grad_norm": 3.4240562915802, "learning_rate": 2.680387604557817e-06, "loss": 0.0059, "num_input_tokens_seen": 26062728, "step": 25960 }, { "epoch": 13.767232237539767, "grad_norm": 0.3320271372795105, "learning_rate": 2.678338350095472e-06, "loss": 0.0158, "num_input_tokens_seen": 26067496, "step": 25965 }, { "epoch": 13.769883351007422, "grad_norm": 6.076292514801025, "learning_rate": 2.67628959269481e-06, "loss": 0.0478, "num_input_tokens_seen": 26072200, "step": 25970 }, { "epoch": 13.77253446447508, "grad_norm": 0.6416746973991394, "learning_rate": 2.6742413327944637e-06, "loss": 0.0205, "num_input_tokens_seen": 26076584, "step": 25975 }, { "epoch": 13.775185577942736, "grad_norm": 37.929229736328125, "learning_rate": 2.6721935708329673e-06, "loss": 0.0357, "num_input_tokens_seen": 26081416, "step": 25980 }, { "epoch": 13.777836691410393, "grad_norm": 0.6484785079956055, "learning_rate": 2.6701463072487312e-06, "loss": 0.0093, "num_input_tokens_seen": 26085512, "step": 25985 }, { "epoch": 13.78048780487805, "grad_norm": 3.7602527141571045, "learning_rate": 2.668099542480076e-06, "loss": 0.0433, "num_input_tokens_seen": 26091720, "step": 25990 }, { "epoch": 13.783138918345704, "grad_norm": 20.771869659423828, "learning_rate": 2.666053276965207e-06, "loss": 0.2076, "num_input_tokens_seen": 26097224, "step": 25995 }, { "epoch": 13.785790031813361, "grad_norm": 20.87070083618164, "learning_rate": 2.664007511142225e-06, "loss": 0.038, "num_input_tokens_seen": 26102440, "step": 26000 }, { "epoch": 13.788441145281018, "grad_norm": 0.1295015811920166, "learning_rate": 2.661962245449121e-06, "loss": 0.0424, "num_input_tokens_seen": 26106760, "step": 26005 }, { "epoch": 13.791092258748675, "grad_norm": 0.3840465247631073, "learning_rate": 2.6599174803237826e-06, "loss": 0.0521, "num_input_tokens_seen": 26112616, "step": 26010 }, { "epoch": 13.793743372216332, "grad_norm": 2.0303072929382324, "learning_rate": 2.657873216203986e-06, "loss": 0.1145, "num_input_tokens_seen": 26117512, "step": 26015 }, { "epoch": 13.796394485683987, "grad_norm": 14.741573333740234, "learning_rate": 2.6558294535274094e-06, "loss": 0.0151, "num_input_tokens_seen": 26122120, "step": 26020 }, { "epoch": 13.799045599151643, "grad_norm": 30.67636489868164, "learning_rate": 2.653786192731609e-06, "loss": 0.0958, "num_input_tokens_seen": 26127272, "step": 26025 }, { "epoch": 13.8016967126193, "grad_norm": 34.83574295043945, "learning_rate": 2.6517434342540465e-06, "loss": 0.0238, "num_input_tokens_seen": 26132072, "step": 26030 }, { "epoch": 13.804347826086957, "grad_norm": 0.3800014555454254, "learning_rate": 2.64970117853207e-06, "loss": 0.0169, "num_input_tokens_seen": 26138376, "step": 26035 }, { "epoch": 13.806998939554614, "grad_norm": 0.5737857818603516, "learning_rate": 2.647659426002922e-06, "loss": 0.0739, "num_input_tokens_seen": 26143080, "step": 26040 }, { "epoch": 13.809650053022269, "grad_norm": 0.5179604291915894, "learning_rate": 2.6456181771037347e-06, "loss": 0.0167, "num_input_tokens_seen": 26148424, "step": 26045 }, { "epoch": 13.812301166489926, "grad_norm": 0.07014717906713486, "learning_rate": 2.6435774322715353e-06, "loss": 0.0286, "num_input_tokens_seen": 26152616, "step": 26050 }, { "epoch": 13.814952279957582, "grad_norm": 11.01948070526123, "learning_rate": 2.6415371919432393e-06, "loss": 0.0735, "num_input_tokens_seen": 26157320, "step": 26055 }, { "epoch": 13.81760339342524, "grad_norm": 8.880184173583984, "learning_rate": 2.639497456555663e-06, "loss": 0.0343, "num_input_tokens_seen": 26162472, "step": 26060 }, { "epoch": 13.820254506892894, "grad_norm": 2.349714994430542, "learning_rate": 2.6374582265455007e-06, "loss": 0.0318, "num_input_tokens_seen": 26167144, "step": 26065 }, { "epoch": 13.822905620360551, "grad_norm": 5.641659736633301, "learning_rate": 2.635419502349351e-06, "loss": 0.0654, "num_input_tokens_seen": 26171912, "step": 26070 }, { "epoch": 13.825556733828208, "grad_norm": 0.3841650187969208, "learning_rate": 2.6333812844037e-06, "loss": 0.0229, "num_input_tokens_seen": 26176776, "step": 26075 }, { "epoch": 13.828207847295864, "grad_norm": 56.91579055786133, "learning_rate": 2.6313435731449224e-06, "loss": 0.0362, "num_input_tokens_seen": 26181544, "step": 26080 }, { "epoch": 13.830858960763521, "grad_norm": 0.7260228395462036, "learning_rate": 2.629306369009287e-06, "loss": 0.0376, "num_input_tokens_seen": 26187208, "step": 26085 }, { "epoch": 13.833510074231176, "grad_norm": 4.249791622161865, "learning_rate": 2.627269672432955e-06, "loss": 0.0058, "num_input_tokens_seen": 26191496, "step": 26090 }, { "epoch": 13.836161187698833, "grad_norm": 0.5420987606048584, "learning_rate": 2.625233483851978e-06, "loss": 0.0126, "num_input_tokens_seen": 26196392, "step": 26095 }, { "epoch": 13.83881230116649, "grad_norm": 53.75204849243164, "learning_rate": 2.6231978037022965e-06, "loss": 0.0453, "num_input_tokens_seen": 26202536, "step": 26100 }, { "epoch": 13.841463414634147, "grad_norm": 0.3973683714866638, "learning_rate": 2.621162632419747e-06, "loss": 0.0794, "num_input_tokens_seen": 26207752, "step": 26105 }, { "epoch": 13.844114528101803, "grad_norm": 0.44237929582595825, "learning_rate": 2.6191279704400506e-06, "loss": 0.0931, "num_input_tokens_seen": 26213448, "step": 26110 }, { "epoch": 13.846765641569458, "grad_norm": 22.18667984008789, "learning_rate": 2.6170938181988297e-06, "loss": 0.0642, "num_input_tokens_seen": 26218472, "step": 26115 }, { "epoch": 13.849416755037115, "grad_norm": 0.047342102974653244, "learning_rate": 2.615060176131583e-06, "loss": 0.0074, "num_input_tokens_seen": 26224392, "step": 26120 }, { "epoch": 13.852067868504772, "grad_norm": 15.463645935058594, "learning_rate": 2.613027044673714e-06, "loss": 0.0435, "num_input_tokens_seen": 26229576, "step": 26125 }, { "epoch": 13.854718981972429, "grad_norm": 0.09741286188364029, "learning_rate": 2.6109944242605077e-06, "loss": 0.0091, "num_input_tokens_seen": 26235112, "step": 26130 }, { "epoch": 13.857370095440086, "grad_norm": 87.76703643798828, "learning_rate": 2.608962315327145e-06, "loss": 0.2076, "num_input_tokens_seen": 26240136, "step": 26135 }, { "epoch": 13.86002120890774, "grad_norm": 16.84327507019043, "learning_rate": 2.6069307183086933e-06, "loss": 0.075, "num_input_tokens_seen": 26246632, "step": 26140 }, { "epoch": 13.862672322375397, "grad_norm": 5.751044273376465, "learning_rate": 2.6048996336401126e-06, "loss": 0.0097, "num_input_tokens_seen": 26253256, "step": 26145 }, { "epoch": 13.865323435843054, "grad_norm": 1.6984920501708984, "learning_rate": 2.6028690617562513e-06, "loss": 0.0284, "num_input_tokens_seen": 26259048, "step": 26150 }, { "epoch": 13.867974549310711, "grad_norm": 1.849069356918335, "learning_rate": 2.600839003091855e-06, "loss": 0.0071, "num_input_tokens_seen": 26263816, "step": 26155 }, { "epoch": 13.870625662778368, "grad_norm": 0.048834461718797684, "learning_rate": 2.5988094580815466e-06, "loss": 0.0175, "num_input_tokens_seen": 26269000, "step": 26160 }, { "epoch": 13.873276776246023, "grad_norm": 3.7280631065368652, "learning_rate": 2.5967804271598517e-06, "loss": 0.1206, "num_input_tokens_seen": 26273672, "step": 26165 }, { "epoch": 13.87592788971368, "grad_norm": 14.297170639038086, "learning_rate": 2.5947519107611785e-06, "loss": 0.0548, "num_input_tokens_seen": 26280712, "step": 26170 }, { "epoch": 13.878579003181336, "grad_norm": 32.6387939453125, "learning_rate": 2.5927239093198273e-06, "loss": 0.0273, "num_input_tokens_seen": 26285320, "step": 26175 }, { "epoch": 13.881230116648993, "grad_norm": 31.534313201904297, "learning_rate": 2.5906964232699882e-06, "loss": 0.1426, "num_input_tokens_seen": 26289768, "step": 26180 }, { "epoch": 13.88388123011665, "grad_norm": 0.1867646425962448, "learning_rate": 2.5886694530457406e-06, "loss": 0.191, "num_input_tokens_seen": 26294664, "step": 26185 }, { "epoch": 13.886532343584305, "grad_norm": 1.1894195079803467, "learning_rate": 2.586642999081051e-06, "loss": 0.0228, "num_input_tokens_seen": 26299208, "step": 26190 }, { "epoch": 13.889183457051962, "grad_norm": 17.163021087646484, "learning_rate": 2.5846170618097844e-06, "loss": 0.1042, "num_input_tokens_seen": 26303752, "step": 26195 }, { "epoch": 13.891834570519618, "grad_norm": 2.0849525928497314, "learning_rate": 2.58259164166568e-06, "loss": 0.0259, "num_input_tokens_seen": 26308584, "step": 26200 }, { "epoch": 13.894485683987275, "grad_norm": 1.2204927206039429, "learning_rate": 2.580566739082382e-06, "loss": 0.0018, "num_input_tokens_seen": 26314472, "step": 26205 }, { "epoch": 13.89713679745493, "grad_norm": 0.1314382404088974, "learning_rate": 2.578542354493413e-06, "loss": 0.0063, "num_input_tokens_seen": 26319368, "step": 26210 }, { "epoch": 13.899787910922587, "grad_norm": 2.507283926010132, "learning_rate": 2.57651848833219e-06, "loss": 0.0221, "num_input_tokens_seen": 26323144, "step": 26215 }, { "epoch": 13.902439024390244, "grad_norm": 60.16376495361328, "learning_rate": 2.5744951410320163e-06, "loss": 0.0459, "num_input_tokens_seen": 26327560, "step": 26220 }, { "epoch": 13.9050901378579, "grad_norm": 0.38634437322616577, "learning_rate": 2.572472313026089e-06, "loss": 0.0222, "num_input_tokens_seen": 26333064, "step": 26225 }, { "epoch": 13.907741251325557, "grad_norm": 45.81462097167969, "learning_rate": 2.570450004747485e-06, "loss": 0.0732, "num_input_tokens_seen": 26337832, "step": 26230 }, { "epoch": 13.910392364793212, "grad_norm": 5.040156364440918, "learning_rate": 2.5684282166291758e-06, "loss": 0.116, "num_input_tokens_seen": 26342600, "step": 26235 }, { "epoch": 13.91304347826087, "grad_norm": 1.0317375659942627, "learning_rate": 2.5664069491040265e-06, "loss": 0.0314, "num_input_tokens_seen": 26347944, "step": 26240 }, { "epoch": 13.915694591728526, "grad_norm": 0.36990875005722046, "learning_rate": 2.564386202604778e-06, "loss": 0.0204, "num_input_tokens_seen": 26353256, "step": 26245 }, { "epoch": 13.918345705196183, "grad_norm": 3.8410863876342773, "learning_rate": 2.5623659775640714e-06, "loss": 0.0065, "num_input_tokens_seen": 26358216, "step": 26250 }, { "epoch": 13.92099681866384, "grad_norm": 5.152157783508301, "learning_rate": 2.56034627441443e-06, "loss": 0.0323, "num_input_tokens_seen": 26362888, "step": 26255 }, { "epoch": 13.923647932131495, "grad_norm": 27.192928314208984, "learning_rate": 2.5583270935882687e-06, "loss": 0.1084, "num_input_tokens_seen": 26367848, "step": 26260 }, { "epoch": 13.926299045599151, "grad_norm": 76.13096618652344, "learning_rate": 2.5563084355178867e-06, "loss": 0.1053, "num_input_tokens_seen": 26373288, "step": 26265 }, { "epoch": 13.928950159066808, "grad_norm": 40.10076904296875, "learning_rate": 2.5542903006354736e-06, "loss": 0.2672, "num_input_tokens_seen": 26378088, "step": 26270 }, { "epoch": 13.931601272534465, "grad_norm": 5.917727470397949, "learning_rate": 2.5522726893731066e-06, "loss": 0.1185, "num_input_tokens_seen": 26382664, "step": 26275 }, { "epoch": 13.934252386002122, "grad_norm": 17.15923500061035, "learning_rate": 2.5502556021627545e-06, "loss": 0.024, "num_input_tokens_seen": 26386664, "step": 26280 }, { "epoch": 13.936903499469777, "grad_norm": 8.386080741882324, "learning_rate": 2.548239039436264e-06, "loss": 0.0114, "num_input_tokens_seen": 26390984, "step": 26285 }, { "epoch": 13.939554612937433, "grad_norm": 2.4919919967651367, "learning_rate": 2.546223001625382e-06, "loss": 0.0223, "num_input_tokens_seen": 26395912, "step": 26290 }, { "epoch": 13.94220572640509, "grad_norm": 0.09739410132169724, "learning_rate": 2.5442074891617343e-06, "loss": 0.011, "num_input_tokens_seen": 26401480, "step": 26295 }, { "epoch": 13.944856839872747, "grad_norm": 0.5188329219818115, "learning_rate": 2.542192502476837e-06, "loss": 0.0115, "num_input_tokens_seen": 26405864, "step": 26300 }, { "epoch": 13.947507953340402, "grad_norm": 23.66714859008789, "learning_rate": 2.5401780420020935e-06, "loss": 0.0253, "num_input_tokens_seen": 26411176, "step": 26305 }, { "epoch": 13.950159066808059, "grad_norm": 3.447916030883789, "learning_rate": 2.538164108168794e-06, "loss": 0.0046, "num_input_tokens_seen": 26417128, "step": 26310 }, { "epoch": 13.952810180275716, "grad_norm": 2.259341239929199, "learning_rate": 2.5361507014081155e-06, "loss": 0.0505, "num_input_tokens_seen": 26421992, "step": 26315 }, { "epoch": 13.955461293743372, "grad_norm": 0.6864450573921204, "learning_rate": 2.5341378221511282e-06, "loss": 0.0313, "num_input_tokens_seen": 26427016, "step": 26320 }, { "epoch": 13.95811240721103, "grad_norm": 0.15347504615783691, "learning_rate": 2.5321254708287774e-06, "loss": 0.0352, "num_input_tokens_seen": 26432424, "step": 26325 }, { "epoch": 13.960763520678686, "grad_norm": 79.05821228027344, "learning_rate": 2.5301136478719067e-06, "loss": 0.0902, "num_input_tokens_seen": 26436968, "step": 26330 }, { "epoch": 13.963414634146341, "grad_norm": 7.266596794128418, "learning_rate": 2.5281023537112417e-06, "loss": 0.2573, "num_input_tokens_seen": 26442760, "step": 26335 }, { "epoch": 13.966065747613998, "grad_norm": 0.029063045978546143, "learning_rate": 2.526091588777394e-06, "loss": 0.0314, "num_input_tokens_seen": 26447336, "step": 26340 }, { "epoch": 13.968716861081655, "grad_norm": 7.163522243499756, "learning_rate": 2.5240813535008634e-06, "loss": 0.1406, "num_input_tokens_seen": 26451816, "step": 26345 }, { "epoch": 13.971367974549311, "grad_norm": 14.626620292663574, "learning_rate": 2.522071648312037e-06, "loss": 0.2528, "num_input_tokens_seen": 26458056, "step": 26350 }, { "epoch": 13.974019088016966, "grad_norm": 20.361499786376953, "learning_rate": 2.520062473641184e-06, "loss": 0.0488, "num_input_tokens_seen": 26462856, "step": 26355 }, { "epoch": 13.976670201484623, "grad_norm": 1.5985558032989502, "learning_rate": 2.51805382991847e-06, "loss": 0.0197, "num_input_tokens_seen": 26468328, "step": 26360 }, { "epoch": 13.97932131495228, "grad_norm": 1.274476170539856, "learning_rate": 2.5160457175739337e-06, "loss": 0.0081, "num_input_tokens_seen": 26473672, "step": 26365 }, { "epoch": 13.981972428419937, "grad_norm": 0.708411455154419, "learning_rate": 2.514038137037509e-06, "loss": 0.0078, "num_input_tokens_seen": 26478472, "step": 26370 }, { "epoch": 13.984623541887593, "grad_norm": 5.183390140533447, "learning_rate": 2.5120310887390163e-06, "loss": 0.0856, "num_input_tokens_seen": 26482888, "step": 26375 }, { "epoch": 13.987274655355248, "grad_norm": 6.49507999420166, "learning_rate": 2.5100245731081545e-06, "loss": 0.038, "num_input_tokens_seen": 26487880, "step": 26380 }, { "epoch": 13.989925768822905, "grad_norm": 71.71339416503906, "learning_rate": 2.508018590574518e-06, "loss": 0.1708, "num_input_tokens_seen": 26493224, "step": 26385 }, { "epoch": 13.992576882290562, "grad_norm": 8.278069496154785, "learning_rate": 2.5060131415675798e-06, "loss": 0.298, "num_input_tokens_seen": 26497448, "step": 26390 }, { "epoch": 13.995227995758219, "grad_norm": 0.5077017545700073, "learning_rate": 2.504008226516702e-06, "loss": 0.0125, "num_input_tokens_seen": 26501448, "step": 26395 }, { "epoch": 13.997879109225876, "grad_norm": 45.49373245239258, "learning_rate": 2.502003845851132e-06, "loss": 0.1017, "num_input_tokens_seen": 26507720, "step": 26400 }, { "epoch": 14.0, "eval_loss": 0.7290266156196594, "eval_runtime": 29.4167, "eval_samples_per_second": 64.113, "eval_steps_per_second": 16.045, "num_input_tokens_seen": 26510480, "step": 26404 }, { "epoch": 14.00053022269353, "grad_norm": 0.4395175874233246, "learning_rate": 2.5000000000000015e-06, "loss": 0.0268, "num_input_tokens_seen": 26511728, "step": 26405 }, { "epoch": 14.003181336161187, "grad_norm": 1.5622355937957764, "learning_rate": 2.497996689392327e-06, "loss": 0.0142, "num_input_tokens_seen": 26516240, "step": 26410 }, { "epoch": 14.005832449628844, "grad_norm": 39.31336975097656, "learning_rate": 2.4959939144570184e-06, "loss": 0.1573, "num_input_tokens_seen": 26520720, "step": 26415 }, { "epoch": 14.008483563096501, "grad_norm": 0.14109887182712555, "learning_rate": 2.4939916756228565e-06, "loss": 0.0197, "num_input_tokens_seen": 26527088, "step": 26420 }, { "epoch": 14.011134676564158, "grad_norm": 10.955076217651367, "learning_rate": 2.4919899733185216e-06, "loss": 0.0081, "num_input_tokens_seen": 26532752, "step": 26425 }, { "epoch": 14.013785790031813, "grad_norm": 1.1942700147628784, "learning_rate": 2.4899888079725705e-06, "loss": 0.0046, "num_input_tokens_seen": 26537840, "step": 26430 }, { "epoch": 14.01643690349947, "grad_norm": 0.020659063011407852, "learning_rate": 2.487988180013447e-06, "loss": 0.004, "num_input_tokens_seen": 26542576, "step": 26435 }, { "epoch": 14.019088016967126, "grad_norm": 0.1256096214056015, "learning_rate": 2.485988089869481e-06, "loss": 0.006, "num_input_tokens_seen": 26548208, "step": 26440 }, { "epoch": 14.021739130434783, "grad_norm": 1.0836654901504517, "learning_rate": 2.483988537968887e-06, "loss": 0.0084, "num_input_tokens_seen": 26552816, "step": 26445 }, { "epoch": 14.024390243902438, "grad_norm": 0.36116543412208557, "learning_rate": 2.481989524739761e-06, "loss": 0.0214, "num_input_tokens_seen": 26557776, "step": 26450 }, { "epoch": 14.027041357370095, "grad_norm": 0.0569193959236145, "learning_rate": 2.4799910506100927e-06, "loss": 0.0041, "num_input_tokens_seen": 26563056, "step": 26455 }, { "epoch": 14.029692470837752, "grad_norm": 0.5629674196243286, "learning_rate": 2.4779931160077426e-06, "loss": 0.0054, "num_input_tokens_seen": 26567472, "step": 26460 }, { "epoch": 14.032343584305409, "grad_norm": 19.380878448486328, "learning_rate": 2.4759957213604686e-06, "loss": 0.0318, "num_input_tokens_seen": 26571952, "step": 26465 }, { "epoch": 14.034994697773065, "grad_norm": 0.6977324485778809, "learning_rate": 2.473998867095906e-06, "loss": 0.0021, "num_input_tokens_seen": 26576176, "step": 26470 }, { "epoch": 14.03764581124072, "grad_norm": 12.941670417785645, "learning_rate": 2.472002553641576e-06, "loss": 0.0099, "num_input_tokens_seen": 26580912, "step": 26475 }, { "epoch": 14.040296924708377, "grad_norm": 26.685890197753906, "learning_rate": 2.4700067814248844e-06, "loss": 0.0122, "num_input_tokens_seen": 26586384, "step": 26480 }, { "epoch": 14.042948038176034, "grad_norm": 0.1472027450799942, "learning_rate": 2.468011550873121e-06, "loss": 0.0011, "num_input_tokens_seen": 26592016, "step": 26485 }, { "epoch": 14.04559915164369, "grad_norm": 0.20893479883670807, "learning_rate": 2.466016862413456e-06, "loss": 0.0032, "num_input_tokens_seen": 26596528, "step": 26490 }, { "epoch": 14.048250265111347, "grad_norm": 77.35014343261719, "learning_rate": 2.4640227164729547e-06, "loss": 0.1045, "num_input_tokens_seen": 26601808, "step": 26495 }, { "epoch": 14.050901378579002, "grad_norm": 0.503719687461853, "learning_rate": 2.462029113478551e-06, "loss": 0.0029, "num_input_tokens_seen": 26607344, "step": 26500 }, { "epoch": 14.05355249204666, "grad_norm": 0.4371132254600525, "learning_rate": 2.4600360538570724e-06, "loss": 0.0529, "num_input_tokens_seen": 26612688, "step": 26505 }, { "epoch": 14.056203605514316, "grad_norm": 1.2161682844161987, "learning_rate": 2.458043538035231e-06, "loss": 0.0128, "num_input_tokens_seen": 26619888, "step": 26510 }, { "epoch": 14.058854718981973, "grad_norm": 0.35972538590431213, "learning_rate": 2.456051566439613e-06, "loss": 0.0093, "num_input_tokens_seen": 26624848, "step": 26515 }, { "epoch": 14.06150583244963, "grad_norm": 0.3072405755519867, "learning_rate": 2.4540601394966996e-06, "loss": 0.0088, "num_input_tokens_seen": 26632752, "step": 26520 }, { "epoch": 14.064156945917285, "grad_norm": 2.4070510864257812, "learning_rate": 2.452069257632848e-06, "loss": 0.0533, "num_input_tokens_seen": 26637136, "step": 26525 }, { "epoch": 14.066808059384941, "grad_norm": 8.758004188537598, "learning_rate": 2.4500789212743016e-06, "loss": 0.0334, "num_input_tokens_seen": 26641712, "step": 26530 }, { "epoch": 14.069459172852598, "grad_norm": 0.1263919323682785, "learning_rate": 2.4480891308471855e-06, "loss": 0.0013, "num_input_tokens_seen": 26646672, "step": 26535 }, { "epoch": 14.072110286320255, "grad_norm": 0.10593519359827042, "learning_rate": 2.446099886777509e-06, "loss": 0.003, "num_input_tokens_seen": 26650960, "step": 26540 }, { "epoch": 14.074761399787912, "grad_norm": 2.395752429962158, "learning_rate": 2.4441111894911616e-06, "loss": 0.0161, "num_input_tokens_seen": 26656048, "step": 26545 }, { "epoch": 14.077412513255567, "grad_norm": 9.829248428344727, "learning_rate": 2.4421230394139244e-06, "loss": 0.0149, "num_input_tokens_seen": 26660496, "step": 26550 }, { "epoch": 14.080063626723224, "grad_norm": 0.7502285838127136, "learning_rate": 2.4401354369714476e-06, "loss": 0.0037, "num_input_tokens_seen": 26666576, "step": 26555 }, { "epoch": 14.08271474019088, "grad_norm": 2.9380078315734863, "learning_rate": 2.4381483825892765e-06, "loss": 0.2617, "num_input_tokens_seen": 26671184, "step": 26560 }, { "epoch": 14.085365853658537, "grad_norm": 0.19749966263771057, "learning_rate": 2.4361618766928334e-06, "loss": 0.0175, "num_input_tokens_seen": 26676624, "step": 26565 }, { "epoch": 14.088016967126194, "grad_norm": 23.94019317626953, "learning_rate": 2.434175919707424e-06, "loss": 0.0173, "num_input_tokens_seen": 26681776, "step": 26570 }, { "epoch": 14.090668080593849, "grad_norm": 0.04537449777126312, "learning_rate": 2.4321905120582357e-06, "loss": 0.0007, "num_input_tokens_seen": 26687408, "step": 26575 }, { "epoch": 14.093319194061506, "grad_norm": 0.8869311809539795, "learning_rate": 2.4302056541703396e-06, "loss": 0.159, "num_input_tokens_seen": 26691696, "step": 26580 }, { "epoch": 14.095970307529162, "grad_norm": 9.236692428588867, "learning_rate": 2.428221346468686e-06, "loss": 0.0115, "num_input_tokens_seen": 26697552, "step": 26585 }, { "epoch": 14.09862142099682, "grad_norm": 2.4979467391967773, "learning_rate": 2.4262375893781172e-06, "loss": 0.008, "num_input_tokens_seen": 26703152, "step": 26590 }, { "epoch": 14.101272534464474, "grad_norm": 0.569055438041687, "learning_rate": 2.4242543833233413e-06, "loss": 0.0667, "num_input_tokens_seen": 26708432, "step": 26595 }, { "epoch": 14.103923647932131, "grad_norm": 0.36112332344055176, "learning_rate": 2.4222717287289638e-06, "loss": 0.1021, "num_input_tokens_seen": 26713488, "step": 26600 }, { "epoch": 14.106574761399788, "grad_norm": 0.28679585456848145, "learning_rate": 2.4202896260194638e-06, "loss": 0.0594, "num_input_tokens_seen": 26718064, "step": 26605 }, { "epoch": 14.109225874867445, "grad_norm": 2.373145818710327, "learning_rate": 2.4183080756192048e-06, "loss": 0.1233, "num_input_tokens_seen": 26723536, "step": 26610 }, { "epoch": 14.111876988335101, "grad_norm": 15.314040184020996, "learning_rate": 2.4163270779524313e-06, "loss": 0.0132, "num_input_tokens_seen": 26728368, "step": 26615 }, { "epoch": 14.114528101802756, "grad_norm": 0.4007110893726349, "learning_rate": 2.4143466334432693e-06, "loss": 0.0026, "num_input_tokens_seen": 26732272, "step": 26620 }, { "epoch": 14.117179215270413, "grad_norm": 4.08860445022583, "learning_rate": 2.4123667425157247e-06, "loss": 0.0079, "num_input_tokens_seen": 26736240, "step": 26625 }, { "epoch": 14.11983032873807, "grad_norm": 0.14998753368854523, "learning_rate": 2.4103874055936945e-06, "loss": 0.0142, "num_input_tokens_seen": 26741520, "step": 26630 }, { "epoch": 14.122481442205727, "grad_norm": 19.448490142822266, "learning_rate": 2.40840862310094e-06, "loss": 0.0101, "num_input_tokens_seen": 26747440, "step": 26635 }, { "epoch": 14.125132555673384, "grad_norm": 3.830895185470581, "learning_rate": 2.406430395461119e-06, "loss": 0.0089, "num_input_tokens_seen": 26752528, "step": 26640 }, { "epoch": 14.127783669141039, "grad_norm": 0.814468502998352, "learning_rate": 2.404452723097766e-06, "loss": 0.0013, "num_input_tokens_seen": 26757552, "step": 26645 }, { "epoch": 14.130434782608695, "grad_norm": 0.7481362223625183, "learning_rate": 2.4024756064342896e-06, "loss": 0.0513, "num_input_tokens_seen": 26761680, "step": 26650 }, { "epoch": 14.133085896076352, "grad_norm": 36.71363067626953, "learning_rate": 2.4004990458939907e-06, "loss": 0.0266, "num_input_tokens_seen": 26766224, "step": 26655 }, { "epoch": 14.135737009544009, "grad_norm": 0.0332450270652771, "learning_rate": 2.3985230419000437e-06, "loss": 0.0506, "num_input_tokens_seen": 26771344, "step": 26660 }, { "epoch": 14.138388123011666, "grad_norm": 0.36219045519828796, "learning_rate": 2.3965475948755063e-06, "loss": 0.0043, "num_input_tokens_seen": 26776048, "step": 26665 }, { "epoch": 14.14103923647932, "grad_norm": 1.200850248336792, "learning_rate": 2.394572705243316e-06, "loss": 0.0071, "num_input_tokens_seen": 26780592, "step": 26670 }, { "epoch": 14.143690349946977, "grad_norm": 28.742916107177734, "learning_rate": 2.392598373426292e-06, "loss": 0.0623, "num_input_tokens_seen": 26785424, "step": 26675 }, { "epoch": 14.146341463414634, "grad_norm": 18.273649215698242, "learning_rate": 2.3906245998471312e-06, "loss": 0.0098, "num_input_tokens_seen": 26790704, "step": 26680 }, { "epoch": 14.148992576882291, "grad_norm": 0.8898906707763672, "learning_rate": 2.388651384928419e-06, "loss": 0.0082, "num_input_tokens_seen": 26794640, "step": 26685 }, { "epoch": 14.151643690349948, "grad_norm": 2.3906185626983643, "learning_rate": 2.3866787290926093e-06, "loss": 0.0075, "num_input_tokens_seen": 26798768, "step": 26690 }, { "epoch": 14.154294803817603, "grad_norm": 3.1793901920318604, "learning_rate": 2.3847066327620465e-06, "loss": 0.0029, "num_input_tokens_seen": 26803056, "step": 26695 }, { "epoch": 14.15694591728526, "grad_norm": 57.55644226074219, "learning_rate": 2.38273509635895e-06, "loss": 0.0648, "num_input_tokens_seen": 26807568, "step": 26700 }, { "epoch": 14.159597030752916, "grad_norm": 69.91065216064453, "learning_rate": 2.380764120305421e-06, "loss": 0.0831, "num_input_tokens_seen": 26812336, "step": 26705 }, { "epoch": 14.162248144220573, "grad_norm": 10.058880805969238, "learning_rate": 2.37879370502344e-06, "loss": 0.0107, "num_input_tokens_seen": 26817904, "step": 26710 }, { "epoch": 14.164899257688228, "grad_norm": 16.848215103149414, "learning_rate": 2.376823850934868e-06, "loss": 0.1253, "num_input_tokens_seen": 26822640, "step": 26715 }, { "epoch": 14.167550371155885, "grad_norm": 0.40891966223716736, "learning_rate": 2.374854558461443e-06, "loss": 0.0085, "num_input_tokens_seen": 26828016, "step": 26720 }, { "epoch": 14.170201484623542, "grad_norm": 0.1700470894575119, "learning_rate": 2.372885828024792e-06, "loss": 0.0543, "num_input_tokens_seen": 26832784, "step": 26725 }, { "epoch": 14.172852598091199, "grad_norm": 1.8027185201644897, "learning_rate": 2.370917660046406e-06, "loss": 0.0047, "num_input_tokens_seen": 26838032, "step": 26730 }, { "epoch": 14.175503711558855, "grad_norm": 4.092036724090576, "learning_rate": 2.3689500549476723e-06, "loss": 0.0034, "num_input_tokens_seen": 26843536, "step": 26735 }, { "epoch": 14.17815482502651, "grad_norm": 46.16733932495117, "learning_rate": 2.3669830131498465e-06, "loss": 0.0218, "num_input_tokens_seen": 26848560, "step": 26740 }, { "epoch": 14.180805938494167, "grad_norm": 1.2904441356658936, "learning_rate": 2.3650165350740677e-06, "loss": 0.0128, "num_input_tokens_seen": 26853648, "step": 26745 }, { "epoch": 14.183457051961824, "grad_norm": 12.453722953796387, "learning_rate": 2.363050621141354e-06, "loss": 0.0077, "num_input_tokens_seen": 26858448, "step": 26750 }, { "epoch": 14.18610816542948, "grad_norm": 0.18262922763824463, "learning_rate": 2.361085271772602e-06, "loss": 0.0012, "num_input_tokens_seen": 26862448, "step": 26755 }, { "epoch": 14.188759278897138, "grad_norm": 14.065496444702148, "learning_rate": 2.359120487388586e-06, "loss": 0.0586, "num_input_tokens_seen": 26867952, "step": 26760 }, { "epoch": 14.191410392364793, "grad_norm": 1.2539056539535522, "learning_rate": 2.3571562684099667e-06, "loss": 0.0138, "num_input_tokens_seen": 26872464, "step": 26765 }, { "epoch": 14.19406150583245, "grad_norm": 1.1251119375228882, "learning_rate": 2.3551926152572713e-06, "loss": 0.0026, "num_input_tokens_seen": 26877296, "step": 26770 }, { "epoch": 14.196712619300106, "grad_norm": 82.2209701538086, "learning_rate": 2.3532295283509183e-06, "loss": 0.1345, "num_input_tokens_seen": 26882864, "step": 26775 }, { "epoch": 14.199363732767763, "grad_norm": 1.3022822141647339, "learning_rate": 2.351267008111199e-06, "loss": 0.0046, "num_input_tokens_seen": 26887824, "step": 26780 }, { "epoch": 14.20201484623542, "grad_norm": 0.6877485513687134, "learning_rate": 2.349305054958278e-06, "loss": 0.0774, "num_input_tokens_seen": 26893424, "step": 26785 }, { "epoch": 14.204665959703075, "grad_norm": 0.0884845033288002, "learning_rate": 2.347343669312211e-06, "loss": 0.0048, "num_input_tokens_seen": 26897456, "step": 26790 }, { "epoch": 14.207317073170731, "grad_norm": 0.5236916542053223, "learning_rate": 2.345382851592923e-06, "loss": 0.0064, "num_input_tokens_seen": 26902480, "step": 26795 }, { "epoch": 14.209968186638388, "grad_norm": 41.63768768310547, "learning_rate": 2.34342260222022e-06, "loss": 0.0274, "num_input_tokens_seen": 26907664, "step": 26800 }, { "epoch": 14.212619300106045, "grad_norm": 30.49696922302246, "learning_rate": 2.3414629216137842e-06, "loss": 0.0155, "num_input_tokens_seen": 26914320, "step": 26805 }, { "epoch": 14.215270413573702, "grad_norm": 0.07302059233188629, "learning_rate": 2.3395038101931843e-06, "loss": 0.0541, "num_input_tokens_seen": 26918448, "step": 26810 }, { "epoch": 14.217921527041357, "grad_norm": 86.4513168334961, "learning_rate": 2.337545268377853e-06, "loss": 0.1312, "num_input_tokens_seen": 26923536, "step": 26815 }, { "epoch": 14.220572640509014, "grad_norm": 3.581559658050537, "learning_rate": 2.3355872965871147e-06, "loss": 0.038, "num_input_tokens_seen": 26928912, "step": 26820 }, { "epoch": 14.22322375397667, "grad_norm": 51.63207244873047, "learning_rate": 2.3336298952401636e-06, "loss": 0.2187, "num_input_tokens_seen": 26934064, "step": 26825 }, { "epoch": 14.225874867444327, "grad_norm": 3.1771488189697266, "learning_rate": 2.3316730647560753e-06, "loss": 0.0244, "num_input_tokens_seen": 26939920, "step": 26830 }, { "epoch": 14.228525980911982, "grad_norm": 2.811204195022583, "learning_rate": 2.3297168055538018e-06, "loss": 0.0065, "num_input_tokens_seen": 26945264, "step": 26835 }, { "epoch": 14.231177094379639, "grad_norm": 5.157653331756592, "learning_rate": 2.327761118052172e-06, "loss": 0.0346, "num_input_tokens_seen": 26951056, "step": 26840 }, { "epoch": 14.233828207847296, "grad_norm": 21.873937606811523, "learning_rate": 2.325806002669893e-06, "loss": 0.0168, "num_input_tokens_seen": 26955504, "step": 26845 }, { "epoch": 14.236479321314953, "grad_norm": 0.06342481076717377, "learning_rate": 2.3238514598255544e-06, "loss": 0.0489, "num_input_tokens_seen": 26960240, "step": 26850 }, { "epoch": 14.23913043478261, "grad_norm": 0.13438424468040466, "learning_rate": 2.3218974899376124e-06, "loss": 0.0092, "num_input_tokens_seen": 26967280, "step": 26855 }, { "epoch": 14.241781548250264, "grad_norm": 1.003889799118042, "learning_rate": 2.319944093424411e-06, "loss": 0.0218, "num_input_tokens_seen": 26972240, "step": 26860 }, { "epoch": 14.244432661717921, "grad_norm": 0.010395958088338375, "learning_rate": 2.317991270704167e-06, "loss": 0.0195, "num_input_tokens_seen": 26977648, "step": 26865 }, { "epoch": 14.247083775185578, "grad_norm": 0.21595363318920135, "learning_rate": 2.3160390221949737e-06, "loss": 0.0102, "num_input_tokens_seen": 26983088, "step": 26870 }, { "epoch": 14.249734888653235, "grad_norm": 1.5660834312438965, "learning_rate": 2.3140873483148018e-06, "loss": 0.0217, "num_input_tokens_seen": 26987920, "step": 26875 }, { "epoch": 14.252386002120891, "grad_norm": 2.4419491291046143, "learning_rate": 2.312136249481501e-06, "loss": 0.0795, "num_input_tokens_seen": 26992560, "step": 26880 }, { "epoch": 14.255037115588546, "grad_norm": 0.6176154613494873, "learning_rate": 2.3101857261127935e-06, "loss": 0.07, "num_input_tokens_seen": 26998768, "step": 26885 }, { "epoch": 14.257688229056203, "grad_norm": 13.470568656921387, "learning_rate": 2.308235778626288e-06, "loss": 0.0886, "num_input_tokens_seen": 27004560, "step": 26890 }, { "epoch": 14.26033934252386, "grad_norm": 1.272731900215149, "learning_rate": 2.306286407439454e-06, "loss": 0.0047, "num_input_tokens_seen": 27010352, "step": 26895 }, { "epoch": 14.262990455991517, "grad_norm": 116.44482421875, "learning_rate": 2.304337612969654e-06, "loss": 0.06, "num_input_tokens_seen": 27015600, "step": 26900 }, { "epoch": 14.265641569459174, "grad_norm": 92.03459930419922, "learning_rate": 2.3023893956341163e-06, "loss": 0.1023, "num_input_tokens_seen": 27020592, "step": 26905 }, { "epoch": 14.268292682926829, "grad_norm": 5.946107387542725, "learning_rate": 2.300441755849951e-06, "loss": 0.0501, "num_input_tokens_seen": 27025168, "step": 26910 }, { "epoch": 14.270943796394485, "grad_norm": 0.02447701059281826, "learning_rate": 2.2984946940341415e-06, "loss": 0.0476, "num_input_tokens_seen": 27030032, "step": 26915 }, { "epoch": 14.273594909862142, "grad_norm": 16.063230514526367, "learning_rate": 2.296548210603549e-06, "loss": 0.0473, "num_input_tokens_seen": 27035120, "step": 26920 }, { "epoch": 14.276246023329799, "grad_norm": 8.575542449951172, "learning_rate": 2.2946023059749093e-06, "loss": 0.0198, "num_input_tokens_seen": 27039248, "step": 26925 }, { "epoch": 14.278897136797456, "grad_norm": 1.2888820171356201, "learning_rate": 2.292656980564838e-06, "loss": 0.0084, "num_input_tokens_seen": 27043312, "step": 26930 }, { "epoch": 14.28154825026511, "grad_norm": 1.1890274286270142, "learning_rate": 2.290712234789822e-06, "loss": 0.043, "num_input_tokens_seen": 27048208, "step": 26935 }, { "epoch": 14.284199363732768, "grad_norm": 0.29202860593795776, "learning_rate": 2.288768069066226e-06, "loss": 0.0043, "num_input_tokens_seen": 27054416, "step": 26940 }, { "epoch": 14.286850477200424, "grad_norm": 35.069210052490234, "learning_rate": 2.2868244838102944e-06, "loss": 0.0158, "num_input_tokens_seen": 27058736, "step": 26945 }, { "epoch": 14.289501590668081, "grad_norm": 0.033107466995716095, "learning_rate": 2.284881479438138e-06, "loss": 0.0084, "num_input_tokens_seen": 27063632, "step": 26950 }, { "epoch": 14.292152704135738, "grad_norm": 0.6890202760696411, "learning_rate": 2.282939056365754e-06, "loss": 0.0138, "num_input_tokens_seen": 27068464, "step": 26955 }, { "epoch": 14.294803817603393, "grad_norm": 15.874799728393555, "learning_rate": 2.280997215009008e-06, "loss": 0.2147, "num_input_tokens_seen": 27074480, "step": 26960 }, { "epoch": 14.29745493107105, "grad_norm": 48.64000701904297, "learning_rate": 2.2790559557836435e-06, "loss": 0.034, "num_input_tokens_seen": 27079856, "step": 26965 }, { "epoch": 14.300106044538706, "grad_norm": 7.139283657073975, "learning_rate": 2.277115279105279e-06, "loss": 0.0652, "num_input_tokens_seen": 27085264, "step": 26970 }, { "epoch": 14.302757158006363, "grad_norm": 0.07918538898229599, "learning_rate": 2.2751751853894076e-06, "loss": 0.0206, "num_input_tokens_seen": 27090608, "step": 26975 }, { "epoch": 14.305408271474018, "grad_norm": 3.2413458824157715, "learning_rate": 2.273235675051398e-06, "loss": 0.1574, "num_input_tokens_seen": 27095696, "step": 26980 }, { "epoch": 14.308059384941675, "grad_norm": 0.2371620088815689, "learning_rate": 2.271296748506498e-06, "loss": 0.0037, "num_input_tokens_seen": 27100592, "step": 26985 }, { "epoch": 14.310710498409332, "grad_norm": 0.043117716908454895, "learning_rate": 2.2693584061698203e-06, "loss": 0.0191, "num_input_tokens_seen": 27104592, "step": 26990 }, { "epoch": 14.313361611876989, "grad_norm": 7.300515174865723, "learning_rate": 2.2674206484563644e-06, "loss": 0.0108, "num_input_tokens_seen": 27109520, "step": 26995 }, { "epoch": 14.316012725344645, "grad_norm": 0.20294371247291565, "learning_rate": 2.2654834757809975e-06, "loss": 0.0041, "num_input_tokens_seen": 27115312, "step": 27000 }, { "epoch": 14.3186638388123, "grad_norm": 0.5641267895698547, "learning_rate": 2.2635468885584626e-06, "loss": 0.0064, "num_input_tokens_seen": 27120240, "step": 27005 }, { "epoch": 14.321314952279957, "grad_norm": 1.4254988431930542, "learning_rate": 2.261610887203378e-06, "loss": 0.0259, "num_input_tokens_seen": 27124720, "step": 27010 }, { "epoch": 14.323966065747614, "grad_norm": 0.12566694617271423, "learning_rate": 2.2596754721302378e-06, "loss": 0.0124, "num_input_tokens_seen": 27130256, "step": 27015 }, { "epoch": 14.32661717921527, "grad_norm": 0.3019775152206421, "learning_rate": 2.2577406437534055e-06, "loss": 0.0473, "num_input_tokens_seen": 27135952, "step": 27020 }, { "epoch": 14.329268292682928, "grad_norm": 41.61461639404297, "learning_rate": 2.2558064024871297e-06, "loss": 0.0601, "num_input_tokens_seen": 27141168, "step": 27025 }, { "epoch": 14.331919406150583, "grad_norm": 0.7303705811500549, "learning_rate": 2.2538727487455186e-06, "loss": 0.0044, "num_input_tokens_seen": 27145296, "step": 27030 }, { "epoch": 14.33457051961824, "grad_norm": 4.369741916656494, "learning_rate": 2.2519396829425682e-06, "loss": 0.0067, "num_input_tokens_seen": 27149712, "step": 27035 }, { "epoch": 14.337221633085896, "grad_norm": 4.472364902496338, "learning_rate": 2.25000720549214e-06, "loss": 0.0042, "num_input_tokens_seen": 27154192, "step": 27040 }, { "epoch": 14.339872746553553, "grad_norm": 0.1376233547925949, "learning_rate": 2.2480753168079737e-06, "loss": 0.0074, "num_input_tokens_seen": 27159472, "step": 27045 }, { "epoch": 14.34252386002121, "grad_norm": 4.245030403137207, "learning_rate": 2.2461440173036807e-06, "loss": 0.0039, "num_input_tokens_seen": 27163824, "step": 27050 }, { "epoch": 14.345174973488865, "grad_norm": 90.83380126953125, "learning_rate": 2.244213307392748e-06, "loss": 0.0981, "num_input_tokens_seen": 27168464, "step": 27055 }, { "epoch": 14.347826086956522, "grad_norm": 10.606012344360352, "learning_rate": 2.242283187488534e-06, "loss": 0.012, "num_input_tokens_seen": 27173424, "step": 27060 }, { "epoch": 14.350477200424178, "grad_norm": 5.796304225921631, "learning_rate": 2.2403536580042746e-06, "loss": 0.026, "num_input_tokens_seen": 27178192, "step": 27065 }, { "epoch": 14.353128313891835, "grad_norm": 17.112760543823242, "learning_rate": 2.238424719353075e-06, "loss": 0.0344, "num_input_tokens_seen": 27182576, "step": 27070 }, { "epoch": 14.35577942735949, "grad_norm": 0.20916403830051422, "learning_rate": 2.2364963719479147e-06, "loss": 0.0598, "num_input_tokens_seen": 27187600, "step": 27075 }, { "epoch": 14.358430540827147, "grad_norm": 11.591602325439453, "learning_rate": 2.234568616201654e-06, "loss": 0.0083, "num_input_tokens_seen": 27192880, "step": 27080 }, { "epoch": 14.361081654294804, "grad_norm": 2.970144510269165, "learning_rate": 2.2326414525270125e-06, "loss": 0.0235, "num_input_tokens_seen": 27199056, "step": 27085 }, { "epoch": 14.36373276776246, "grad_norm": 3.913527727127075, "learning_rate": 2.230714881336596e-06, "loss": 0.007, "num_input_tokens_seen": 27203664, "step": 27090 }, { "epoch": 14.366383881230117, "grad_norm": 2.195744752883911, "learning_rate": 2.228788903042877e-06, "loss": 0.0034, "num_input_tokens_seen": 27209040, "step": 27095 }, { "epoch": 14.369034994697772, "grad_norm": 2.8000059127807617, "learning_rate": 2.226863518058203e-06, "loss": 0.0655, "num_input_tokens_seen": 27214608, "step": 27100 }, { "epoch": 14.371686108165429, "grad_norm": 3.0401296615600586, "learning_rate": 2.224938726794792e-06, "loss": 0.1148, "num_input_tokens_seen": 27219760, "step": 27105 }, { "epoch": 14.374337221633086, "grad_norm": 6.34034538269043, "learning_rate": 2.2230145296647386e-06, "loss": 0.0167, "num_input_tokens_seen": 27225680, "step": 27110 }, { "epoch": 14.376988335100743, "grad_norm": 2.765062093734741, "learning_rate": 2.221090927080006e-06, "loss": 0.2192, "num_input_tokens_seen": 27232400, "step": 27115 }, { "epoch": 14.3796394485684, "grad_norm": 0.07079500705003738, "learning_rate": 2.219167919452438e-06, "loss": 0.0982, "num_input_tokens_seen": 27237008, "step": 27120 }, { "epoch": 14.382290562036054, "grad_norm": 14.58799934387207, "learning_rate": 2.2172455071937378e-06, "loss": 0.0596, "num_input_tokens_seen": 27242288, "step": 27125 }, { "epoch": 14.384941675503711, "grad_norm": 24.503101348876953, "learning_rate": 2.2153236907154946e-06, "loss": 0.0606, "num_input_tokens_seen": 27246576, "step": 27130 }, { "epoch": 14.387592788971368, "grad_norm": 19.952598571777344, "learning_rate": 2.213402470429163e-06, "loss": 0.0123, "num_input_tokens_seen": 27252336, "step": 27135 }, { "epoch": 14.390243902439025, "grad_norm": 8.203511238098145, "learning_rate": 2.2114818467460704e-06, "loss": 0.2781, "num_input_tokens_seen": 27258896, "step": 27140 }, { "epoch": 14.392895015906682, "grad_norm": 0.7634260058403015, "learning_rate": 2.209561820077417e-06, "loss": 0.0139, "num_input_tokens_seen": 27264944, "step": 27145 }, { "epoch": 14.395546129374337, "grad_norm": 13.77879524230957, "learning_rate": 2.2076423908342763e-06, "loss": 0.085, "num_input_tokens_seen": 27269584, "step": 27150 }, { "epoch": 14.398197242841993, "grad_norm": 0.8875847458839417, "learning_rate": 2.2057235594275915e-06, "loss": 0.0436, "num_input_tokens_seen": 27274000, "step": 27155 }, { "epoch": 14.40084835630965, "grad_norm": 62.07715606689453, "learning_rate": 2.203805326268184e-06, "loss": 0.0693, "num_input_tokens_seen": 27279024, "step": 27160 }, { "epoch": 14.403499469777307, "grad_norm": 0.47497624158859253, "learning_rate": 2.201887691766736e-06, "loss": 0.0269, "num_input_tokens_seen": 27284240, "step": 27165 }, { "epoch": 14.406150583244964, "grad_norm": 0.18657556176185608, "learning_rate": 2.199970656333813e-06, "loss": 0.0615, "num_input_tokens_seen": 27289392, "step": 27170 }, { "epoch": 14.408801696712619, "grad_norm": 3.4237518310546875, "learning_rate": 2.1980542203798457e-06, "loss": 0.0044, "num_input_tokens_seen": 27294480, "step": 27175 }, { "epoch": 14.411452810180275, "grad_norm": 0.5798844695091248, "learning_rate": 2.1961383843151373e-06, "loss": 0.0222, "num_input_tokens_seen": 27298608, "step": 27180 }, { "epoch": 14.414103923647932, "grad_norm": 1.7099889516830444, "learning_rate": 2.194223148549864e-06, "loss": 0.0319, "num_input_tokens_seen": 27303440, "step": 27185 }, { "epoch": 14.416755037115589, "grad_norm": 0.10183209925889969, "learning_rate": 2.1923085134940727e-06, "loss": 0.0113, "num_input_tokens_seen": 27308336, "step": 27190 }, { "epoch": 14.419406150583246, "grad_norm": 0.4936155080795288, "learning_rate": 2.1903944795576816e-06, "loss": 0.0017, "num_input_tokens_seen": 27314384, "step": 27195 }, { "epoch": 14.4220572640509, "grad_norm": 0.0573769249022007, "learning_rate": 2.1884810471504808e-06, "loss": 0.0142, "num_input_tokens_seen": 27320016, "step": 27200 }, { "epoch": 14.424708377518558, "grad_norm": 0.985162079334259, "learning_rate": 2.1865682166821307e-06, "loss": 0.0115, "num_input_tokens_seen": 27325552, "step": 27205 }, { "epoch": 14.427359490986214, "grad_norm": 0.1622559279203415, "learning_rate": 2.184655988562161e-06, "loss": 0.0748, "num_input_tokens_seen": 27331152, "step": 27210 }, { "epoch": 14.430010604453871, "grad_norm": 0.34085020422935486, "learning_rate": 2.182744363199981e-06, "loss": 0.1284, "num_input_tokens_seen": 27336080, "step": 27215 }, { "epoch": 14.432661717921526, "grad_norm": 0.0626165121793747, "learning_rate": 2.180833341004858e-06, "loss": 0.0102, "num_input_tokens_seen": 27341232, "step": 27220 }, { "epoch": 14.435312831389183, "grad_norm": 0.23705846071243286, "learning_rate": 2.1789229223859403e-06, "loss": 0.0028, "num_input_tokens_seen": 27345808, "step": 27225 }, { "epoch": 14.43796394485684, "grad_norm": 2.015653133392334, "learning_rate": 2.177013107752244e-06, "loss": 0.0065, "num_input_tokens_seen": 27352400, "step": 27230 }, { "epoch": 14.440615058324497, "grad_norm": 15.188028335571289, "learning_rate": 2.1751038975126538e-06, "loss": 0.0099, "num_input_tokens_seen": 27356400, "step": 27235 }, { "epoch": 14.443266171792153, "grad_norm": 0.11696930229663849, "learning_rate": 2.173195292075927e-06, "loss": 0.0133, "num_input_tokens_seen": 27360784, "step": 27240 }, { "epoch": 14.445917285259808, "grad_norm": 0.4179815351963043, "learning_rate": 2.171287291850691e-06, "loss": 0.0988, "num_input_tokens_seen": 27365936, "step": 27245 }, { "epoch": 14.448568398727465, "grad_norm": 1.9890878200531006, "learning_rate": 2.169379897245443e-06, "loss": 0.0209, "num_input_tokens_seen": 27370192, "step": 27250 }, { "epoch": 14.451219512195122, "grad_norm": 4.814348220825195, "learning_rate": 2.1674731086685545e-06, "loss": 0.0517, "num_input_tokens_seen": 27375664, "step": 27255 }, { "epoch": 14.453870625662779, "grad_norm": 0.05569484457373619, "learning_rate": 2.165566926528258e-06, "loss": 0.0025, "num_input_tokens_seen": 27380592, "step": 27260 }, { "epoch": 14.456521739130435, "grad_norm": 119.45647430419922, "learning_rate": 2.1636613512326675e-06, "loss": 0.2255, "num_input_tokens_seen": 27386064, "step": 27265 }, { "epoch": 14.45917285259809, "grad_norm": 0.1756129413843155, "learning_rate": 2.1617563831897604e-06, "loss": 0.1463, "num_input_tokens_seen": 27391248, "step": 27270 }, { "epoch": 14.461823966065747, "grad_norm": 93.93150329589844, "learning_rate": 2.159852022807384e-06, "loss": 0.0998, "num_input_tokens_seen": 27396016, "step": 27275 }, { "epoch": 14.464475079533404, "grad_norm": 1.6414309740066528, "learning_rate": 2.157948270493258e-06, "loss": 0.0124, "num_input_tokens_seen": 27401552, "step": 27280 }, { "epoch": 14.46712619300106, "grad_norm": 0.6365848779678345, "learning_rate": 2.1560451266549705e-06, "loss": 0.0092, "num_input_tokens_seen": 27406256, "step": 27285 }, { "epoch": 14.469777306468718, "grad_norm": 25.05461883544922, "learning_rate": 2.154142591699977e-06, "loss": 0.0191, "num_input_tokens_seen": 27410928, "step": 27290 }, { "epoch": 14.472428419936373, "grad_norm": 0.07587272673845291, "learning_rate": 2.1522406660356116e-06, "loss": 0.0014, "num_input_tokens_seen": 27415760, "step": 27295 }, { "epoch": 14.47507953340403, "grad_norm": 0.35763803124427795, "learning_rate": 2.1503393500690643e-06, "loss": 0.1082, "num_input_tokens_seen": 27420304, "step": 27300 }, { "epoch": 14.477730646871686, "grad_norm": 71.32393646240234, "learning_rate": 2.1484386442074075e-06, "loss": 0.063, "num_input_tokens_seen": 27425712, "step": 27305 }, { "epoch": 14.480381760339343, "grad_norm": 2.21087908744812, "learning_rate": 2.1465385488575745e-06, "loss": 0.0212, "num_input_tokens_seen": 27430096, "step": 27310 }, { "epoch": 14.483032873807, "grad_norm": 2.323023796081543, "learning_rate": 2.144639064426372e-06, "loss": 0.0048, "num_input_tokens_seen": 27435024, "step": 27315 }, { "epoch": 14.485683987274655, "grad_norm": 2.1764698028564453, "learning_rate": 2.1427401913204737e-06, "loss": 0.0018, "num_input_tokens_seen": 27440816, "step": 27320 }, { "epoch": 14.488335100742312, "grad_norm": 52.779903411865234, "learning_rate": 2.1408419299464245e-06, "loss": 0.0464, "num_input_tokens_seen": 27445616, "step": 27325 }, { "epoch": 14.490986214209968, "grad_norm": 0.20515990257263184, "learning_rate": 2.1389442807106353e-06, "loss": 0.008, "num_input_tokens_seen": 27450928, "step": 27330 }, { "epoch": 14.493637327677625, "grad_norm": 0.25917088985443115, "learning_rate": 2.1370472440193897e-06, "loss": 0.0212, "num_input_tokens_seen": 27457488, "step": 27335 }, { "epoch": 14.496288441145282, "grad_norm": 0.9433892369270325, "learning_rate": 2.1351508202788375e-06, "loss": 0.0135, "num_input_tokens_seen": 27462064, "step": 27340 }, { "epoch": 14.498939554612937, "grad_norm": 55.527076721191406, "learning_rate": 2.1332550098949963e-06, "loss": 0.0312, "num_input_tokens_seen": 27468304, "step": 27345 }, { "epoch": 14.501590668080594, "grad_norm": 4.862443447113037, "learning_rate": 2.1313598132737596e-06, "loss": 0.0682, "num_input_tokens_seen": 27472784, "step": 27350 }, { "epoch": 14.50424178154825, "grad_norm": 40.48041534423828, "learning_rate": 2.129465230820876e-06, "loss": 0.0373, "num_input_tokens_seen": 27478128, "step": 27355 }, { "epoch": 14.506892895015907, "grad_norm": 3.974059581756592, "learning_rate": 2.127571262941978e-06, "loss": 0.0079, "num_input_tokens_seen": 27483216, "step": 27360 }, { "epoch": 14.509544008483562, "grad_norm": 0.01839369907975197, "learning_rate": 2.1256779100425557e-06, "loss": 0.2004, "num_input_tokens_seen": 27487920, "step": 27365 }, { "epoch": 14.512195121951219, "grad_norm": 0.15208521485328674, "learning_rate": 2.1237851725279716e-06, "loss": 0.0163, "num_input_tokens_seen": 27492432, "step": 27370 }, { "epoch": 14.514846235418876, "grad_norm": 1.0302734375, "learning_rate": 2.121893050803454e-06, "loss": 0.0067, "num_input_tokens_seen": 27497808, "step": 27375 }, { "epoch": 14.517497348886533, "grad_norm": 7.399377346038818, "learning_rate": 2.1200015452741067e-06, "loss": 0.0053, "num_input_tokens_seen": 27504752, "step": 27380 }, { "epoch": 14.52014846235419, "grad_norm": 0.16942048072814941, "learning_rate": 2.1181106563448883e-06, "loss": 0.1144, "num_input_tokens_seen": 27509296, "step": 27385 }, { "epoch": 14.522799575821844, "grad_norm": 6.21605110168457, "learning_rate": 2.1162203844206412e-06, "loss": 0.0055, "num_input_tokens_seen": 27514320, "step": 27390 }, { "epoch": 14.525450689289501, "grad_norm": 2.0028254985809326, "learning_rate": 2.11433072990606e-06, "loss": 0.1053, "num_input_tokens_seen": 27519152, "step": 27395 }, { "epoch": 14.528101802757158, "grad_norm": 0.4705768823623657, "learning_rate": 2.11244169320572e-06, "loss": 0.0008, "num_input_tokens_seen": 27523760, "step": 27400 }, { "epoch": 14.530752916224815, "grad_norm": 0.1581188440322876, "learning_rate": 2.110553274724058e-06, "loss": 0.001, "num_input_tokens_seen": 27528688, "step": 27405 }, { "epoch": 14.533404029692472, "grad_norm": 31.121477127075195, "learning_rate": 2.108665474865379e-06, "loss": 0.0223, "num_input_tokens_seen": 27532368, "step": 27410 }, { "epoch": 14.536055143160127, "grad_norm": 2.002807140350342, "learning_rate": 2.106778294033854e-06, "loss": 0.0064, "num_input_tokens_seen": 27536272, "step": 27415 }, { "epoch": 14.538706256627783, "grad_norm": 11.731912612915039, "learning_rate": 2.104891732633529e-06, "loss": 0.0049, "num_input_tokens_seen": 27540432, "step": 27420 }, { "epoch": 14.54135737009544, "grad_norm": 0.8393725752830505, "learning_rate": 2.1030057910683043e-06, "loss": 0.2197, "num_input_tokens_seen": 27544400, "step": 27425 }, { "epoch": 14.544008483563097, "grad_norm": 0.5536840558052063, "learning_rate": 2.101120469741962e-06, "loss": 0.0019, "num_input_tokens_seen": 27549552, "step": 27430 }, { "epoch": 14.546659597030754, "grad_norm": 0.7219635844230652, "learning_rate": 2.0992357690581406e-06, "loss": 0.226, "num_input_tokens_seen": 27554224, "step": 27435 }, { "epoch": 14.549310710498409, "grad_norm": 2.7208847999572754, "learning_rate": 2.0973516894203507e-06, "loss": 0.0024, "num_input_tokens_seen": 27558448, "step": 27440 }, { "epoch": 14.551961823966066, "grad_norm": 5.410405158996582, "learning_rate": 2.0954682312319686e-06, "loss": 0.0069, "num_input_tokens_seen": 27562992, "step": 27445 }, { "epoch": 14.554612937433722, "grad_norm": 70.57266998291016, "learning_rate": 2.0935853948962374e-06, "loss": 0.0959, "num_input_tokens_seen": 27567856, "step": 27450 }, { "epoch": 14.557264050901379, "grad_norm": 0.98078852891922, "learning_rate": 2.091703180816267e-06, "loss": 0.0276, "num_input_tokens_seen": 27573264, "step": 27455 }, { "epoch": 14.559915164369034, "grad_norm": 1.6675764322280884, "learning_rate": 2.089821589395038e-06, "loss": 0.0026, "num_input_tokens_seen": 27578480, "step": 27460 }, { "epoch": 14.56256627783669, "grad_norm": 0.6283811330795288, "learning_rate": 2.08794062103539e-06, "loss": 0.0112, "num_input_tokens_seen": 27583568, "step": 27465 }, { "epoch": 14.565217391304348, "grad_norm": 0.26640573143959045, "learning_rate": 2.0860602761400333e-06, "loss": 0.0044, "num_input_tokens_seen": 27588048, "step": 27470 }, { "epoch": 14.567868504772004, "grad_norm": 0.11401951313018799, "learning_rate": 2.08418055511155e-06, "loss": 0.0179, "num_input_tokens_seen": 27592880, "step": 27475 }, { "epoch": 14.570519618239661, "grad_norm": 0.7385857105255127, "learning_rate": 2.0823014583523767e-06, "loss": 0.0031, "num_input_tokens_seen": 27597744, "step": 27480 }, { "epoch": 14.573170731707316, "grad_norm": 0.016965195536613464, "learning_rate": 2.080422986264828e-06, "loss": 0.008, "num_input_tokens_seen": 27602480, "step": 27485 }, { "epoch": 14.575821845174973, "grad_norm": 4.275199890136719, "learning_rate": 2.078545139251078e-06, "loss": 0.0223, "num_input_tokens_seen": 27607600, "step": 27490 }, { "epoch": 14.57847295864263, "grad_norm": 0.016043925657868385, "learning_rate": 2.0766679177131686e-06, "loss": 0.0398, "num_input_tokens_seen": 27612176, "step": 27495 }, { "epoch": 14.581124072110287, "grad_norm": 1.4619816541671753, "learning_rate": 2.074791322053009e-06, "loss": 0.0911, "num_input_tokens_seen": 27618288, "step": 27500 }, { "epoch": 14.583775185577943, "grad_norm": 0.12137066572904587, "learning_rate": 2.072915352672373e-06, "loss": 0.0043, "num_input_tokens_seen": 27623536, "step": 27505 }, { "epoch": 14.586426299045598, "grad_norm": 0.43794047832489014, "learning_rate": 2.071040009972898e-06, "loss": 0.0445, "num_input_tokens_seen": 27628080, "step": 27510 }, { "epoch": 14.589077412513255, "grad_norm": 0.31950971484184265, "learning_rate": 2.069165294356097e-06, "loss": 0.001, "num_input_tokens_seen": 27633488, "step": 27515 }, { "epoch": 14.591728525980912, "grad_norm": 22.962772369384766, "learning_rate": 2.067291206223333e-06, "loss": 0.0219, "num_input_tokens_seen": 27638320, "step": 27520 }, { "epoch": 14.594379639448569, "grad_norm": 1.588585376739502, "learning_rate": 2.0654177459758493e-06, "loss": 0.0464, "num_input_tokens_seen": 27645136, "step": 27525 }, { "epoch": 14.597030752916226, "grad_norm": 4.367098331451416, "learning_rate": 2.063544914014748e-06, "loss": 0.0069, "num_input_tokens_seen": 27649904, "step": 27530 }, { "epoch": 14.59968186638388, "grad_norm": 2.1157965660095215, "learning_rate": 2.0616727107409963e-06, "loss": 0.0018, "num_input_tokens_seen": 27654224, "step": 27535 }, { "epoch": 14.602332979851537, "grad_norm": 11.941393852233887, "learning_rate": 2.0598011365554284e-06, "loss": 0.0065, "num_input_tokens_seen": 27659216, "step": 27540 }, { "epoch": 14.604984093319194, "grad_norm": 0.06539520621299744, "learning_rate": 2.057930191858743e-06, "loss": 0.0335, "num_input_tokens_seen": 27664208, "step": 27545 }, { "epoch": 14.607635206786851, "grad_norm": 1.7832238674163818, "learning_rate": 2.056059877051503e-06, "loss": 0.0027, "num_input_tokens_seen": 27668432, "step": 27550 }, { "epoch": 14.610286320254508, "grad_norm": 0.7538211345672607, "learning_rate": 2.0541901925341446e-06, "loss": 0.0046, "num_input_tokens_seen": 27674768, "step": 27555 }, { "epoch": 14.612937433722163, "grad_norm": 1.199260950088501, "learning_rate": 2.0523211387069532e-06, "loss": 0.0126, "num_input_tokens_seen": 27679088, "step": 27560 }, { "epoch": 14.61558854718982, "grad_norm": 0.06419476121664047, "learning_rate": 2.0504527159700943e-06, "loss": 0.0277, "num_input_tokens_seen": 27684176, "step": 27565 }, { "epoch": 14.618239660657476, "grad_norm": 0.05028742179274559, "learning_rate": 2.048584924723591e-06, "loss": 0.0023, "num_input_tokens_seen": 27688656, "step": 27570 }, { "epoch": 14.620890774125133, "grad_norm": 2.677065134048462, "learning_rate": 2.046717765367332e-06, "loss": 0.0053, "num_input_tokens_seen": 27693776, "step": 27575 }, { "epoch": 14.62354188759279, "grad_norm": 63.65773391723633, "learning_rate": 2.044851238301072e-06, "loss": 0.161, "num_input_tokens_seen": 27698800, "step": 27580 }, { "epoch": 14.626193001060445, "grad_norm": 35.53491973876953, "learning_rate": 2.0429853439244285e-06, "loss": 0.0213, "num_input_tokens_seen": 27703728, "step": 27585 }, { "epoch": 14.628844114528102, "grad_norm": 0.8686187863349915, "learning_rate": 2.0411200826368837e-06, "loss": 0.0437, "num_input_tokens_seen": 27708144, "step": 27590 }, { "epoch": 14.631495227995758, "grad_norm": 0.511951208114624, "learning_rate": 2.0392554548377902e-06, "loss": 0.0288, "num_input_tokens_seen": 27712880, "step": 27595 }, { "epoch": 14.634146341463415, "grad_norm": 5.6923627853393555, "learning_rate": 2.037391460926352e-06, "loss": 0.0196, "num_input_tokens_seen": 27719568, "step": 27600 }, { "epoch": 14.63679745493107, "grad_norm": 0.016927048563957214, "learning_rate": 2.0355281013016526e-06, "loss": 0.0101, "num_input_tokens_seen": 27724560, "step": 27605 }, { "epoch": 14.639448568398727, "grad_norm": 1.6154909133911133, "learning_rate": 2.0336653763626308e-06, "loss": 0.1918, "num_input_tokens_seen": 27730064, "step": 27610 }, { "epoch": 14.642099681866384, "grad_norm": 8.726268768310547, "learning_rate": 2.031803286508086e-06, "loss": 0.008, "num_input_tokens_seen": 27734608, "step": 27615 }, { "epoch": 14.64475079533404, "grad_norm": 33.826656341552734, "learning_rate": 2.0299418321366924e-06, "loss": 0.0175, "num_input_tokens_seen": 27738928, "step": 27620 }, { "epoch": 14.647401908801697, "grad_norm": 0.02392437681555748, "learning_rate": 2.028081013646981e-06, "loss": 0.0858, "num_input_tokens_seen": 27743408, "step": 27625 }, { "epoch": 14.650053022269352, "grad_norm": 4.794009208679199, "learning_rate": 2.0262208314373473e-06, "loss": 0.0183, "num_input_tokens_seen": 27750896, "step": 27630 }, { "epoch": 14.65270413573701, "grad_norm": 3.691371440887451, "learning_rate": 2.0243612859060526e-06, "loss": 0.0366, "num_input_tokens_seen": 27755312, "step": 27635 }, { "epoch": 14.655355249204666, "grad_norm": 38.484825134277344, "learning_rate": 2.0225023774512197e-06, "loss": 0.0114, "num_input_tokens_seen": 27759856, "step": 27640 }, { "epoch": 14.658006362672323, "grad_norm": 0.5643777847290039, "learning_rate": 2.020644106470835e-06, "loss": 0.0122, "num_input_tokens_seen": 27764400, "step": 27645 }, { "epoch": 14.66065747613998, "grad_norm": 8.922487258911133, "learning_rate": 2.0187864733627537e-06, "loss": 0.035, "num_input_tokens_seen": 27769712, "step": 27650 }, { "epoch": 14.663308589607635, "grad_norm": 39.0586051940918, "learning_rate": 2.0169294785246845e-06, "loss": 0.1847, "num_input_tokens_seen": 27774160, "step": 27655 }, { "epoch": 14.665959703075291, "grad_norm": 0.5653585195541382, "learning_rate": 2.0150731223542102e-06, "loss": 0.0714, "num_input_tokens_seen": 27779792, "step": 27660 }, { "epoch": 14.668610816542948, "grad_norm": 1.8720142841339111, "learning_rate": 2.013217405248769e-06, "loss": 0.057, "num_input_tokens_seen": 27784592, "step": 27665 }, { "epoch": 14.671261930010605, "grad_norm": 0.07314036041498184, "learning_rate": 2.011362327605666e-06, "loss": 0.009, "num_input_tokens_seen": 27790352, "step": 27670 }, { "epoch": 14.673913043478262, "grad_norm": 0.1340397149324417, "learning_rate": 2.009507889822068e-06, "loss": 0.0082, "num_input_tokens_seen": 27794096, "step": 27675 }, { "epoch": 14.676564156945917, "grad_norm": 3.323225975036621, "learning_rate": 2.0076540922950057e-06, "loss": 0.0391, "num_input_tokens_seen": 27798384, "step": 27680 }, { "epoch": 14.679215270413573, "grad_norm": 11.2877779006958, "learning_rate": 2.0058009354213697e-06, "loss": 0.0794, "num_input_tokens_seen": 27803184, "step": 27685 }, { "epoch": 14.68186638388123, "grad_norm": 0.033762842416763306, "learning_rate": 2.0039484195979227e-06, "loss": 0.0258, "num_input_tokens_seen": 27807536, "step": 27690 }, { "epoch": 14.684517497348887, "grad_norm": 29.963552474975586, "learning_rate": 2.0020965452212754e-06, "loss": 0.0157, "num_input_tokens_seen": 27812880, "step": 27695 }, { "epoch": 14.687168610816542, "grad_norm": 6.1756672859191895, "learning_rate": 2.0002453126879144e-06, "loss": 0.0181, "num_input_tokens_seen": 27818448, "step": 27700 }, { "epoch": 14.689819724284199, "grad_norm": 61.2802848815918, "learning_rate": 1.998394722394183e-06, "loss": 0.0532, "num_input_tokens_seen": 27823088, "step": 27705 }, { "epoch": 14.692470837751856, "grad_norm": 1.8842588663101196, "learning_rate": 1.996544774736287e-06, "loss": 0.3083, "num_input_tokens_seen": 27828080, "step": 27710 }, { "epoch": 14.695121951219512, "grad_norm": 1.4238885641098022, "learning_rate": 1.994695470110295e-06, "loss": 0.0033, "num_input_tokens_seen": 27832912, "step": 27715 }, { "epoch": 14.69777306468717, "grad_norm": 1.6019039154052734, "learning_rate": 1.9928468089121395e-06, "loss": 0.0153, "num_input_tokens_seen": 27837904, "step": 27720 }, { "epoch": 14.700424178154826, "grad_norm": 66.3266830444336, "learning_rate": 1.990998791537611e-06, "loss": 0.1378, "num_input_tokens_seen": 27842704, "step": 27725 }, { "epoch": 14.703075291622481, "grad_norm": 0.3779085874557495, "learning_rate": 1.989151418382371e-06, "loss": 0.0105, "num_input_tokens_seen": 27848208, "step": 27730 }, { "epoch": 14.705726405090138, "grad_norm": 4.550799369812012, "learning_rate": 1.98730468984193e-06, "loss": 0.0046, "num_input_tokens_seen": 27854000, "step": 27735 }, { "epoch": 14.708377518557795, "grad_norm": 0.12307471036911011, "learning_rate": 1.9854586063116737e-06, "loss": 0.0033, "num_input_tokens_seen": 27858544, "step": 27740 }, { "epoch": 14.711028632025451, "grad_norm": 2.9494194984436035, "learning_rate": 1.983613168186843e-06, "loss": 0.0611, "num_input_tokens_seen": 27863504, "step": 27745 }, { "epoch": 14.713679745493106, "grad_norm": 0.05979815125465393, "learning_rate": 1.9817683758625366e-06, "loss": 0.0065, "num_input_tokens_seen": 27869136, "step": 27750 }, { "epoch": 14.716330858960763, "grad_norm": 0.5282044410705566, "learning_rate": 1.979924229733725e-06, "loss": 0.0018, "num_input_tokens_seen": 27874416, "step": 27755 }, { "epoch": 14.71898197242842, "grad_norm": 103.97969818115234, "learning_rate": 1.9780807301952326e-06, "loss": 0.0834, "num_input_tokens_seen": 27879152, "step": 27760 }, { "epoch": 14.721633085896077, "grad_norm": 0.24337390065193176, "learning_rate": 1.9762378776417485e-06, "loss": 0.0024, "num_input_tokens_seen": 27885168, "step": 27765 }, { "epoch": 14.724284199363733, "grad_norm": 54.638282775878906, "learning_rate": 1.9743956724678227e-06, "loss": 0.2257, "num_input_tokens_seen": 27889744, "step": 27770 }, { "epoch": 14.726935312831388, "grad_norm": 0.4679704010486603, "learning_rate": 1.9725541150678656e-06, "loss": 0.0287, "num_input_tokens_seen": 27893680, "step": 27775 }, { "epoch": 14.729586426299045, "grad_norm": 0.5579937100410461, "learning_rate": 1.970713205836149e-06, "loss": 0.017, "num_input_tokens_seen": 27899280, "step": 27780 }, { "epoch": 14.732237539766702, "grad_norm": 9.338173866271973, "learning_rate": 1.9688729451668116e-06, "loss": 0.0053, "num_input_tokens_seen": 27904368, "step": 27785 }, { "epoch": 14.734888653234359, "grad_norm": 1.0004724264144897, "learning_rate": 1.9670333334538416e-06, "loss": 0.005, "num_input_tokens_seen": 27908752, "step": 27790 }, { "epoch": 14.737539766702016, "grad_norm": 13.611449241638184, "learning_rate": 1.9651943710911004e-06, "loss": 0.0161, "num_input_tokens_seen": 27914000, "step": 27795 }, { "epoch": 14.74019088016967, "grad_norm": 0.7104077935218811, "learning_rate": 1.9633560584723028e-06, "loss": 0.022, "num_input_tokens_seen": 27918512, "step": 27800 }, { "epoch": 14.742841993637327, "grad_norm": 46.126678466796875, "learning_rate": 1.9615183959910265e-06, "loss": 0.0153, "num_input_tokens_seen": 27923568, "step": 27805 }, { "epoch": 14.745493107104984, "grad_norm": 35.13700485229492, "learning_rate": 1.9596813840407112e-06, "loss": 0.0221, "num_input_tokens_seen": 27928272, "step": 27810 }, { "epoch": 14.748144220572641, "grad_norm": 1.0226608514785767, "learning_rate": 1.9578450230146563e-06, "loss": 0.081, "num_input_tokens_seen": 27932976, "step": 27815 }, { "epoch": 14.750795334040298, "grad_norm": 1.9091943502426147, "learning_rate": 1.956009313306019e-06, "loss": 0.0337, "num_input_tokens_seen": 27937232, "step": 27820 }, { "epoch": 14.753446447507953, "grad_norm": 0.35109391808509827, "learning_rate": 1.9541742553078263e-06, "loss": 0.0135, "num_input_tokens_seen": 27942352, "step": 27825 }, { "epoch": 14.75609756097561, "grad_norm": 118.2278060913086, "learning_rate": 1.9523398494129514e-06, "loss": 0.0924, "num_input_tokens_seen": 27946672, "step": 27830 }, { "epoch": 14.758748674443266, "grad_norm": 0.5989831686019897, "learning_rate": 1.950506096014142e-06, "loss": 0.0114, "num_input_tokens_seen": 27951664, "step": 27835 }, { "epoch": 14.761399787910923, "grad_norm": 3.8069708347320557, "learning_rate": 1.948672995503998e-06, "loss": 0.0193, "num_input_tokens_seen": 27956400, "step": 27840 }, { "epoch": 14.764050901378578, "grad_norm": 2.2712318897247314, "learning_rate": 1.9468405482749806e-06, "loss": 0.0021, "num_input_tokens_seen": 27960560, "step": 27845 }, { "epoch": 14.766702014846235, "grad_norm": 0.2668331563472748, "learning_rate": 1.9450087547194124e-06, "loss": 0.0499, "num_input_tokens_seen": 27966288, "step": 27850 }, { "epoch": 14.769353128313892, "grad_norm": 5.433338165283203, "learning_rate": 1.943177615229476e-06, "loss": 0.0033, "num_input_tokens_seen": 27971504, "step": 27855 }, { "epoch": 14.772004241781548, "grad_norm": 0.657153308391571, "learning_rate": 1.941347130197212e-06, "loss": 0.0971, "num_input_tokens_seen": 27975888, "step": 27860 }, { "epoch": 14.774655355249205, "grad_norm": 0.21684075891971588, "learning_rate": 1.939517300014527e-06, "loss": 0.002, "num_input_tokens_seen": 27982128, "step": 27865 }, { "epoch": 14.77730646871686, "grad_norm": 0.30131658911705017, "learning_rate": 1.9376881250731753e-06, "loss": 0.0194, "num_input_tokens_seen": 27986672, "step": 27870 }, { "epoch": 14.779957582184517, "grad_norm": 0.7904822826385498, "learning_rate": 1.9358596057647855e-06, "loss": 0.0062, "num_input_tokens_seen": 27991408, "step": 27875 }, { "epoch": 14.782608695652174, "grad_norm": 49.5541877746582, "learning_rate": 1.9340317424808375e-06, "loss": 0.0382, "num_input_tokens_seen": 27997712, "step": 27880 }, { "epoch": 14.78525980911983, "grad_norm": 0.2522602379322052, "learning_rate": 1.932204535612667e-06, "loss": 0.0041, "num_input_tokens_seen": 28002256, "step": 27885 }, { "epoch": 14.787910922587487, "grad_norm": 0.0926319882273674, "learning_rate": 1.9303779855514793e-06, "loss": 0.009, "num_input_tokens_seen": 28006608, "step": 27890 }, { "epoch": 14.790562036055142, "grad_norm": 4.666563987731934, "learning_rate": 1.9285520926883328e-06, "loss": 0.0733, "num_input_tokens_seen": 28011376, "step": 27895 }, { "epoch": 14.7932131495228, "grad_norm": 1.4923748970031738, "learning_rate": 1.926726857414146e-06, "loss": 0.1899, "num_input_tokens_seen": 28016304, "step": 27900 }, { "epoch": 14.795864262990456, "grad_norm": 19.91179656982422, "learning_rate": 1.924902280119697e-06, "loss": 0.0921, "num_input_tokens_seen": 28020496, "step": 27905 }, { "epoch": 14.798515376458113, "grad_norm": 2.9216434955596924, "learning_rate": 1.923078361195623e-06, "loss": 0.0035, "num_input_tokens_seen": 28025392, "step": 27910 }, { "epoch": 14.80116648992577, "grad_norm": 1.6069837808609009, "learning_rate": 1.921255101032419e-06, "loss": 0.0043, "num_input_tokens_seen": 28029808, "step": 27915 }, { "epoch": 14.803817603393425, "grad_norm": 33.62314224243164, "learning_rate": 1.919432500020445e-06, "loss": 0.0132, "num_input_tokens_seen": 28035344, "step": 27920 }, { "epoch": 14.806468716861081, "grad_norm": 9.49742317199707, "learning_rate": 1.9176105585499082e-06, "loss": 0.0448, "num_input_tokens_seen": 28041264, "step": 27925 }, { "epoch": 14.809119830328738, "grad_norm": 23.89166831970215, "learning_rate": 1.9157892770108872e-06, "loss": 0.0143, "num_input_tokens_seen": 28046352, "step": 27930 }, { "epoch": 14.811770943796395, "grad_norm": 6.6927947998046875, "learning_rate": 1.913968655793311e-06, "loss": 0.0064, "num_input_tokens_seen": 28050416, "step": 27935 }, { "epoch": 14.814422057264052, "grad_norm": 5.225345611572266, "learning_rate": 1.91214869528697e-06, "loss": 0.0074, "num_input_tokens_seen": 28054992, "step": 27940 }, { "epoch": 14.817073170731707, "grad_norm": 0.7356735467910767, "learning_rate": 1.9103293958815116e-06, "loss": 0.0016, "num_input_tokens_seen": 28059248, "step": 27945 }, { "epoch": 14.819724284199363, "grad_norm": 0.059946827590465546, "learning_rate": 1.9085107579664485e-06, "loss": 0.0253, "num_input_tokens_seen": 28063696, "step": 27950 }, { "epoch": 14.82237539766702, "grad_norm": 67.54427337646484, "learning_rate": 1.9066927819311387e-06, "loss": 0.1204, "num_input_tokens_seen": 28067344, "step": 27955 }, { "epoch": 14.825026511134677, "grad_norm": 1.4838685989379883, "learning_rate": 1.9048754681648135e-06, "loss": 0.1127, "num_input_tokens_seen": 28071472, "step": 27960 }, { "epoch": 14.827677624602334, "grad_norm": 2.4973511695861816, "learning_rate": 1.9030588170565478e-06, "loss": 0.0509, "num_input_tokens_seen": 28076944, "step": 27965 }, { "epoch": 14.830328738069989, "grad_norm": 87.46221160888672, "learning_rate": 1.9012428289952877e-06, "loss": 0.0824, "num_input_tokens_seen": 28082096, "step": 27970 }, { "epoch": 14.832979851537646, "grad_norm": 0.5824255347251892, "learning_rate": 1.899427504369829e-06, "loss": 0.0432, "num_input_tokens_seen": 28086640, "step": 27975 }, { "epoch": 14.835630965005302, "grad_norm": 0.7114113569259644, "learning_rate": 1.8976128435688285e-06, "loss": 0.0026, "num_input_tokens_seen": 28091632, "step": 27980 }, { "epoch": 14.83828207847296, "grad_norm": 4.022697925567627, "learning_rate": 1.895798846980798e-06, "loss": 0.0065, "num_input_tokens_seen": 28098896, "step": 27985 }, { "epoch": 14.840933191940614, "grad_norm": 0.1188812106847763, "learning_rate": 1.8939855149941149e-06, "loss": 0.0073, "num_input_tokens_seen": 28103408, "step": 27990 }, { "epoch": 14.843584305408271, "grad_norm": 0.05611983314156532, "learning_rate": 1.892172847997002e-06, "loss": 0.0076, "num_input_tokens_seen": 28109648, "step": 27995 }, { "epoch": 14.846235418875928, "grad_norm": 1.271264910697937, "learning_rate": 1.8903608463775518e-06, "loss": 0.0312, "num_input_tokens_seen": 28114512, "step": 28000 }, { "epoch": 14.848886532343585, "grad_norm": 0.32572469115257263, "learning_rate": 1.8885495105237074e-06, "loss": 0.039, "num_input_tokens_seen": 28119248, "step": 28005 }, { "epoch": 14.851537645811241, "grad_norm": 4.5726213455200195, "learning_rate": 1.8867388408232706e-06, "loss": 0.0024, "num_input_tokens_seen": 28124944, "step": 28010 }, { "epoch": 14.854188759278896, "grad_norm": 8.834156036376953, "learning_rate": 1.884928837663902e-06, "loss": 0.0163, "num_input_tokens_seen": 28131344, "step": 28015 }, { "epoch": 14.856839872746553, "grad_norm": 0.12787379324436188, "learning_rate": 1.883119501433117e-06, "loss": 0.0124, "num_input_tokens_seen": 28136592, "step": 28020 }, { "epoch": 14.85949098621421, "grad_norm": 0.20450618863105774, "learning_rate": 1.8813108325182916e-06, "loss": 0.0064, "num_input_tokens_seen": 28140432, "step": 28025 }, { "epoch": 14.862142099681867, "grad_norm": 59.26995086669922, "learning_rate": 1.879502831306656e-06, "loss": 0.0797, "num_input_tokens_seen": 28145616, "step": 28030 }, { "epoch": 14.864793213149524, "grad_norm": 0.9890462160110474, "learning_rate": 1.8776954981852984e-06, "loss": 0.0033, "num_input_tokens_seen": 28150736, "step": 28035 }, { "epoch": 14.867444326617179, "grad_norm": 0.5440049171447754, "learning_rate": 1.875888833541163e-06, "loss": 0.0015, "num_input_tokens_seen": 28155568, "step": 28040 }, { "epoch": 14.870095440084835, "grad_norm": 7.71779203414917, "learning_rate": 1.8740828377610564e-06, "loss": 0.0078, "num_input_tokens_seen": 28160112, "step": 28045 }, { "epoch": 14.872746553552492, "grad_norm": 23.604951858520508, "learning_rate": 1.872277511231631e-06, "loss": 0.0156, "num_input_tokens_seen": 28164688, "step": 28050 }, { "epoch": 14.875397667020149, "grad_norm": 29.349977493286133, "learning_rate": 1.8704728543394086e-06, "loss": 0.207, "num_input_tokens_seen": 28169872, "step": 28055 }, { "epoch": 14.878048780487806, "grad_norm": 2.5369458198547363, "learning_rate": 1.868668867470758e-06, "loss": 0.0337, "num_input_tokens_seen": 28174384, "step": 28060 }, { "epoch": 14.88069989395546, "grad_norm": 0.5133613348007202, "learning_rate": 1.8668655510119094e-06, "loss": 0.0017, "num_input_tokens_seen": 28178928, "step": 28065 }, { "epoch": 14.883351007423117, "grad_norm": 0.3352218270301819, "learning_rate": 1.8650629053489477e-06, "loss": 0.0119, "num_input_tokens_seen": 28184016, "step": 28070 }, { "epoch": 14.886002120890774, "grad_norm": 2.205303430557251, "learning_rate": 1.8632609308678146e-06, "loss": 0.0264, "num_input_tokens_seen": 28188080, "step": 28075 }, { "epoch": 14.888653234358431, "grad_norm": 9.01596450805664, "learning_rate": 1.8614596279543068e-06, "loss": 0.1431, "num_input_tokens_seen": 28193328, "step": 28080 }, { "epoch": 14.891304347826086, "grad_norm": 11.393929481506348, "learning_rate": 1.8596589969940832e-06, "loss": 0.0045, "num_input_tokens_seen": 28197584, "step": 28085 }, { "epoch": 14.893955461293743, "grad_norm": 19.62355613708496, "learning_rate": 1.8578590383726474e-06, "loss": 0.0628, "num_input_tokens_seen": 28205200, "step": 28090 }, { "epoch": 14.8966065747614, "grad_norm": 31.691749572753906, "learning_rate": 1.856059752475371e-06, "loss": 0.0978, "num_input_tokens_seen": 28210224, "step": 28095 }, { "epoch": 14.899257688229056, "grad_norm": 46.631351470947266, "learning_rate": 1.8542611396874743e-06, "loss": 0.0561, "num_input_tokens_seen": 28214992, "step": 28100 }, { "epoch": 14.901908801696713, "grad_norm": 1.630867838859558, "learning_rate": 1.8524632003940367e-06, "loss": 0.0085, "num_input_tokens_seen": 28222224, "step": 28105 }, { "epoch": 14.90455991516437, "grad_norm": 0.4389305114746094, "learning_rate": 1.8506659349799915e-06, "loss": 0.014, "num_input_tokens_seen": 28226800, "step": 28110 }, { "epoch": 14.907211028632025, "grad_norm": 0.3402734398841858, "learning_rate": 1.8488693438301286e-06, "loss": 0.0139, "num_input_tokens_seen": 28231536, "step": 28115 }, { "epoch": 14.909862142099682, "grad_norm": 4.629300594329834, "learning_rate": 1.847073427329092e-06, "loss": 0.0091, "num_input_tokens_seen": 28236752, "step": 28120 }, { "epoch": 14.912513255567339, "grad_norm": 0.8806607723236084, "learning_rate": 1.845278185861387e-06, "loss": 0.0268, "num_input_tokens_seen": 28241264, "step": 28125 }, { "epoch": 14.915164369034995, "grad_norm": 2.164750576019287, "learning_rate": 1.8434836198113642e-06, "loss": 0.0811, "num_input_tokens_seen": 28245808, "step": 28130 }, { "epoch": 14.91781548250265, "grad_norm": 0.9219194650650024, "learning_rate": 1.8416897295632407e-06, "loss": 0.013, "num_input_tokens_seen": 28250928, "step": 28135 }, { "epoch": 14.920466595970307, "grad_norm": 5.194416046142578, "learning_rate": 1.839896515501081e-06, "loss": 0.0242, "num_input_tokens_seen": 28255312, "step": 28140 }, { "epoch": 14.923117709437964, "grad_norm": 0.844457745552063, "learning_rate": 1.838103978008809e-06, "loss": 0.1674, "num_input_tokens_seen": 28260912, "step": 28145 }, { "epoch": 14.92576882290562, "grad_norm": 0.1696648746728897, "learning_rate": 1.8363121174702008e-06, "loss": 0.0035, "num_input_tokens_seen": 28265648, "step": 28150 }, { "epoch": 14.928419936373277, "grad_norm": 1.6170634031295776, "learning_rate": 1.8345209342688903e-06, "loss": 0.01, "num_input_tokens_seen": 28270128, "step": 28155 }, { "epoch": 14.931071049840932, "grad_norm": 0.017197582870721817, "learning_rate": 1.832730428788364e-06, "loss": 0.1239, "num_input_tokens_seen": 28275600, "step": 28160 }, { "epoch": 14.93372216330859, "grad_norm": 0.29587703943252563, "learning_rate": 1.8309406014119652e-06, "loss": 0.0442, "num_input_tokens_seen": 28282544, "step": 28165 }, { "epoch": 14.936373276776246, "grad_norm": 2.685499429702759, "learning_rate": 1.8291514525228909e-06, "loss": 0.0106, "num_input_tokens_seen": 28288144, "step": 28170 }, { "epoch": 14.939024390243903, "grad_norm": 6.993406772613525, "learning_rate": 1.8273629825041915e-06, "loss": 0.1085, "num_input_tokens_seen": 28292400, "step": 28175 }, { "epoch": 14.94167550371156, "grad_norm": 78.0058364868164, "learning_rate": 1.8255751917387792e-06, "loss": 0.1717, "num_input_tokens_seen": 28297616, "step": 28180 }, { "epoch": 14.944326617179215, "grad_norm": 0.01849767006933689, "learning_rate": 1.8237880806094088e-06, "loss": 0.0057, "num_input_tokens_seen": 28303056, "step": 28185 }, { "epoch": 14.946977730646871, "grad_norm": 4.802833557128906, "learning_rate": 1.8220016494987002e-06, "loss": 0.0046, "num_input_tokens_seen": 28306928, "step": 28190 }, { "epoch": 14.949628844114528, "grad_norm": 37.84676742553711, "learning_rate": 1.8202158987891222e-06, "loss": 0.028, "num_input_tokens_seen": 28311760, "step": 28195 }, { "epoch": 14.952279957582185, "grad_norm": 0.014737498015165329, "learning_rate": 1.8184308288630004e-06, "loss": 0.0031, "num_input_tokens_seen": 28316752, "step": 28200 }, { "epoch": 14.954931071049842, "grad_norm": 0.6837162375450134, "learning_rate": 1.816646440102512e-06, "loss": 0.008, "num_input_tokens_seen": 28321104, "step": 28205 }, { "epoch": 14.957582184517497, "grad_norm": 0.9425452947616577, "learning_rate": 1.8148627328896911e-06, "loss": 0.001, "num_input_tokens_seen": 28326896, "step": 28210 }, { "epoch": 14.960233297985154, "grad_norm": 0.8868814706802368, "learning_rate": 1.8130797076064227e-06, "loss": 0.0011, "num_input_tokens_seen": 28330896, "step": 28215 }, { "epoch": 14.96288441145281, "grad_norm": 12.896048545837402, "learning_rate": 1.8112973646344534e-06, "loss": 0.0057, "num_input_tokens_seen": 28335856, "step": 28220 }, { "epoch": 14.965535524920467, "grad_norm": 0.16954413056373596, "learning_rate": 1.8095157043553708e-06, "loss": 0.1596, "num_input_tokens_seen": 28341200, "step": 28225 }, { "epoch": 14.968186638388122, "grad_norm": 1.1102004051208496, "learning_rate": 1.8077347271506284e-06, "loss": 0.0012, "num_input_tokens_seen": 28345488, "step": 28230 }, { "epoch": 14.970837751855779, "grad_norm": 6.407876491546631, "learning_rate": 1.8059544334015279e-06, "loss": 0.0176, "num_input_tokens_seen": 28350608, "step": 28235 }, { "epoch": 14.973488865323436, "grad_norm": 1.0193626880645752, "learning_rate": 1.8041748234892253e-06, "loss": 0.0159, "num_input_tokens_seen": 28356272, "step": 28240 }, { "epoch": 14.976139978791092, "grad_norm": 0.40133777260780334, "learning_rate": 1.8023958977947303e-06, "loss": 0.0428, "num_input_tokens_seen": 28360656, "step": 28245 }, { "epoch": 14.97879109225875, "grad_norm": 1.4856452941894531, "learning_rate": 1.8006176566989064e-06, "loss": 0.004, "num_input_tokens_seen": 28364624, "step": 28250 }, { "epoch": 14.981442205726404, "grad_norm": 2.598398208618164, "learning_rate": 1.7988401005824686e-06, "loss": 0.0044, "num_input_tokens_seen": 28369744, "step": 28255 }, { "epoch": 14.984093319194061, "grad_norm": 1.2090011835098267, "learning_rate": 1.7970632298259916e-06, "loss": 0.0171, "num_input_tokens_seen": 28374480, "step": 28260 }, { "epoch": 14.986744432661718, "grad_norm": 71.71710205078125, "learning_rate": 1.795287044809893e-06, "loss": 0.0402, "num_input_tokens_seen": 28379376, "step": 28265 }, { "epoch": 14.989395546129375, "grad_norm": 19.651159286499023, "learning_rate": 1.7935115459144537e-06, "loss": 0.0152, "num_input_tokens_seen": 28384432, "step": 28270 }, { "epoch": 14.992046659597031, "grad_norm": 69.58576965332031, "learning_rate": 1.7917367335198016e-06, "loss": 0.0897, "num_input_tokens_seen": 28388912, "step": 28275 }, { "epoch": 14.994697773064686, "grad_norm": 0.29956793785095215, "learning_rate": 1.7899626080059202e-06, "loss": 0.0071, "num_input_tokens_seen": 28394544, "step": 28280 }, { "epoch": 14.997348886532343, "grad_norm": 0.5369403958320618, "learning_rate": 1.7881891697526443e-06, "loss": 0.0491, "num_input_tokens_seen": 28399184, "step": 28285 }, { "epoch": 15.0, "grad_norm": 5.859635829925537, "learning_rate": 1.7864164191396627e-06, "loss": 0.0056, "num_input_tokens_seen": 28403312, "step": 28290 }, { "epoch": 15.002651113467657, "grad_norm": 2.114809274673462, "learning_rate": 1.784644356546517e-06, "loss": 0.002, "num_input_tokens_seen": 28408560, "step": 28295 }, { "epoch": 15.005302226935314, "grad_norm": 3.4121384620666504, "learning_rate": 1.7828729823526003e-06, "loss": 0.0046, "num_input_tokens_seen": 28415056, "step": 28300 }, { "epoch": 15.007953340402969, "grad_norm": 0.20490965247154236, "learning_rate": 1.78110229693716e-06, "loss": 0.0127, "num_input_tokens_seen": 28419184, "step": 28305 }, { "epoch": 15.010604453870625, "grad_norm": 0.15183408558368683, "learning_rate": 1.7793323006792934e-06, "loss": 0.0252, "num_input_tokens_seen": 28424368, "step": 28310 }, { "epoch": 15.013255567338282, "grad_norm": 0.3942767083644867, "learning_rate": 1.777562993957957e-06, "loss": 0.0038, "num_input_tokens_seen": 28430128, "step": 28315 }, { "epoch": 15.015906680805939, "grad_norm": 0.013888363726437092, "learning_rate": 1.7757943771519476e-06, "loss": 0.0015, "num_input_tokens_seen": 28434384, "step": 28320 }, { "epoch": 15.018557794273596, "grad_norm": 0.017172040417790413, "learning_rate": 1.774026450639927e-06, "loss": 0.0046, "num_input_tokens_seen": 28439248, "step": 28325 }, { "epoch": 15.02120890774125, "grad_norm": 2.9358880519866943, "learning_rate": 1.7722592148004015e-06, "loss": 0.0058, "num_input_tokens_seen": 28443728, "step": 28330 }, { "epoch": 15.023860021208908, "grad_norm": 0.05101291090250015, "learning_rate": 1.7704926700117326e-06, "loss": 0.0062, "num_input_tokens_seen": 28448144, "step": 28335 }, { "epoch": 15.026511134676564, "grad_norm": 1.8288613557815552, "learning_rate": 1.7687268166521315e-06, "loss": 0.0033, "num_input_tokens_seen": 28453680, "step": 28340 }, { "epoch": 15.029162248144221, "grad_norm": 0.4473559558391571, "learning_rate": 1.7669616550996643e-06, "loss": 0.0071, "num_input_tokens_seen": 28458192, "step": 28345 }, { "epoch": 15.031813361611878, "grad_norm": 0.5264180302619934, "learning_rate": 1.7651971857322446e-06, "loss": 0.0059, "num_input_tokens_seen": 28463088, "step": 28350 }, { "epoch": 15.034464475079533, "grad_norm": 0.2596660852432251, "learning_rate": 1.7634334089276461e-06, "loss": 0.0038, "num_input_tokens_seen": 28467888, "step": 28355 }, { "epoch": 15.03711558854719, "grad_norm": 2.233846426010132, "learning_rate": 1.7616703250634826e-06, "loss": 0.0034, "num_input_tokens_seen": 28474256, "step": 28360 }, { "epoch": 15.039766702014846, "grad_norm": 0.07237088680267334, "learning_rate": 1.75990793451723e-06, "loss": 0.0014, "num_input_tokens_seen": 28479632, "step": 28365 }, { "epoch": 15.042417815482503, "grad_norm": 0.15667717158794403, "learning_rate": 1.7581462376662101e-06, "loss": 0.2368, "num_input_tokens_seen": 28485616, "step": 28370 }, { "epoch": 15.045068928950158, "grad_norm": 0.19666747748851776, "learning_rate": 1.7563852348875976e-06, "loss": 0.0017, "num_input_tokens_seen": 28490320, "step": 28375 }, { "epoch": 15.047720042417815, "grad_norm": 13.902578353881836, "learning_rate": 1.7546249265584193e-06, "loss": 0.0076, "num_input_tokens_seen": 28495920, "step": 28380 }, { "epoch": 15.050371155885472, "grad_norm": 0.25933587551116943, "learning_rate": 1.7528653130555512e-06, "loss": 0.0028, "num_input_tokens_seen": 28501424, "step": 28385 }, { "epoch": 15.053022269353129, "grad_norm": 0.23136252164840698, "learning_rate": 1.751106394755721e-06, "loss": 0.069, "num_input_tokens_seen": 28507344, "step": 28390 }, { "epoch": 15.055673382820785, "grad_norm": 71.5010757446289, "learning_rate": 1.7493481720355137e-06, "loss": 0.0969, "num_input_tokens_seen": 28511696, "step": 28395 }, { "epoch": 15.05832449628844, "grad_norm": 0.016161508858203888, "learning_rate": 1.7475906452713536e-06, "loss": 0.0006, "num_input_tokens_seen": 28516368, "step": 28400 }, { "epoch": 15.060975609756097, "grad_norm": 0.123029924929142, "learning_rate": 1.745833814839527e-06, "loss": 0.0044, "num_input_tokens_seen": 28520592, "step": 28405 }, { "epoch": 15.063626723223754, "grad_norm": 1.0836114883422852, "learning_rate": 1.7440776811161653e-06, "loss": 0.0022, "num_input_tokens_seen": 28525840, "step": 28410 }, { "epoch": 15.06627783669141, "grad_norm": 0.6369734406471252, "learning_rate": 1.7423222444772525e-06, "loss": 0.0013, "num_input_tokens_seen": 28531280, "step": 28415 }, { "epoch": 15.068928950159068, "grad_norm": 1.2423475980758667, "learning_rate": 1.7405675052986226e-06, "loss": 0.0032, "num_input_tokens_seen": 28536592, "step": 28420 }, { "epoch": 15.071580063626723, "grad_norm": 0.149179607629776, "learning_rate": 1.7388134639559605e-06, "loss": 0.1054, "num_input_tokens_seen": 28544144, "step": 28425 }, { "epoch": 15.07423117709438, "grad_norm": 1.3086844682693481, "learning_rate": 1.7370601208248005e-06, "loss": 0.0048, "num_input_tokens_seen": 28548432, "step": 28430 }, { "epoch": 15.076882290562036, "grad_norm": 28.99114990234375, "learning_rate": 1.7353074762805339e-06, "loss": 0.0308, "num_input_tokens_seen": 28553200, "step": 28435 }, { "epoch": 15.079533404029693, "grad_norm": 6.866462707519531, "learning_rate": 1.733555530698392e-06, "loss": 0.0495, "num_input_tokens_seen": 28558704, "step": 28440 }, { "epoch": 15.08218451749735, "grad_norm": 0.23791521787643433, "learning_rate": 1.7318042844534622e-06, "loss": 0.0038, "num_input_tokens_seen": 28563152, "step": 28445 }, { "epoch": 15.084835630965005, "grad_norm": 0.9271265268325806, "learning_rate": 1.730053737920686e-06, "loss": 0.0416, "num_input_tokens_seen": 28567984, "step": 28450 }, { "epoch": 15.087486744432661, "grad_norm": 33.800453186035156, "learning_rate": 1.7283038914748446e-06, "loss": 0.0237, "num_input_tokens_seen": 28573424, "step": 28455 }, { "epoch": 15.090137857900318, "grad_norm": 0.23477675020694733, "learning_rate": 1.7265547454905807e-06, "loss": 0.0003, "num_input_tokens_seen": 28578704, "step": 28460 }, { "epoch": 15.092788971367975, "grad_norm": 0.3879936933517456, "learning_rate": 1.7248063003423798e-06, "loss": 0.0024, "num_input_tokens_seen": 28584944, "step": 28465 }, { "epoch": 15.095440084835632, "grad_norm": 2.8557021617889404, "learning_rate": 1.7230585564045805e-06, "loss": 0.0045, "num_input_tokens_seen": 28589008, "step": 28470 }, { "epoch": 15.098091198303287, "grad_norm": 3.3304123878479004, "learning_rate": 1.7213115140513687e-06, "loss": 0.0022, "num_input_tokens_seen": 28593360, "step": 28475 }, { "epoch": 15.100742311770944, "grad_norm": 0.23800183832645416, "learning_rate": 1.719565173656782e-06, "loss": 0.0018, "num_input_tokens_seen": 28600144, "step": 28480 }, { "epoch": 15.1033934252386, "grad_norm": 0.13336728513240814, "learning_rate": 1.7178195355947064e-06, "loss": 0.0018, "num_input_tokens_seen": 28604656, "step": 28485 }, { "epoch": 15.106044538706257, "grad_norm": 73.00559997558594, "learning_rate": 1.716074600238883e-06, "loss": 0.0221, "num_input_tokens_seen": 28609008, "step": 28490 }, { "epoch": 15.108695652173912, "grad_norm": 0.7193148732185364, "learning_rate": 1.714330367962891e-06, "loss": 0.0028, "num_input_tokens_seen": 28613424, "step": 28495 }, { "epoch": 15.111346765641569, "grad_norm": 0.40906643867492676, "learning_rate": 1.7125868391401712e-06, "loss": 0.0067, "num_input_tokens_seen": 28617840, "step": 28500 }, { "epoch": 15.113997879109226, "grad_norm": 0.006817553658038378, "learning_rate": 1.7108440141440064e-06, "loss": 0.0013, "num_input_tokens_seen": 28621840, "step": 28505 }, { "epoch": 15.116648992576883, "grad_norm": 0.037271611392498016, "learning_rate": 1.7091018933475318e-06, "loss": 0.1527, "num_input_tokens_seen": 28626672, "step": 28510 }, { "epoch": 15.11930010604454, "grad_norm": 0.07759195566177368, "learning_rate": 1.707360477123729e-06, "loss": 0.0147, "num_input_tokens_seen": 28632464, "step": 28515 }, { "epoch": 15.121951219512194, "grad_norm": 0.42529821395874023, "learning_rate": 1.7056197658454349e-06, "loss": 0.0058, "num_input_tokens_seen": 28636784, "step": 28520 }, { "epoch": 15.124602332979851, "grad_norm": 0.06166248396039009, "learning_rate": 1.7038797598853252e-06, "loss": 0.0031, "num_input_tokens_seen": 28641776, "step": 28525 }, { "epoch": 15.127253446447508, "grad_norm": 66.0761947631836, "learning_rate": 1.7021404596159375e-06, "loss": 0.0842, "num_input_tokens_seen": 28646064, "step": 28530 }, { "epoch": 15.129904559915165, "grad_norm": 1.3204255104064941, "learning_rate": 1.7004018654096437e-06, "loss": 0.0848, "num_input_tokens_seen": 28650704, "step": 28535 }, { "epoch": 15.132555673382821, "grad_norm": 36.797603607177734, "learning_rate": 1.6986639776386787e-06, "loss": 0.0216, "num_input_tokens_seen": 28655888, "step": 28540 }, { "epoch": 15.135206786850476, "grad_norm": 0.5249825716018677, "learning_rate": 1.6969267966751175e-06, "loss": 0.001, "num_input_tokens_seen": 28660912, "step": 28545 }, { "epoch": 15.137857900318133, "grad_norm": 0.08637230098247528, "learning_rate": 1.695190322890885e-06, "loss": 0.0145, "num_input_tokens_seen": 28665680, "step": 28550 }, { "epoch": 15.14050901378579, "grad_norm": 0.43289658427238464, "learning_rate": 1.6934545566577559e-06, "loss": 0.005, "num_input_tokens_seen": 28670064, "step": 28555 }, { "epoch": 15.143160127253447, "grad_norm": 0.44091567397117615, "learning_rate": 1.6917194983473568e-06, "loss": 0.0952, "num_input_tokens_seen": 28674960, "step": 28560 }, { "epoch": 15.145811240721104, "grad_norm": 10.738520622253418, "learning_rate": 1.6899851483311524e-06, "loss": 0.2476, "num_input_tokens_seen": 28680592, "step": 28565 }, { "epoch": 15.148462354188759, "grad_norm": 66.89256286621094, "learning_rate": 1.6882515069804684e-06, "loss": 0.0394, "num_input_tokens_seen": 28687152, "step": 28570 }, { "epoch": 15.151113467656415, "grad_norm": 0.014364946633577347, "learning_rate": 1.6865185746664725e-06, "loss": 0.0007, "num_input_tokens_seen": 28691792, "step": 28575 }, { "epoch": 15.153764581124072, "grad_norm": 18.52940559387207, "learning_rate": 1.684786351760176e-06, "loss": 0.009, "num_input_tokens_seen": 28696624, "step": 28580 }, { "epoch": 15.156415694591729, "grad_norm": 0.2683798372745514, "learning_rate": 1.6830548386324474e-06, "loss": 0.0073, "num_input_tokens_seen": 28701520, "step": 28585 }, { "epoch": 15.159066808059386, "grad_norm": 0.02102195844054222, "learning_rate": 1.681324035653999e-06, "loss": 0.0031, "num_input_tokens_seen": 28706096, "step": 28590 }, { "epoch": 15.16171792152704, "grad_norm": 0.025816859677433968, "learning_rate": 1.6795939431953905e-06, "loss": 0.0011, "num_input_tokens_seen": 28712176, "step": 28595 }, { "epoch": 15.164369034994698, "grad_norm": 0.017073825001716614, "learning_rate": 1.67786456162703e-06, "loss": 0.0204, "num_input_tokens_seen": 28718416, "step": 28600 }, { "epoch": 15.167020148462354, "grad_norm": 0.7155235409736633, "learning_rate": 1.6761358913191739e-06, "loss": 0.0208, "num_input_tokens_seen": 28725296, "step": 28605 }, { "epoch": 15.169671261930011, "grad_norm": 0.13313162326812744, "learning_rate": 1.674407932641924e-06, "loss": 0.0005, "num_input_tokens_seen": 28730384, "step": 28610 }, { "epoch": 15.172322375397666, "grad_norm": 3.585848569869995, "learning_rate": 1.6726806859652366e-06, "loss": 0.0227, "num_input_tokens_seen": 28734800, "step": 28615 }, { "epoch": 15.174973488865323, "grad_norm": 0.027828902006149292, "learning_rate": 1.6709541516589046e-06, "loss": 0.0006, "num_input_tokens_seen": 28738704, "step": 28620 }, { "epoch": 15.17762460233298, "grad_norm": 85.16987609863281, "learning_rate": 1.6692283300925787e-06, "loss": 0.0831, "num_input_tokens_seen": 28743504, "step": 28625 }, { "epoch": 15.180275715800637, "grad_norm": 0.010357704944908619, "learning_rate": 1.6675032216357523e-06, "loss": 0.127, "num_input_tokens_seen": 28748080, "step": 28630 }, { "epoch": 15.182926829268293, "grad_norm": 0.688431441783905, "learning_rate": 1.6657788266577652e-06, "loss": 0.1452, "num_input_tokens_seen": 28753680, "step": 28635 }, { "epoch": 15.185577942735948, "grad_norm": 0.45521023869514465, "learning_rate": 1.664055145527807e-06, "loss": 0.002, "num_input_tokens_seen": 28759760, "step": 28640 }, { "epoch": 15.188229056203605, "grad_norm": 0.4377457797527313, "learning_rate": 1.6623321786149127e-06, "loss": 0.003, "num_input_tokens_seen": 28765040, "step": 28645 }, { "epoch": 15.190880169671262, "grad_norm": 0.01555074006319046, "learning_rate": 1.6606099262879633e-06, "loss": 0.0048, "num_input_tokens_seen": 28769232, "step": 28650 }, { "epoch": 15.193531283138919, "grad_norm": 0.05316762253642082, "learning_rate": 1.6588883889156942e-06, "loss": 0.0009, "num_input_tokens_seen": 28774416, "step": 28655 }, { "epoch": 15.196182396606575, "grad_norm": 0.31221216917037964, "learning_rate": 1.6571675668666742e-06, "loss": 0.0007, "num_input_tokens_seen": 28778800, "step": 28660 }, { "epoch": 15.19883351007423, "grad_norm": 0.7196857929229736, "learning_rate": 1.6554474605093328e-06, "loss": 0.0005, "num_input_tokens_seen": 28783888, "step": 28665 }, { "epoch": 15.201484623541887, "grad_norm": 35.694091796875, "learning_rate": 1.6537280702119386e-06, "loss": 0.0217, "num_input_tokens_seen": 28789424, "step": 28670 }, { "epoch": 15.204135737009544, "grad_norm": 0.08912454545497894, "learning_rate": 1.6520093963426081e-06, "loss": 0.0004, "num_input_tokens_seen": 28794224, "step": 28675 }, { "epoch": 15.2067868504772, "grad_norm": 33.41136169433594, "learning_rate": 1.6502914392693049e-06, "loss": 0.0165, "num_input_tokens_seen": 28798512, "step": 28680 }, { "epoch": 15.209437963944858, "grad_norm": 0.726702868938446, "learning_rate": 1.6485741993598392e-06, "loss": 0.0023, "num_input_tokens_seen": 28803376, "step": 28685 }, { "epoch": 15.212089077412513, "grad_norm": 0.09516361355781555, "learning_rate": 1.6468576769818663e-06, "loss": 0.001, "num_input_tokens_seen": 28808752, "step": 28690 }, { "epoch": 15.21474019088017, "grad_norm": 0.12002874165773392, "learning_rate": 1.6451418725028934e-06, "loss": 0.0004, "num_input_tokens_seen": 28813680, "step": 28695 }, { "epoch": 15.217391304347826, "grad_norm": 0.058943748474121094, "learning_rate": 1.6434267862902636e-06, "loss": 0.0104, "num_input_tokens_seen": 28818480, "step": 28700 }, { "epoch": 15.220042417815483, "grad_norm": 0.026449205353856087, "learning_rate": 1.6417124187111778e-06, "loss": 0.1303, "num_input_tokens_seen": 28825200, "step": 28705 }, { "epoch": 15.22269353128314, "grad_norm": 6.651317119598389, "learning_rate": 1.6399987701326764e-06, "loss": 0.004, "num_input_tokens_seen": 28829840, "step": 28710 }, { "epoch": 15.225344644750795, "grad_norm": 0.911587119102478, "learning_rate": 1.638285840921643e-06, "loss": 0.0037, "num_input_tokens_seen": 28834576, "step": 28715 }, { "epoch": 15.227995758218452, "grad_norm": 2.361372232437134, "learning_rate": 1.6365736314448154e-06, "loss": 0.0032, "num_input_tokens_seen": 28840080, "step": 28720 }, { "epoch": 15.230646871686108, "grad_norm": 3.322333812713623, "learning_rate": 1.6348621420687726e-06, "loss": 0.0091, "num_input_tokens_seen": 28845136, "step": 28725 }, { "epoch": 15.233297985153765, "grad_norm": 10.776290893554688, "learning_rate": 1.6331513731599397e-06, "loss": 0.0066, "num_input_tokens_seen": 28851408, "step": 28730 }, { "epoch": 15.235949098621422, "grad_norm": 7.207732677459717, "learning_rate": 1.631441325084588e-06, "loss": 0.0042, "num_input_tokens_seen": 28857232, "step": 28735 }, { "epoch": 15.238600212089077, "grad_norm": 83.16777038574219, "learning_rate": 1.6297319982088333e-06, "loss": 0.0894, "num_input_tokens_seen": 28862320, "step": 28740 }, { "epoch": 15.241251325556734, "grad_norm": 0.08660885691642761, "learning_rate": 1.6280233928986372e-06, "loss": 0.0016, "num_input_tokens_seen": 28867600, "step": 28745 }, { "epoch": 15.24390243902439, "grad_norm": 70.72055053710938, "learning_rate": 1.6263155095198125e-06, "loss": 0.0609, "num_input_tokens_seen": 28873904, "step": 28750 }, { "epoch": 15.246553552492047, "grad_norm": 6.811311721801758, "learning_rate": 1.6246083484380055e-06, "loss": 0.0086, "num_input_tokens_seen": 28878672, "step": 28755 }, { "epoch": 15.249204665959702, "grad_norm": 1.6809364557266235, "learning_rate": 1.6229019100187204e-06, "loss": 0.0572, "num_input_tokens_seen": 28884304, "step": 28760 }, { "epoch": 15.251855779427359, "grad_norm": 0.1261998414993286, "learning_rate": 1.6211961946272997e-06, "loss": 0.0065, "num_input_tokens_seen": 28889360, "step": 28765 }, { "epoch": 15.254506892895016, "grad_norm": 0.15515944361686707, "learning_rate": 1.6194912026289323e-06, "loss": 0.2035, "num_input_tokens_seen": 28894640, "step": 28770 }, { "epoch": 15.257158006362673, "grad_norm": 2.168633222579956, "learning_rate": 1.617786934388652e-06, "loss": 0.0015, "num_input_tokens_seen": 28899216, "step": 28775 }, { "epoch": 15.25980911983033, "grad_norm": 0.759155809879303, "learning_rate": 1.6160833902713391e-06, "loss": 0.0014, "num_input_tokens_seen": 28903824, "step": 28780 }, { "epoch": 15.262460233297984, "grad_norm": 0.42308303713798523, "learning_rate": 1.6143805706417153e-06, "loss": 0.0052, "num_input_tokens_seen": 28908368, "step": 28785 }, { "epoch": 15.265111346765641, "grad_norm": 3.211108684539795, "learning_rate": 1.612678475864356e-06, "loss": 0.0025, "num_input_tokens_seen": 28913296, "step": 28790 }, { "epoch": 15.267762460233298, "grad_norm": 8.997178077697754, "learning_rate": 1.6109771063036677e-06, "loss": 0.0038, "num_input_tokens_seen": 28918256, "step": 28795 }, { "epoch": 15.270413573700955, "grad_norm": 3.3639299869537354, "learning_rate": 1.6092764623239149e-06, "loss": 0.0082, "num_input_tokens_seen": 28923312, "step": 28800 }, { "epoch": 15.273064687168612, "grad_norm": 0.8839413523674011, "learning_rate": 1.6075765442891983e-06, "loss": 0.0016, "num_input_tokens_seen": 28928432, "step": 28805 }, { "epoch": 15.275715800636267, "grad_norm": 0.6563340425491333, "learning_rate": 1.6058773525634668e-06, "loss": 0.0074, "num_input_tokens_seen": 28932848, "step": 28810 }, { "epoch": 15.278366914103923, "grad_norm": 4.235137462615967, "learning_rate": 1.6041788875105125e-06, "loss": 0.0431, "num_input_tokens_seen": 28937552, "step": 28815 }, { "epoch": 15.28101802757158, "grad_norm": 62.93700408935547, "learning_rate": 1.6024811494939723e-06, "loss": 0.0413, "num_input_tokens_seen": 28942448, "step": 28820 }, { "epoch": 15.283669141039237, "grad_norm": 0.6274019479751587, "learning_rate": 1.600784138877326e-06, "loss": 0.0164, "num_input_tokens_seen": 28947440, "step": 28825 }, { "epoch": 15.286320254506894, "grad_norm": 0.3891516625881195, "learning_rate": 1.5990878560239042e-06, "loss": 0.0008, "num_input_tokens_seen": 28953168, "step": 28830 }, { "epoch": 15.288971367974549, "grad_norm": 0.33320483565330505, "learning_rate": 1.5973923012968695e-06, "loss": 0.0259, "num_input_tokens_seen": 28957744, "step": 28835 }, { "epoch": 15.291622481442205, "grad_norm": 79.57059478759766, "learning_rate": 1.5956974750592414e-06, "loss": 0.102, "num_input_tokens_seen": 28963792, "step": 28840 }, { "epoch": 15.294273594909862, "grad_norm": 0.9125203490257263, "learning_rate": 1.5940033776738757e-06, "loss": 0.001, "num_input_tokens_seen": 28969104, "step": 28845 }, { "epoch": 15.296924708377519, "grad_norm": 9.198582649230957, "learning_rate": 1.5923100095034738e-06, "loss": 0.0025, "num_input_tokens_seen": 28973808, "step": 28850 }, { "epoch": 15.299575821845174, "grad_norm": 0.04837209731340408, "learning_rate": 1.5906173709105822e-06, "loss": 0.0145, "num_input_tokens_seen": 28979472, "step": 28855 }, { "epoch": 15.30222693531283, "grad_norm": 0.05094285309314728, "learning_rate": 1.5889254622575901e-06, "loss": 0.0043, "num_input_tokens_seen": 28984144, "step": 28860 }, { "epoch": 15.304878048780488, "grad_norm": 0.6210349798202515, "learning_rate": 1.5872342839067305e-06, "loss": 0.0017, "num_input_tokens_seen": 28989616, "step": 28865 }, { "epoch": 15.307529162248144, "grad_norm": 0.8230668902397156, "learning_rate": 1.5855438362200804e-06, "loss": 0.0204, "num_input_tokens_seen": 28994544, "step": 28870 }, { "epoch": 15.310180275715801, "grad_norm": 6.123676776885986, "learning_rate": 1.5838541195595602e-06, "loss": 0.0171, "num_input_tokens_seen": 28999408, "step": 28875 }, { "epoch": 15.312831389183456, "grad_norm": 58.28675842285156, "learning_rate": 1.582165134286932e-06, "loss": 0.0411, "num_input_tokens_seen": 29005840, "step": 28880 }, { "epoch": 15.315482502651113, "grad_norm": 0.1423357129096985, "learning_rate": 1.5804768807638087e-06, "loss": 0.0007, "num_input_tokens_seen": 29010864, "step": 28885 }, { "epoch": 15.31813361611877, "grad_norm": 0.42853477597236633, "learning_rate": 1.578789359351634e-06, "loss": 0.0057, "num_input_tokens_seen": 29015120, "step": 28890 }, { "epoch": 15.320784729586427, "grad_norm": 0.20167796313762665, "learning_rate": 1.577102570411706e-06, "loss": 0.0033, "num_input_tokens_seen": 29019888, "step": 28895 }, { "epoch": 15.323435843054083, "grad_norm": 5.163102149963379, "learning_rate": 1.5754165143051614e-06, "loss": 0.0072, "num_input_tokens_seen": 29025008, "step": 28900 }, { "epoch": 15.326086956521738, "grad_norm": 0.4317185580730438, "learning_rate": 1.57373119139298e-06, "loss": 0.0005, "num_input_tokens_seen": 29029808, "step": 28905 }, { "epoch": 15.328738069989395, "grad_norm": 11.072348594665527, "learning_rate": 1.5720466020359847e-06, "loss": 0.0048, "num_input_tokens_seen": 29034512, "step": 28910 }, { "epoch": 15.331389183457052, "grad_norm": 0.35623809695243835, "learning_rate": 1.5703627465948418e-06, "loss": 0.0267, "num_input_tokens_seen": 29038480, "step": 28915 }, { "epoch": 15.334040296924709, "grad_norm": 11.119596481323242, "learning_rate": 1.5686796254300585e-06, "loss": 0.0056, "num_input_tokens_seen": 29042992, "step": 28920 }, { "epoch": 15.336691410392365, "grad_norm": 0.0953494980931282, "learning_rate": 1.5669972389019922e-06, "loss": 0.0073, "num_input_tokens_seen": 29047408, "step": 28925 }, { "epoch": 15.33934252386002, "grad_norm": 7.056656360626221, "learning_rate": 1.5653155873708304e-06, "loss": 0.0046, "num_input_tokens_seen": 29051824, "step": 28930 }, { "epoch": 15.341993637327677, "grad_norm": 0.13655689358711243, "learning_rate": 1.5636346711966154e-06, "loss": 0.0075, "num_input_tokens_seen": 29055920, "step": 28935 }, { "epoch": 15.344644750795334, "grad_norm": 0.05745778977870941, "learning_rate": 1.5619544907392253e-06, "loss": 0.0027, "num_input_tokens_seen": 29061744, "step": 28940 }, { "epoch": 15.34729586426299, "grad_norm": 0.15406033396720886, "learning_rate": 1.5602750463583822e-06, "loss": 0.1476, "num_input_tokens_seen": 29066992, "step": 28945 }, { "epoch": 15.349946977730648, "grad_norm": 14.89522647857666, "learning_rate": 1.5585963384136505e-06, "loss": 0.006, "num_input_tokens_seen": 29072080, "step": 28950 }, { "epoch": 15.352598091198303, "grad_norm": 0.029535874724388123, "learning_rate": 1.5569183672644379e-06, "loss": 0.0025, "num_input_tokens_seen": 29076368, "step": 28955 }, { "epoch": 15.35524920466596, "grad_norm": 0.6079484820365906, "learning_rate": 1.5552411332699913e-06, "loss": 0.0253, "num_input_tokens_seen": 29081104, "step": 28960 }, { "epoch": 15.357900318133616, "grad_norm": 1.6259589195251465, "learning_rate": 1.5535646367894075e-06, "loss": 0.0036, "num_input_tokens_seen": 29086288, "step": 28965 }, { "epoch": 15.360551431601273, "grad_norm": 11.225944519042969, "learning_rate": 1.5518888781816132e-06, "loss": 0.0058, "num_input_tokens_seen": 29092208, "step": 28970 }, { "epoch": 15.36320254506893, "grad_norm": 7.698798656463623, "learning_rate": 1.5502138578053888e-06, "loss": 0.0033, "num_input_tokens_seen": 29097040, "step": 28975 }, { "epoch": 15.365853658536585, "grad_norm": 0.03823825344443321, "learning_rate": 1.5485395760193505e-06, "loss": 0.0214, "num_input_tokens_seen": 29102480, "step": 28980 }, { "epoch": 15.368504772004242, "grad_norm": 0.910940408706665, "learning_rate": 1.5468660331819574e-06, "loss": 0.0029, "num_input_tokens_seen": 29108944, "step": 28985 }, { "epoch": 15.371155885471898, "grad_norm": 0.03359409049153328, "learning_rate": 1.5451932296515104e-06, "loss": 0.0063, "num_input_tokens_seen": 29113456, "step": 28990 }, { "epoch": 15.373806998939555, "grad_norm": 5.303304195404053, "learning_rate": 1.5435211657861532e-06, "loss": 0.0735, "num_input_tokens_seen": 29118512, "step": 28995 }, { "epoch": 15.37645811240721, "grad_norm": 0.022497592493891716, "learning_rate": 1.5418498419438694e-06, "loss": 0.0025, "num_input_tokens_seen": 29123440, "step": 29000 }, { "epoch": 15.379109225874867, "grad_norm": 0.18085476756095886, "learning_rate": 1.5401792584824848e-06, "loss": 0.0034, "num_input_tokens_seen": 29128336, "step": 29005 }, { "epoch": 15.381760339342524, "grad_norm": 0.28411945700645447, "learning_rate": 1.538509415759668e-06, "loss": 0.1303, "num_input_tokens_seen": 29132144, "step": 29010 }, { "epoch": 15.38441145281018, "grad_norm": 9.196931838989258, "learning_rate": 1.5368403141329257e-06, "loss": 0.0027, "num_input_tokens_seen": 29136592, "step": 29015 }, { "epoch": 15.387062566277837, "grad_norm": 29.925960540771484, "learning_rate": 1.5351719539596137e-06, "loss": 0.0106, "num_input_tokens_seen": 29141200, "step": 29020 }, { "epoch": 15.389713679745492, "grad_norm": 0.26931512355804443, "learning_rate": 1.5335043355969166e-06, "loss": 0.0316, "num_input_tokens_seen": 29146256, "step": 29025 }, { "epoch": 15.392364793213149, "grad_norm": 2.784440040588379, "learning_rate": 1.5318374594018715e-06, "loss": 0.0032, "num_input_tokens_seen": 29150768, "step": 29030 }, { "epoch": 15.395015906680806, "grad_norm": 0.6017489433288574, "learning_rate": 1.5301713257313512e-06, "loss": 0.0078, "num_input_tokens_seen": 29156016, "step": 29035 }, { "epoch": 15.397667020148463, "grad_norm": 4.532209873199463, "learning_rate": 1.52850593494207e-06, "loss": 0.0032, "num_input_tokens_seen": 29161264, "step": 29040 }, { "epoch": 15.40031813361612, "grad_norm": 7.100613594055176, "learning_rate": 1.5268412873905848e-06, "loss": 0.0068, "num_input_tokens_seen": 29166704, "step": 29045 }, { "epoch": 15.402969247083774, "grad_norm": 0.8253811001777649, "learning_rate": 1.525177383433291e-06, "loss": 0.0028, "num_input_tokens_seen": 29173840, "step": 29050 }, { "epoch": 15.405620360551431, "grad_norm": 0.3296433985233307, "learning_rate": 1.5235142234264262e-06, "loss": 0.0131, "num_input_tokens_seen": 29180080, "step": 29055 }, { "epoch": 15.408271474019088, "grad_norm": 0.9161731600761414, "learning_rate": 1.5218518077260713e-06, "loss": 0.002, "num_input_tokens_seen": 29183888, "step": 29060 }, { "epoch": 15.410922587486745, "grad_norm": 0.29506367444992065, "learning_rate": 1.52019013668814e-06, "loss": 0.0003, "num_input_tokens_seen": 29188784, "step": 29065 }, { "epoch": 15.413573700954402, "grad_norm": 0.7172483205795288, "learning_rate": 1.5185292106683963e-06, "loss": 0.0042, "num_input_tokens_seen": 29194640, "step": 29070 }, { "epoch": 15.416224814422057, "grad_norm": 0.0952913835644722, "learning_rate": 1.5168690300224387e-06, "loss": 0.0352, "num_input_tokens_seen": 29199504, "step": 29075 }, { "epoch": 15.418875927889713, "grad_norm": 0.31424376368522644, "learning_rate": 1.5152095951057073e-06, "loss": 0.0011, "num_input_tokens_seen": 29204368, "step": 29080 }, { "epoch": 15.42152704135737, "grad_norm": 4.835017681121826, "learning_rate": 1.5135509062734826e-06, "loss": 0.0304, "num_input_tokens_seen": 29208560, "step": 29085 }, { "epoch": 15.424178154825027, "grad_norm": 0.010996108874678612, "learning_rate": 1.511892963880886e-06, "loss": 0.0281, "num_input_tokens_seen": 29213584, "step": 29090 }, { "epoch": 15.426829268292684, "grad_norm": 5.150406360626221, "learning_rate": 1.5102357682828767e-06, "loss": 0.0065, "num_input_tokens_seen": 29217776, "step": 29095 }, { "epoch": 15.429480381760339, "grad_norm": 18.751922607421875, "learning_rate": 1.5085793198342608e-06, "loss": 0.0056, "num_input_tokens_seen": 29222320, "step": 29100 }, { "epoch": 15.432131495227996, "grad_norm": 0.5690585970878601, "learning_rate": 1.506923618889673e-06, "loss": 0.0202, "num_input_tokens_seen": 29226608, "step": 29105 }, { "epoch": 15.434782608695652, "grad_norm": 0.20639631152153015, "learning_rate": 1.5052686658035998e-06, "loss": 0.0008, "num_input_tokens_seen": 29230864, "step": 29110 }, { "epoch": 15.43743372216331, "grad_norm": 19.972524642944336, "learning_rate": 1.5036144609303605e-06, "loss": 0.0076, "num_input_tokens_seen": 29235056, "step": 29115 }, { "epoch": 15.440084835630966, "grad_norm": 0.690260112285614, "learning_rate": 1.5019610046241157e-06, "loss": 0.0396, "num_input_tokens_seen": 29240624, "step": 29120 }, { "epoch": 15.442735949098621, "grad_norm": 0.013255076482892036, "learning_rate": 1.5003082972388667e-06, "loss": 0.0022, "num_input_tokens_seen": 29246352, "step": 29125 }, { "epoch": 15.445387062566278, "grad_norm": 0.1930965930223465, "learning_rate": 1.498656339128453e-06, "loss": 0.0016, "num_input_tokens_seen": 29251792, "step": 29130 }, { "epoch": 15.448038176033934, "grad_norm": 0.07709412276744843, "learning_rate": 1.4970051306465555e-06, "loss": 0.0097, "num_input_tokens_seen": 29257040, "step": 29135 }, { "epoch": 15.450689289501591, "grad_norm": 0.5324794054031372, "learning_rate": 1.4953546721466915e-06, "loss": 0.0006, "num_input_tokens_seen": 29260880, "step": 29140 }, { "epoch": 15.453340402969246, "grad_norm": 3.2790026664733887, "learning_rate": 1.4937049639822244e-06, "loss": 0.0034, "num_input_tokens_seen": 29265968, "step": 29145 }, { "epoch": 15.455991516436903, "grad_norm": 0.43442580103874207, "learning_rate": 1.492056006506346e-06, "loss": 0.0004, "num_input_tokens_seen": 29270320, "step": 29150 }, { "epoch": 15.45864262990456, "grad_norm": 0.8632968664169312, "learning_rate": 1.4904078000720995e-06, "loss": 0.0078, "num_input_tokens_seen": 29274992, "step": 29155 }, { "epoch": 15.461293743372217, "grad_norm": 1.227769136428833, "learning_rate": 1.488760345032359e-06, "loss": 0.0025, "num_input_tokens_seen": 29279728, "step": 29160 }, { "epoch": 15.463944856839873, "grad_norm": 0.8867275714874268, "learning_rate": 1.4871136417398407e-06, "loss": 0.0008, "num_input_tokens_seen": 29284816, "step": 29165 }, { "epoch": 15.466595970307528, "grad_norm": 2.7270607948303223, "learning_rate": 1.4854676905470993e-06, "loss": 0.0013, "num_input_tokens_seen": 29291088, "step": 29170 }, { "epoch": 15.469247083775185, "grad_norm": 3.9665920734405518, "learning_rate": 1.4838224918065291e-06, "loss": 0.0046, "num_input_tokens_seen": 29296496, "step": 29175 }, { "epoch": 15.471898197242842, "grad_norm": 0.7104726433753967, "learning_rate": 1.4821780458703605e-06, "loss": 0.0016, "num_input_tokens_seen": 29302032, "step": 29180 }, { "epoch": 15.474549310710499, "grad_norm": 2.740795612335205, "learning_rate": 1.4805343530906703e-06, "loss": 0.0049, "num_input_tokens_seen": 29307728, "step": 29185 }, { "epoch": 15.477200424178156, "grad_norm": 0.1229557991027832, "learning_rate": 1.4788914138193627e-06, "loss": 0.0011, "num_input_tokens_seen": 29312176, "step": 29190 }, { "epoch": 15.47985153764581, "grad_norm": 8.613935470581055, "learning_rate": 1.4772492284081908e-06, "loss": 0.0029, "num_input_tokens_seen": 29317168, "step": 29195 }, { "epoch": 15.482502651113467, "grad_norm": 0.05051908269524574, "learning_rate": 1.4756077972087412e-06, "loss": 0.0078, "num_input_tokens_seen": 29321968, "step": 29200 }, { "epoch": 15.485153764581124, "grad_norm": 1.0388548374176025, "learning_rate": 1.4739671205724398e-06, "loss": 0.0035, "num_input_tokens_seen": 29326896, "step": 29205 }, { "epoch": 15.487804878048781, "grad_norm": 0.32032740116119385, "learning_rate": 1.4723271988505512e-06, "loss": 0.0012, "num_input_tokens_seen": 29331600, "step": 29210 }, { "epoch": 15.490455991516438, "grad_norm": 0.047540124505758286, "learning_rate": 1.470688032394178e-06, "loss": 0.0029, "num_input_tokens_seen": 29337840, "step": 29215 }, { "epoch": 15.493107104984093, "grad_norm": 5.644817352294922, "learning_rate": 1.4690496215542604e-06, "loss": 0.008, "num_input_tokens_seen": 29342448, "step": 29220 }, { "epoch": 15.49575821845175, "grad_norm": 0.42730486392974854, "learning_rate": 1.4674119666815828e-06, "loss": 0.0641, "num_input_tokens_seen": 29347664, "step": 29225 }, { "epoch": 15.498409331919406, "grad_norm": 38.29481887817383, "learning_rate": 1.4657750681267557e-06, "loss": 0.0634, "num_input_tokens_seen": 29352208, "step": 29230 }, { "epoch": 15.501060445387063, "grad_norm": 123.3198013305664, "learning_rate": 1.4641389262402406e-06, "loss": 0.1248, "num_input_tokens_seen": 29356368, "step": 29235 }, { "epoch": 15.503711558854718, "grad_norm": 0.6673060655593872, "learning_rate": 1.4625035413723282e-06, "loss": 0.0675, "num_input_tokens_seen": 29360208, "step": 29240 }, { "epoch": 15.506362672322375, "grad_norm": 2.3438925743103027, "learning_rate": 1.4608689138731513e-06, "loss": 0.0113, "num_input_tokens_seen": 29364112, "step": 29245 }, { "epoch": 15.509013785790032, "grad_norm": 15.625889778137207, "learning_rate": 1.459235044092679e-06, "loss": 0.0138, "num_input_tokens_seen": 29369424, "step": 29250 }, { "epoch": 15.511664899257688, "grad_norm": 0.093619205057621, "learning_rate": 1.4576019323807179e-06, "loss": 0.0151, "num_input_tokens_seen": 29375344, "step": 29255 }, { "epoch": 15.514316012725345, "grad_norm": 0.13182079792022705, "learning_rate": 1.4559695790869128e-06, "loss": 0.0014, "num_input_tokens_seen": 29379888, "step": 29260 }, { "epoch": 15.516967126193002, "grad_norm": 107.62311553955078, "learning_rate": 1.4543379845607497e-06, "loss": 0.0722, "num_input_tokens_seen": 29385456, "step": 29265 }, { "epoch": 15.519618239660657, "grad_norm": 26.61825942993164, "learning_rate": 1.4527071491515442e-06, "loss": 0.0118, "num_input_tokens_seen": 29390256, "step": 29270 }, { "epoch": 15.522269353128314, "grad_norm": 8.110738754272461, "learning_rate": 1.451077073208455e-06, "loss": 0.0028, "num_input_tokens_seen": 29396112, "step": 29275 }, { "epoch": 15.52492046659597, "grad_norm": 0.7874994277954102, "learning_rate": 1.4494477570804804e-06, "loss": 0.002, "num_input_tokens_seen": 29400368, "step": 29280 }, { "epoch": 15.527571580063627, "grad_norm": 2.91357421875, "learning_rate": 1.4478192011164471e-06, "loss": 0.0016, "num_input_tokens_seen": 29406512, "step": 29285 }, { "epoch": 15.530222693531282, "grad_norm": 0.032885126769542694, "learning_rate": 1.4461914056650296e-06, "loss": 0.0991, "num_input_tokens_seen": 29410448, "step": 29290 }, { "epoch": 15.53287380699894, "grad_norm": 4.044046878814697, "learning_rate": 1.444564371074732e-06, "loss": 0.0053, "num_input_tokens_seen": 29414960, "step": 29295 }, { "epoch": 15.535524920466596, "grad_norm": 0.21311798691749573, "learning_rate": 1.4429380976938996e-06, "loss": 0.0006, "num_input_tokens_seen": 29418864, "step": 29300 }, { "epoch": 15.538176033934253, "grad_norm": 1.7821725606918335, "learning_rate": 1.4413125858707123e-06, "loss": 0.0291, "num_input_tokens_seen": 29422896, "step": 29305 }, { "epoch": 15.54082714740191, "grad_norm": 65.60347747802734, "learning_rate": 1.4396878359531886e-06, "loss": 0.0521, "num_input_tokens_seen": 29428400, "step": 29310 }, { "epoch": 15.543478260869565, "grad_norm": 0.10644322633743286, "learning_rate": 1.4380638482891806e-06, "loss": 0.0003, "num_input_tokens_seen": 29432240, "step": 29315 }, { "epoch": 15.546129374337221, "grad_norm": 0.5545052886009216, "learning_rate": 1.4364406232263856e-06, "loss": 0.0635, "num_input_tokens_seen": 29436976, "step": 29320 }, { "epoch": 15.548780487804878, "grad_norm": 0.19801630079746246, "learning_rate": 1.4348181611123252e-06, "loss": 0.0101, "num_input_tokens_seen": 29442608, "step": 29325 }, { "epoch": 15.551431601272535, "grad_norm": 4.323635101318359, "learning_rate": 1.433196462294369e-06, "loss": 0.0601, "num_input_tokens_seen": 29447312, "step": 29330 }, { "epoch": 15.554082714740192, "grad_norm": 0.16353179514408112, "learning_rate": 1.4315755271197163e-06, "loss": 0.0018, "num_input_tokens_seen": 29451088, "step": 29335 }, { "epoch": 15.556733828207847, "grad_norm": 0.18782208859920502, "learning_rate": 1.4299553559354062e-06, "loss": 0.0022, "num_input_tokens_seen": 29456368, "step": 29340 }, { "epoch": 15.559384941675503, "grad_norm": 56.25613784790039, "learning_rate": 1.4283359490883119e-06, "loss": 0.0195, "num_input_tokens_seen": 29460656, "step": 29345 }, { "epoch": 15.56203605514316, "grad_norm": 0.03885422274470329, "learning_rate": 1.4267173069251456e-06, "loss": 0.003, "num_input_tokens_seen": 29466000, "step": 29350 }, { "epoch": 15.564687168610817, "grad_norm": 62.0434455871582, "learning_rate": 1.4250994297924515e-06, "loss": 0.047, "num_input_tokens_seen": 29472208, "step": 29355 }, { "epoch": 15.567338282078474, "grad_norm": 0.044192805886268616, "learning_rate": 1.4234823180366181e-06, "loss": 0.0201, "num_input_tokens_seen": 29477968, "step": 29360 }, { "epoch": 15.569989395546129, "grad_norm": 0.7015625238418579, "learning_rate": 1.4218659720038586e-06, "loss": 0.0153, "num_input_tokens_seen": 29484720, "step": 29365 }, { "epoch": 15.572640509013786, "grad_norm": 0.23115314543247223, "learning_rate": 1.4202503920402328e-06, "loss": 0.0282, "num_input_tokens_seen": 29489744, "step": 29370 }, { "epoch": 15.575291622481442, "grad_norm": 1.0913194417953491, "learning_rate": 1.4186355784916306e-06, "loss": 0.0018, "num_input_tokens_seen": 29494736, "step": 29375 }, { "epoch": 15.5779427359491, "grad_norm": 1.1879230737686157, "learning_rate": 1.41702153170378e-06, "loss": 0.1495, "num_input_tokens_seen": 29499280, "step": 29380 }, { "epoch": 15.580593849416754, "grad_norm": 0.5191084742546082, "learning_rate": 1.415408252022244e-06, "loss": 0.0009, "num_input_tokens_seen": 29505552, "step": 29385 }, { "epoch": 15.583244962884411, "grad_norm": 0.2292877733707428, "learning_rate": 1.4137957397924207e-06, "loss": 0.0131, "num_input_tokens_seen": 29509584, "step": 29390 }, { "epoch": 15.585896076352068, "grad_norm": 0.6896409392356873, "learning_rate": 1.412183995359544e-06, "loss": 0.0012, "num_input_tokens_seen": 29514128, "step": 29395 }, { "epoch": 15.588547189819725, "grad_norm": 0.532778263092041, "learning_rate": 1.410573019068689e-06, "loss": 0.0007, "num_input_tokens_seen": 29518992, "step": 29400 }, { "epoch": 15.591198303287381, "grad_norm": 53.94965744018555, "learning_rate": 1.4089628112647557e-06, "loss": 0.1072, "num_input_tokens_seen": 29523536, "step": 29405 }, { "epoch": 15.593849416755036, "grad_norm": 0.032630909234285355, "learning_rate": 1.407353372292487e-06, "loss": 0.0032, "num_input_tokens_seen": 29527888, "step": 29410 }, { "epoch": 15.596500530222693, "grad_norm": 0.023989573121070862, "learning_rate": 1.4057447024964626e-06, "loss": 0.0034, "num_input_tokens_seen": 29532592, "step": 29415 }, { "epoch": 15.59915164369035, "grad_norm": 0.12388881295919418, "learning_rate": 1.40413680222109e-06, "loss": 0.0019, "num_input_tokens_seen": 29537328, "step": 29420 }, { "epoch": 15.601802757158007, "grad_norm": 0.6986292004585266, "learning_rate": 1.4025296718106197e-06, "loss": 0.0038, "num_input_tokens_seen": 29541392, "step": 29425 }, { "epoch": 15.604453870625663, "grad_norm": 33.79585266113281, "learning_rate": 1.4009233116091325e-06, "loss": 0.0123, "num_input_tokens_seen": 29546736, "step": 29430 }, { "epoch": 15.607104984093318, "grad_norm": 0.16639453172683716, "learning_rate": 1.399317721960547e-06, "loss": 0.0198, "num_input_tokens_seen": 29553104, "step": 29435 }, { "epoch": 15.609756097560975, "grad_norm": 24.190221786499023, "learning_rate": 1.397712903208615e-06, "loss": 0.0288, "num_input_tokens_seen": 29559024, "step": 29440 }, { "epoch": 15.612407211028632, "grad_norm": 0.26386553049087524, "learning_rate": 1.3961088556969244e-06, "loss": 0.0055, "num_input_tokens_seen": 29564688, "step": 29445 }, { "epoch": 15.615058324496289, "grad_norm": 0.6627315282821655, "learning_rate": 1.394505579768896e-06, "loss": 0.0018, "num_input_tokens_seen": 29569488, "step": 29450 }, { "epoch": 15.617709437963946, "grad_norm": 0.03319967910647392, "learning_rate": 1.3929030757677908e-06, "loss": 0.0283, "num_input_tokens_seen": 29575248, "step": 29455 }, { "epoch": 15.6203605514316, "grad_norm": 0.016449319198727608, "learning_rate": 1.3913013440366957e-06, "loss": 0.0006, "num_input_tokens_seen": 29579824, "step": 29460 }, { "epoch": 15.623011664899257, "grad_norm": 70.44356536865234, "learning_rate": 1.3897003849185415e-06, "loss": 0.1009, "num_input_tokens_seen": 29584720, "step": 29465 }, { "epoch": 15.625662778366914, "grad_norm": 0.2590693533420563, "learning_rate": 1.3881001987560872e-06, "loss": 0.0009, "num_input_tokens_seen": 29590672, "step": 29470 }, { "epoch": 15.628313891834571, "grad_norm": 33.60049057006836, "learning_rate": 1.3865007858919288e-06, "loss": 0.1448, "num_input_tokens_seen": 29595824, "step": 29475 }, { "epoch": 15.630965005302228, "grad_norm": 0.14811013638973236, "learning_rate": 1.384902146668497e-06, "loss": 0.0566, "num_input_tokens_seen": 29600016, "step": 29480 }, { "epoch": 15.633616118769883, "grad_norm": 0.2039264291524887, "learning_rate": 1.3833042814280563e-06, "loss": 0.0033, "num_input_tokens_seen": 29605104, "step": 29485 }, { "epoch": 15.63626723223754, "grad_norm": 1.0760341882705688, "learning_rate": 1.3817071905127028e-06, "loss": 0.0035, "num_input_tokens_seen": 29610064, "step": 29490 }, { "epoch": 15.638918345705196, "grad_norm": 113.06840515136719, "learning_rate": 1.3801108742643748e-06, "loss": 0.089, "num_input_tokens_seen": 29614992, "step": 29495 }, { "epoch": 15.641569459172853, "grad_norm": 31.688329696655273, "learning_rate": 1.378515333024833e-06, "loss": 0.0168, "num_input_tokens_seen": 29619184, "step": 29500 }, { "epoch": 15.64422057264051, "grad_norm": 3.032498836517334, "learning_rate": 1.376920567135684e-06, "loss": 0.0035, "num_input_tokens_seen": 29623632, "step": 29505 }, { "epoch": 15.646871686108165, "grad_norm": 2.719052314758301, "learning_rate": 1.3753265769383605e-06, "loss": 0.0136, "num_input_tokens_seen": 29628912, "step": 29510 }, { "epoch": 15.649522799575822, "grad_norm": 1.059084177017212, "learning_rate": 1.3737333627741328e-06, "loss": 0.0083, "num_input_tokens_seen": 29634640, "step": 29515 }, { "epoch": 15.652173913043478, "grad_norm": 0.01941816508769989, "learning_rate": 1.3721409249841023e-06, "loss": 0.0008, "num_input_tokens_seen": 29638896, "step": 29520 }, { "epoch": 15.654825026511135, "grad_norm": 2.911301612854004, "learning_rate": 1.3705492639092072e-06, "loss": 0.0054, "num_input_tokens_seen": 29643920, "step": 29525 }, { "epoch": 15.65747613997879, "grad_norm": 4.509973049163818, "learning_rate": 1.3689583798902156e-06, "loss": 0.0019, "num_input_tokens_seen": 29648784, "step": 29530 }, { "epoch": 15.660127253446447, "grad_norm": 5.394204139709473, "learning_rate": 1.3673682732677363e-06, "loss": 0.0152, "num_input_tokens_seen": 29653776, "step": 29535 }, { "epoch": 15.662778366914104, "grad_norm": 0.059974052011966705, "learning_rate": 1.3657789443822028e-06, "loss": 0.1316, "num_input_tokens_seen": 29658672, "step": 29540 }, { "epoch": 15.66542948038176, "grad_norm": 0.3312150537967682, "learning_rate": 1.364190393573886e-06, "loss": 0.0074, "num_input_tokens_seen": 29664144, "step": 29545 }, { "epoch": 15.668080593849417, "grad_norm": 0.22863200306892395, "learning_rate": 1.3626026211828947e-06, "loss": 0.0129, "num_input_tokens_seen": 29670160, "step": 29550 }, { "epoch": 15.670731707317072, "grad_norm": 7.254644393920898, "learning_rate": 1.3610156275491615e-06, "loss": 0.0229, "num_input_tokens_seen": 29674640, "step": 29555 }, { "epoch": 15.67338282078473, "grad_norm": 1.0304358005523682, "learning_rate": 1.3594294130124613e-06, "loss": 0.0011, "num_input_tokens_seen": 29679376, "step": 29560 }, { "epoch": 15.676033934252386, "grad_norm": 0.3847014605998993, "learning_rate": 1.3578439779123975e-06, "loss": 0.003, "num_input_tokens_seen": 29687088, "step": 29565 }, { "epoch": 15.678685047720043, "grad_norm": 0.2292415350675583, "learning_rate": 1.3562593225884074e-06, "loss": 0.0007, "num_input_tokens_seen": 29692240, "step": 29570 }, { "epoch": 15.6813361611877, "grad_norm": 0.12131963670253754, "learning_rate": 1.3546754473797618e-06, "loss": 0.0045, "num_input_tokens_seen": 29697200, "step": 29575 }, { "epoch": 15.683987274655355, "grad_norm": 0.7194454073905945, "learning_rate": 1.3530923526255641e-06, "loss": 0.004, "num_input_tokens_seen": 29701680, "step": 29580 }, { "epoch": 15.686638388123011, "grad_norm": 0.11738407611846924, "learning_rate": 1.351510038664749e-06, "loss": 0.0498, "num_input_tokens_seen": 29706384, "step": 29585 }, { "epoch": 15.689289501590668, "grad_norm": 0.09116144478321075, "learning_rate": 1.3499285058360905e-06, "loss": 0.0052, "num_input_tokens_seen": 29711440, "step": 29590 }, { "epoch": 15.691940615058325, "grad_norm": 6.489476203918457, "learning_rate": 1.3483477544781847e-06, "loss": 0.0042, "num_input_tokens_seen": 29715952, "step": 29595 }, { "epoch": 15.694591728525982, "grad_norm": 0.017500540241599083, "learning_rate": 1.3467677849294708e-06, "loss": 0.0029, "num_input_tokens_seen": 29720336, "step": 29600 }, { "epoch": 15.697242841993637, "grad_norm": 19.030115127563477, "learning_rate": 1.3451885975282147e-06, "loss": 0.0171, "num_input_tokens_seen": 29725584, "step": 29605 }, { "epoch": 15.699893955461294, "grad_norm": 6.345243453979492, "learning_rate": 1.3436101926125156e-06, "loss": 0.0025, "num_input_tokens_seen": 29729552, "step": 29610 }, { "epoch": 15.70254506892895, "grad_norm": 5.173274517059326, "learning_rate": 1.342032570520307e-06, "loss": 0.0036, "num_input_tokens_seen": 29733104, "step": 29615 }, { "epoch": 15.705196182396607, "grad_norm": 0.34835416078567505, "learning_rate": 1.340455731589353e-06, "loss": 0.0015, "num_input_tokens_seen": 29738352, "step": 29620 }, { "epoch": 15.707847295864262, "grad_norm": 0.7803381085395813, "learning_rate": 1.3388796761572493e-06, "loss": 0.0012, "num_input_tokens_seen": 29743760, "step": 29625 }, { "epoch": 15.710498409331919, "grad_norm": 2.1003904342651367, "learning_rate": 1.3373044045614297e-06, "loss": 0.0174, "num_input_tokens_seen": 29749392, "step": 29630 }, { "epoch": 15.713149522799576, "grad_norm": 14.950600624084473, "learning_rate": 1.3357299171391503e-06, "loss": 0.0079, "num_input_tokens_seen": 29754576, "step": 29635 }, { "epoch": 15.715800636267232, "grad_norm": 121.66494750976562, "learning_rate": 1.3341562142275078e-06, "loss": 0.1207, "num_input_tokens_seen": 29761008, "step": 29640 }, { "epoch": 15.71845174973489, "grad_norm": 0.3901696801185608, "learning_rate": 1.3325832961634277e-06, "loss": 0.0912, "num_input_tokens_seen": 29766544, "step": 29645 }, { "epoch": 15.721102863202544, "grad_norm": 1.4886902570724487, "learning_rate": 1.3310111632836675e-06, "loss": 0.0593, "num_input_tokens_seen": 29771024, "step": 29650 }, { "epoch": 15.723753976670201, "grad_norm": 0.9612028002738953, "learning_rate": 1.3294398159248162e-06, "loss": 0.0684, "num_input_tokens_seen": 29776176, "step": 29655 }, { "epoch": 15.726405090137858, "grad_norm": 2.034397840499878, "learning_rate": 1.3278692544232953e-06, "loss": 0.0064, "num_input_tokens_seen": 29781392, "step": 29660 }, { "epoch": 15.729056203605515, "grad_norm": 1.373153567314148, "learning_rate": 1.3262994791153566e-06, "loss": 0.0113, "num_input_tokens_seen": 29785936, "step": 29665 }, { "epoch": 15.731707317073171, "grad_norm": 0.050845514982938766, "learning_rate": 1.3247304903370895e-06, "loss": 0.0111, "num_input_tokens_seen": 29790928, "step": 29670 }, { "epoch": 15.734358430540826, "grad_norm": 7.626673698425293, "learning_rate": 1.3231622884244045e-06, "loss": 0.0085, "num_input_tokens_seen": 29795824, "step": 29675 }, { "epoch": 15.737009544008483, "grad_norm": 0.8578786849975586, "learning_rate": 1.321594873713053e-06, "loss": 0.0092, "num_input_tokens_seen": 29800944, "step": 29680 }, { "epoch": 15.73966065747614, "grad_norm": 0.013427778147161007, "learning_rate": 1.3200282465386156e-06, "loss": 0.0012, "num_input_tokens_seen": 29807120, "step": 29685 }, { "epoch": 15.742311770943797, "grad_norm": 0.8957647681236267, "learning_rate": 1.3184624072364976e-06, "loss": 0.0047, "num_input_tokens_seen": 29811952, "step": 29690 }, { "epoch": 15.744962884411454, "grad_norm": 1.985921859741211, "learning_rate": 1.3168973561419468e-06, "loss": 0.0042, "num_input_tokens_seen": 29816624, "step": 29695 }, { "epoch": 15.747613997879109, "grad_norm": 1.1669491529464722, "learning_rate": 1.3153330935900338e-06, "loss": 0.0364, "num_input_tokens_seen": 29821456, "step": 29700 }, { "epoch": 15.750265111346765, "grad_norm": 0.028297511860728264, "learning_rate": 1.313769619915664e-06, "loss": 0.0011, "num_input_tokens_seen": 29827504, "step": 29705 }, { "epoch": 15.752916224814422, "grad_norm": 0.024477330967783928, "learning_rate": 1.312206935453571e-06, "loss": 0.0015, "num_input_tokens_seen": 29832784, "step": 29710 }, { "epoch": 15.755567338282079, "grad_norm": 0.09741334617137909, "learning_rate": 1.3106450405383264e-06, "loss": 0.0055, "num_input_tokens_seen": 29836880, "step": 29715 }, { "epoch": 15.758218451749736, "grad_norm": 8.34615421295166, "learning_rate": 1.309083935504321e-06, "loss": 0.0065, "num_input_tokens_seen": 29843536, "step": 29720 }, { "epoch": 15.76086956521739, "grad_norm": 65.18794250488281, "learning_rate": 1.3075236206857906e-06, "loss": 0.0606, "num_input_tokens_seen": 29847856, "step": 29725 }, { "epoch": 15.763520678685047, "grad_norm": 0.03823885694146156, "learning_rate": 1.3059640964167869e-06, "loss": 0.0073, "num_input_tokens_seen": 29852528, "step": 29730 }, { "epoch": 15.766171792152704, "grad_norm": 2.6487741470336914, "learning_rate": 1.304405363031206e-06, "loss": 0.0053, "num_input_tokens_seen": 29856624, "step": 29735 }, { "epoch": 15.768822905620361, "grad_norm": 0.024315914139151573, "learning_rate": 1.302847420862766e-06, "loss": 0.0024, "num_input_tokens_seen": 29861456, "step": 29740 }, { "epoch": 15.771474019088018, "grad_norm": 63.92637252807617, "learning_rate": 1.301290270245019e-06, "loss": 0.0199, "num_input_tokens_seen": 29866672, "step": 29745 }, { "epoch": 15.774125132555673, "grad_norm": 0.14313849806785583, "learning_rate": 1.2997339115113454e-06, "loss": 0.0009, "num_input_tokens_seen": 29872144, "step": 29750 }, { "epoch": 15.77677624602333, "grad_norm": 0.29818183183670044, "learning_rate": 1.2981783449949615e-06, "loss": 0.0695, "num_input_tokens_seen": 29876656, "step": 29755 }, { "epoch": 15.779427359490986, "grad_norm": 0.014193445444107056, "learning_rate": 1.2966235710289044e-06, "loss": 0.0131, "num_input_tokens_seen": 29882288, "step": 29760 }, { "epoch": 15.782078472958643, "grad_norm": 0.21276827156543732, "learning_rate": 1.2950695899460509e-06, "loss": 0.0386, "num_input_tokens_seen": 29886608, "step": 29765 }, { "epoch": 15.784729586426298, "grad_norm": 1.3590071201324463, "learning_rate": 1.2935164020791035e-06, "loss": 0.1353, "num_input_tokens_seen": 29891472, "step": 29770 }, { "epoch": 15.787380699893955, "grad_norm": 0.042617280036211014, "learning_rate": 1.291964007760596e-06, "loss": 0.0075, "num_input_tokens_seen": 29896784, "step": 29775 }, { "epoch": 15.790031813361612, "grad_norm": 2.3907482624053955, "learning_rate": 1.2904124073228913e-06, "loss": 0.0025, "num_input_tokens_seen": 29900720, "step": 29780 }, { "epoch": 15.792682926829269, "grad_norm": 0.07791595160961151, "learning_rate": 1.288861601098183e-06, "loss": 0.0163, "num_input_tokens_seen": 29905040, "step": 29785 }, { "epoch": 15.795334040296925, "grad_norm": 91.24366760253906, "learning_rate": 1.2873115894184935e-06, "loss": 0.2594, "num_input_tokens_seen": 29910864, "step": 29790 }, { "epoch": 15.79798515376458, "grad_norm": 0.411293625831604, "learning_rate": 1.2857623726156804e-06, "loss": 0.0013, "num_input_tokens_seen": 29916912, "step": 29795 }, { "epoch": 15.800636267232237, "grad_norm": 0.12451436370611191, "learning_rate": 1.2842139510214213e-06, "loss": 0.002, "num_input_tokens_seen": 29921744, "step": 29800 }, { "epoch": 15.803287380699894, "grad_norm": 0.43774253129959106, "learning_rate": 1.282666324967234e-06, "loss": 0.0059, "num_input_tokens_seen": 29928080, "step": 29805 }, { "epoch": 15.80593849416755, "grad_norm": 13.000168800354004, "learning_rate": 1.2811194947844585e-06, "loss": 0.1245, "num_input_tokens_seen": 29934000, "step": 29810 }, { "epoch": 15.808589607635207, "grad_norm": 9.866948127746582, "learning_rate": 1.2795734608042682e-06, "loss": 0.2355, "num_input_tokens_seen": 29938384, "step": 29815 }, { "epoch": 15.811240721102862, "grad_norm": 0.1044887825846672, "learning_rate": 1.278028223357664e-06, "loss": 0.0112, "num_input_tokens_seen": 29943408, "step": 29820 }, { "epoch": 15.81389183457052, "grad_norm": 0.8700383305549622, "learning_rate": 1.2764837827754784e-06, "loss": 0.0094, "num_input_tokens_seen": 29948304, "step": 29825 }, { "epoch": 15.816542948038176, "grad_norm": 0.5350497364997864, "learning_rate": 1.2749401393883715e-06, "loss": 0.0014, "num_input_tokens_seen": 29953104, "step": 29830 }, { "epoch": 15.819194061505833, "grad_norm": 0.5156589150428772, "learning_rate": 1.2733972935268329e-06, "loss": 0.0085, "num_input_tokens_seen": 29957552, "step": 29835 }, { "epoch": 15.82184517497349, "grad_norm": 19.50240135192871, "learning_rate": 1.2718552455211824e-06, "loss": 0.0149, "num_input_tokens_seen": 29963536, "step": 29840 }, { "epoch": 15.824496288441145, "grad_norm": 0.5253692865371704, "learning_rate": 1.270313995701567e-06, "loss": 0.0031, "num_input_tokens_seen": 29969104, "step": 29845 }, { "epoch": 15.827147401908801, "grad_norm": 0.04310065135359764, "learning_rate": 1.2687735443979683e-06, "loss": 0.0006, "num_input_tokens_seen": 29973776, "step": 29850 }, { "epoch": 15.829798515376458, "grad_norm": 0.8165587782859802, "learning_rate": 1.2672338919401866e-06, "loss": 0.0029, "num_input_tokens_seen": 29978704, "step": 29855 }, { "epoch": 15.832449628844115, "grad_norm": 0.056344106793403625, "learning_rate": 1.2656950386578632e-06, "loss": 0.1571, "num_input_tokens_seen": 29983792, "step": 29860 }, { "epoch": 15.83510074231177, "grad_norm": 0.02555454708635807, "learning_rate": 1.2641569848804598e-06, "loss": 0.0009, "num_input_tokens_seen": 29988912, "step": 29865 }, { "epoch": 15.837751855779427, "grad_norm": 0.1293017566204071, "learning_rate": 1.2626197309372707e-06, "loss": 0.001, "num_input_tokens_seen": 29993776, "step": 29870 }, { "epoch": 15.840402969247084, "grad_norm": 109.23957824707031, "learning_rate": 1.2610832771574171e-06, "loss": 0.0948, "num_input_tokens_seen": 29999632, "step": 29875 }, { "epoch": 15.84305408271474, "grad_norm": 0.10003428161144257, "learning_rate": 1.2595476238698501e-06, "loss": 0.0493, "num_input_tokens_seen": 30005840, "step": 29880 }, { "epoch": 15.845705196182397, "grad_norm": 0.13513800501823425, "learning_rate": 1.258012771403348e-06, "loss": 0.0005, "num_input_tokens_seen": 30010640, "step": 29885 }, { "epoch": 15.848356309650054, "grad_norm": 1.2721468210220337, "learning_rate": 1.2564787200865226e-06, "loss": 0.0013, "num_input_tokens_seen": 30015344, "step": 29890 }, { "epoch": 15.851007423117709, "grad_norm": 0.28171783685684204, "learning_rate": 1.2549454702478038e-06, "loss": 0.0052, "num_input_tokens_seen": 30020016, "step": 29895 }, { "epoch": 15.853658536585366, "grad_norm": 74.71492004394531, "learning_rate": 1.2534130222154623e-06, "loss": 0.1167, "num_input_tokens_seen": 30024688, "step": 29900 }, { "epoch": 15.856309650053023, "grad_norm": 64.81971740722656, "learning_rate": 1.2518813763175885e-06, "loss": 0.0253, "num_input_tokens_seen": 30029552, "step": 29905 }, { "epoch": 15.85896076352068, "grad_norm": 2.6986489295959473, "learning_rate": 1.2503505328821042e-06, "loss": 0.1446, "num_input_tokens_seen": 30034160, "step": 29910 }, { "epoch": 15.861611876988334, "grad_norm": 1.2779605388641357, "learning_rate": 1.248820492236759e-06, "loss": 0.0049, "num_input_tokens_seen": 30038000, "step": 29915 }, { "epoch": 15.864262990455991, "grad_norm": 0.7439101338386536, "learning_rate": 1.2472912547091304e-06, "loss": 0.0061, "num_input_tokens_seen": 30044080, "step": 29920 }, { "epoch": 15.866914103923648, "grad_norm": 0.21600250899791718, "learning_rate": 1.2457628206266231e-06, "loss": 0.0134, "num_input_tokens_seen": 30049328, "step": 29925 }, { "epoch": 15.869565217391305, "grad_norm": 4.119946479797363, "learning_rate": 1.244235190316475e-06, "loss": 0.0023, "num_input_tokens_seen": 30054032, "step": 29930 }, { "epoch": 15.872216330858961, "grad_norm": 49.81760025024414, "learning_rate": 1.242708364105742e-06, "loss": 0.0218, "num_input_tokens_seen": 30058672, "step": 29935 }, { "epoch": 15.874867444326616, "grad_norm": 80.24560546875, "learning_rate": 1.2411823423213177e-06, "loss": 0.0367, "num_input_tokens_seen": 30062864, "step": 29940 }, { "epoch": 15.877518557794273, "grad_norm": 1.8045412302017212, "learning_rate": 1.2396571252899182e-06, "loss": 0.0009, "num_input_tokens_seen": 30068496, "step": 29945 }, { "epoch": 15.88016967126193, "grad_norm": 32.100608825683594, "learning_rate": 1.2381327133380877e-06, "loss": 0.0169, "num_input_tokens_seen": 30073392, "step": 29950 }, { "epoch": 15.882820784729587, "grad_norm": 4.538843631744385, "learning_rate": 1.2366091067922004e-06, "loss": 0.048, "num_input_tokens_seen": 30077808, "step": 29955 }, { "epoch": 15.885471898197244, "grad_norm": 0.6361729502677917, "learning_rate": 1.2350863059784546e-06, "loss": 0.0004, "num_input_tokens_seen": 30082576, "step": 29960 }, { "epoch": 15.888123011664899, "grad_norm": 3.7611522674560547, "learning_rate": 1.2335643112228795e-06, "loss": 0.1301, "num_input_tokens_seen": 30087536, "step": 29965 }, { "epoch": 15.890774125132555, "grad_norm": 0.40968120098114014, "learning_rate": 1.2320431228513297e-06, "loss": 0.0024, "num_input_tokens_seen": 30091952, "step": 29970 }, { "epoch": 15.893425238600212, "grad_norm": 0.8269537091255188, "learning_rate": 1.2305227411894871e-06, "loss": 0.0016, "num_input_tokens_seen": 30096976, "step": 29975 }, { "epoch": 15.896076352067869, "grad_norm": 0.0337534099817276, "learning_rate": 1.229003166562861e-06, "loss": 0.0014, "num_input_tokens_seen": 30101488, "step": 29980 }, { "epoch": 15.898727465535526, "grad_norm": 91.29557037353516, "learning_rate": 1.2274843992967916e-06, "loss": 0.0484, "num_input_tokens_seen": 30105232, "step": 29985 }, { "epoch": 15.90137857900318, "grad_norm": 0.04459104314446449, "learning_rate": 1.2259664397164377e-06, "loss": 0.088, "num_input_tokens_seen": 30109744, "step": 29990 }, { "epoch": 15.904029692470838, "grad_norm": 41.9770622253418, "learning_rate": 1.2244492881467956e-06, "loss": 0.1561, "num_input_tokens_seen": 30115472, "step": 29995 }, { "epoch": 15.906680805938494, "grad_norm": 0.07054290175437927, "learning_rate": 1.2229329449126808e-06, "loss": 0.0178, "num_input_tokens_seen": 30121328, "step": 30000 }, { "epoch": 15.909331919406151, "grad_norm": 2.532837390899658, "learning_rate": 1.2214174103387388e-06, "loss": 0.2141, "num_input_tokens_seen": 30126288, "step": 30005 }, { "epoch": 15.911983032873806, "grad_norm": 0.15202660858631134, "learning_rate": 1.2199026847494416e-06, "loss": 0.0213, "num_input_tokens_seen": 30131888, "step": 30010 }, { "epoch": 15.914634146341463, "grad_norm": 0.029829660430550575, "learning_rate": 1.218388768469088e-06, "loss": 0.0083, "num_input_tokens_seen": 30137136, "step": 30015 }, { "epoch": 15.91728525980912, "grad_norm": 0.030567998066544533, "learning_rate": 1.216875661821803e-06, "loss": 0.0392, "num_input_tokens_seen": 30141808, "step": 30020 }, { "epoch": 15.919936373276776, "grad_norm": 44.409461975097656, "learning_rate": 1.2153633651315417e-06, "loss": 0.016, "num_input_tokens_seen": 30146512, "step": 30025 }, { "epoch": 15.922587486744433, "grad_norm": 1.9395663738250732, "learning_rate": 1.2138518787220777e-06, "loss": 0.0021, "num_input_tokens_seen": 30151920, "step": 30030 }, { "epoch": 15.925238600212088, "grad_norm": 15.652193069458008, "learning_rate": 1.21234120291702e-06, "loss": 0.0079, "num_input_tokens_seen": 30156208, "step": 30035 }, { "epoch": 15.927889713679745, "grad_norm": 0.030186563730239868, "learning_rate": 1.2108313380398e-06, "loss": 0.0465, "num_input_tokens_seen": 30161200, "step": 30040 }, { "epoch": 15.930540827147402, "grad_norm": 0.273657888174057, "learning_rate": 1.2093222844136754e-06, "loss": 0.0017, "num_input_tokens_seen": 30166672, "step": 30045 }, { "epoch": 15.933191940615059, "grad_norm": 0.5205932855606079, "learning_rate": 1.20781404236173e-06, "loss": 0.0021, "num_input_tokens_seen": 30170864, "step": 30050 }, { "epoch": 15.935843054082715, "grad_norm": 0.02371625043451786, "learning_rate": 1.2063066122068756e-06, "loss": 0.0016, "num_input_tokens_seen": 30176656, "step": 30055 }, { "epoch": 15.93849416755037, "grad_norm": 0.12131263315677643, "learning_rate": 1.2047999942718464e-06, "loss": 0.0008, "num_input_tokens_seen": 30182544, "step": 30060 }, { "epoch": 15.941145281018027, "grad_norm": 4.1571526527404785, "learning_rate": 1.2032941888792105e-06, "loss": 0.0221, "num_input_tokens_seen": 30187824, "step": 30065 }, { "epoch": 15.943796394485684, "grad_norm": 3.4029293060302734, "learning_rate": 1.2017891963513512e-06, "loss": 0.0342, "num_input_tokens_seen": 30192624, "step": 30070 }, { "epoch": 15.94644750795334, "grad_norm": 0.09101928770542145, "learning_rate": 1.2002850170104875e-06, "loss": 0.0216, "num_input_tokens_seen": 30198256, "step": 30075 }, { "epoch": 15.949098621420998, "grad_norm": 1.319667935371399, "learning_rate": 1.1987816511786582e-06, "loss": 0.0007, "num_input_tokens_seen": 30203600, "step": 30080 }, { "epoch": 15.951749734888653, "grad_norm": 1.2734936475753784, "learning_rate": 1.197279099177731e-06, "loss": 0.0069, "num_input_tokens_seen": 30208368, "step": 30085 }, { "epoch": 15.95440084835631, "grad_norm": 0.4728398621082306, "learning_rate": 1.1957773613293978e-06, "loss": 0.0097, "num_input_tokens_seen": 30212560, "step": 30090 }, { "epoch": 15.957051961823966, "grad_norm": 0.08529782295227051, "learning_rate": 1.194276437955177e-06, "loss": 0.1272, "num_input_tokens_seen": 30217360, "step": 30095 }, { "epoch": 15.959703075291623, "grad_norm": 0.9902392625808716, "learning_rate": 1.1927763293764122e-06, "loss": 0.0036, "num_input_tokens_seen": 30222512, "step": 30100 }, { "epoch": 15.96235418875928, "grad_norm": 0.06545842438936234, "learning_rate": 1.1912770359142727e-06, "loss": 0.0131, "num_input_tokens_seen": 30227920, "step": 30105 }, { "epoch": 15.965005302226935, "grad_norm": 6.950829029083252, "learning_rate": 1.1897785578897535e-06, "loss": 0.033, "num_input_tokens_seen": 30232304, "step": 30110 }, { "epoch": 15.967656415694591, "grad_norm": 0.16525617241859436, "learning_rate": 1.1882808956236735e-06, "loss": 0.0039, "num_input_tokens_seen": 30236944, "step": 30115 }, { "epoch": 15.970307529162248, "grad_norm": 0.11147747933864594, "learning_rate": 1.1867840494366818e-06, "loss": 0.0042, "num_input_tokens_seen": 30242832, "step": 30120 }, { "epoch": 15.972958642629905, "grad_norm": 0.9616537690162659, "learning_rate": 1.1852880196492444e-06, "loss": 0.0032, "num_input_tokens_seen": 30247728, "step": 30125 }, { "epoch": 15.975609756097562, "grad_norm": 0.2508636713027954, "learning_rate": 1.1837928065816613e-06, "loss": 0.0009, "num_input_tokens_seen": 30252912, "step": 30130 }, { "epoch": 15.978260869565217, "grad_norm": 0.2154378592967987, "learning_rate": 1.1822984105540526e-06, "loss": 0.0031, "num_input_tokens_seen": 30257552, "step": 30135 }, { "epoch": 15.980911983032874, "grad_norm": 3.8828516006469727, "learning_rate": 1.1808048318863641e-06, "loss": 0.0019, "num_input_tokens_seen": 30262576, "step": 30140 }, { "epoch": 15.98356309650053, "grad_norm": 1.3157150745391846, "learning_rate": 1.1793120708983668e-06, "loss": 0.0007, "num_input_tokens_seen": 30266640, "step": 30145 }, { "epoch": 15.986214209968187, "grad_norm": 1.2570966482162476, "learning_rate": 1.177820127909658e-06, "loss": 0.0035, "num_input_tokens_seen": 30272144, "step": 30150 }, { "epoch": 15.988865323435842, "grad_norm": 3.0466415882110596, "learning_rate": 1.1763290032396563e-06, "loss": 0.0067, "num_input_tokens_seen": 30276848, "step": 30155 }, { "epoch": 15.991516436903499, "grad_norm": 0.43509289622306824, "learning_rate": 1.1748386972076125e-06, "loss": 0.0063, "num_input_tokens_seen": 30282384, "step": 30160 }, { "epoch": 15.994167550371156, "grad_norm": 34.16103744506836, "learning_rate": 1.1733492101325916e-06, "loss": 0.023, "num_input_tokens_seen": 30286384, "step": 30165 }, { "epoch": 15.996818663838813, "grad_norm": 37.614784240722656, "learning_rate": 1.1718605423334927e-06, "loss": 0.2489, "num_input_tokens_seen": 30290640, "step": 30170 }, { "epoch": 15.99946977730647, "grad_norm": 0.01376216858625412, "learning_rate": 1.1703726941290343e-06, "loss": 0.0012, "num_input_tokens_seen": 30297168, "step": 30175 }, { "epoch": 16.0, "eval_loss": 0.9380632638931274, "eval_runtime": 29.4078, "eval_samples_per_second": 64.133, "eval_steps_per_second": 16.05, "num_input_tokens_seen": 30297384, "step": 30176 }, { "epoch": 16.002120890774126, "grad_norm": 1.3296949863433838, "learning_rate": 1.1688856658377612e-06, "loss": 0.0026, "num_input_tokens_seen": 30303176, "step": 30180 }, { "epoch": 16.004772004241783, "grad_norm": 0.08716629445552826, "learning_rate": 1.1673994577780418e-06, "loss": 0.0008, "num_input_tokens_seen": 30307560, "step": 30185 }, { "epoch": 16.00742311770944, "grad_norm": 99.1216049194336, "learning_rate": 1.16591407026807e-06, "loss": 0.0863, "num_input_tokens_seen": 30313480, "step": 30190 }, { "epoch": 16.010074231177093, "grad_norm": 0.4990287721157074, "learning_rate": 1.1644295036258612e-06, "loss": 0.0004, "num_input_tokens_seen": 30318152, "step": 30195 }, { "epoch": 16.01272534464475, "grad_norm": 26.534799575805664, "learning_rate": 1.1629457581692616e-06, "loss": 0.0103, "num_input_tokens_seen": 30322248, "step": 30200 }, { "epoch": 16.015376458112407, "grad_norm": 0.481191486120224, "learning_rate": 1.1614628342159323e-06, "loss": 0.0445, "num_input_tokens_seen": 30326728, "step": 30205 }, { "epoch": 16.018027571580063, "grad_norm": 0.256996214389801, "learning_rate": 1.1599807320833668e-06, "loss": 0.0031, "num_input_tokens_seen": 30331912, "step": 30210 }, { "epoch": 16.02067868504772, "grad_norm": 0.03395894169807434, "learning_rate": 1.1584994520888782e-06, "loss": 0.0051, "num_input_tokens_seen": 30337384, "step": 30215 }, { "epoch": 16.023329798515377, "grad_norm": 0.13268540799617767, "learning_rate": 1.1570189945496051e-06, "loss": 0.0013, "num_input_tokens_seen": 30342184, "step": 30220 }, { "epoch": 16.025980911983034, "grad_norm": 15.780096054077148, "learning_rate": 1.1555393597825087e-06, "loss": 0.005, "num_input_tokens_seen": 30347336, "step": 30225 }, { "epoch": 16.02863202545069, "grad_norm": 5.892240047454834, "learning_rate": 1.154060548104376e-06, "loss": 0.0027, "num_input_tokens_seen": 30353160, "step": 30230 }, { "epoch": 16.031283138918347, "grad_norm": 0.17064104974269867, "learning_rate": 1.1525825598318157e-06, "loss": 0.0007, "num_input_tokens_seen": 30357672, "step": 30235 }, { "epoch": 16.033934252386, "grad_norm": 0.02900806814432144, "learning_rate": 1.1511053952812618e-06, "loss": 0.0005, "num_input_tokens_seen": 30362280, "step": 30240 }, { "epoch": 16.036585365853657, "grad_norm": 73.83940887451172, "learning_rate": 1.1496290547689716e-06, "loss": 0.1379, "num_input_tokens_seen": 30366408, "step": 30245 }, { "epoch": 16.039236479321314, "grad_norm": 0.5570884346961975, "learning_rate": 1.1481535386110232e-06, "loss": 0.0015, "num_input_tokens_seen": 30371272, "step": 30250 }, { "epoch": 16.04188759278897, "grad_norm": 0.4986300766468048, "learning_rate": 1.1466788471233265e-06, "loss": 0.0069, "num_input_tokens_seen": 30377352, "step": 30255 }, { "epoch": 16.044538706256628, "grad_norm": 0.29598501324653625, "learning_rate": 1.145204980621602e-06, "loss": 0.0066, "num_input_tokens_seen": 30382344, "step": 30260 }, { "epoch": 16.047189819724284, "grad_norm": 0.47408515214920044, "learning_rate": 1.1437319394214064e-06, "loss": 0.0015, "num_input_tokens_seen": 30387464, "step": 30265 }, { "epoch": 16.04984093319194, "grad_norm": 0.1154060885310173, "learning_rate": 1.1422597238381111e-06, "loss": 0.001, "num_input_tokens_seen": 30392936, "step": 30270 }, { "epoch": 16.052492046659598, "grad_norm": 0.3842522203922272, "learning_rate": 1.140788334186914e-06, "loss": 0.0011, "num_input_tokens_seen": 30398728, "step": 30275 }, { "epoch": 16.055143160127255, "grad_norm": 4.60397481918335, "learning_rate": 1.1393177707828352e-06, "loss": 0.0032, "num_input_tokens_seen": 30402504, "step": 30280 }, { "epoch": 16.05779427359491, "grad_norm": 0.33248046040534973, "learning_rate": 1.1378480339407222e-06, "loss": 0.0005, "num_input_tokens_seen": 30408456, "step": 30285 }, { "epoch": 16.060445387062565, "grad_norm": 0.0514015331864357, "learning_rate": 1.1363791239752352e-06, "loss": 0.0018, "num_input_tokens_seen": 30413672, "step": 30290 }, { "epoch": 16.06309650053022, "grad_norm": 44.85460662841797, "learning_rate": 1.1349110412008712e-06, "loss": 0.0372, "num_input_tokens_seen": 30419688, "step": 30295 }, { "epoch": 16.06574761399788, "grad_norm": 1.226328730583191, "learning_rate": 1.133443785931937e-06, "loss": 0.0013, "num_input_tokens_seen": 30425160, "step": 30300 }, { "epoch": 16.068398727465535, "grad_norm": 50.47195816040039, "learning_rate": 1.1319773584825711e-06, "loss": 0.0177, "num_input_tokens_seen": 30429512, "step": 30305 }, { "epoch": 16.071049840933192, "grad_norm": 0.022008800879120827, "learning_rate": 1.1305117591667315e-06, "loss": 0.0013, "num_input_tokens_seen": 30435016, "step": 30310 }, { "epoch": 16.07370095440085, "grad_norm": 0.1173381507396698, "learning_rate": 1.1290469882981987e-06, "loss": 0.0003, "num_input_tokens_seen": 30440328, "step": 30315 }, { "epoch": 16.076352067868505, "grad_norm": 0.03730696067214012, "learning_rate": 1.1275830461905745e-06, "loss": 0.0028, "num_input_tokens_seen": 30445192, "step": 30320 }, { "epoch": 16.079003181336162, "grad_norm": 0.0495326891541481, "learning_rate": 1.126119933157291e-06, "loss": 0.0003, "num_input_tokens_seen": 30450280, "step": 30325 }, { "epoch": 16.08165429480382, "grad_norm": 6.144260883331299, "learning_rate": 1.124657649511589e-06, "loss": 0.0066, "num_input_tokens_seen": 30456520, "step": 30330 }, { "epoch": 16.084305408271472, "grad_norm": 0.24909305572509766, "learning_rate": 1.1231961955665456e-06, "loss": 0.004, "num_input_tokens_seen": 30461608, "step": 30335 }, { "epoch": 16.08695652173913, "grad_norm": 0.0472305603325367, "learning_rate": 1.1217355716350515e-06, "loss": 0.0016, "num_input_tokens_seen": 30466920, "step": 30340 }, { "epoch": 16.089607635206786, "grad_norm": 0.03256247192621231, "learning_rate": 1.1202757780298235e-06, "loss": 0.0138, "num_input_tokens_seen": 30471240, "step": 30345 }, { "epoch": 16.092258748674443, "grad_norm": 0.06043552607297897, "learning_rate": 1.1188168150633992e-06, "loss": 0.0007, "num_input_tokens_seen": 30476680, "step": 30350 }, { "epoch": 16.0949098621421, "grad_norm": 1.111311912536621, "learning_rate": 1.1173586830481386e-06, "loss": 0.0174, "num_input_tokens_seen": 30481064, "step": 30355 }, { "epoch": 16.097560975609756, "grad_norm": 1.5325630903244019, "learning_rate": 1.1159013822962234e-06, "loss": 0.0169, "num_input_tokens_seen": 30486536, "step": 30360 }, { "epoch": 16.100212089077413, "grad_norm": 0.044048409909009933, "learning_rate": 1.1144449131196606e-06, "loss": 0.0004, "num_input_tokens_seen": 30491080, "step": 30365 }, { "epoch": 16.10286320254507, "grad_norm": 0.051002588123083115, "learning_rate": 1.1129892758302735e-06, "loss": 0.0011, "num_input_tokens_seen": 30496008, "step": 30370 }, { "epoch": 16.105514316012727, "grad_norm": 0.2527230978012085, "learning_rate": 1.1115344707397096e-06, "loss": 0.0003, "num_input_tokens_seen": 30500264, "step": 30375 }, { "epoch": 16.108165429480383, "grad_norm": 0.12031587958335876, "learning_rate": 1.110080498159443e-06, "loss": 0.0031, "num_input_tokens_seen": 30505544, "step": 30380 }, { "epoch": 16.110816542948037, "grad_norm": 0.02738417498767376, "learning_rate": 1.10862735840076e-06, "loss": 0.0011, "num_input_tokens_seen": 30510056, "step": 30385 }, { "epoch": 16.113467656415693, "grad_norm": 0.050978995859622955, "learning_rate": 1.1071750517747787e-06, "loss": 0.0006, "num_input_tokens_seen": 30515368, "step": 30390 }, { "epoch": 16.11611876988335, "grad_norm": 0.41029971837997437, "learning_rate": 1.105723578592432e-06, "loss": 0.0012, "num_input_tokens_seen": 30519304, "step": 30395 }, { "epoch": 16.118769883351007, "grad_norm": 0.1279388815164566, "learning_rate": 1.1042729391644774e-06, "loss": 0.0017, "num_input_tokens_seen": 30524328, "step": 30400 }, { "epoch": 16.121420996818664, "grad_norm": 0.5558939576148987, "learning_rate": 1.1028231338014916e-06, "loss": 0.0009, "num_input_tokens_seen": 30530312, "step": 30405 }, { "epoch": 16.12407211028632, "grad_norm": 0.0675601214170456, "learning_rate": 1.1013741628138759e-06, "loss": 0.0004, "num_input_tokens_seen": 30535432, "step": 30410 }, { "epoch": 16.126723223753977, "grad_norm": 0.16761796176433563, "learning_rate": 1.0999260265118478e-06, "loss": 0.0007, "num_input_tokens_seen": 30540360, "step": 30415 }, { "epoch": 16.129374337221634, "grad_norm": 0.9772737622261047, "learning_rate": 1.0984787252054556e-06, "loss": 0.0005, "num_input_tokens_seen": 30545224, "step": 30420 }, { "epoch": 16.13202545068929, "grad_norm": 1.0109665393829346, "learning_rate": 1.0970322592045558e-06, "loss": 0.0008, "num_input_tokens_seen": 30551144, "step": 30425 }, { "epoch": 16.134676564156948, "grad_norm": 3.531567335128784, "learning_rate": 1.0955866288188377e-06, "loss": 0.0015, "num_input_tokens_seen": 30555560, "step": 30430 }, { "epoch": 16.1373276776246, "grad_norm": 0.42045822739601135, "learning_rate": 1.094141834357806e-06, "loss": 0.0211, "num_input_tokens_seen": 30559816, "step": 30435 }, { "epoch": 16.139978791092258, "grad_norm": 0.7321693897247314, "learning_rate": 1.0926978761307866e-06, "loss": 0.002, "num_input_tokens_seen": 30563752, "step": 30440 }, { "epoch": 16.142629904559914, "grad_norm": 0.051432814449071884, "learning_rate": 1.0912547544469276e-06, "loss": 0.0014, "num_input_tokens_seen": 30569064, "step": 30445 }, { "epoch": 16.14528101802757, "grad_norm": 5.084846019744873, "learning_rate": 1.0898124696151974e-06, "loss": 0.0032, "num_input_tokens_seen": 30574280, "step": 30450 }, { "epoch": 16.147932131495228, "grad_norm": 0.062189407646656036, "learning_rate": 1.0883710219443844e-06, "loss": 0.0003, "num_input_tokens_seen": 30578632, "step": 30455 }, { "epoch": 16.150583244962885, "grad_norm": 0.04300999268889427, "learning_rate": 1.0869304117431022e-06, "loss": 0.0007, "num_input_tokens_seen": 30583336, "step": 30460 }, { "epoch": 16.15323435843054, "grad_norm": 0.3423921763896942, "learning_rate": 1.0854906393197772e-06, "loss": 0.0006, "num_input_tokens_seen": 30589224, "step": 30465 }, { "epoch": 16.1558854718982, "grad_norm": 0.011784697882831097, "learning_rate": 1.0840517049826644e-06, "loss": 0.0006, "num_input_tokens_seen": 30593384, "step": 30470 }, { "epoch": 16.158536585365855, "grad_norm": 0.312028169631958, "learning_rate": 1.0826136090398342e-06, "loss": 0.0016, "num_input_tokens_seen": 30598824, "step": 30475 }, { "epoch": 16.16118769883351, "grad_norm": 0.02830110676586628, "learning_rate": 1.0811763517991796e-06, "loss": 0.0005, "num_input_tokens_seen": 30603496, "step": 30480 }, { "epoch": 16.163838812301165, "grad_norm": 1.1533135175704956, "learning_rate": 1.0797399335684132e-06, "loss": 0.1014, "num_input_tokens_seen": 30607976, "step": 30485 }, { "epoch": 16.166489925768822, "grad_norm": 0.05285536125302315, "learning_rate": 1.0783043546550697e-06, "loss": 0.0025, "num_input_tokens_seen": 30613096, "step": 30490 }, { "epoch": 16.16914103923648, "grad_norm": 0.04715627059340477, "learning_rate": 1.0768696153664997e-06, "loss": 0.0034, "num_input_tokens_seen": 30618632, "step": 30495 }, { "epoch": 16.171792152704136, "grad_norm": 0.8917461633682251, "learning_rate": 1.0754357160098822e-06, "loss": 0.0012, "num_input_tokens_seen": 30623368, "step": 30500 }, { "epoch": 16.174443266171792, "grad_norm": 10.890204429626465, "learning_rate": 1.0740026568922058e-06, "loss": 0.0034, "num_input_tokens_seen": 30627752, "step": 30505 }, { "epoch": 16.17709437963945, "grad_norm": 0.6298331022262573, "learning_rate": 1.0725704383202884e-06, "loss": 0.0016, "num_input_tokens_seen": 30632392, "step": 30510 }, { "epoch": 16.179745493107106, "grad_norm": 19.323463439941406, "learning_rate": 1.0711390606007643e-06, "loss": 0.0056, "num_input_tokens_seen": 30637288, "step": 30515 }, { "epoch": 16.182396606574763, "grad_norm": 0.27118444442749023, "learning_rate": 1.0697085240400845e-06, "loss": 0.0007, "num_input_tokens_seen": 30641000, "step": 30520 }, { "epoch": 16.18504772004242, "grad_norm": 0.3668432831764221, "learning_rate": 1.0682788289445257e-06, "loss": 0.1198, "num_input_tokens_seen": 30645704, "step": 30525 }, { "epoch": 16.187698833510073, "grad_norm": 0.010699690319597721, "learning_rate": 1.0668499756201817e-06, "loss": 0.0022, "num_input_tokens_seen": 30651144, "step": 30530 }, { "epoch": 16.19034994697773, "grad_norm": 0.014754489995539188, "learning_rate": 1.0654219643729662e-06, "loss": 0.0498, "num_input_tokens_seen": 30655752, "step": 30535 }, { "epoch": 16.193001060445386, "grad_norm": 2.0523481369018555, "learning_rate": 1.0639947955086121e-06, "loss": 0.0752, "num_input_tokens_seen": 30661064, "step": 30540 }, { "epoch": 16.195652173913043, "grad_norm": 0.05822674185037613, "learning_rate": 1.0625684693326727e-06, "loss": 0.0009, "num_input_tokens_seen": 30665928, "step": 30545 }, { "epoch": 16.1983032873807, "grad_norm": 0.30695757269859314, "learning_rate": 1.0611429861505196e-06, "loss": 0.0066, "num_input_tokens_seen": 30670600, "step": 30550 }, { "epoch": 16.200954400848357, "grad_norm": 0.17631874978542328, "learning_rate": 1.0597183462673488e-06, "loss": 0.0004, "num_input_tokens_seen": 30675400, "step": 30555 }, { "epoch": 16.203605514316013, "grad_norm": 0.049987174570560455, "learning_rate": 1.058294549988167e-06, "loss": 0.001, "num_input_tokens_seen": 30679624, "step": 30560 }, { "epoch": 16.20625662778367, "grad_norm": 0.023849088698625565, "learning_rate": 1.0568715976178085e-06, "loss": 0.01, "num_input_tokens_seen": 30683464, "step": 30565 }, { "epoch": 16.208907741251327, "grad_norm": 0.03795076534152031, "learning_rate": 1.0554494894609229e-06, "loss": 0.0006, "num_input_tokens_seen": 30688200, "step": 30570 }, { "epoch": 16.211558854718984, "grad_norm": 2.0696442127227783, "learning_rate": 1.0540282258219802e-06, "loss": 0.0008, "num_input_tokens_seen": 30695464, "step": 30575 }, { "epoch": 16.214209968186637, "grad_norm": 33.185203552246094, "learning_rate": 1.052607807005268e-06, "loss": 0.0098, "num_input_tokens_seen": 30701480, "step": 30580 }, { "epoch": 16.216861081654294, "grad_norm": 0.45259565114974976, "learning_rate": 1.051188233314896e-06, "loss": 0.0571, "num_input_tokens_seen": 30706696, "step": 30585 }, { "epoch": 16.21951219512195, "grad_norm": 0.1278989166021347, "learning_rate": 1.0497695050547885e-06, "loss": 0.0008, "num_input_tokens_seen": 30711048, "step": 30590 }, { "epoch": 16.222163308589607, "grad_norm": 3.8181800842285156, "learning_rate": 1.0483516225286956e-06, "loss": 0.0015, "num_input_tokens_seen": 30717032, "step": 30595 }, { "epoch": 16.224814422057264, "grad_norm": 2.902454376220703, "learning_rate": 1.0469345860401775e-06, "loss": 0.0014, "num_input_tokens_seen": 30721704, "step": 30600 }, { "epoch": 16.22746553552492, "grad_norm": 0.3363880515098572, "learning_rate": 1.0455183958926219e-06, "loss": 0.0071, "num_input_tokens_seen": 30726728, "step": 30605 }, { "epoch": 16.230116648992578, "grad_norm": 1.2673171758651733, "learning_rate": 1.0441030523892298e-06, "loss": 0.006, "num_input_tokens_seen": 30732744, "step": 30610 }, { "epoch": 16.232767762460234, "grad_norm": 0.44664376974105835, "learning_rate": 1.0426885558330225e-06, "loss": 0.0016, "num_input_tokens_seen": 30737640, "step": 30615 }, { "epoch": 16.23541887592789, "grad_norm": 0.023059744387865067, "learning_rate": 1.0412749065268402e-06, "loss": 0.0138, "num_input_tokens_seen": 30742280, "step": 30620 }, { "epoch": 16.238069989395544, "grad_norm": 1.9439198970794678, "learning_rate": 1.0398621047733415e-06, "loss": 0.0007, "num_input_tokens_seen": 30747336, "step": 30625 }, { "epoch": 16.2407211028632, "grad_norm": 0.016548186540603638, "learning_rate": 1.0384501508750023e-06, "loss": 0.0348, "num_input_tokens_seen": 30751848, "step": 30630 }, { "epoch": 16.243372216330858, "grad_norm": 0.04169173166155815, "learning_rate": 1.0370390451341222e-06, "loss": 0.0012, "num_input_tokens_seen": 30757480, "step": 30635 }, { "epoch": 16.246023329798515, "grad_norm": 0.05387800931930542, "learning_rate": 1.0356287878528093e-06, "loss": 0.0007, "num_input_tokens_seen": 30762728, "step": 30640 }, { "epoch": 16.24867444326617, "grad_norm": 0.1256706267595291, "learning_rate": 1.0342193793330013e-06, "loss": 0.0111, "num_input_tokens_seen": 30767400, "step": 30645 }, { "epoch": 16.25132555673383, "grad_norm": 0.5323474407196045, "learning_rate": 1.0328108198764475e-06, "loss": 0.022, "num_input_tokens_seen": 30773096, "step": 30650 }, { "epoch": 16.253976670201485, "grad_norm": 0.4911714196205139, "learning_rate": 1.0314031097847137e-06, "loss": 0.0015, "num_input_tokens_seen": 30779048, "step": 30655 }, { "epoch": 16.256627783669142, "grad_norm": 0.20067588984966278, "learning_rate": 1.0299962493591908e-06, "loss": 0.0009, "num_input_tokens_seen": 30783176, "step": 30660 }, { "epoch": 16.2592788971368, "grad_norm": 0.5622826814651489, "learning_rate": 1.028590238901082e-06, "loss": 0.0013, "num_input_tokens_seen": 30788488, "step": 30665 }, { "epoch": 16.261930010604456, "grad_norm": 0.027007639408111572, "learning_rate": 1.0271850787114113e-06, "loss": 0.0123, "num_input_tokens_seen": 30792808, "step": 30670 }, { "epoch": 16.26458112407211, "grad_norm": 2.312579393386841, "learning_rate": 1.0257807690910194e-06, "loss": 0.0011, "num_input_tokens_seen": 30797544, "step": 30675 }, { "epoch": 16.267232237539766, "grad_norm": 1.0650908946990967, "learning_rate": 1.0243773103405652e-06, "loss": 0.0016, "num_input_tokens_seen": 30802504, "step": 30680 }, { "epoch": 16.269883351007422, "grad_norm": 0.19791336357593536, "learning_rate": 1.0229747027605247e-06, "loss": 0.0039, "num_input_tokens_seen": 30806696, "step": 30685 }, { "epoch": 16.27253446447508, "grad_norm": 0.16496577858924866, "learning_rate": 1.021572946651196e-06, "loss": 0.001, "num_input_tokens_seen": 30812648, "step": 30690 }, { "epoch": 16.275185577942736, "grad_norm": 0.09841836988925934, "learning_rate": 1.020172042312686e-06, "loss": 0.0005, "num_input_tokens_seen": 30817608, "step": 30695 }, { "epoch": 16.277836691410393, "grad_norm": 0.9876561164855957, "learning_rate": 1.0187719900449289e-06, "loss": 0.0042, "num_input_tokens_seen": 30822920, "step": 30700 }, { "epoch": 16.28048780487805, "grad_norm": 0.028985416516661644, "learning_rate": 1.017372790147671e-06, "loss": 0.0005, "num_input_tokens_seen": 30827784, "step": 30705 }, { "epoch": 16.283138918345706, "grad_norm": 0.0985582247376442, "learning_rate": 1.0159744429204776e-06, "loss": 0.0015, "num_input_tokens_seen": 30833160, "step": 30710 }, { "epoch": 16.285790031813363, "grad_norm": 0.17386840283870697, "learning_rate": 1.0145769486627305e-06, "loss": 0.0008, "num_input_tokens_seen": 30838888, "step": 30715 }, { "epoch": 16.28844114528102, "grad_norm": 0.1294768750667572, "learning_rate": 1.0131803076736302e-06, "loss": 0.0006, "num_input_tokens_seen": 30843912, "step": 30720 }, { "epoch": 16.291092258748673, "grad_norm": 0.20168931782245636, "learning_rate": 1.011784520252192e-06, "loss": 0.1322, "num_input_tokens_seen": 30848104, "step": 30725 }, { "epoch": 16.29374337221633, "grad_norm": 0.17984889447689056, "learning_rate": 1.0103895866972547e-06, "loss": 0.0012, "num_input_tokens_seen": 30853064, "step": 30730 }, { "epoch": 16.296394485683987, "grad_norm": 1.8074548244476318, "learning_rate": 1.0089955073074642e-06, "loss": 0.0038, "num_input_tokens_seen": 30857896, "step": 30735 }, { "epoch": 16.299045599151643, "grad_norm": 0.079775869846344, "learning_rate": 1.0076022823812937e-06, "loss": 0.1325, "num_input_tokens_seen": 30862696, "step": 30740 }, { "epoch": 16.3016967126193, "grad_norm": 69.7347640991211, "learning_rate": 1.0062099122170272e-06, "loss": 0.0432, "num_input_tokens_seen": 30867784, "step": 30745 }, { "epoch": 16.304347826086957, "grad_norm": 0.14077478647232056, "learning_rate": 1.0048183971127685e-06, "loss": 0.0007, "num_input_tokens_seen": 30873128, "step": 30750 }, { "epoch": 16.306998939554614, "grad_norm": 0.6376233696937561, "learning_rate": 1.003427737366436e-06, "loss": 0.0011, "num_input_tokens_seen": 30879080, "step": 30755 }, { "epoch": 16.30965005302227, "grad_norm": 0.0849928930401802, "learning_rate": 1.0020379332757668e-06, "loss": 0.0033, "num_input_tokens_seen": 30884488, "step": 30760 }, { "epoch": 16.312301166489927, "grad_norm": 0.7215670943260193, "learning_rate": 1.0006489851383138e-06, "loss": 0.0007, "num_input_tokens_seen": 30889960, "step": 30765 }, { "epoch": 16.31495227995758, "grad_norm": 0.42430657148361206, "learning_rate": 9.992608932514502e-07, "loss": 0.0087, "num_input_tokens_seen": 30894888, "step": 30770 }, { "epoch": 16.317603393425237, "grad_norm": 1.4020683765411377, "learning_rate": 9.978736579123577e-07, "loss": 0.0024, "num_input_tokens_seen": 30899624, "step": 30775 }, { "epoch": 16.320254506892894, "grad_norm": 0.8505143523216248, "learning_rate": 9.96487279418044e-07, "loss": 0.0016, "num_input_tokens_seen": 30905128, "step": 30780 }, { "epoch": 16.32290562036055, "grad_norm": 0.009674768894910812, "learning_rate": 9.951017580653287e-07, "loss": 0.0024, "num_input_tokens_seen": 30910344, "step": 30785 }, { "epoch": 16.325556733828208, "grad_norm": 0.0199428703635931, "learning_rate": 9.93717094150845e-07, "loss": 0.0021, "num_input_tokens_seen": 30915080, "step": 30790 }, { "epoch": 16.328207847295864, "grad_norm": 0.12055158615112305, "learning_rate": 9.92333287971049e-07, "loss": 0.0007, "num_input_tokens_seen": 30920136, "step": 30795 }, { "epoch": 16.33085896076352, "grad_norm": 0.7521263957023621, "learning_rate": 9.909503398222092e-07, "loss": 0.0046, "num_input_tokens_seen": 30925576, "step": 30800 }, { "epoch": 16.333510074231178, "grad_norm": 0.07738955318927765, "learning_rate": 9.89568250000411e-07, "loss": 0.0009, "num_input_tokens_seen": 30929864, "step": 30805 }, { "epoch": 16.336161187698835, "grad_norm": 1.3665075302124023, "learning_rate": 9.881870188015569e-07, "loss": 0.0014, "num_input_tokens_seen": 30934920, "step": 30810 }, { "epoch": 16.33881230116649, "grad_norm": 0.5737707018852234, "learning_rate": 9.868066465213632e-07, "loss": 0.0045, "num_input_tokens_seen": 30939848, "step": 30815 }, { "epoch": 16.341463414634145, "grad_norm": 1.1264870166778564, "learning_rate": 9.854271334553645e-07, "loss": 0.1073, "num_input_tokens_seen": 30945064, "step": 30820 }, { "epoch": 16.3441145281018, "grad_norm": 0.04810868576169014, "learning_rate": 9.840484798989135e-07, "loss": 0.0147, "num_input_tokens_seen": 30950440, "step": 30825 }, { "epoch": 16.34676564156946, "grad_norm": 0.02655310183763504, "learning_rate": 9.826706861471719e-07, "loss": 0.0635, "num_input_tokens_seen": 30954920, "step": 30830 }, { "epoch": 16.349416755037115, "grad_norm": 0.07123847305774689, "learning_rate": 9.812937524951249e-07, "loss": 0.018, "num_input_tokens_seen": 30959336, "step": 30835 }, { "epoch": 16.352067868504772, "grad_norm": 0.11979272216558456, "learning_rate": 9.799176792375703e-07, "loss": 0.0004, "num_input_tokens_seen": 30964456, "step": 30840 }, { "epoch": 16.35471898197243, "grad_norm": 0.04388707876205444, "learning_rate": 9.785424666691206e-07, "loss": 0.0331, "num_input_tokens_seen": 30969640, "step": 30845 }, { "epoch": 16.357370095440086, "grad_norm": 39.333045959472656, "learning_rate": 9.77168115084205e-07, "loss": 0.0133, "num_input_tokens_seen": 30973704, "step": 30850 }, { "epoch": 16.360021208907742, "grad_norm": 0.01000180933624506, "learning_rate": 9.75794624777071e-07, "loss": 0.001, "num_input_tokens_seen": 30978600, "step": 30855 }, { "epoch": 16.3626723223754, "grad_norm": 0.032701123505830765, "learning_rate": 9.744219960417756e-07, "loss": 0.0028, "num_input_tokens_seen": 30984392, "step": 30860 }, { "epoch": 16.365323435843052, "grad_norm": 1.1466608047485352, "learning_rate": 9.730502291721993e-07, "loss": 0.0013, "num_input_tokens_seen": 30989448, "step": 30865 }, { "epoch": 16.36797454931071, "grad_norm": 0.6993523836135864, "learning_rate": 9.716793244620293e-07, "loss": 0.007, "num_input_tokens_seen": 30994504, "step": 30870 }, { "epoch": 16.370625662778366, "grad_norm": 46.24566650390625, "learning_rate": 9.703092822047756e-07, "loss": 0.0294, "num_input_tokens_seen": 30999624, "step": 30875 }, { "epoch": 16.373276776246023, "grad_norm": 0.0694526880979538, "learning_rate": 9.689401026937606e-07, "loss": 0.0004, "num_input_tokens_seen": 31004104, "step": 30880 }, { "epoch": 16.37592788971368, "grad_norm": 0.051823221147060394, "learning_rate": 9.675717862221206e-07, "loss": 0.0004, "num_input_tokens_seen": 31008392, "step": 30885 }, { "epoch": 16.378579003181336, "grad_norm": 4.020786762237549, "learning_rate": 9.662043330828086e-07, "loss": 0.0011, "num_input_tokens_seen": 31013192, "step": 30890 }, { "epoch": 16.381230116648993, "grad_norm": 0.14067137241363525, "learning_rate": 9.648377435685957e-07, "loss": 0.0008, "num_input_tokens_seen": 31018280, "step": 30895 }, { "epoch": 16.38388123011665, "grad_norm": 0.3731101453304291, "learning_rate": 9.634720179720607e-07, "loss": 0.0016, "num_input_tokens_seen": 31023880, "step": 30900 }, { "epoch": 16.386532343584307, "grad_norm": 0.16503390669822693, "learning_rate": 9.621071565856055e-07, "loss": 0.0006, "num_input_tokens_seen": 31030056, "step": 30905 }, { "epoch": 16.389183457051963, "grad_norm": 0.27112001180648804, "learning_rate": 9.607431597014417e-07, "loss": 0.0102, "num_input_tokens_seen": 31034568, "step": 30910 }, { "epoch": 16.391834570519617, "grad_norm": 2.789051055908203, "learning_rate": 9.593800276115978e-07, "loss": 0.0011, "num_input_tokens_seen": 31039400, "step": 30915 }, { "epoch": 16.394485683987273, "grad_norm": 21.877525329589844, "learning_rate": 9.580177606079173e-07, "loss": 0.0391, "num_input_tokens_seen": 31043752, "step": 30920 }, { "epoch": 16.39713679745493, "grad_norm": 0.008021601475775242, "learning_rate": 9.566563589820565e-07, "loss": 0.0124, "num_input_tokens_seen": 31048328, "step": 30925 }, { "epoch": 16.399787910922587, "grad_norm": 1.856925368309021, "learning_rate": 9.552958230254893e-07, "loss": 0.0013, "num_input_tokens_seen": 31053992, "step": 30930 }, { "epoch": 16.402439024390244, "grad_norm": 1.1982945203781128, "learning_rate": 9.539361530295026e-07, "loss": 0.0256, "num_input_tokens_seen": 31058280, "step": 30935 }, { "epoch": 16.4050901378579, "grad_norm": 8.029288291931152, "learning_rate": 9.525773492851981e-07, "loss": 0.1632, "num_input_tokens_seen": 31062952, "step": 30940 }, { "epoch": 16.407741251325557, "grad_norm": 1.1266014575958252, "learning_rate": 9.512194120834906e-07, "loss": 0.0063, "num_input_tokens_seen": 31067080, "step": 30945 }, { "epoch": 16.410392364793214, "grad_norm": 0.14097213745117188, "learning_rate": 9.498623417151149e-07, "loss": 0.0009, "num_input_tokens_seen": 31071208, "step": 30950 }, { "epoch": 16.41304347826087, "grad_norm": 0.11510220170021057, "learning_rate": 9.485061384706112e-07, "loss": 0.0007, "num_input_tokens_seen": 31075592, "step": 30955 }, { "epoch": 16.415694591728524, "grad_norm": 11.774842262268066, "learning_rate": 9.471508026403431e-07, "loss": 0.0027, "num_input_tokens_seen": 31080232, "step": 30960 }, { "epoch": 16.41834570519618, "grad_norm": 0.4435911774635315, "learning_rate": 9.457963345144821e-07, "loss": 0.0006, "num_input_tokens_seen": 31084040, "step": 30965 }, { "epoch": 16.420996818663838, "grad_norm": 0.07056523114442825, "learning_rate": 9.444427343830182e-07, "loss": 0.0011, "num_input_tokens_seen": 31089256, "step": 30970 }, { "epoch": 16.423647932131495, "grad_norm": 0.38863807916641235, "learning_rate": 9.43090002535752e-07, "loss": 0.0006, "num_input_tokens_seen": 31093768, "step": 30975 }, { "epoch": 16.42629904559915, "grad_norm": 1.3587654829025269, "learning_rate": 9.417381392623004e-07, "loss": 0.0052, "num_input_tokens_seen": 31098888, "step": 30980 }, { "epoch": 16.428950159066808, "grad_norm": 0.06603863835334778, "learning_rate": 9.403871448520924e-07, "loss": 0.0021, "num_input_tokens_seen": 31103752, "step": 30985 }, { "epoch": 16.431601272534465, "grad_norm": 0.521932065486908, "learning_rate": 9.390370195943765e-07, "loss": 0.0025, "num_input_tokens_seen": 31109192, "step": 30990 }, { "epoch": 16.43425238600212, "grad_norm": 0.14959952235221863, "learning_rate": 9.376877637782056e-07, "loss": 0.0007, "num_input_tokens_seen": 31114504, "step": 30995 }, { "epoch": 16.43690349946978, "grad_norm": 0.026667434722185135, "learning_rate": 9.363393776924556e-07, "loss": 0.0082, "num_input_tokens_seen": 31119496, "step": 31000 }, { "epoch": 16.439554612937435, "grad_norm": 0.8563617467880249, "learning_rate": 9.349918616258113e-07, "loss": 0.0196, "num_input_tokens_seen": 31124264, "step": 31005 }, { "epoch": 16.44220572640509, "grad_norm": 0.5406785607337952, "learning_rate": 9.33645215866773e-07, "loss": 0.0025, "num_input_tokens_seen": 31129576, "step": 31010 }, { "epoch": 16.444856839872745, "grad_norm": 6.094087600708008, "learning_rate": 9.322994407036529e-07, "loss": 0.0018, "num_input_tokens_seen": 31135496, "step": 31015 }, { "epoch": 16.447507953340402, "grad_norm": 0.2534612715244293, "learning_rate": 9.309545364245787e-07, "loss": 0.0052, "num_input_tokens_seen": 31140264, "step": 31020 }, { "epoch": 16.45015906680806, "grad_norm": 1.3009225130081177, "learning_rate": 9.296105033174891e-07, "loss": 0.001, "num_input_tokens_seen": 31144712, "step": 31025 }, { "epoch": 16.452810180275716, "grad_norm": 1.2354576587677002, "learning_rate": 9.282673416701421e-07, "loss": 0.002, "num_input_tokens_seen": 31148584, "step": 31030 }, { "epoch": 16.455461293743372, "grad_norm": 0.1915193498134613, "learning_rate": 9.269250517701e-07, "loss": 0.107, "num_input_tokens_seen": 31153128, "step": 31035 }, { "epoch": 16.45811240721103, "grad_norm": 5.4221601486206055, "learning_rate": 9.255836339047474e-07, "loss": 0.2002, "num_input_tokens_seen": 31158696, "step": 31040 }, { "epoch": 16.460763520678686, "grad_norm": 0.3507980704307556, "learning_rate": 9.242430883612774e-07, "loss": 0.0007, "num_input_tokens_seen": 31163240, "step": 31045 }, { "epoch": 16.463414634146343, "grad_norm": 0.01973758637905121, "learning_rate": 9.22903415426697e-07, "loss": 0.0008, "num_input_tokens_seen": 31168744, "step": 31050 }, { "epoch": 16.466065747614, "grad_norm": 90.51017761230469, "learning_rate": 9.215646153878266e-07, "loss": 0.0745, "num_input_tokens_seen": 31173960, "step": 31055 }, { "epoch": 16.468716861081653, "grad_norm": 0.9486221075057983, "learning_rate": 9.202266885313e-07, "loss": 0.0021, "num_input_tokens_seen": 31178568, "step": 31060 }, { "epoch": 16.47136797454931, "grad_norm": 108.90013122558594, "learning_rate": 9.188896351435633e-07, "loss": 0.0655, "num_input_tokens_seen": 31182952, "step": 31065 }, { "epoch": 16.474019088016966, "grad_norm": 1.8347725868225098, "learning_rate": 9.175534555108767e-07, "loss": 0.0031, "num_input_tokens_seen": 31187144, "step": 31070 }, { "epoch": 16.476670201484623, "grad_norm": 0.10783708840608597, "learning_rate": 9.162181499193129e-07, "loss": 0.0025, "num_input_tokens_seen": 31192328, "step": 31075 }, { "epoch": 16.47932131495228, "grad_norm": 0.6531946659088135, "learning_rate": 9.148837186547554e-07, "loss": 0.0008, "num_input_tokens_seen": 31199048, "step": 31080 }, { "epoch": 16.481972428419937, "grad_norm": 0.13951101899147034, "learning_rate": 9.135501620029064e-07, "loss": 0.0008, "num_input_tokens_seen": 31204360, "step": 31085 }, { "epoch": 16.484623541887593, "grad_norm": 0.26323845982551575, "learning_rate": 9.122174802492723e-07, "loss": 0.0034, "num_input_tokens_seen": 31209288, "step": 31090 }, { "epoch": 16.48727465535525, "grad_norm": 0.45453840494155884, "learning_rate": 9.108856736791805e-07, "loss": 0.0036, "num_input_tokens_seen": 31214952, "step": 31095 }, { "epoch": 16.489925768822907, "grad_norm": 0.2351197749376297, "learning_rate": 9.095547425777657e-07, "loss": 0.0018, "num_input_tokens_seen": 31220424, "step": 31100 }, { "epoch": 16.49257688229056, "grad_norm": 0.2131776362657547, "learning_rate": 9.082246872299766e-07, "loss": 0.0005, "num_input_tokens_seen": 31225224, "step": 31105 }, { "epoch": 16.495227995758217, "grad_norm": 3.197845697402954, "learning_rate": 9.068955079205755e-07, "loss": 0.0029, "num_input_tokens_seen": 31229672, "step": 31110 }, { "epoch": 16.497879109225874, "grad_norm": 1.8515230417251587, "learning_rate": 9.055672049341351e-07, "loss": 0.1626, "num_input_tokens_seen": 31234888, "step": 31115 }, { "epoch": 16.50053022269353, "grad_norm": 2.6886444091796875, "learning_rate": 9.042397785550405e-07, "loss": 0.0097, "num_input_tokens_seen": 31240520, "step": 31120 }, { "epoch": 16.503181336161187, "grad_norm": 4.167555809020996, "learning_rate": 9.029132290674946e-07, "loss": 0.0016, "num_input_tokens_seen": 31248168, "step": 31125 }, { "epoch": 16.505832449628844, "grad_norm": 0.2243511825799942, "learning_rate": 9.015875567555032e-07, "loss": 0.0198, "num_input_tokens_seen": 31252904, "step": 31130 }, { "epoch": 16.5084835630965, "grad_norm": 1.3695008754730225, "learning_rate": 9.002627619028925e-07, "loss": 0.0045, "num_input_tokens_seen": 31258984, "step": 31135 }, { "epoch": 16.511134676564158, "grad_norm": 0.0639789029955864, "learning_rate": 8.989388447932967e-07, "loss": 0.1414, "num_input_tokens_seen": 31263432, "step": 31140 }, { "epoch": 16.513785790031815, "grad_norm": 37.89881134033203, "learning_rate": 8.976158057101625e-07, "loss": 0.015, "num_input_tokens_seen": 31268296, "step": 31145 }, { "epoch": 16.51643690349947, "grad_norm": 1.1760174036026, "learning_rate": 8.962936449367493e-07, "loss": 0.0017, "num_input_tokens_seen": 31272552, "step": 31150 }, { "epoch": 16.519088016967125, "grad_norm": 56.70499038696289, "learning_rate": 8.949723627561285e-07, "loss": 0.0246, "num_input_tokens_seen": 31278952, "step": 31155 }, { "epoch": 16.52173913043478, "grad_norm": 0.019881300628185272, "learning_rate": 8.936519594511822e-07, "loss": 0.0005, "num_input_tokens_seen": 31283240, "step": 31160 }, { "epoch": 16.524390243902438, "grad_norm": 54.12568664550781, "learning_rate": 8.923324353046087e-07, "loss": 0.0212, "num_input_tokens_seen": 31289288, "step": 31165 }, { "epoch": 16.527041357370095, "grad_norm": 0.0941949188709259, "learning_rate": 8.910137905989091e-07, "loss": 0.0009, "num_input_tokens_seen": 31294632, "step": 31170 }, { "epoch": 16.52969247083775, "grad_norm": 1.6589008569717407, "learning_rate": 8.896960256164061e-07, "loss": 0.2359, "num_input_tokens_seen": 31299368, "step": 31175 }, { "epoch": 16.53234358430541, "grad_norm": 8.164690017700195, "learning_rate": 8.883791406392289e-07, "loss": 0.0065, "num_input_tokens_seen": 31304200, "step": 31180 }, { "epoch": 16.534994697773065, "grad_norm": 4.078899383544922, "learning_rate": 8.870631359493182e-07, "loss": 0.1804, "num_input_tokens_seen": 31309704, "step": 31185 }, { "epoch": 16.537645811240722, "grad_norm": 10.460439682006836, "learning_rate": 8.857480118284278e-07, "loss": 0.0226, "num_input_tokens_seen": 31316264, "step": 31190 }, { "epoch": 16.54029692470838, "grad_norm": 0.09012394398450851, "learning_rate": 8.844337685581222e-07, "loss": 0.0008, "num_input_tokens_seen": 31320712, "step": 31195 }, { "epoch": 16.542948038176036, "grad_norm": 0.037302009761333466, "learning_rate": 8.831204064197773e-07, "loss": 0.0002, "num_input_tokens_seen": 31324680, "step": 31200 }, { "epoch": 16.54559915164369, "grad_norm": 0.17648033797740936, "learning_rate": 8.818079256945805e-07, "loss": 0.0007, "num_input_tokens_seen": 31329992, "step": 31205 }, { "epoch": 16.548250265111346, "grad_norm": 11.558615684509277, "learning_rate": 8.804963266635308e-07, "loss": 0.0183, "num_input_tokens_seen": 31335208, "step": 31210 }, { "epoch": 16.550901378579002, "grad_norm": 0.11107278615236282, "learning_rate": 8.79185609607437e-07, "loss": 0.0479, "num_input_tokens_seen": 31340488, "step": 31215 }, { "epoch": 16.55355249204666, "grad_norm": 0.07202862203121185, "learning_rate": 8.778757748069228e-07, "loss": 0.0059, "num_input_tokens_seen": 31346024, "step": 31220 }, { "epoch": 16.556203605514316, "grad_norm": 0.04953519627451897, "learning_rate": 8.765668225424167e-07, "loss": 0.0061, "num_input_tokens_seen": 31351560, "step": 31225 }, { "epoch": 16.558854718981973, "grad_norm": 5.218474864959717, "learning_rate": 8.752587530941653e-07, "loss": 0.0075, "num_input_tokens_seen": 31356552, "step": 31230 }, { "epoch": 16.56150583244963, "grad_norm": 0.03765767812728882, "learning_rate": 8.739515667422211e-07, "loss": 0.021, "num_input_tokens_seen": 31363848, "step": 31235 }, { "epoch": 16.564156945917286, "grad_norm": 0.12136346846818924, "learning_rate": 8.726452637664501e-07, "loss": 0.0018, "num_input_tokens_seen": 31370280, "step": 31240 }, { "epoch": 16.566808059384943, "grad_norm": 0.16174940764904022, "learning_rate": 8.713398444465276e-07, "loss": 0.0148, "num_input_tokens_seen": 31376968, "step": 31245 }, { "epoch": 16.569459172852596, "grad_norm": 0.7729526162147522, "learning_rate": 8.700353090619412e-07, "loss": 0.0006, "num_input_tokens_seen": 31382440, "step": 31250 }, { "epoch": 16.572110286320253, "grad_norm": 1.0848747491836548, "learning_rate": 8.687316578919864e-07, "loss": 0.0028, "num_input_tokens_seen": 31387240, "step": 31255 }, { "epoch": 16.57476139978791, "grad_norm": 0.12946192920207977, "learning_rate": 8.674288912157757e-07, "loss": 0.0007, "num_input_tokens_seen": 31391816, "step": 31260 }, { "epoch": 16.577412513255567, "grad_norm": 0.3993734121322632, "learning_rate": 8.661270093122231e-07, "loss": 0.0006, "num_input_tokens_seen": 31397640, "step": 31265 }, { "epoch": 16.580063626723224, "grad_norm": 0.041739560663700104, "learning_rate": 8.648260124600621e-07, "loss": 0.0017, "num_input_tokens_seen": 31402120, "step": 31270 }, { "epoch": 16.58271474019088, "grad_norm": 84.15068817138672, "learning_rate": 8.635259009378311e-07, "loss": 0.0738, "num_input_tokens_seen": 31407720, "step": 31275 }, { "epoch": 16.585365853658537, "grad_norm": 0.741231381893158, "learning_rate": 8.622266750238806e-07, "loss": 0.0022, "num_input_tokens_seen": 31412008, "step": 31280 }, { "epoch": 16.588016967126194, "grad_norm": 0.06665418297052383, "learning_rate": 8.609283349963715e-07, "loss": 0.0003, "num_input_tokens_seen": 31418728, "step": 31285 }, { "epoch": 16.59066808059385, "grad_norm": 0.18348029255867004, "learning_rate": 8.596308811332749e-07, "loss": 0.0007, "num_input_tokens_seen": 31423208, "step": 31290 }, { "epoch": 16.593319194061507, "grad_norm": 0.10176081955432892, "learning_rate": 8.583343137123712e-07, "loss": 0.001, "num_input_tokens_seen": 31427624, "step": 31295 }, { "epoch": 16.59597030752916, "grad_norm": 0.2257511168718338, "learning_rate": 8.570386330112557e-07, "loss": 0.128, "num_input_tokens_seen": 31432168, "step": 31300 }, { "epoch": 16.598621420996817, "grad_norm": 0.2643584907054901, "learning_rate": 8.557438393073253e-07, "loss": 0.0012, "num_input_tokens_seen": 31436648, "step": 31305 }, { "epoch": 16.601272534464474, "grad_norm": 0.2010776549577713, "learning_rate": 8.544499328777961e-07, "loss": 0.003, "num_input_tokens_seen": 31441768, "step": 31310 }, { "epoch": 16.60392364793213, "grad_norm": 0.15915510058403015, "learning_rate": 8.531569139996881e-07, "loss": 0.0088, "num_input_tokens_seen": 31447464, "step": 31315 }, { "epoch": 16.606574761399788, "grad_norm": 97.13009643554688, "learning_rate": 8.518647829498338e-07, "loss": 0.0491, "num_input_tokens_seen": 31451656, "step": 31320 }, { "epoch": 16.609225874867445, "grad_norm": 6.1372456550598145, "learning_rate": 8.505735400048748e-07, "loss": 0.0023, "num_input_tokens_seen": 31455944, "step": 31325 }, { "epoch": 16.6118769883351, "grad_norm": 0.09040149301290512, "learning_rate": 8.492831854412631e-07, "loss": 0.0032, "num_input_tokens_seen": 31461128, "step": 31330 }, { "epoch": 16.614528101802758, "grad_norm": 2.468916893005371, "learning_rate": 8.479937195352583e-07, "loss": 0.0086, "num_input_tokens_seen": 31466408, "step": 31335 }, { "epoch": 16.617179215270415, "grad_norm": 0.39170825481414795, "learning_rate": 8.467051425629363e-07, "loss": 0.0197, "num_input_tokens_seen": 31472744, "step": 31340 }, { "epoch": 16.619830328738068, "grad_norm": 0.12060344964265823, "learning_rate": 8.454174548001742e-07, "loss": 0.0004, "num_input_tokens_seen": 31478472, "step": 31345 }, { "epoch": 16.622481442205725, "grad_norm": 0.35544225573539734, "learning_rate": 8.44130656522662e-07, "loss": 0.0005, "num_input_tokens_seen": 31482184, "step": 31350 }, { "epoch": 16.62513255567338, "grad_norm": 0.5896862745285034, "learning_rate": 8.428447480059044e-07, "loss": 0.0264, "num_input_tokens_seen": 31486920, "step": 31355 }, { "epoch": 16.62778366914104, "grad_norm": 0.8802456855773926, "learning_rate": 8.415597295252048e-07, "loss": 0.0013, "num_input_tokens_seen": 31492296, "step": 31360 }, { "epoch": 16.630434782608695, "grad_norm": 0.21200740337371826, "learning_rate": 8.402756013556873e-07, "loss": 0.0082, "num_input_tokens_seen": 31496776, "step": 31365 }, { "epoch": 16.633085896076352, "grad_norm": 0.9337728023529053, "learning_rate": 8.389923637722786e-07, "loss": 0.0012, "num_input_tokens_seen": 31501256, "step": 31370 }, { "epoch": 16.63573700954401, "grad_norm": 0.01683204062283039, "learning_rate": 8.377100170497166e-07, "loss": 0.0005, "num_input_tokens_seen": 31505672, "step": 31375 }, { "epoch": 16.638388123011666, "grad_norm": 0.10974689573049545, "learning_rate": 8.364285614625484e-07, "loss": 0.0019, "num_input_tokens_seen": 31510472, "step": 31380 }, { "epoch": 16.641039236479322, "grad_norm": 0.010011928156018257, "learning_rate": 8.3514799728513e-07, "loss": 0.0011, "num_input_tokens_seen": 31514376, "step": 31385 }, { "epoch": 16.64369034994698, "grad_norm": 0.1908581703901291, "learning_rate": 8.338683247916257e-07, "loss": 0.0003, "num_input_tokens_seen": 31519432, "step": 31390 }, { "epoch": 16.646341463414632, "grad_norm": 0.18703970313072205, "learning_rate": 8.325895442560139e-07, "loss": 0.0006, "num_input_tokens_seen": 31524424, "step": 31395 }, { "epoch": 16.64899257688229, "grad_norm": 0.25955694913864136, "learning_rate": 8.313116559520734e-07, "loss": 0.0008, "num_input_tokens_seen": 31529416, "step": 31400 }, { "epoch": 16.651643690349946, "grad_norm": 7.594599723815918, "learning_rate": 8.300346601534004e-07, "loss": 0.0099, "num_input_tokens_seen": 31534376, "step": 31405 }, { "epoch": 16.654294803817603, "grad_norm": 0.2278538942337036, "learning_rate": 8.28758557133395e-07, "loss": 0.0006, "num_input_tokens_seen": 31539432, "step": 31410 }, { "epoch": 16.65694591728526, "grad_norm": 0.12674997746944427, "learning_rate": 8.274833471652671e-07, "loss": 0.098, "num_input_tokens_seen": 31544456, "step": 31415 }, { "epoch": 16.659597030752916, "grad_norm": 0.04253588616847992, "learning_rate": 8.262090305220349e-07, "loss": 0.0087, "num_input_tokens_seen": 31549352, "step": 31420 }, { "epoch": 16.662248144220573, "grad_norm": 0.15102028846740723, "learning_rate": 8.249356074765303e-07, "loss": 0.0028, "num_input_tokens_seen": 31554024, "step": 31425 }, { "epoch": 16.66489925768823, "grad_norm": 0.033330149948596954, "learning_rate": 8.236630783013843e-07, "loss": 0.0008, "num_input_tokens_seen": 31558344, "step": 31430 }, { "epoch": 16.667550371155887, "grad_norm": 0.1841123402118683, "learning_rate": 8.223914432690478e-07, "loss": 0.0039, "num_input_tokens_seen": 31563048, "step": 31435 }, { "epoch": 16.670201484623544, "grad_norm": 0.012545344419777393, "learning_rate": 8.211207026517693e-07, "loss": 0.0003, "num_input_tokens_seen": 31568328, "step": 31440 }, { "epoch": 16.672852598091197, "grad_norm": 0.01275283470749855, "learning_rate": 8.198508567216141e-07, "loss": 0.022, "num_input_tokens_seen": 31572808, "step": 31445 }, { "epoch": 16.675503711558854, "grad_norm": 1.9457777738571167, "learning_rate": 8.185819057504518e-07, "loss": 0.0011, "num_input_tokens_seen": 31577224, "step": 31450 }, { "epoch": 16.67815482502651, "grad_norm": 0.042216476052999496, "learning_rate": 8.173138500099625e-07, "loss": 0.0038, "num_input_tokens_seen": 31582120, "step": 31455 }, { "epoch": 16.680805938494167, "grad_norm": 93.54676055908203, "learning_rate": 8.160466897716313e-07, "loss": 0.0486, "num_input_tokens_seen": 31587336, "step": 31460 }, { "epoch": 16.683457051961824, "grad_norm": 25.233449935913086, "learning_rate": 8.147804253067581e-07, "loss": 0.0269, "num_input_tokens_seen": 31592072, "step": 31465 }, { "epoch": 16.68610816542948, "grad_norm": 76.46216583251953, "learning_rate": 8.135150568864419e-07, "loss": 0.1955, "num_input_tokens_seen": 31596296, "step": 31470 }, { "epoch": 16.688759278897138, "grad_norm": 0.17091266810894012, "learning_rate": 8.122505847815981e-07, "loss": 0.223, "num_input_tokens_seen": 31601832, "step": 31475 }, { "epoch": 16.691410392364794, "grad_norm": 0.6560613512992859, "learning_rate": 8.109870092629473e-07, "loss": 0.0024, "num_input_tokens_seen": 31606824, "step": 31480 }, { "epoch": 16.69406150583245, "grad_norm": 3.1806697845458984, "learning_rate": 8.097243306010138e-07, "loss": 0.002, "num_input_tokens_seen": 31611048, "step": 31485 }, { "epoch": 16.696712619300108, "grad_norm": 0.1937568336725235, "learning_rate": 8.084625490661375e-07, "loss": 0.0006, "num_input_tokens_seen": 31615560, "step": 31490 }, { "epoch": 16.69936373276776, "grad_norm": 0.21650896966457367, "learning_rate": 8.072016649284608e-07, "loss": 0.0016, "num_input_tokens_seen": 31620104, "step": 31495 }, { "epoch": 16.702014846235418, "grad_norm": 1.0127758979797363, "learning_rate": 8.059416784579366e-07, "loss": 0.001, "num_input_tokens_seen": 31625064, "step": 31500 }, { "epoch": 16.704665959703075, "grad_norm": 1.3441115617752075, "learning_rate": 8.046825899243243e-07, "loss": 0.0038, "num_input_tokens_seen": 31629160, "step": 31505 }, { "epoch": 16.70731707317073, "grad_norm": 0.06926140189170837, "learning_rate": 8.034243995971914e-07, "loss": 0.0024, "num_input_tokens_seen": 31633736, "step": 31510 }, { "epoch": 16.70996818663839, "grad_norm": 0.423385888338089, "learning_rate": 8.02167107745912e-07, "loss": 0.0004, "num_input_tokens_seen": 31639400, "step": 31515 }, { "epoch": 16.712619300106045, "grad_norm": 0.47247257828712463, "learning_rate": 8.009107146396728e-07, "loss": 0.0005, "num_input_tokens_seen": 31645448, "step": 31520 }, { "epoch": 16.715270413573702, "grad_norm": 0.7766616940498352, "learning_rate": 7.996552205474594e-07, "loss": 0.0021, "num_input_tokens_seen": 31650856, "step": 31525 }, { "epoch": 16.71792152704136, "grad_norm": 0.7019651532173157, "learning_rate": 7.984006257380727e-07, "loss": 0.0033, "num_input_tokens_seen": 31656808, "step": 31530 }, { "epoch": 16.720572640509015, "grad_norm": 90.33267974853516, "learning_rate": 7.971469304801182e-07, "loss": 0.139, "num_input_tokens_seen": 31661416, "step": 31535 }, { "epoch": 16.72322375397667, "grad_norm": 0.05469132214784622, "learning_rate": 7.958941350420079e-07, "loss": 0.0002, "num_input_tokens_seen": 31666728, "step": 31540 }, { "epoch": 16.725874867444325, "grad_norm": 0.2622954249382019, "learning_rate": 7.946422396919618e-07, "loss": 0.0023, "num_input_tokens_seen": 31671592, "step": 31545 }, { "epoch": 16.728525980911982, "grad_norm": 3.1336803436279297, "learning_rate": 7.933912446980085e-07, "loss": 0.0019, "num_input_tokens_seen": 31676392, "step": 31550 }, { "epoch": 16.73117709437964, "grad_norm": 0.06748545169830322, "learning_rate": 7.921411503279808e-07, "loss": 0.0013, "num_input_tokens_seen": 31683112, "step": 31555 }, { "epoch": 16.733828207847296, "grad_norm": 0.05736095458269119, "learning_rate": 7.908919568495238e-07, "loss": 0.0345, "num_input_tokens_seen": 31687688, "step": 31560 }, { "epoch": 16.736479321314953, "grad_norm": 0.18412500619888306, "learning_rate": 7.896436645300831e-07, "loss": 0.0031, "num_input_tokens_seen": 31692136, "step": 31565 }, { "epoch": 16.73913043478261, "grad_norm": 0.1344471573829651, "learning_rate": 7.883962736369172e-07, "loss": 0.0005, "num_input_tokens_seen": 31697256, "step": 31570 }, { "epoch": 16.741781548250266, "grad_norm": 20.8675479888916, "learning_rate": 7.871497844370885e-07, "loss": 0.0066, "num_input_tokens_seen": 31702184, "step": 31575 }, { "epoch": 16.744432661717923, "grad_norm": 0.038867514580488205, "learning_rate": 7.859041971974668e-07, "loss": 0.0183, "num_input_tokens_seen": 31707048, "step": 31580 }, { "epoch": 16.74708377518558, "grad_norm": 0.05948690325021744, "learning_rate": 7.8465951218473e-07, "loss": 0.0004, "num_input_tokens_seen": 31711784, "step": 31585 }, { "epoch": 16.749734888653233, "grad_norm": 7.567947864532471, "learning_rate": 7.834157296653606e-07, "loss": 0.0092, "num_input_tokens_seen": 31717224, "step": 31590 }, { "epoch": 16.75238600212089, "grad_norm": 44.84437561035156, "learning_rate": 7.821728499056486e-07, "loss": 0.0128, "num_input_tokens_seen": 31721384, "step": 31595 }, { "epoch": 16.755037115588546, "grad_norm": 0.02967134490609169, "learning_rate": 7.809308731716953e-07, "loss": 0.049, "num_input_tokens_seen": 31725544, "step": 31600 }, { "epoch": 16.757688229056203, "grad_norm": 0.14825165271759033, "learning_rate": 7.79689799729399e-07, "loss": 0.1383, "num_input_tokens_seen": 31731112, "step": 31605 }, { "epoch": 16.76033934252386, "grad_norm": 0.394603431224823, "learning_rate": 7.784496298444743e-07, "loss": 0.0252, "num_input_tokens_seen": 31736168, "step": 31610 }, { "epoch": 16.762990455991517, "grad_norm": 0.4769801199436188, "learning_rate": 7.772103637824391e-07, "loss": 0.003, "num_input_tokens_seen": 31740328, "step": 31615 }, { "epoch": 16.765641569459174, "grad_norm": 0.11342757940292358, "learning_rate": 7.75972001808612e-07, "loss": 0.0853, "num_input_tokens_seen": 31745704, "step": 31620 }, { "epoch": 16.76829268292683, "grad_norm": 0.43568581342697144, "learning_rate": 7.747345441881276e-07, "loss": 0.0021, "num_input_tokens_seen": 31751464, "step": 31625 }, { "epoch": 16.770943796394487, "grad_norm": 0.39413002133369446, "learning_rate": 7.734979911859209e-07, "loss": 0.0148, "num_input_tokens_seen": 31756136, "step": 31630 }, { "epoch": 16.77359490986214, "grad_norm": 0.06707032769918442, "learning_rate": 7.722623430667347e-07, "loss": 0.0437, "num_input_tokens_seen": 31760552, "step": 31635 }, { "epoch": 16.776246023329797, "grad_norm": 0.053310833871364594, "learning_rate": 7.710276000951177e-07, "loss": 0.0026, "num_input_tokens_seen": 31765608, "step": 31640 }, { "epoch": 16.778897136797454, "grad_norm": 2.7113804817199707, "learning_rate": 7.697937625354257e-07, "loss": 0.0084, "num_input_tokens_seen": 31770376, "step": 31645 }, { "epoch": 16.78154825026511, "grad_norm": 0.07408607751131058, "learning_rate": 7.685608306518183e-07, "loss": 0.0022, "num_input_tokens_seen": 31776296, "step": 31650 }, { "epoch": 16.784199363732768, "grad_norm": 3.7618062496185303, "learning_rate": 7.673288047082672e-07, "loss": 0.0025, "num_input_tokens_seen": 31781736, "step": 31655 }, { "epoch": 16.786850477200424, "grad_norm": 3.7014966011047363, "learning_rate": 7.660976849685409e-07, "loss": 0.0043, "num_input_tokens_seen": 31787464, "step": 31660 }, { "epoch": 16.78950159066808, "grad_norm": 82.58625793457031, "learning_rate": 7.648674716962217e-07, "loss": 0.0905, "num_input_tokens_seen": 31793320, "step": 31665 }, { "epoch": 16.792152704135738, "grad_norm": 0.07067511230707169, "learning_rate": 7.636381651546954e-07, "loss": 0.0006, "num_input_tokens_seen": 31798376, "step": 31670 }, { "epoch": 16.794803817603395, "grad_norm": 0.5137271881103516, "learning_rate": 7.624097656071522e-07, "loss": 0.0004, "num_input_tokens_seen": 31803976, "step": 31675 }, { "epoch": 16.79745493107105, "grad_norm": 1.6702412366867065, "learning_rate": 7.611822733165897e-07, "loss": 0.0009, "num_input_tokens_seen": 31807592, "step": 31680 }, { "epoch": 16.800106044538705, "grad_norm": 0.6513623595237732, "learning_rate": 7.599556885458104e-07, "loss": 0.0009, "num_input_tokens_seen": 31812488, "step": 31685 }, { "epoch": 16.80275715800636, "grad_norm": 0.011332050897181034, "learning_rate": 7.58730011557422e-07, "loss": 0.0013, "num_input_tokens_seen": 31817608, "step": 31690 }, { "epoch": 16.80540827147402, "grad_norm": 1.2274421453475952, "learning_rate": 7.575052426138424e-07, "loss": 0.0007, "num_input_tokens_seen": 31823304, "step": 31695 }, { "epoch": 16.808059384941675, "grad_norm": 1.803842544555664, "learning_rate": 7.562813819772863e-07, "loss": 0.0014, "num_input_tokens_seen": 31828648, "step": 31700 }, { "epoch": 16.810710498409332, "grad_norm": 0.30652889609336853, "learning_rate": 7.550584299097824e-07, "loss": 0.003, "num_input_tokens_seen": 31833000, "step": 31705 }, { "epoch": 16.81336161187699, "grad_norm": 0.0737709105014801, "learning_rate": 7.538363866731613e-07, "loss": 0.0003, "num_input_tokens_seen": 31837928, "step": 31710 }, { "epoch": 16.816012725344645, "grad_norm": 0.6250998973846436, "learning_rate": 7.526152525290586e-07, "loss": 0.0032, "num_input_tokens_seen": 31842440, "step": 31715 }, { "epoch": 16.818663838812302, "grad_norm": 0.026379426941275597, "learning_rate": 7.513950277389159e-07, "loss": 0.0011, "num_input_tokens_seen": 31846984, "step": 31720 }, { "epoch": 16.82131495227996, "grad_norm": 0.003694418352097273, "learning_rate": 7.501757125639808e-07, "loss": 0.0013, "num_input_tokens_seen": 31852008, "step": 31725 }, { "epoch": 16.823966065747612, "grad_norm": 103.09038543701172, "learning_rate": 7.489573072653034e-07, "loss": 0.0827, "num_input_tokens_seen": 31857032, "step": 31730 }, { "epoch": 16.82661717921527, "grad_norm": 0.05305144190788269, "learning_rate": 7.477398121037449e-07, "loss": 0.002, "num_input_tokens_seen": 31862280, "step": 31735 }, { "epoch": 16.829268292682926, "grad_norm": 124.76818084716797, "learning_rate": 7.465232273399636e-07, "loss": 0.0767, "num_input_tokens_seen": 31866856, "step": 31740 }, { "epoch": 16.831919406150583, "grad_norm": 2.944146156311035, "learning_rate": 7.453075532344301e-07, "loss": 0.0009, "num_input_tokens_seen": 31871368, "step": 31745 }, { "epoch": 16.83457051961824, "grad_norm": 0.20103071630001068, "learning_rate": 7.440927900474171e-07, "loss": 0.0008, "num_input_tokens_seen": 31875944, "step": 31750 }, { "epoch": 16.837221633085896, "grad_norm": 0.10829750448465347, "learning_rate": 7.428789380389989e-07, "loss": 0.0026, "num_input_tokens_seen": 31880456, "step": 31755 }, { "epoch": 16.839872746553553, "grad_norm": 1.605456829071045, "learning_rate": 7.416659974690616e-07, "loss": 0.0713, "num_input_tokens_seen": 31886696, "step": 31760 }, { "epoch": 16.84252386002121, "grad_norm": 10.724626541137695, "learning_rate": 7.404539685972911e-07, "loss": 0.002, "num_input_tokens_seen": 31890664, "step": 31765 }, { "epoch": 16.845174973488866, "grad_norm": 80.82330322265625, "learning_rate": 7.392428516831796e-07, "loss": 0.0429, "num_input_tokens_seen": 31896776, "step": 31770 }, { "epoch": 16.847826086956523, "grad_norm": 0.663247287273407, "learning_rate": 7.380326469860244e-07, "loss": 0.0013, "num_input_tokens_seen": 31900968, "step": 31775 }, { "epoch": 16.850477200424177, "grad_norm": 0.3864412307739258, "learning_rate": 7.368233547649273e-07, "loss": 0.0005, "num_input_tokens_seen": 31905672, "step": 31780 }, { "epoch": 16.853128313891833, "grad_norm": 62.14983367919922, "learning_rate": 7.356149752787934e-07, "loss": 0.0254, "num_input_tokens_seen": 31910536, "step": 31785 }, { "epoch": 16.85577942735949, "grad_norm": 5.471789360046387, "learning_rate": 7.344075087863362e-07, "loss": 0.0021, "num_input_tokens_seen": 31915976, "step": 31790 }, { "epoch": 16.858430540827147, "grad_norm": 0.21974313259124756, "learning_rate": 7.332009555460673e-07, "loss": 0.0946, "num_input_tokens_seen": 31920648, "step": 31795 }, { "epoch": 16.861081654294804, "grad_norm": 0.19783851504325867, "learning_rate": 7.319953158163101e-07, "loss": 0.0024, "num_input_tokens_seen": 31925256, "step": 31800 }, { "epoch": 16.86373276776246, "grad_norm": 0.10270474851131439, "learning_rate": 7.307905898551876e-07, "loss": 0.0469, "num_input_tokens_seen": 31930728, "step": 31805 }, { "epoch": 16.866383881230117, "grad_norm": 8.903040885925293, "learning_rate": 7.295867779206283e-07, "loss": 0.067, "num_input_tokens_seen": 31934984, "step": 31810 }, { "epoch": 16.869034994697774, "grad_norm": 0.06059956178069115, "learning_rate": 7.283838802703652e-07, "loss": 0.0008, "num_input_tokens_seen": 31939176, "step": 31815 }, { "epoch": 16.87168610816543, "grad_norm": 0.004977944772690535, "learning_rate": 7.27181897161936e-07, "loss": 0.0007, "num_input_tokens_seen": 31943624, "step": 31820 }, { "epoch": 16.874337221633088, "grad_norm": 0.5199215412139893, "learning_rate": 7.259808288526798e-07, "loss": 0.0011, "num_input_tokens_seen": 31949096, "step": 31825 }, { "epoch": 16.87698833510074, "grad_norm": 0.083762988448143, "learning_rate": 7.247806755997472e-07, "loss": 0.0086, "num_input_tokens_seen": 31953000, "step": 31830 }, { "epoch": 16.879639448568398, "grad_norm": 0.18547266721725464, "learning_rate": 7.235814376600824e-07, "loss": 0.0026, "num_input_tokens_seen": 31957096, "step": 31835 }, { "epoch": 16.882290562036054, "grad_norm": 0.02004767768085003, "learning_rate": 7.223831152904426e-07, "loss": 0.0023, "num_input_tokens_seen": 31962312, "step": 31840 }, { "epoch": 16.88494167550371, "grad_norm": 2.009864568710327, "learning_rate": 7.211857087473845e-07, "loss": 0.0008, "num_input_tokens_seen": 31966664, "step": 31845 }, { "epoch": 16.887592788971368, "grad_norm": 0.058220718055963516, "learning_rate": 7.199892182872698e-07, "loss": 0.0004, "num_input_tokens_seen": 31972488, "step": 31850 }, { "epoch": 16.890243902439025, "grad_norm": 0.17949935793876648, "learning_rate": 7.187936441662635e-07, "loss": 0.0002, "num_input_tokens_seen": 31977608, "step": 31855 }, { "epoch": 16.89289501590668, "grad_norm": 17.506324768066406, "learning_rate": 7.175989866403349e-07, "loss": 0.0045, "num_input_tokens_seen": 31982696, "step": 31860 }, { "epoch": 16.89554612937434, "grad_norm": 3.030412197113037, "learning_rate": 7.16405245965257e-07, "loss": 0.0128, "num_input_tokens_seen": 31988776, "step": 31865 }, { "epoch": 16.898197242841995, "grad_norm": 0.10835520178079605, "learning_rate": 7.152124223966084e-07, "loss": 0.0009, "num_input_tokens_seen": 31995336, "step": 31870 }, { "epoch": 16.900848356309652, "grad_norm": 26.99602699279785, "learning_rate": 7.140205161897662e-07, "loss": 0.0146, "num_input_tokens_seen": 31999560, "step": 31875 }, { "epoch": 16.903499469777305, "grad_norm": 0.030483515933156013, "learning_rate": 7.128295275999175e-07, "loss": 0.0006, "num_input_tokens_seen": 32004168, "step": 31880 }, { "epoch": 16.906150583244962, "grad_norm": 0.04307787865400314, "learning_rate": 7.116394568820484e-07, "loss": 0.0003, "num_input_tokens_seen": 32009640, "step": 31885 }, { "epoch": 16.90880169671262, "grad_norm": 0.05591915547847748, "learning_rate": 7.104503042909494e-07, "loss": 0.0007, "num_input_tokens_seen": 32015016, "step": 31890 }, { "epoch": 16.911452810180275, "grad_norm": 0.031292565166950226, "learning_rate": 7.092620700812164e-07, "loss": 0.0255, "num_input_tokens_seen": 32019688, "step": 31895 }, { "epoch": 16.914103923647932, "grad_norm": 0.03765101358294487, "learning_rate": 7.080747545072464e-07, "loss": 0.0002, "num_input_tokens_seen": 32024840, "step": 31900 }, { "epoch": 16.91675503711559, "grad_norm": 0.7035855650901794, "learning_rate": 7.068883578232405e-07, "loss": 0.0007, "num_input_tokens_seen": 32030280, "step": 31905 }, { "epoch": 16.919406150583246, "grad_norm": 0.034831494092941284, "learning_rate": 7.057028802832033e-07, "loss": 0.0016, "num_input_tokens_seen": 32035816, "step": 31910 }, { "epoch": 16.922057264050903, "grad_norm": 0.05304217338562012, "learning_rate": 7.045183221409424e-07, "loss": 0.0006, "num_input_tokens_seen": 32043144, "step": 31915 }, { "epoch": 16.92470837751856, "grad_norm": 0.11379516124725342, "learning_rate": 7.033346836500671e-07, "loss": 0.0038, "num_input_tokens_seen": 32047880, "step": 31920 }, { "epoch": 16.927359490986213, "grad_norm": 0.18468376994132996, "learning_rate": 7.021519650639952e-07, "loss": 0.0054, "num_input_tokens_seen": 32052936, "step": 31925 }, { "epoch": 16.93001060445387, "grad_norm": 1.0521602630615234, "learning_rate": 7.009701666359392e-07, "loss": 0.0089, "num_input_tokens_seen": 32057320, "step": 31930 }, { "epoch": 16.932661717921526, "grad_norm": 0.21271252632141113, "learning_rate": 6.997892886189223e-07, "loss": 0.002, "num_input_tokens_seen": 32061736, "step": 31935 }, { "epoch": 16.935312831389183, "grad_norm": 0.2869245707988739, "learning_rate": 6.98609331265766e-07, "loss": 0.143, "num_input_tokens_seen": 32067304, "step": 31940 }, { "epoch": 16.93796394485684, "grad_norm": 0.37159934639930725, "learning_rate": 6.974302948290962e-07, "loss": 0.013, "num_input_tokens_seen": 32071784, "step": 31945 }, { "epoch": 16.940615058324497, "grad_norm": 11.118718147277832, "learning_rate": 6.962521795613419e-07, "loss": 0.0137, "num_input_tokens_seen": 32076104, "step": 31950 }, { "epoch": 16.943266171792153, "grad_norm": 2.8946237564086914, "learning_rate": 6.950749857147337e-07, "loss": 0.1246, "num_input_tokens_seen": 32081160, "step": 31955 }, { "epoch": 16.94591728525981, "grad_norm": 0.043193817138671875, "learning_rate": 6.938987135413055e-07, "loss": 0.0038, "num_input_tokens_seen": 32086184, "step": 31960 }, { "epoch": 16.948568398727467, "grad_norm": 0.10641390085220337, "learning_rate": 6.92723363292897e-07, "loss": 0.0002, "num_input_tokens_seen": 32092168, "step": 31965 }, { "epoch": 16.951219512195124, "grad_norm": 0.006201256066560745, "learning_rate": 6.91548935221143e-07, "loss": 0.0006, "num_input_tokens_seen": 32096392, "step": 31970 }, { "epoch": 16.953870625662777, "grad_norm": 109.00922393798828, "learning_rate": 6.90375429577489e-07, "loss": 0.0828, "num_input_tokens_seen": 32102120, "step": 31975 }, { "epoch": 16.956521739130434, "grad_norm": 0.17797471582889557, "learning_rate": 6.892028466131783e-07, "loss": 0.0007, "num_input_tokens_seen": 32106248, "step": 31980 }, { "epoch": 16.95917285259809, "grad_norm": 0.1308346837759018, "learning_rate": 6.880311865792583e-07, "loss": 0.0003, "num_input_tokens_seen": 32111144, "step": 31985 }, { "epoch": 16.961823966065747, "grad_norm": 0.2239963412284851, "learning_rate": 6.868604497265774e-07, "loss": 0.0007, "num_input_tokens_seen": 32117704, "step": 31990 }, { "epoch": 16.964475079533404, "grad_norm": 2.5768513679504395, "learning_rate": 6.856906363057886e-07, "loss": 0.001, "num_input_tokens_seen": 32122024, "step": 31995 }, { "epoch": 16.96712619300106, "grad_norm": 0.08401335030794144, "learning_rate": 6.845217465673437e-07, "loss": 0.0008, "num_input_tokens_seen": 32128200, "step": 32000 }, { "epoch": 16.969777306468718, "grad_norm": 25.324148178100586, "learning_rate": 6.83353780761502e-07, "loss": 0.016, "num_input_tokens_seen": 32132360, "step": 32005 }, { "epoch": 16.972428419936374, "grad_norm": 0.007579927798360586, "learning_rate": 6.821867391383191e-07, "loss": 0.0007, "num_input_tokens_seen": 32139240, "step": 32010 }, { "epoch": 16.97507953340403, "grad_norm": 0.028124982491135597, "learning_rate": 6.810206219476573e-07, "loss": 0.0014, "num_input_tokens_seen": 32143304, "step": 32015 }, { "epoch": 16.977730646871684, "grad_norm": 0.025183582678437233, "learning_rate": 6.79855429439179e-07, "loss": 0.001, "num_input_tokens_seen": 32149320, "step": 32020 }, { "epoch": 16.98038176033934, "grad_norm": 0.25725042819976807, "learning_rate": 6.786911618623482e-07, "loss": 0.0011, "num_input_tokens_seen": 32153864, "step": 32025 }, { "epoch": 16.983032873806998, "grad_norm": 0.02032061293721199, "learning_rate": 6.77527819466432e-07, "loss": 0.0046, "num_input_tokens_seen": 32159368, "step": 32030 }, { "epoch": 16.985683987274655, "grad_norm": 0.22320552170276642, "learning_rate": 6.763654025004996e-07, "loss": 0.0007, "num_input_tokens_seen": 32166376, "step": 32035 }, { "epoch": 16.98833510074231, "grad_norm": 4.408520221710205, "learning_rate": 6.752039112134196e-07, "loss": 0.0025, "num_input_tokens_seen": 32171432, "step": 32040 }, { "epoch": 16.99098621420997, "grad_norm": 0.8382477760314941, "learning_rate": 6.740433458538647e-07, "loss": 0.0061, "num_input_tokens_seen": 32175304, "step": 32045 }, { "epoch": 16.993637327677625, "grad_norm": 0.021572621539235115, "learning_rate": 6.72883706670312e-07, "loss": 0.0002, "num_input_tokens_seen": 32180552, "step": 32050 }, { "epoch": 16.996288441145282, "grad_norm": 0.5612688660621643, "learning_rate": 6.717249939110326e-07, "loss": 0.0011, "num_input_tokens_seen": 32187208, "step": 32055 }, { "epoch": 16.99893955461294, "grad_norm": 0.03647293895483017, "learning_rate": 6.705672078241071e-07, "loss": 0.0012, "num_input_tokens_seen": 32192104, "step": 32060 }, { "epoch": 17.001590668080595, "grad_norm": 0.00788184441626072, "learning_rate": 6.694103486574132e-07, "loss": 0.0145, "num_input_tokens_seen": 32196168, "step": 32065 }, { "epoch": 17.00424178154825, "grad_norm": 0.2968560755252838, "learning_rate": 6.682544166586319e-07, "loss": 0.0017, "num_input_tokens_seen": 32201352, "step": 32070 }, { "epoch": 17.006892895015906, "grad_norm": 0.05626164376735687, "learning_rate": 6.67099412075245e-07, "loss": 0.0003, "num_input_tokens_seen": 32206696, "step": 32075 }, { "epoch": 17.009544008483562, "grad_norm": 0.5058049559593201, "learning_rate": 6.659453351545353e-07, "loss": 0.002, "num_input_tokens_seen": 32212712, "step": 32080 }, { "epoch": 17.01219512195122, "grad_norm": 55.382381439208984, "learning_rate": 6.64792186143588e-07, "loss": 0.0212, "num_input_tokens_seen": 32217544, "step": 32085 }, { "epoch": 17.014846235418876, "grad_norm": 0.6743733882904053, "learning_rate": 6.636399652892911e-07, "loss": 0.0006, "num_input_tokens_seen": 32222184, "step": 32090 }, { "epoch": 17.017497348886533, "grad_norm": 0.09341730177402496, "learning_rate": 6.624886728383284e-07, "loss": 0.0008, "num_input_tokens_seen": 32226728, "step": 32095 }, { "epoch": 17.02014846235419, "grad_norm": 0.9385693669319153, "learning_rate": 6.613383090371922e-07, "loss": 0.0047, "num_input_tokens_seen": 32231368, "step": 32100 }, { "epoch": 17.022799575821846, "grad_norm": 0.14189939200878143, "learning_rate": 6.601888741321705e-07, "loss": 0.0004, "num_input_tokens_seen": 32237352, "step": 32105 }, { "epoch": 17.025450689289503, "grad_norm": 0.009373114444315434, "learning_rate": 6.590403683693552e-07, "loss": 0.0006, "num_input_tokens_seen": 32242888, "step": 32110 }, { "epoch": 17.028101802757156, "grad_norm": 0.14634168148040771, "learning_rate": 6.578927919946376e-07, "loss": 0.0011, "num_input_tokens_seen": 32247944, "step": 32115 }, { "epoch": 17.030752916224813, "grad_norm": 0.02932203933596611, "learning_rate": 6.56746145253711e-07, "loss": 0.0303, "num_input_tokens_seen": 32252424, "step": 32120 }, { "epoch": 17.03340402969247, "grad_norm": 0.7045866847038269, "learning_rate": 6.556004283920686e-07, "loss": 0.0036, "num_input_tokens_seen": 32258440, "step": 32125 }, { "epoch": 17.036055143160127, "grad_norm": 0.063214510679245, "learning_rate": 6.544556416550085e-07, "loss": 0.0006, "num_input_tokens_seen": 32262504, "step": 32130 }, { "epoch": 17.038706256627783, "grad_norm": 1.5076178312301636, "learning_rate": 6.533117852876219e-07, "loss": 0.0008, "num_input_tokens_seen": 32266536, "step": 32135 }, { "epoch": 17.04135737009544, "grad_norm": 0.03690936043858528, "learning_rate": 6.521688595348092e-07, "loss": 0.0004, "num_input_tokens_seen": 32271080, "step": 32140 }, { "epoch": 17.044008483563097, "grad_norm": 0.18485534191131592, "learning_rate": 6.510268646412665e-07, "loss": 0.0885, "num_input_tokens_seen": 32276328, "step": 32145 }, { "epoch": 17.046659597030754, "grad_norm": 0.07835591584444046, "learning_rate": 6.498858008514924e-07, "loss": 0.0004, "num_input_tokens_seen": 32280456, "step": 32150 }, { "epoch": 17.04931071049841, "grad_norm": 0.22871242463588715, "learning_rate": 6.487456684097848e-07, "loss": 0.0006, "num_input_tokens_seen": 32285416, "step": 32155 }, { "epoch": 17.051961823966067, "grad_norm": 82.59245300292969, "learning_rate": 6.476064675602439e-07, "loss": 0.0885, "num_input_tokens_seen": 32290696, "step": 32160 }, { "epoch": 17.05461293743372, "grad_norm": 0.01976058818399906, "learning_rate": 6.464681985467686e-07, "loss": 0.009, "num_input_tokens_seen": 32295880, "step": 32165 }, { "epoch": 17.057264050901377, "grad_norm": 0.23423515260219574, "learning_rate": 6.453308616130604e-07, "loss": 0.0004, "num_input_tokens_seen": 32301608, "step": 32170 }, { "epoch": 17.059915164369034, "grad_norm": 1.698960542678833, "learning_rate": 6.441944570026199e-07, "loss": 0.0005, "num_input_tokens_seen": 32306792, "step": 32175 }, { "epoch": 17.06256627783669, "grad_norm": 0.005348666571080685, "learning_rate": 6.430589849587465e-07, "loss": 0.0003, "num_input_tokens_seen": 32312136, "step": 32180 }, { "epoch": 17.065217391304348, "grad_norm": 0.061139270663261414, "learning_rate": 6.41924445724546e-07, "loss": 0.0014, "num_input_tokens_seen": 32317128, "step": 32185 }, { "epoch": 17.067868504772004, "grad_norm": 0.48924848437309265, "learning_rate": 6.407908395429152e-07, "loss": 0.1942, "num_input_tokens_seen": 32321960, "step": 32190 }, { "epoch": 17.07051961823966, "grad_norm": 1.0429891347885132, "learning_rate": 6.396581666565604e-07, "loss": 0.0009, "num_input_tokens_seen": 32327112, "step": 32195 }, { "epoch": 17.073170731707318, "grad_norm": 0.14190766215324402, "learning_rate": 6.385264273079817e-07, "loss": 0.002, "num_input_tokens_seen": 32332488, "step": 32200 }, { "epoch": 17.075821845174975, "grad_norm": 1.021202564239502, "learning_rate": 6.373956217394828e-07, "loss": 0.0952, "num_input_tokens_seen": 32337768, "step": 32205 }, { "epoch": 17.07847295864263, "grad_norm": 3.8912503719329834, "learning_rate": 6.362657501931646e-07, "loss": 0.0012, "num_input_tokens_seen": 32342312, "step": 32210 }, { "epoch": 17.081124072110285, "grad_norm": 0.07039526104927063, "learning_rate": 6.351368129109314e-07, "loss": 0.0022, "num_input_tokens_seen": 32347304, "step": 32215 }, { "epoch": 17.08377518557794, "grad_norm": 0.5308434963226318, "learning_rate": 6.340088101344833e-07, "loss": 0.0027, "num_input_tokens_seen": 32352680, "step": 32220 }, { "epoch": 17.0864262990456, "grad_norm": 0.01229461096227169, "learning_rate": 6.328817421053263e-07, "loss": 0.0529, "num_input_tokens_seen": 32358344, "step": 32225 }, { "epoch": 17.089077412513255, "grad_norm": 0.24286049604415894, "learning_rate": 6.317556090647592e-07, "loss": 0.0003, "num_input_tokens_seen": 32363016, "step": 32230 }, { "epoch": 17.091728525980912, "grad_norm": 0.08454722911119461, "learning_rate": 6.306304112538858e-07, "loss": 0.0534, "num_input_tokens_seen": 32367464, "step": 32235 }, { "epoch": 17.09437963944857, "grad_norm": 0.9253056049346924, "learning_rate": 6.295061489136089e-07, "loss": 0.0027, "num_input_tokens_seen": 32372232, "step": 32240 }, { "epoch": 17.097030752916226, "grad_norm": 0.2815212309360504, "learning_rate": 6.283828222846283e-07, "loss": 0.0015, "num_input_tokens_seen": 32376872, "step": 32245 }, { "epoch": 17.099681866383882, "grad_norm": 0.03580230847001076, "learning_rate": 6.272604316074465e-07, "loss": 0.0002, "num_input_tokens_seen": 32382920, "step": 32250 }, { "epoch": 17.10233297985154, "grad_norm": 0.5184340476989746, "learning_rate": 6.261389771223636e-07, "loss": 0.0005, "num_input_tokens_seen": 32387976, "step": 32255 }, { "epoch": 17.104984093319192, "grad_norm": 0.1535530984401703, "learning_rate": 6.250184590694797e-07, "loss": 0.0091, "num_input_tokens_seen": 32392904, "step": 32260 }, { "epoch": 17.10763520678685, "grad_norm": 0.17128972709178925, "learning_rate": 6.238988776886978e-07, "loss": 0.0004, "num_input_tokens_seen": 32398440, "step": 32265 }, { "epoch": 17.110286320254506, "grad_norm": 24.262577056884766, "learning_rate": 6.227802332197125e-07, "loss": 0.0101, "num_input_tokens_seen": 32403400, "step": 32270 }, { "epoch": 17.112937433722163, "grad_norm": 0.4075717628002167, "learning_rate": 6.21662525902027e-07, "loss": 0.0648, "num_input_tokens_seen": 32408168, "step": 32275 }, { "epoch": 17.11558854718982, "grad_norm": 0.08742259442806244, "learning_rate": 6.205457559749372e-07, "loss": 0.0019, "num_input_tokens_seen": 32412808, "step": 32280 }, { "epoch": 17.118239660657476, "grad_norm": 0.17856146395206451, "learning_rate": 6.194299236775414e-07, "loss": 0.0021, "num_input_tokens_seen": 32417704, "step": 32285 }, { "epoch": 17.120890774125133, "grad_norm": 5.9212870597839355, "learning_rate": 6.183150292487361e-07, "loss": 0.0019, "num_input_tokens_seen": 32422216, "step": 32290 }, { "epoch": 17.12354188759279, "grad_norm": 0.23564781248569489, "learning_rate": 6.172010729272171e-07, "loss": 0.0065, "num_input_tokens_seen": 32427464, "step": 32295 }, { "epoch": 17.126193001060447, "grad_norm": 0.35632380843162537, "learning_rate": 6.160880549514786e-07, "loss": 0.0246, "num_input_tokens_seen": 32432744, "step": 32300 }, { "epoch": 17.128844114528103, "grad_norm": 0.018910272046923637, "learning_rate": 6.149759755598184e-07, "loss": 0.0003, "num_input_tokens_seen": 32439464, "step": 32305 }, { "epoch": 17.131495227995757, "grad_norm": 0.10208132863044739, "learning_rate": 6.138648349903264e-07, "loss": 0.004, "num_input_tokens_seen": 32445768, "step": 32310 }, { "epoch": 17.134146341463413, "grad_norm": 0.03275886923074722, "learning_rate": 6.127546334808949e-07, "loss": 0.0008, "num_input_tokens_seen": 32451304, "step": 32315 }, { "epoch": 17.13679745493107, "grad_norm": 1.4575610160827637, "learning_rate": 6.116453712692177e-07, "loss": 0.0242, "num_input_tokens_seen": 32455496, "step": 32320 }, { "epoch": 17.139448568398727, "grad_norm": 0.3431788980960846, "learning_rate": 6.10537048592782e-07, "loss": 0.0005, "num_input_tokens_seen": 32460552, "step": 32325 }, { "epoch": 17.142099681866384, "grad_norm": 8.088644027709961, "learning_rate": 6.094296656888787e-07, "loss": 0.003, "num_input_tokens_seen": 32465768, "step": 32330 }, { "epoch": 17.14475079533404, "grad_norm": 0.005173864308744669, "learning_rate": 6.083232227945951e-07, "loss": 0.0104, "num_input_tokens_seen": 32470856, "step": 32335 }, { "epoch": 17.147401908801697, "grad_norm": 0.5917767286300659, "learning_rate": 6.072177201468172e-07, "loss": 0.0008, "num_input_tokens_seen": 32476744, "step": 32340 }, { "epoch": 17.150053022269354, "grad_norm": 2.6710519790649414, "learning_rate": 6.06113157982231e-07, "loss": 0.0013, "num_input_tokens_seen": 32482152, "step": 32345 }, { "epoch": 17.15270413573701, "grad_norm": 0.10047642141580582, "learning_rate": 6.050095365373204e-07, "loss": 0.0055, "num_input_tokens_seen": 32487080, "step": 32350 }, { "epoch": 17.155355249204668, "grad_norm": 0.2624676823616028, "learning_rate": 6.039068560483663e-07, "loss": 0.0014, "num_input_tokens_seen": 32490920, "step": 32355 }, { "epoch": 17.15800636267232, "grad_norm": 0.04302990809082985, "learning_rate": 6.028051167514526e-07, "loss": 0.0011, "num_input_tokens_seen": 32494952, "step": 32360 }, { "epoch": 17.160657476139978, "grad_norm": 3.5717623233795166, "learning_rate": 6.017043188824551e-07, "loss": 0.0025, "num_input_tokens_seen": 32500040, "step": 32365 }, { "epoch": 17.163308589607635, "grad_norm": 0.3435358703136444, "learning_rate": 6.006044626770551e-07, "loss": 0.0008, "num_input_tokens_seen": 32505000, "step": 32370 }, { "epoch": 17.16595970307529, "grad_norm": 0.17680980265140533, "learning_rate": 5.995055483707274e-07, "loss": 0.0826, "num_input_tokens_seen": 32509736, "step": 32375 }, { "epoch": 17.168610816542948, "grad_norm": 1.1031464338302612, "learning_rate": 5.984075761987468e-07, "loss": 0.0102, "num_input_tokens_seen": 32514088, "step": 32380 }, { "epoch": 17.171261930010605, "grad_norm": 0.13401994109153748, "learning_rate": 5.973105463961864e-07, "loss": 0.0361, "num_input_tokens_seen": 32518792, "step": 32385 }, { "epoch": 17.17391304347826, "grad_norm": 0.03580159693956375, "learning_rate": 5.962144591979174e-07, "loss": 0.0008, "num_input_tokens_seen": 32523784, "step": 32390 }, { "epoch": 17.17656415694592, "grad_norm": 0.15794241428375244, "learning_rate": 5.95119314838608e-07, "loss": 0.0003, "num_input_tokens_seen": 32529416, "step": 32395 }, { "epoch": 17.179215270413575, "grad_norm": 3.505105972290039, "learning_rate": 5.940251135527292e-07, "loss": 0.0128, "num_input_tokens_seen": 32533640, "step": 32400 }, { "epoch": 17.18186638388123, "grad_norm": 5.557766437530518, "learning_rate": 5.929318555745422e-07, "loss": 0.0504, "num_input_tokens_seen": 32538312, "step": 32405 }, { "epoch": 17.184517497348885, "grad_norm": 0.06113162264227867, "learning_rate": 5.918395411381133e-07, "loss": 0.0002, "num_input_tokens_seen": 32543624, "step": 32410 }, { "epoch": 17.187168610816542, "grad_norm": 0.18823526799678802, "learning_rate": 5.907481704773044e-07, "loss": 0.001, "num_input_tokens_seen": 32548808, "step": 32415 }, { "epoch": 17.1898197242842, "grad_norm": 1.1125997304916382, "learning_rate": 5.896577438257739e-07, "loss": 0.0011, "num_input_tokens_seen": 32553608, "step": 32420 }, { "epoch": 17.192470837751856, "grad_norm": 0.133754163980484, "learning_rate": 5.885682614169796e-07, "loss": 0.0379, "num_input_tokens_seen": 32557768, "step": 32425 }, { "epoch": 17.195121951219512, "grad_norm": 0.21999254822731018, "learning_rate": 5.874797234841773e-07, "loss": 0.0008, "num_input_tokens_seen": 32562280, "step": 32430 }, { "epoch": 17.19777306468717, "grad_norm": 5.14351749420166, "learning_rate": 5.863921302604186e-07, "loss": 0.0019, "num_input_tokens_seen": 32566792, "step": 32435 }, { "epoch": 17.200424178154826, "grad_norm": 0.08214795589447021, "learning_rate": 5.853054819785581e-07, "loss": 0.0007, "num_input_tokens_seen": 32571784, "step": 32440 }, { "epoch": 17.203075291622483, "grad_norm": 0.021498875692486763, "learning_rate": 5.8421977887124e-07, "loss": 0.001, "num_input_tokens_seen": 32575656, "step": 32445 }, { "epoch": 17.20572640509014, "grad_norm": 0.36743250489234924, "learning_rate": 5.831350211709119e-07, "loss": 0.0005, "num_input_tokens_seen": 32580168, "step": 32450 }, { "epoch": 17.208377518557793, "grad_norm": 0.0620095431804657, "learning_rate": 5.820512091098196e-07, "loss": 0.0004, "num_input_tokens_seen": 32584392, "step": 32455 }, { "epoch": 17.21102863202545, "grad_norm": 0.09323293715715408, "learning_rate": 5.809683429200008e-07, "loss": 0.0007, "num_input_tokens_seen": 32588680, "step": 32460 }, { "epoch": 17.213679745493106, "grad_norm": 0.9325125813484192, "learning_rate": 5.798864228332979e-07, "loss": 0.0011, "num_input_tokens_seen": 32595720, "step": 32465 }, { "epoch": 17.216330858960763, "grad_norm": 78.46977996826172, "learning_rate": 5.78805449081345e-07, "loss": 0.0592, "num_input_tokens_seen": 32600328, "step": 32470 }, { "epoch": 17.21898197242842, "grad_norm": 0.4186374247074127, "learning_rate": 5.777254218955769e-07, "loss": 0.0006, "num_input_tokens_seen": 32604584, "step": 32475 }, { "epoch": 17.221633085896077, "grad_norm": 0.024964919313788414, "learning_rate": 5.766463415072237e-07, "loss": 0.0016, "num_input_tokens_seen": 32609384, "step": 32480 }, { "epoch": 17.224284199363733, "grad_norm": 0.6437589526176453, "learning_rate": 5.755682081473141e-07, "loss": 0.0005, "num_input_tokens_seen": 32614792, "step": 32485 }, { "epoch": 17.22693531283139, "grad_norm": 0.2350834161043167, "learning_rate": 5.744910220466726e-07, "loss": 0.0012, "num_input_tokens_seen": 32619848, "step": 32490 }, { "epoch": 17.229586426299047, "grad_norm": 0.014743015170097351, "learning_rate": 5.734147834359249e-07, "loss": 0.0019, "num_input_tokens_seen": 32625032, "step": 32495 }, { "epoch": 17.2322375397667, "grad_norm": 0.049476057291030884, "learning_rate": 5.723394925454867e-07, "loss": 0.001, "num_input_tokens_seen": 32630248, "step": 32500 }, { "epoch": 17.234888653234357, "grad_norm": 0.17505164444446564, "learning_rate": 5.712651496055782e-07, "loss": 0.0004, "num_input_tokens_seen": 32634376, "step": 32505 }, { "epoch": 17.237539766702014, "grad_norm": 87.54244995117188, "learning_rate": 5.701917548462122e-07, "loss": 0.2631, "num_input_tokens_seen": 32638344, "step": 32510 }, { "epoch": 17.24019088016967, "grad_norm": 0.01393040083348751, "learning_rate": 5.691193084972002e-07, "loss": 0.0002, "num_input_tokens_seen": 32642536, "step": 32515 }, { "epoch": 17.242841993637327, "grad_norm": 0.7881969809532166, "learning_rate": 5.680478107881493e-07, "loss": 0.0037, "num_input_tokens_seen": 32646600, "step": 32520 }, { "epoch": 17.245493107104984, "grad_norm": 0.09312053769826889, "learning_rate": 5.669772619484654e-07, "loss": 0.0004, "num_input_tokens_seen": 32651560, "step": 32525 }, { "epoch": 17.24814422057264, "grad_norm": 0.3376500904560089, "learning_rate": 5.659076622073484e-07, "loss": 0.0778, "num_input_tokens_seen": 32655784, "step": 32530 }, { "epoch": 17.250795334040298, "grad_norm": 0.3276366889476776, "learning_rate": 5.648390117938001e-07, "loss": 0.0003, "num_input_tokens_seen": 32660744, "step": 32535 }, { "epoch": 17.253446447507955, "grad_norm": 23.036758422851562, "learning_rate": 5.637713109366117e-07, "loss": 0.0053, "num_input_tokens_seen": 32665928, "step": 32540 }, { "epoch": 17.25609756097561, "grad_norm": 1.6385784149169922, "learning_rate": 5.627045598643782e-07, "loss": 0.0007, "num_input_tokens_seen": 32671048, "step": 32545 }, { "epoch": 17.258748674443265, "grad_norm": 0.10930468887090683, "learning_rate": 5.616387588054873e-07, "loss": 0.0011, "num_input_tokens_seen": 32677384, "step": 32550 }, { "epoch": 17.26139978791092, "grad_norm": 0.059589795768260956, "learning_rate": 5.60573907988124e-07, "loss": 0.0029, "num_input_tokens_seen": 32682344, "step": 32555 }, { "epoch": 17.264050901378578, "grad_norm": 0.06784655153751373, "learning_rate": 5.595100076402704e-07, "loss": 0.0007, "num_input_tokens_seen": 32686824, "step": 32560 }, { "epoch": 17.266702014846235, "grad_norm": 0.12667153775691986, "learning_rate": 5.584470579897045e-07, "loss": 0.0005, "num_input_tokens_seen": 32690856, "step": 32565 }, { "epoch": 17.26935312831389, "grad_norm": 0.12805508077144623, "learning_rate": 5.573850592640001e-07, "loss": 0.0006, "num_input_tokens_seen": 32696104, "step": 32570 }, { "epoch": 17.27200424178155, "grad_norm": 0.020580587908625603, "learning_rate": 5.56324011690531e-07, "loss": 0.0002, "num_input_tokens_seen": 32701224, "step": 32575 }, { "epoch": 17.274655355249205, "grad_norm": 1.014084815979004, "learning_rate": 5.552639154964623e-07, "loss": 0.0021, "num_input_tokens_seen": 32706856, "step": 32580 }, { "epoch": 17.277306468716862, "grad_norm": 0.01086399145424366, "learning_rate": 5.542047709087578e-07, "loss": 0.0012, "num_input_tokens_seen": 32712552, "step": 32585 }, { "epoch": 17.27995758218452, "grad_norm": 0.12627801299095154, "learning_rate": 5.5314657815418e-07, "loss": 0.0005, "num_input_tokens_seen": 32716648, "step": 32590 }, { "epoch": 17.282608695652176, "grad_norm": 0.46634912490844727, "learning_rate": 5.520893374592823e-07, "loss": 0.0015, "num_input_tokens_seen": 32720296, "step": 32595 }, { "epoch": 17.28525980911983, "grad_norm": 53.737239837646484, "learning_rate": 5.51033049050419e-07, "loss": 0.021, "num_input_tokens_seen": 32724200, "step": 32600 }, { "epoch": 17.287910922587486, "grad_norm": 0.607399046421051, "learning_rate": 5.499777131537376e-07, "loss": 0.0003, "num_input_tokens_seen": 32729448, "step": 32605 }, { "epoch": 17.290562036055142, "grad_norm": 0.042249664664268494, "learning_rate": 5.489233299951841e-07, "loss": 0.0011, "num_input_tokens_seen": 32734312, "step": 32610 }, { "epoch": 17.2932131495228, "grad_norm": 1.9889427423477173, "learning_rate": 5.478698998004967e-07, "loss": 0.0017, "num_input_tokens_seen": 32739784, "step": 32615 }, { "epoch": 17.295864262990456, "grad_norm": 0.22921407222747803, "learning_rate": 5.468174227952161e-07, "loss": 0.0021, "num_input_tokens_seen": 32744168, "step": 32620 }, { "epoch": 17.298515376458113, "grad_norm": 0.4058148264884949, "learning_rate": 5.457658992046705e-07, "loss": 0.0007, "num_input_tokens_seen": 32749192, "step": 32625 }, { "epoch": 17.30116648992577, "grad_norm": 0.3097795248031616, "learning_rate": 5.447153292539919e-07, "loss": 0.0046, "num_input_tokens_seen": 32755816, "step": 32630 }, { "epoch": 17.303817603393426, "grad_norm": 0.1273161917924881, "learning_rate": 5.436657131681016e-07, "loss": 0.0006, "num_input_tokens_seen": 32760648, "step": 32635 }, { "epoch": 17.306468716861083, "grad_norm": 0.22763724625110626, "learning_rate": 5.426170511717216e-07, "loss": 0.0003, "num_input_tokens_seen": 32765256, "step": 32640 }, { "epoch": 17.309119830328736, "grad_norm": 0.09574916958808899, "learning_rate": 5.415693434893677e-07, "loss": 0.0002, "num_input_tokens_seen": 32770472, "step": 32645 }, { "epoch": 17.311770943796393, "grad_norm": 0.14815792441368103, "learning_rate": 5.405225903453504e-07, "loss": 0.0007, "num_input_tokens_seen": 32775944, "step": 32650 }, { "epoch": 17.31442205726405, "grad_norm": 0.10966114699840546, "learning_rate": 5.394767919637761e-07, "loss": 0.0005, "num_input_tokens_seen": 32780680, "step": 32655 }, { "epoch": 17.317073170731707, "grad_norm": 0.02567199431359768, "learning_rate": 5.384319485685507e-07, "loss": 0.0254, "num_input_tokens_seen": 32785928, "step": 32660 }, { "epoch": 17.319724284199363, "grad_norm": 0.21418297290802002, "learning_rate": 5.373880603833686e-07, "loss": 0.0003, "num_input_tokens_seen": 32791368, "step": 32665 }, { "epoch": 17.32237539766702, "grad_norm": 0.004048480652272701, "learning_rate": 5.363451276317261e-07, "loss": 0.0001, "num_input_tokens_seen": 32795592, "step": 32670 }, { "epoch": 17.325026511134677, "grad_norm": 0.07750613242387772, "learning_rate": 5.353031505369116e-07, "loss": 0.0016, "num_input_tokens_seen": 32801480, "step": 32675 }, { "epoch": 17.327677624602334, "grad_norm": 0.23767083883285522, "learning_rate": 5.342621293220096e-07, "loss": 0.0004, "num_input_tokens_seen": 32806408, "step": 32680 }, { "epoch": 17.33032873806999, "grad_norm": 0.012560905888676643, "learning_rate": 5.332220642099001e-07, "loss": 0.0008, "num_input_tokens_seen": 32811400, "step": 32685 }, { "epoch": 17.332979851537647, "grad_norm": 0.8253949880599976, "learning_rate": 5.321829554232583e-07, "loss": 0.0299, "num_input_tokens_seen": 32816040, "step": 32690 }, { "epoch": 17.3356309650053, "grad_norm": 1.0115340948104858, "learning_rate": 5.311448031845534e-07, "loss": 0.0016, "num_input_tokens_seen": 32820040, "step": 32695 }, { "epoch": 17.338282078472957, "grad_norm": 0.09602826833724976, "learning_rate": 5.30107607716055e-07, "loss": 0.0008, "num_input_tokens_seen": 32824968, "step": 32700 }, { "epoch": 17.340933191940614, "grad_norm": 0.22183391451835632, "learning_rate": 5.290713692398186e-07, "loss": 0.0004, "num_input_tokens_seen": 32829544, "step": 32705 }, { "epoch": 17.34358430540827, "grad_norm": 5.19976282119751, "learning_rate": 5.280360879777041e-07, "loss": 0.0014, "num_input_tokens_seen": 32835080, "step": 32710 }, { "epoch": 17.346235418875928, "grad_norm": 0.6723707914352417, "learning_rate": 5.270017641513608e-07, "loss": 0.0017, "num_input_tokens_seen": 32839464, "step": 32715 }, { "epoch": 17.348886532343585, "grad_norm": 0.1780579537153244, "learning_rate": 5.259683979822356e-07, "loss": 0.0012, "num_input_tokens_seen": 32845128, "step": 32720 }, { "epoch": 17.35153764581124, "grad_norm": 0.14571227133274078, "learning_rate": 5.24935989691569e-07, "loss": 0.0004, "num_input_tokens_seen": 32851720, "step": 32725 }, { "epoch": 17.354188759278898, "grad_norm": 0.07402221113443375, "learning_rate": 5.239045395003967e-07, "loss": 0.0007, "num_input_tokens_seen": 32856328, "step": 32730 }, { "epoch": 17.356839872746555, "grad_norm": 0.5451220273971558, "learning_rate": 5.228740476295502e-07, "loss": 0.0025, "num_input_tokens_seen": 32862344, "step": 32735 }, { "epoch": 17.35949098621421, "grad_norm": 0.09660356491804123, "learning_rate": 5.218445142996542e-07, "loss": 0.104, "num_input_tokens_seen": 32867816, "step": 32740 }, { "epoch": 17.362142099681865, "grad_norm": 0.44020524621009827, "learning_rate": 5.208159397311297e-07, "loss": 0.0024, "num_input_tokens_seen": 32872840, "step": 32745 }, { "epoch": 17.36479321314952, "grad_norm": 0.34228208661079407, "learning_rate": 5.197883241441903e-07, "loss": 0.0011, "num_input_tokens_seen": 32877544, "step": 32750 }, { "epoch": 17.36744432661718, "grad_norm": 0.053475867956876755, "learning_rate": 5.187616677588497e-07, "loss": 0.0007, "num_input_tokens_seen": 32882696, "step": 32755 }, { "epoch": 17.370095440084835, "grad_norm": 0.019271424040198326, "learning_rate": 5.177359707949076e-07, "loss": 0.0146, "num_input_tokens_seen": 32887368, "step": 32760 }, { "epoch": 17.372746553552492, "grad_norm": 46.52153396606445, "learning_rate": 5.167112334719665e-07, "loss": 0.0522, "num_input_tokens_seen": 32892648, "step": 32765 }, { "epoch": 17.37539766702015, "grad_norm": 0.08793789893388748, "learning_rate": 5.156874560094183e-07, "loss": 0.0007, "num_input_tokens_seen": 32897224, "step": 32770 }, { "epoch": 17.378048780487806, "grad_norm": 0.44800737500190735, "learning_rate": 5.146646386264515e-07, "loss": 0.0007, "num_input_tokens_seen": 32902568, "step": 32775 }, { "epoch": 17.380699893955462, "grad_norm": 0.05676779896020889, "learning_rate": 5.136427815420491e-07, "loss": 0.0003, "num_input_tokens_seen": 32908040, "step": 32780 }, { "epoch": 17.38335100742312, "grad_norm": 0.1476745456457138, "learning_rate": 5.12621884974987e-07, "loss": 0.001, "num_input_tokens_seen": 32912296, "step": 32785 }, { "epoch": 17.386002120890772, "grad_norm": 0.021385133266448975, "learning_rate": 5.116019491438362e-07, "loss": 0.0709, "num_input_tokens_seen": 32917032, "step": 32790 }, { "epoch": 17.38865323435843, "grad_norm": 0.011581776663661003, "learning_rate": 5.105829742669654e-07, "loss": 0.0032, "num_input_tokens_seen": 32921640, "step": 32795 }, { "epoch": 17.391304347826086, "grad_norm": 0.07401379942893982, "learning_rate": 5.095649605625303e-07, "loss": 0.0006, "num_input_tokens_seen": 32926664, "step": 32800 }, { "epoch": 17.393955461293743, "grad_norm": 0.7545833587646484, "learning_rate": 5.085479082484873e-07, "loss": 0.0005, "num_input_tokens_seen": 32931272, "step": 32805 }, { "epoch": 17.3966065747614, "grad_norm": 0.2597194015979767, "learning_rate": 5.075318175425848e-07, "loss": 0.0006, "num_input_tokens_seen": 32936456, "step": 32810 }, { "epoch": 17.399257688229056, "grad_norm": 0.07741229981184006, "learning_rate": 5.06516688662364e-07, "loss": 0.0004, "num_input_tokens_seen": 32940840, "step": 32815 }, { "epoch": 17.401908801696713, "grad_norm": 0.5216916799545288, "learning_rate": 5.055025218251625e-07, "loss": 0.0006, "num_input_tokens_seen": 32945736, "step": 32820 }, { "epoch": 17.40455991516437, "grad_norm": 33.90635299682617, "learning_rate": 5.044893172481097e-07, "loss": 0.01, "num_input_tokens_seen": 32950408, "step": 32825 }, { "epoch": 17.407211028632027, "grad_norm": 0.2298741340637207, "learning_rate": 5.034770751481294e-07, "loss": 0.0005, "num_input_tokens_seen": 32955528, "step": 32830 }, { "epoch": 17.409862142099684, "grad_norm": 0.5528882145881653, "learning_rate": 5.024657957419432e-07, "loss": 0.0007, "num_input_tokens_seen": 32959464, "step": 32835 }, { "epoch": 17.412513255567337, "grad_norm": 0.03197187930345535, "learning_rate": 5.014554792460596e-07, "loss": 0.0004, "num_input_tokens_seen": 32964040, "step": 32840 }, { "epoch": 17.415164369034994, "grad_norm": 0.23500461876392365, "learning_rate": 5.004461258767873e-07, "loss": 0.0008, "num_input_tokens_seen": 32968936, "step": 32845 }, { "epoch": 17.41781548250265, "grad_norm": 2.579470634460449, "learning_rate": 4.99437735850225e-07, "loss": 0.0109, "num_input_tokens_seen": 32973128, "step": 32850 }, { "epoch": 17.420466595970307, "grad_norm": 0.3599911034107208, "learning_rate": 4.984303093822668e-07, "loss": 0.1202, "num_input_tokens_seen": 32977928, "step": 32855 }, { "epoch": 17.423117709437964, "grad_norm": 2.417663097381592, "learning_rate": 4.974238466886e-07, "loss": 0.002, "num_input_tokens_seen": 32983432, "step": 32860 }, { "epoch": 17.42576882290562, "grad_norm": 0.3011065721511841, "learning_rate": 4.964183479847056e-07, "loss": 0.001, "num_input_tokens_seen": 32987656, "step": 32865 }, { "epoch": 17.428419936373277, "grad_norm": 0.011659051291644573, "learning_rate": 4.954138134858583e-07, "loss": 0.0053, "num_input_tokens_seen": 32993000, "step": 32870 }, { "epoch": 17.431071049840934, "grad_norm": 2.2319347858428955, "learning_rate": 4.944102434071263e-07, "loss": 0.0019, "num_input_tokens_seen": 32999208, "step": 32875 }, { "epoch": 17.43372216330859, "grad_norm": 0.5226317644119263, "learning_rate": 4.934076379633707e-07, "loss": 0.0004, "num_input_tokens_seen": 33004616, "step": 32880 }, { "epoch": 17.436373276776244, "grad_norm": 0.010022146627306938, "learning_rate": 4.924059973692468e-07, "loss": 0.0003, "num_input_tokens_seen": 33009352, "step": 32885 }, { "epoch": 17.4390243902439, "grad_norm": 0.3593021631240845, "learning_rate": 4.914053218392051e-07, "loss": 0.0004, "num_input_tokens_seen": 33014536, "step": 32890 }, { "epoch": 17.441675503711558, "grad_norm": 0.13365064561367035, "learning_rate": 4.90405611587485e-07, "loss": 0.0011, "num_input_tokens_seen": 33018824, "step": 32895 }, { "epoch": 17.444326617179215, "grad_norm": 0.08741237968206406, "learning_rate": 4.894068668281232e-07, "loss": 0.0005, "num_input_tokens_seen": 33024200, "step": 32900 }, { "epoch": 17.44697773064687, "grad_norm": 2.0343165397644043, "learning_rate": 4.884090877749486e-07, "loss": 0.0012, "num_input_tokens_seen": 33029640, "step": 32905 }, { "epoch": 17.449628844114528, "grad_norm": 0.016437308862805367, "learning_rate": 4.874122746415827e-07, "loss": 0.0053, "num_input_tokens_seen": 33034728, "step": 32910 }, { "epoch": 17.452279957582185, "grad_norm": 0.6677172780036926, "learning_rate": 4.864164276414407e-07, "loss": 0.0004, "num_input_tokens_seen": 33040264, "step": 32915 }, { "epoch": 17.45493107104984, "grad_norm": 0.40395164489746094, "learning_rate": 4.854215469877305e-07, "loss": 0.0028, "num_input_tokens_seen": 33044584, "step": 32920 }, { "epoch": 17.4575821845175, "grad_norm": 0.25548255443573, "learning_rate": 4.844276328934527e-07, "loss": 0.0015, "num_input_tokens_seen": 33050472, "step": 32925 }, { "epoch": 17.460233297985155, "grad_norm": 4.1540422439575195, "learning_rate": 4.834346855714045e-07, "loss": 0.0017, "num_input_tokens_seen": 33055304, "step": 32930 }, { "epoch": 17.46288441145281, "grad_norm": 0.11884895712137222, "learning_rate": 4.824427052341696e-07, "loss": 0.0033, "num_input_tokens_seen": 33059400, "step": 32935 }, { "epoch": 17.465535524920465, "grad_norm": 0.9259098172187805, "learning_rate": 4.814516920941309e-07, "loss": 0.0009, "num_input_tokens_seen": 33064776, "step": 32940 }, { "epoch": 17.468186638388122, "grad_norm": 0.015140670351684093, "learning_rate": 4.804616463634615e-07, "loss": 0.0024, "num_input_tokens_seen": 33069960, "step": 32945 }, { "epoch": 17.47083775185578, "grad_norm": 0.35730698704719543, "learning_rate": 4.794725682541268e-07, "loss": 0.0002, "num_input_tokens_seen": 33074152, "step": 32950 }, { "epoch": 17.473488865323436, "grad_norm": 3.068587303161621, "learning_rate": 4.784844579778863e-07, "loss": 0.0012, "num_input_tokens_seen": 33077960, "step": 32955 }, { "epoch": 17.476139978791092, "grad_norm": 0.19166041910648346, "learning_rate": 4.77497315746292e-07, "loss": 0.1105, "num_input_tokens_seen": 33082984, "step": 32960 }, { "epoch": 17.47879109225875, "grad_norm": 0.1433809995651245, "learning_rate": 4.7651114177068694e-07, "loss": 0.0023, "num_input_tokens_seen": 33088872, "step": 32965 }, { "epoch": 17.481442205726406, "grad_norm": 1.3905638456344604, "learning_rate": 4.7552593626221165e-07, "loss": 0.0006, "num_input_tokens_seen": 33093704, "step": 32970 }, { "epoch": 17.484093319194063, "grad_norm": 0.2087976336479187, "learning_rate": 4.745416994317925e-07, "loss": 0.0006, "num_input_tokens_seen": 33098600, "step": 32975 }, { "epoch": 17.48674443266172, "grad_norm": 0.23021742701530457, "learning_rate": 4.7355843149015424e-07, "loss": 0.0003, "num_input_tokens_seen": 33103400, "step": 32980 }, { "epoch": 17.489395546129373, "grad_norm": 0.1857633739709854, "learning_rate": 4.7257613264781123e-07, "loss": 0.0003, "num_input_tokens_seen": 33107720, "step": 32985 }, { "epoch": 17.49204665959703, "grad_norm": 1.8139036893844604, "learning_rate": 4.7159480311507134e-07, "loss": 0.0337, "num_input_tokens_seen": 33111912, "step": 32990 }, { "epoch": 17.494697773064686, "grad_norm": 0.13189247250556946, "learning_rate": 4.706144431020343e-07, "loss": 0.0004, "num_input_tokens_seen": 33117448, "step": 32995 }, { "epoch": 17.497348886532343, "grad_norm": 0.041968997567892075, "learning_rate": 4.6963505281859277e-07, "loss": 0.0008, "num_input_tokens_seen": 33122312, "step": 33000 }, { "epoch": 17.5, "grad_norm": 0.5497397780418396, "learning_rate": 4.686566324744318e-07, "loss": 0.0009, "num_input_tokens_seen": 33128776, "step": 33005 }, { "epoch": 17.502651113467657, "grad_norm": 5.314942836761475, "learning_rate": 4.676791822790283e-07, "loss": 0.0018, "num_input_tokens_seen": 33134504, "step": 33010 }, { "epoch": 17.505302226935314, "grad_norm": 0.13244274258613586, "learning_rate": 4.6670270244165206e-07, "loss": 0.0003, "num_input_tokens_seen": 33140520, "step": 33015 }, { "epoch": 17.50795334040297, "grad_norm": 37.7933464050293, "learning_rate": 4.657271931713636e-07, "loss": 0.0351, "num_input_tokens_seen": 33145160, "step": 33020 }, { "epoch": 17.510604453870627, "grad_norm": 0.016498122364282608, "learning_rate": 4.647526546770204e-07, "loss": 0.0003, "num_input_tokens_seen": 33150472, "step": 33025 }, { "epoch": 17.51325556733828, "grad_norm": 0.23886778950691223, "learning_rate": 4.637790871672643e-07, "loss": 0.0004, "num_input_tokens_seen": 33154760, "step": 33030 }, { "epoch": 17.515906680805937, "grad_norm": 0.07885546237230301, "learning_rate": 4.628064908505364e-07, "loss": 0.0017, "num_input_tokens_seen": 33159080, "step": 33035 }, { "epoch": 17.518557794273594, "grad_norm": 0.8546721339225769, "learning_rate": 4.618348659350663e-07, "loss": 0.0008, "num_input_tokens_seen": 33164008, "step": 33040 }, { "epoch": 17.52120890774125, "grad_norm": 0.22514007985591888, "learning_rate": 4.6086421262887595e-07, "loss": 0.0004, "num_input_tokens_seen": 33168968, "step": 33045 }, { "epoch": 17.523860021208908, "grad_norm": 0.25970977544784546, "learning_rate": 4.598945311397801e-07, "loss": 0.0003, "num_input_tokens_seen": 33173416, "step": 33050 }, { "epoch": 17.526511134676564, "grad_norm": 0.11462642252445221, "learning_rate": 4.5892582167538503e-07, "loss": 0.0003, "num_input_tokens_seen": 33178056, "step": 33055 }, { "epoch": 17.52916224814422, "grad_norm": 0.011864442378282547, "learning_rate": 4.5795808444308753e-07, "loss": 0.0002, "num_input_tokens_seen": 33182536, "step": 33060 }, { "epoch": 17.531813361611878, "grad_norm": 0.03741854429244995, "learning_rate": 4.5699131965008083e-07, "loss": 0.0007, "num_input_tokens_seen": 33187208, "step": 33065 }, { "epoch": 17.534464475079535, "grad_norm": 26.75981903076172, "learning_rate": 4.560255275033426e-07, "loss": 0.0093, "num_input_tokens_seen": 33192360, "step": 33070 }, { "epoch": 17.53711558854719, "grad_norm": 1.3627650737762451, "learning_rate": 4.5506070820964973e-07, "loss": 0.0006, "num_input_tokens_seen": 33197096, "step": 33075 }, { "epoch": 17.539766702014845, "grad_norm": 0.19458691775798798, "learning_rate": 4.5409686197556646e-07, "loss": 0.0008, "num_input_tokens_seen": 33201992, "step": 33080 }, { "epoch": 17.5424178154825, "grad_norm": 0.7117621302604675, "learning_rate": 4.5313398900744934e-07, "loss": 0.0008, "num_input_tokens_seen": 33205672, "step": 33085 }, { "epoch": 17.54506892895016, "grad_norm": 0.1266908198595047, "learning_rate": 4.521720895114479e-07, "loss": 0.0005, "num_input_tokens_seen": 33210504, "step": 33090 }, { "epoch": 17.547720042417815, "grad_norm": 0.0891411304473877, "learning_rate": 4.512111636935007e-07, "loss": 0.0011, "num_input_tokens_seen": 33215560, "step": 33095 }, { "epoch": 17.550371155885472, "grad_norm": 0.11362579464912415, "learning_rate": 4.5025121175933994e-07, "loss": 0.0003, "num_input_tokens_seen": 33221160, "step": 33100 }, { "epoch": 17.55302226935313, "grad_norm": 0.44821450114250183, "learning_rate": 4.4929223391449115e-07, "loss": 0.0007, "num_input_tokens_seen": 33225352, "step": 33105 }, { "epoch": 17.555673382820785, "grad_norm": 0.08621164411306381, "learning_rate": 4.483342303642652e-07, "loss": 0.0005, "num_input_tokens_seen": 33229736, "step": 33110 }, { "epoch": 17.558324496288442, "grad_norm": 0.025370849296450615, "learning_rate": 4.473772013137706e-07, "loss": 0.0003, "num_input_tokens_seen": 33236424, "step": 33115 }, { "epoch": 17.5609756097561, "grad_norm": 0.16114887595176697, "learning_rate": 4.4642114696790473e-07, "loss": 0.0016, "num_input_tokens_seen": 33241160, "step": 33120 }, { "epoch": 17.563626723223756, "grad_norm": 74.55868530273438, "learning_rate": 4.4546606753135547e-07, "loss": 0.0222, "num_input_tokens_seen": 33246696, "step": 33125 }, { "epoch": 17.56627783669141, "grad_norm": 0.1410301774740219, "learning_rate": 4.445119632086031e-07, "loss": 0.0004, "num_input_tokens_seen": 33252104, "step": 33130 }, { "epoch": 17.568928950159066, "grad_norm": 5.619335174560547, "learning_rate": 4.435588342039188e-07, "loss": 0.0161, "num_input_tokens_seen": 33257416, "step": 33135 }, { "epoch": 17.571580063626723, "grad_norm": 7.171798229217529, "learning_rate": 4.426066807213653e-07, "loss": 0.0025, "num_input_tokens_seen": 33262312, "step": 33140 }, { "epoch": 17.57423117709438, "grad_norm": 0.19349579513072968, "learning_rate": 4.4165550296479566e-07, "loss": 0.0002, "num_input_tokens_seen": 33267304, "step": 33145 }, { "epoch": 17.576882290562036, "grad_norm": 0.8592365384101868, "learning_rate": 4.407053011378554e-07, "loss": 0.0007, "num_input_tokens_seen": 33272008, "step": 33150 }, { "epoch": 17.579533404029693, "grad_norm": 0.7833792567253113, "learning_rate": 4.3975607544397837e-07, "loss": 0.0013, "num_input_tokens_seen": 33277256, "step": 33155 }, { "epoch": 17.58218451749735, "grad_norm": 0.16090045869350433, "learning_rate": 4.388078260863943e-07, "loss": 0.0011, "num_input_tokens_seen": 33281896, "step": 33160 }, { "epoch": 17.584835630965006, "grad_norm": 0.09413482248783112, "learning_rate": 4.3786055326811793e-07, "loss": 0.0084, "num_input_tokens_seen": 33288456, "step": 33165 }, { "epoch": 17.587486744432663, "grad_norm": 0.19191916286945343, "learning_rate": 4.369142571919599e-07, "loss": 0.0002, "num_input_tokens_seen": 33292872, "step": 33170 }, { "epoch": 17.590137857900316, "grad_norm": 0.5552329421043396, "learning_rate": 4.3596893806051867e-07, "loss": 0.0029, "num_input_tokens_seen": 33297768, "step": 33175 }, { "epoch": 17.592788971367973, "grad_norm": 0.20447058975696564, "learning_rate": 4.350245960761851e-07, "loss": 0.0006, "num_input_tokens_seen": 33302408, "step": 33180 }, { "epoch": 17.59544008483563, "grad_norm": 44.11792755126953, "learning_rate": 4.340812314411397e-07, "loss": 0.0088, "num_input_tokens_seen": 33309064, "step": 33185 }, { "epoch": 17.598091198303287, "grad_norm": 0.17334610223770142, "learning_rate": 4.3313884435735586e-07, "loss": 0.0005, "num_input_tokens_seen": 33314184, "step": 33190 }, { "epoch": 17.600742311770944, "grad_norm": 0.052625447511672974, "learning_rate": 4.3219743502659395e-07, "loss": 0.0036, "num_input_tokens_seen": 33319368, "step": 33195 }, { "epoch": 17.6033934252386, "grad_norm": 0.31228166818618774, "learning_rate": 4.3125700365041044e-07, "loss": 0.0004, "num_input_tokens_seen": 33324104, "step": 33200 }, { "epoch": 17.606044538706257, "grad_norm": 118.21589660644531, "learning_rate": 4.303175504301449e-07, "loss": 0.1788, "num_input_tokens_seen": 33329480, "step": 33205 }, { "epoch": 17.608695652173914, "grad_norm": 0.14867275953292847, "learning_rate": 4.293790755669353e-07, "loss": 0.0005, "num_input_tokens_seen": 33334248, "step": 33210 }, { "epoch": 17.61134676564157, "grad_norm": 3.731327772140503, "learning_rate": 4.284415792617058e-07, "loss": 0.0011, "num_input_tokens_seen": 33342312, "step": 33215 }, { "epoch": 17.613997879109228, "grad_norm": 0.7274947166442871, "learning_rate": 4.27505061715171e-07, "loss": 0.0028, "num_input_tokens_seen": 33347432, "step": 33220 }, { "epoch": 17.61664899257688, "grad_norm": 0.0067623634822666645, "learning_rate": 4.265695231278372e-07, "loss": 0.0008, "num_input_tokens_seen": 33352200, "step": 33225 }, { "epoch": 17.619300106044538, "grad_norm": 4.7649383544921875, "learning_rate": 4.2563496370000236e-07, "loss": 0.0052, "num_input_tokens_seen": 33356680, "step": 33230 }, { "epoch": 17.621951219512194, "grad_norm": 0.11327718943357468, "learning_rate": 4.247013836317504e-07, "loss": 0.0054, "num_input_tokens_seen": 33361768, "step": 33235 }, { "epoch": 17.62460233297985, "grad_norm": 1.4809846878051758, "learning_rate": 4.237687831229609e-07, "loss": 0.0007, "num_input_tokens_seen": 33366952, "step": 33240 }, { "epoch": 17.627253446447508, "grad_norm": 2.281461715698242, "learning_rate": 4.228371623732996e-07, "loss": 0.0007, "num_input_tokens_seen": 33372360, "step": 33245 }, { "epoch": 17.629904559915165, "grad_norm": 0.03125125542283058, "learning_rate": 4.219065215822249e-07, "loss": 0.0004, "num_input_tokens_seen": 33377672, "step": 33250 }, { "epoch": 17.63255567338282, "grad_norm": 49.9979248046875, "learning_rate": 4.2097686094898447e-07, "loss": 0.05, "num_input_tokens_seen": 33382216, "step": 33255 }, { "epoch": 17.63520678685048, "grad_norm": 0.11940865218639374, "learning_rate": 4.2004818067261533e-07, "loss": 0.0006, "num_input_tokens_seen": 33387528, "step": 33260 }, { "epoch": 17.637857900318135, "grad_norm": 0.06002857908606529, "learning_rate": 4.1912048095194613e-07, "loss": 0.0001, "num_input_tokens_seen": 33392776, "step": 33265 }, { "epoch": 17.64050901378579, "grad_norm": 0.07746526598930359, "learning_rate": 4.1819376198559637e-07, "loss": 0.0022, "num_input_tokens_seen": 33397992, "step": 33270 }, { "epoch": 17.643160127253445, "grad_norm": 0.7485097646713257, "learning_rate": 4.1726802397197174e-07, "loss": 0.0008, "num_input_tokens_seen": 33404712, "step": 33275 }, { "epoch": 17.645811240721102, "grad_norm": 0.006529390811920166, "learning_rate": 4.16343267109271e-07, "loss": 0.0005, "num_input_tokens_seen": 33409832, "step": 33280 }, { "epoch": 17.64846235418876, "grad_norm": 5.031668663024902, "learning_rate": 4.15419491595484e-07, "loss": 0.0018, "num_input_tokens_seen": 33416168, "step": 33285 }, { "epoch": 17.651113467656415, "grad_norm": 0.18040642142295837, "learning_rate": 4.144966976283854e-07, "loss": 0.0027, "num_input_tokens_seen": 33420968, "step": 33290 }, { "epoch": 17.653764581124072, "grad_norm": 0.14531949162483215, "learning_rate": 4.1357488540554545e-07, "loss": 0.0013, "num_input_tokens_seen": 33425480, "step": 33295 }, { "epoch": 17.65641569459173, "grad_norm": 0.08197051286697388, "learning_rate": 4.1265405512432133e-07, "loss": 0.0003, "num_input_tokens_seen": 33430440, "step": 33300 }, { "epoch": 17.659066808059386, "grad_norm": 0.6515100002288818, "learning_rate": 4.1173420698186027e-07, "loss": 0.0006, "num_input_tokens_seen": 33435784, "step": 33305 }, { "epoch": 17.661717921527043, "grad_norm": 0.7997526526451111, "learning_rate": 4.1081534117509924e-07, "loss": 0.0005, "num_input_tokens_seen": 33440552, "step": 33310 }, { "epoch": 17.6643690349947, "grad_norm": 16.556791305541992, "learning_rate": 4.098974579007647e-07, "loss": 0.005, "num_input_tokens_seen": 33446216, "step": 33315 }, { "epoch": 17.667020148462353, "grad_norm": 0.5069971084594727, "learning_rate": 4.089805573553729e-07, "loss": 0.0789, "num_input_tokens_seen": 33450568, "step": 33320 }, { "epoch": 17.66967126193001, "grad_norm": 0.11825944483280182, "learning_rate": 4.080646397352317e-07, "loss": 0.0006, "num_input_tokens_seen": 33456328, "step": 33325 }, { "epoch": 17.672322375397666, "grad_norm": 0.17608119547367096, "learning_rate": 4.0714970523643313e-07, "loss": 0.0007, "num_input_tokens_seen": 33461192, "step": 33330 }, { "epoch": 17.674973488865323, "grad_norm": 0.09729219228029251, "learning_rate": 4.0623575405486616e-07, "loss": 0.0896, "num_input_tokens_seen": 33465960, "step": 33335 }, { "epoch": 17.67762460233298, "grad_norm": 0.0943775326013565, "learning_rate": 4.0532278638620306e-07, "loss": 0.0004, "num_input_tokens_seen": 33471048, "step": 33340 }, { "epoch": 17.680275715800637, "grad_norm": 0.03928414732217789, "learning_rate": 4.04410802425908e-07, "loss": 0.0015, "num_input_tokens_seen": 33475816, "step": 33345 }, { "epoch": 17.682926829268293, "grad_norm": 0.4235309064388275, "learning_rate": 4.034998023692355e-07, "loss": 0.0004, "num_input_tokens_seen": 33481480, "step": 33350 }, { "epoch": 17.68557794273595, "grad_norm": 1.0025110244750977, "learning_rate": 4.025897864112266e-07, "loss": 0.0008, "num_input_tokens_seen": 33486248, "step": 33355 }, { "epoch": 17.688229056203607, "grad_norm": 0.039649598300457, "learning_rate": 4.016807547467139e-07, "loss": 0.0004, "num_input_tokens_seen": 33492296, "step": 33360 }, { "epoch": 17.690880169671264, "grad_norm": 0.41042646765708923, "learning_rate": 4.007727075703205e-07, "loss": 0.0004, "num_input_tokens_seen": 33497352, "step": 33365 }, { "epoch": 17.693531283138917, "grad_norm": 0.04453027993440628, "learning_rate": 3.9986564507645433e-07, "loss": 0.0212, "num_input_tokens_seen": 33503304, "step": 33370 }, { "epoch": 17.696182396606574, "grad_norm": 8.218605041503906, "learning_rate": 3.989595674593161e-07, "loss": 0.0019, "num_input_tokens_seen": 33508296, "step": 33375 }, { "epoch": 17.69883351007423, "grad_norm": 0.5407078266143799, "learning_rate": 3.980544749128956e-07, "loss": 0.0005, "num_input_tokens_seen": 33513448, "step": 33380 }, { "epoch": 17.701484623541887, "grad_norm": 0.06839022785425186, "learning_rate": 3.971503676309696e-07, "loss": 0.0003, "num_input_tokens_seen": 33518952, "step": 33385 }, { "epoch": 17.704135737009544, "grad_norm": 0.23775345087051392, "learning_rate": 3.962472458071054e-07, "loss": 0.0006, "num_input_tokens_seen": 33523240, "step": 33390 }, { "epoch": 17.7067868504772, "grad_norm": 34.64407730102539, "learning_rate": 3.9534510963465886e-07, "loss": 0.0082, "num_input_tokens_seen": 33527496, "step": 33395 }, { "epoch": 17.709437963944858, "grad_norm": 0.02094118483364582, "learning_rate": 3.9444395930677505e-07, "loss": 0.0005, "num_input_tokens_seen": 33533128, "step": 33400 }, { "epoch": 17.712089077412514, "grad_norm": 0.16866780817508698, "learning_rate": 3.9354379501638894e-07, "loss": 0.0004, "num_input_tokens_seen": 33538536, "step": 33405 }, { "epoch": 17.71474019088017, "grad_norm": 0.20016136765480042, "learning_rate": 3.92644616956222e-07, "loss": 0.0033, "num_input_tokens_seen": 33543880, "step": 33410 }, { "epoch": 17.717391304347824, "grad_norm": 0.022674396634101868, "learning_rate": 3.9174642531878524e-07, "loss": 0.0177, "num_input_tokens_seen": 33548744, "step": 33415 }, { "epoch": 17.72004241781548, "grad_norm": 0.05591066926717758, "learning_rate": 3.908492202963826e-07, "loss": 0.0017, "num_input_tokens_seen": 33553448, "step": 33420 }, { "epoch": 17.722693531283138, "grad_norm": 0.7220686078071594, "learning_rate": 3.8995300208109875e-07, "loss": 0.0004, "num_input_tokens_seen": 33557896, "step": 33425 }, { "epoch": 17.725344644750795, "grad_norm": 0.1842021644115448, "learning_rate": 3.890577708648147e-07, "loss": 0.0004, "num_input_tokens_seen": 33565576, "step": 33430 }, { "epoch": 17.72799575821845, "grad_norm": 0.005175804253667593, "learning_rate": 3.881635268391959e-07, "loss": 0.0123, "num_input_tokens_seen": 33569480, "step": 33435 }, { "epoch": 17.73064687168611, "grad_norm": 0.22270707786083221, "learning_rate": 3.8727027019569873e-07, "loss": 0.0004, "num_input_tokens_seen": 33574536, "step": 33440 }, { "epoch": 17.733297985153765, "grad_norm": 1.2470476627349854, "learning_rate": 3.8637800112556633e-07, "loss": 0.0011, "num_input_tokens_seen": 33579112, "step": 33445 }, { "epoch": 17.735949098621422, "grad_norm": 0.008742423728108406, "learning_rate": 3.8548671981983086e-07, "loss": 0.0002, "num_input_tokens_seen": 33583720, "step": 33450 }, { "epoch": 17.73860021208908, "grad_norm": 0.11541569977998734, "learning_rate": 3.8459642646931305e-07, "loss": 0.0006, "num_input_tokens_seen": 33588264, "step": 33455 }, { "epoch": 17.741251325556735, "grad_norm": 0.11795613169670105, "learning_rate": 3.837071212646248e-07, "loss": 0.0015, "num_input_tokens_seen": 33592840, "step": 33460 }, { "epoch": 17.74390243902439, "grad_norm": 0.6372877359390259, "learning_rate": 3.828188043961606e-07, "loss": 0.0009, "num_input_tokens_seen": 33597096, "step": 33465 }, { "epoch": 17.746553552492045, "grad_norm": 3.3562211990356445, "learning_rate": 3.819314760541093e-07, "loss": 0.0016, "num_input_tokens_seen": 33602888, "step": 33470 }, { "epoch": 17.749204665959702, "grad_norm": 14.845193862915039, "learning_rate": 3.8104513642844454e-07, "loss": 0.0682, "num_input_tokens_seen": 33607432, "step": 33475 }, { "epoch": 17.75185577942736, "grad_norm": 1.067484736442566, "learning_rate": 3.801597857089295e-07, "loss": 0.0005, "num_input_tokens_seen": 33612808, "step": 33480 }, { "epoch": 17.754506892895016, "grad_norm": 0.22854666411876678, "learning_rate": 3.792754240851154e-07, "loss": 0.0214, "num_input_tokens_seen": 33617640, "step": 33485 }, { "epoch": 17.757158006362673, "grad_norm": 0.05082547664642334, "learning_rate": 3.7839205174634187e-07, "loss": 0.0004, "num_input_tokens_seen": 33621992, "step": 33490 }, { "epoch": 17.75980911983033, "grad_norm": 0.01279719639569521, "learning_rate": 3.775096688817359e-07, "loss": 0.0005, "num_input_tokens_seen": 33628168, "step": 33495 }, { "epoch": 17.762460233297986, "grad_norm": 0.21985022723674774, "learning_rate": 3.766282756802153e-07, "loss": 0.0888, "num_input_tokens_seen": 33633032, "step": 33500 }, { "epoch": 17.765111346765643, "grad_norm": 0.057437166571617126, "learning_rate": 3.7574787233048136e-07, "loss": 0.0892, "num_input_tokens_seen": 33637896, "step": 33505 }, { "epoch": 17.7677624602333, "grad_norm": 0.4685029089450836, "learning_rate": 3.748684590210283e-07, "loss": 0.1147, "num_input_tokens_seen": 33643720, "step": 33510 }, { "epoch": 17.770413573700953, "grad_norm": 0.2653786838054657, "learning_rate": 3.739900359401355e-07, "loss": 0.0002, "num_input_tokens_seen": 33648360, "step": 33515 }, { "epoch": 17.77306468716861, "grad_norm": 0.4009827971458435, "learning_rate": 3.7311260327587085e-07, "loss": 0.0826, "num_input_tokens_seen": 33653064, "step": 33520 }, { "epoch": 17.775715800636267, "grad_norm": 1.7932926416397095, "learning_rate": 3.722361612160902e-07, "loss": 0.066, "num_input_tokens_seen": 33659560, "step": 33525 }, { "epoch": 17.778366914103923, "grad_norm": 0.5638794898986816, "learning_rate": 3.7136070994843845e-07, "loss": 0.0004, "num_input_tokens_seen": 33663592, "step": 33530 }, { "epoch": 17.78101802757158, "grad_norm": 0.3187740743160248, "learning_rate": 3.7048624966034506e-07, "loss": 0.0004, "num_input_tokens_seen": 33668296, "step": 33535 }, { "epoch": 17.783669141039237, "grad_norm": 0.0676000788807869, "learning_rate": 3.6961278053903373e-07, "loss": 0.0001, "num_input_tokens_seen": 33673096, "step": 33540 }, { "epoch": 17.786320254506894, "grad_norm": 0.05669226869940758, "learning_rate": 3.687403027715075e-07, "loss": 0.0015, "num_input_tokens_seen": 33678472, "step": 33545 }, { "epoch": 17.78897136797455, "grad_norm": 0.4310091733932495, "learning_rate": 3.6786881654456486e-07, "loss": 0.005, "num_input_tokens_seen": 33683688, "step": 33550 }, { "epoch": 17.791622481442207, "grad_norm": 0.08502934128046036, "learning_rate": 3.6699832204478813e-07, "loss": 0.0008, "num_input_tokens_seen": 33687848, "step": 33555 }, { "epoch": 17.79427359490986, "grad_norm": 0.10194911062717438, "learning_rate": 3.6612881945854606e-07, "loss": 0.0003, "num_input_tokens_seen": 33692840, "step": 33560 }, { "epoch": 17.796924708377517, "grad_norm": 0.15982061624526978, "learning_rate": 3.652603089719986e-07, "loss": 0.0013, "num_input_tokens_seen": 33697864, "step": 33565 }, { "epoch": 17.799575821845174, "grad_norm": 1.3649799823760986, "learning_rate": 3.643927907710909e-07, "loss": 0.001, "num_input_tokens_seen": 33701992, "step": 33570 }, { "epoch": 17.80222693531283, "grad_norm": 0.029417825862765312, "learning_rate": 3.6352626504155663e-07, "loss": 0.0374, "num_input_tokens_seen": 33707336, "step": 33575 }, { "epoch": 17.804878048780488, "grad_norm": 0.36315929889678955, "learning_rate": 3.626607319689168e-07, "loss": 0.0009, "num_input_tokens_seen": 33711528, "step": 33580 }, { "epoch": 17.807529162248144, "grad_norm": 0.11280899494886398, "learning_rate": 3.6179619173847993e-07, "loss": 0.0002, "num_input_tokens_seen": 33716616, "step": 33585 }, { "epoch": 17.8101802757158, "grad_norm": 0.04415303096175194, "learning_rate": 3.6093264453534007e-07, "loss": 0.0016, "num_input_tokens_seen": 33722152, "step": 33590 }, { "epoch": 17.812831389183458, "grad_norm": 0.16346022486686707, "learning_rate": 3.6007009054438323e-07, "loss": 0.0007, "num_input_tokens_seen": 33726856, "step": 33595 }, { "epoch": 17.815482502651115, "grad_norm": 0.12958502769470215, "learning_rate": 3.592085299502773e-07, "loss": 0.0003, "num_input_tokens_seen": 33732328, "step": 33600 }, { "epoch": 17.81813361611877, "grad_norm": 0.20870016515254974, "learning_rate": 3.58347962937482e-07, "loss": 0.0021, "num_input_tokens_seen": 33737384, "step": 33605 }, { "epoch": 17.820784729586425, "grad_norm": 0.0776522308588028, "learning_rate": 3.57488389690242e-07, "loss": 0.0005, "num_input_tokens_seen": 33742696, "step": 33610 }, { "epoch": 17.82343584305408, "grad_norm": 0.014811696484684944, "learning_rate": 3.566298103925897e-07, "loss": 0.0066, "num_input_tokens_seen": 33746536, "step": 33615 }, { "epoch": 17.82608695652174, "grad_norm": 0.34146541357040405, "learning_rate": 3.557722252283441e-07, "loss": 0.0005, "num_input_tokens_seen": 33751272, "step": 33620 }, { "epoch": 17.828738069989395, "grad_norm": 0.42031794786453247, "learning_rate": 3.549156343811128e-07, "loss": 0.0008, "num_input_tokens_seen": 33756328, "step": 33625 }, { "epoch": 17.831389183457052, "grad_norm": 0.018042711541056633, "learning_rate": 3.5406003803428844e-07, "loss": 0.0046, "num_input_tokens_seen": 33761800, "step": 33630 }, { "epoch": 17.83404029692471, "grad_norm": 0.341525673866272, "learning_rate": 3.532054363710541e-07, "loss": 0.0009, "num_input_tokens_seen": 33766856, "step": 33635 }, { "epoch": 17.836691410392365, "grad_norm": 2.3324975967407227, "learning_rate": 3.5235182957437444e-07, "loss": 0.0009, "num_input_tokens_seen": 33771208, "step": 33640 }, { "epoch": 17.839342523860022, "grad_norm": 0.4942433834075928, "learning_rate": 3.5149921782700767e-07, "loss": 0.003, "num_input_tokens_seen": 33777256, "step": 33645 }, { "epoch": 17.84199363732768, "grad_norm": 0.8276246190071106, "learning_rate": 3.506476013114946e-07, "loss": 0.0009, "num_input_tokens_seen": 33781736, "step": 33650 }, { "epoch": 17.844644750795332, "grad_norm": 0.024136563763022423, "learning_rate": 3.497969802101636e-07, "loss": 0.0002, "num_input_tokens_seen": 33786024, "step": 33655 }, { "epoch": 17.84729586426299, "grad_norm": 6.798972129821777, "learning_rate": 3.4894735470513084e-07, "loss": 0.0241, "num_input_tokens_seen": 33790504, "step": 33660 }, { "epoch": 17.849946977730646, "grad_norm": 0.1961205005645752, "learning_rate": 3.480987249782991e-07, "loss": 0.0201, "num_input_tokens_seen": 33794696, "step": 33665 }, { "epoch": 17.852598091198303, "grad_norm": 0.061733148992061615, "learning_rate": 3.4725109121135635e-07, "loss": 0.0003, "num_input_tokens_seen": 33798952, "step": 33670 }, { "epoch": 17.85524920466596, "grad_norm": 0.37651312351226807, "learning_rate": 3.4640445358578187e-07, "loss": 0.0092, "num_input_tokens_seen": 33803912, "step": 33675 }, { "epoch": 17.857900318133616, "grad_norm": 90.27084350585938, "learning_rate": 3.4555881228283463e-07, "loss": 0.0767, "num_input_tokens_seen": 33808968, "step": 33680 }, { "epoch": 17.860551431601273, "grad_norm": 0.11707368493080139, "learning_rate": 3.4471416748356745e-07, "loss": 0.0008, "num_input_tokens_seen": 33814760, "step": 33685 }, { "epoch": 17.86320254506893, "grad_norm": 0.29431870579719543, "learning_rate": 3.438705193688163e-07, "loss": 0.0006, "num_input_tokens_seen": 33819112, "step": 33690 }, { "epoch": 17.865853658536587, "grad_norm": 0.5002292990684509, "learning_rate": 3.430278681192012e-07, "loss": 0.0044, "num_input_tokens_seen": 33824520, "step": 33695 }, { "epoch": 17.868504772004243, "grad_norm": 0.10883209109306335, "learning_rate": 3.421862139151344e-07, "loss": 0.0024, "num_input_tokens_seen": 33830152, "step": 33700 }, { "epoch": 17.871155885471897, "grad_norm": 5.958187103271484, "learning_rate": 3.413455569368107e-07, "loss": 0.002, "num_input_tokens_seen": 33835944, "step": 33705 }, { "epoch": 17.873806998939553, "grad_norm": 0.22431203722953796, "learning_rate": 3.405058973642128e-07, "loss": 0.0003, "num_input_tokens_seen": 33841704, "step": 33710 }, { "epoch": 17.87645811240721, "grad_norm": 0.13133737444877625, "learning_rate": 3.3966723537710965e-07, "loss": 0.0377, "num_input_tokens_seen": 33846952, "step": 33715 }, { "epoch": 17.879109225874867, "grad_norm": 0.07209129631519318, "learning_rate": 3.388295711550571e-07, "loss": 0.0006, "num_input_tokens_seen": 33851624, "step": 33720 }, { "epoch": 17.881760339342524, "grad_norm": 15.115715026855469, "learning_rate": 3.37992904877395e-07, "loss": 0.0042, "num_input_tokens_seen": 33856168, "step": 33725 }, { "epoch": 17.88441145281018, "grad_norm": 0.04739414155483246, "learning_rate": 3.3715723672325397e-07, "loss": 0.0006, "num_input_tokens_seen": 33861064, "step": 33730 }, { "epoch": 17.887062566277837, "grad_norm": 0.8437890410423279, "learning_rate": 3.363225668715464e-07, "loss": 0.0004, "num_input_tokens_seen": 33866760, "step": 33735 }, { "epoch": 17.889713679745494, "grad_norm": 0.016026698052883148, "learning_rate": 3.3548889550097384e-07, "loss": 0.0003, "num_input_tokens_seen": 33871112, "step": 33740 }, { "epoch": 17.89236479321315, "grad_norm": 0.005745567847043276, "learning_rate": 3.34656222790023e-07, "loss": 0.0012, "num_input_tokens_seen": 33875432, "step": 33745 }, { "epoch": 17.895015906680804, "grad_norm": 0.05900577828288078, "learning_rate": 3.338245489169667e-07, "loss": 0.0043, "num_input_tokens_seen": 33879592, "step": 33750 }, { "epoch": 17.89766702014846, "grad_norm": 0.15674905478954315, "learning_rate": 3.3299387405986426e-07, "loss": 0.0018, "num_input_tokens_seen": 33884200, "step": 33755 }, { "epoch": 17.900318133616118, "grad_norm": 0.20513398945331573, "learning_rate": 3.321641983965618e-07, "loss": 0.0007, "num_input_tokens_seen": 33890344, "step": 33760 }, { "epoch": 17.902969247083774, "grad_norm": 0.16812963783740997, "learning_rate": 3.313355221046888e-07, "loss": 0.1628, "num_input_tokens_seen": 33895016, "step": 33765 }, { "epoch": 17.90562036055143, "grad_norm": 141.32154846191406, "learning_rate": 3.305078453616656e-07, "loss": 0.1749, "num_input_tokens_seen": 33900360, "step": 33770 }, { "epoch": 17.908271474019088, "grad_norm": 0.45040786266326904, "learning_rate": 3.296811683446921e-07, "loss": 0.0007, "num_input_tokens_seen": 33906920, "step": 33775 }, { "epoch": 17.910922587486745, "grad_norm": 0.19131390750408173, "learning_rate": 3.288554912307601e-07, "loss": 0.0003, "num_input_tokens_seen": 33911912, "step": 33780 }, { "epoch": 17.9135737009544, "grad_norm": 0.02693093940615654, "learning_rate": 3.2803081419664483e-07, "loss": 0.0085, "num_input_tokens_seen": 33916520, "step": 33785 }, { "epoch": 17.91622481442206, "grad_norm": 0.17833274602890015, "learning_rate": 3.272071374189062e-07, "loss": 0.0006, "num_input_tokens_seen": 33921000, "step": 33790 }, { "epoch": 17.918875927889715, "grad_norm": 0.1767842322587967, "learning_rate": 3.26384461073892e-07, "loss": 0.0003, "num_input_tokens_seen": 33925768, "step": 33795 }, { "epoch": 17.92152704135737, "grad_norm": 3.6986303329467773, "learning_rate": 3.2556278533773576e-07, "loss": 0.0021, "num_input_tokens_seen": 33930792, "step": 33800 }, { "epoch": 17.924178154825025, "grad_norm": 0.17549599707126617, "learning_rate": 3.247421103863546e-07, "loss": 0.0004, "num_input_tokens_seen": 33936168, "step": 33805 }, { "epoch": 17.926829268292682, "grad_norm": 0.15294033288955688, "learning_rate": 3.2392243639545406e-07, "loss": 0.0003, "num_input_tokens_seen": 33941608, "step": 33810 }, { "epoch": 17.92948038176034, "grad_norm": 0.14208543300628662, "learning_rate": 3.2310376354052377e-07, "loss": 0.0229, "num_input_tokens_seen": 33947048, "step": 33815 }, { "epoch": 17.932131495227996, "grad_norm": 0.22407634556293488, "learning_rate": 3.222860919968396e-07, "loss": 0.0013, "num_input_tokens_seen": 33951784, "step": 33820 }, { "epoch": 17.934782608695652, "grad_norm": 0.1309361606836319, "learning_rate": 3.214694219394626e-07, "loss": 0.0002, "num_input_tokens_seen": 33956040, "step": 33825 }, { "epoch": 17.93743372216331, "grad_norm": 0.42577844858169556, "learning_rate": 3.2065375354324025e-07, "loss": 0.0015, "num_input_tokens_seen": 33960808, "step": 33830 }, { "epoch": 17.940084835630966, "grad_norm": 0.18366529047489166, "learning_rate": 3.198390869828044e-07, "loss": 0.0007, "num_input_tokens_seen": 33965448, "step": 33835 }, { "epoch": 17.942735949098623, "grad_norm": 0.2240009754896164, "learning_rate": 3.190254224325734e-07, "loss": 0.0039, "num_input_tokens_seen": 33970120, "step": 33840 }, { "epoch": 17.94538706256628, "grad_norm": 2.3978099822998047, "learning_rate": 3.1821276006675064e-07, "loss": 0.0013, "num_input_tokens_seen": 33974632, "step": 33845 }, { "epoch": 17.948038176033933, "grad_norm": 105.21907043457031, "learning_rate": 3.174011000593241e-07, "loss": 0.083, "num_input_tokens_seen": 33979944, "step": 33850 }, { "epoch": 17.95068928950159, "grad_norm": 0.051683101803064346, "learning_rate": 3.165904425840705e-07, "loss": 0.0082, "num_input_tokens_seen": 33984584, "step": 33855 }, { "epoch": 17.953340402969246, "grad_norm": 0.4343051016330719, "learning_rate": 3.1578078781454636e-07, "loss": 0.0047, "num_input_tokens_seen": 33989288, "step": 33860 }, { "epoch": 17.955991516436903, "grad_norm": 0.21038182079792023, "learning_rate": 3.149721359240987e-07, "loss": 0.0007, "num_input_tokens_seen": 33994888, "step": 33865 }, { "epoch": 17.95864262990456, "grad_norm": 1.3305954933166504, "learning_rate": 3.141644870858579e-07, "loss": 0.0007, "num_input_tokens_seen": 34002888, "step": 33870 }, { "epoch": 17.961293743372217, "grad_norm": 0.22461216151714325, "learning_rate": 3.133578414727384e-07, "loss": 0.0004, "num_input_tokens_seen": 34008104, "step": 33875 }, { "epoch": 17.963944856839873, "grad_norm": 0.07625286281108856, "learning_rate": 3.1255219925744096e-07, "loss": 0.0004, "num_input_tokens_seen": 34013992, "step": 33880 }, { "epoch": 17.96659597030753, "grad_norm": 0.24675661325454712, "learning_rate": 3.117475606124526e-07, "loss": 0.0005, "num_input_tokens_seen": 34020712, "step": 33885 }, { "epoch": 17.969247083775187, "grad_norm": 0.418140709400177, "learning_rate": 3.1094392571004215e-07, "loss": 0.0022, "num_input_tokens_seen": 34026632, "step": 33890 }, { "epoch": 17.971898197242844, "grad_norm": 0.13611014187335968, "learning_rate": 3.1014129472226926e-07, "loss": 0.0003, "num_input_tokens_seen": 34031336, "step": 33895 }, { "epoch": 17.974549310710497, "grad_norm": 80.20295715332031, "learning_rate": 3.093396678209709e-07, "loss": 0.114, "num_input_tokens_seen": 34035944, "step": 33900 }, { "epoch": 17.977200424178154, "grad_norm": 0.03205568715929985, "learning_rate": 3.0853904517777645e-07, "loss": 0.0002, "num_input_tokens_seen": 34040936, "step": 33905 }, { "epoch": 17.97985153764581, "grad_norm": 0.9221096038818359, "learning_rate": 3.077394269640965e-07, "loss": 0.0004, "num_input_tokens_seen": 34045608, "step": 33910 }, { "epoch": 17.982502651113467, "grad_norm": 79.13353729248047, "learning_rate": 3.0694081335112645e-07, "loss": 0.0286, "num_input_tokens_seen": 34051176, "step": 33915 }, { "epoch": 17.985153764581124, "grad_norm": 5.588302135467529, "learning_rate": 3.061432045098478e-07, "loss": 0.0013, "num_input_tokens_seen": 34056488, "step": 33920 }, { "epoch": 17.98780487804878, "grad_norm": 0.4703427255153656, "learning_rate": 3.053466006110267e-07, "loss": 0.0007, "num_input_tokens_seen": 34060744, "step": 33925 }, { "epoch": 17.990455991516438, "grad_norm": 39.78013610839844, "learning_rate": 3.045510018252129e-07, "loss": 0.0121, "num_input_tokens_seen": 34065256, "step": 33930 }, { "epoch": 17.993107104984094, "grad_norm": 0.8303263187408447, "learning_rate": 3.03756408322744e-07, "loss": 0.0045, "num_input_tokens_seen": 34069736, "step": 33935 }, { "epoch": 17.99575821845175, "grad_norm": 3.8148953914642334, "learning_rate": 3.0296282027373836e-07, "loss": 0.0011, "num_input_tokens_seen": 34075304, "step": 33940 }, { "epoch": 17.998409331919405, "grad_norm": 0.36157214641571045, "learning_rate": 3.021702378481023e-07, "loss": 0.0006, "num_input_tokens_seen": 34079560, "step": 33945 }, { "epoch": 18.0, "eval_loss": 1.048458218574524, "eval_runtime": 29.4141, "eval_samples_per_second": 64.119, "eval_steps_per_second": 16.047, "num_input_tokens_seen": 34080816, "step": 33948 }, { "epoch": 18.00106044538706, "grad_norm": 0.12243968993425369, "learning_rate": 3.013786612155251e-07, "loss": 0.0005, "num_input_tokens_seen": 34082448, "step": 33950 }, { "epoch": 18.003711558854718, "grad_norm": 0.14370237290859222, "learning_rate": 3.005880905454822e-07, "loss": 0.0028, "num_input_tokens_seen": 34087312, "step": 33955 }, { "epoch": 18.006362672322375, "grad_norm": 0.20920860767364502, "learning_rate": 2.997985260072317e-07, "loss": 0.0153, "num_input_tokens_seen": 34092848, "step": 33960 }, { "epoch": 18.00901378579003, "grad_norm": 0.11160128563642502, "learning_rate": 2.990099677698172e-07, "loss": 0.0016, "num_input_tokens_seen": 34096912, "step": 33965 }, { "epoch": 18.01166489925769, "grad_norm": 0.11227761954069138, "learning_rate": 2.9822241600206746e-07, "loss": 0.0009, "num_input_tokens_seen": 34102512, "step": 33970 }, { "epoch": 18.014316012725345, "grad_norm": 0.02927277237176895, "learning_rate": 2.974358708725955e-07, "loss": 0.043, "num_input_tokens_seen": 34107536, "step": 33975 }, { "epoch": 18.016967126193002, "grad_norm": 0.5426338315010071, "learning_rate": 2.9665033254979827e-07, "loss": 0.0006, "num_input_tokens_seen": 34112176, "step": 33980 }, { "epoch": 18.01961823966066, "grad_norm": 0.019786983728408813, "learning_rate": 2.9586580120185613e-07, "loss": 0.0003, "num_input_tokens_seen": 34117648, "step": 33985 }, { "epoch": 18.022269353128316, "grad_norm": 0.02979441173374653, "learning_rate": 2.9508227699673873e-07, "loss": 0.0002, "num_input_tokens_seen": 34123184, "step": 33990 }, { "epoch": 18.02492046659597, "grad_norm": 0.05219928175210953, "learning_rate": 2.942997601021924e-07, "loss": 0.0002, "num_input_tokens_seen": 34127792, "step": 33995 }, { "epoch": 18.027571580063626, "grad_norm": 0.31150901317596436, "learning_rate": 2.935182506857542e-07, "loss": 0.0007, "num_input_tokens_seen": 34132880, "step": 34000 }, { "epoch": 18.030222693531282, "grad_norm": 0.0964513048529625, "learning_rate": 2.9273774891474317e-07, "loss": 0.0002, "num_input_tokens_seen": 34137744, "step": 34005 }, { "epoch": 18.03287380699894, "grad_norm": 0.038790661841630936, "learning_rate": 2.919582549562627e-07, "loss": 0.0003, "num_input_tokens_seen": 34142704, "step": 34010 }, { "epoch": 18.035524920466596, "grad_norm": 0.0848008543252945, "learning_rate": 2.911797689772e-07, "loss": 0.0004, "num_input_tokens_seen": 34148016, "step": 34015 }, { "epoch": 18.038176033934253, "grad_norm": 0.08739137649536133, "learning_rate": 2.9040229114422727e-07, "loss": 0.0004, "num_input_tokens_seen": 34152688, "step": 34020 }, { "epoch": 18.04082714740191, "grad_norm": 0.6157678365707397, "learning_rate": 2.89625821623799e-07, "loss": 0.0008, "num_input_tokens_seen": 34157744, "step": 34025 }, { "epoch": 18.043478260869566, "grad_norm": 88.80220031738281, "learning_rate": 2.88850360582158e-07, "loss": 0.0288, "num_input_tokens_seen": 34162896, "step": 34030 }, { "epoch": 18.046129374337223, "grad_norm": 3.253140687942505, "learning_rate": 2.8807590818532573e-07, "loss": 0.0012, "num_input_tokens_seen": 34168720, "step": 34035 }, { "epoch": 18.048780487804876, "grad_norm": 0.4585299491882324, "learning_rate": 2.8730246459911236e-07, "loss": 0.0289, "num_input_tokens_seen": 34173584, "step": 34040 }, { "epoch": 18.051431601272533, "grad_norm": 0.13974854350090027, "learning_rate": 2.865300299891094e-07, "loss": 0.0003, "num_input_tokens_seen": 34178384, "step": 34045 }, { "epoch": 18.05408271474019, "grad_norm": 0.2567118704319, "learning_rate": 2.857586045206934e-07, "loss": 0.0003, "num_input_tokens_seen": 34182512, "step": 34050 }, { "epoch": 18.056733828207847, "grad_norm": 0.10200773924589157, "learning_rate": 2.8498818835902387e-07, "loss": 0.0002, "num_input_tokens_seen": 34187440, "step": 34055 }, { "epoch": 18.059384941675503, "grad_norm": 0.025363333523273468, "learning_rate": 2.842187816690456e-07, "loss": 0.0044, "num_input_tokens_seen": 34192336, "step": 34060 }, { "epoch": 18.06203605514316, "grad_norm": 0.026635216549038887, "learning_rate": 2.8345038461548514e-07, "loss": 0.0003, "num_input_tokens_seen": 34196944, "step": 34065 }, { "epoch": 18.064687168610817, "grad_norm": 0.04310522973537445, "learning_rate": 2.8268299736285696e-07, "loss": 0.0005, "num_input_tokens_seen": 34201776, "step": 34070 }, { "epoch": 18.067338282078474, "grad_norm": 0.2259070724248886, "learning_rate": 2.8191662007545364e-07, "loss": 0.0003, "num_input_tokens_seen": 34207248, "step": 34075 }, { "epoch": 18.06998939554613, "grad_norm": 14.866901397705078, "learning_rate": 2.811512529173566e-07, "loss": 0.0034, "num_input_tokens_seen": 34212272, "step": 34080 }, { "epoch": 18.072640509013787, "grad_norm": 0.287080317735672, "learning_rate": 2.8038689605242864e-07, "loss": 0.0267, "num_input_tokens_seen": 34216880, "step": 34085 }, { "epoch": 18.07529162248144, "grad_norm": 0.0627133697271347, "learning_rate": 2.7962354964431617e-07, "loss": 0.0008, "num_input_tokens_seen": 34221872, "step": 34090 }, { "epoch": 18.077942735949097, "grad_norm": 0.06801319867372513, "learning_rate": 2.7886121385644947e-07, "loss": 0.0002, "num_input_tokens_seen": 34226000, "step": 34095 }, { "epoch": 18.080593849416754, "grad_norm": 0.1555173546075821, "learning_rate": 2.780998888520431e-07, "loss": 0.001, "num_input_tokens_seen": 34230960, "step": 34100 }, { "epoch": 18.08324496288441, "grad_norm": 0.1999565213918686, "learning_rate": 2.7733957479409435e-07, "loss": 0.0067, "num_input_tokens_seen": 34236560, "step": 34105 }, { "epoch": 18.085896076352068, "grad_norm": 0.11454518139362335, "learning_rate": 2.765802718453847e-07, "loss": 0.0002, "num_input_tokens_seen": 34240752, "step": 34110 }, { "epoch": 18.088547189819725, "grad_norm": 0.5025004744529724, "learning_rate": 2.7582198016847917e-07, "loss": 0.0649, "num_input_tokens_seen": 34245648, "step": 34115 }, { "epoch": 18.09119830328738, "grad_norm": 0.059346754103899, "learning_rate": 2.7506469992572504e-07, "loss": 0.0014, "num_input_tokens_seen": 34251376, "step": 34120 }, { "epoch": 18.093849416755038, "grad_norm": 1.7369468212127686, "learning_rate": 2.74308431279256e-07, "loss": 0.0007, "num_input_tokens_seen": 34255408, "step": 34125 }, { "epoch": 18.096500530222695, "grad_norm": 0.03265099227428436, "learning_rate": 2.7355317439098473e-07, "loss": 0.0002, "num_input_tokens_seen": 34259696, "step": 34130 }, { "epoch": 18.09915164369035, "grad_norm": 0.05662322789430618, "learning_rate": 2.727989294226113e-07, "loss": 0.0006, "num_input_tokens_seen": 34265520, "step": 34135 }, { "epoch": 18.101802757158005, "grad_norm": 0.24000117182731628, "learning_rate": 2.7204569653561716e-07, "loss": 0.0007, "num_input_tokens_seen": 34270224, "step": 34140 }, { "epoch": 18.10445387062566, "grad_norm": 0.004498111549764872, "learning_rate": 2.712934758912683e-07, "loss": 0.0002, "num_input_tokens_seen": 34275376, "step": 34145 }, { "epoch": 18.10710498409332, "grad_norm": 0.0070731304585933685, "learning_rate": 2.70542267650612e-07, "loss": 0.0006, "num_input_tokens_seen": 34279728, "step": 34150 }, { "epoch": 18.109756097560975, "grad_norm": 0.16176609694957733, "learning_rate": 2.697920719744801e-07, "loss": 0.0005, "num_input_tokens_seen": 34284304, "step": 34155 }, { "epoch": 18.112407211028632, "grad_norm": 0.07923263311386108, "learning_rate": 2.690428890234881e-07, "loss": 0.0002, "num_input_tokens_seen": 34289392, "step": 34160 }, { "epoch": 18.11505832449629, "grad_norm": 3.448798418045044, "learning_rate": 2.682947189580343e-07, "loss": 0.0014, "num_input_tokens_seen": 34293648, "step": 34165 }, { "epoch": 18.117709437963946, "grad_norm": 0.13594254851341248, "learning_rate": 2.6754756193829887e-07, "loss": 0.0043, "num_input_tokens_seen": 34298096, "step": 34170 }, { "epoch": 18.120360551431602, "grad_norm": 0.5618029236793518, "learning_rate": 2.6680141812424733e-07, "loss": 0.0288, "num_input_tokens_seen": 34302832, "step": 34175 }, { "epoch": 18.12301166489926, "grad_norm": 0.08476969599723816, "learning_rate": 2.6605628767562674e-07, "loss": 0.0004, "num_input_tokens_seen": 34307440, "step": 34180 }, { "epoch": 18.125662778366912, "grad_norm": 0.7286912202835083, "learning_rate": 2.653121707519668e-07, "loss": 0.0006, "num_input_tokens_seen": 34312240, "step": 34185 }, { "epoch": 18.12831389183457, "grad_norm": 0.1604926437139511, "learning_rate": 2.6456906751258225e-07, "loss": 0.0005, "num_input_tokens_seen": 34316368, "step": 34190 }, { "epoch": 18.130965005302226, "grad_norm": 2.8093695640563965, "learning_rate": 2.638269781165692e-07, "loss": 0.0072, "num_input_tokens_seen": 34321552, "step": 34195 }, { "epoch": 18.133616118769883, "grad_norm": 0.21750523149967194, "learning_rate": 2.630859027228055e-07, "loss": 0.0547, "num_input_tokens_seen": 34326032, "step": 34200 }, { "epoch": 18.13626723223754, "grad_norm": 0.005629533901810646, "learning_rate": 2.623458414899566e-07, "loss": 0.0009, "num_input_tokens_seen": 34330800, "step": 34205 }, { "epoch": 18.138918345705196, "grad_norm": 0.20579810440540314, "learning_rate": 2.616067945764644e-07, "loss": 0.0014, "num_input_tokens_seen": 34336304, "step": 34210 }, { "epoch": 18.141569459172853, "grad_norm": 1.0928959846496582, "learning_rate": 2.6086876214055923e-07, "loss": 0.043, "num_input_tokens_seen": 34340656, "step": 34215 }, { "epoch": 18.14422057264051, "grad_norm": 0.5035173296928406, "learning_rate": 2.6013174434025066e-07, "loss": 0.001, "num_input_tokens_seen": 34345008, "step": 34220 }, { "epoch": 18.146871686108167, "grad_norm": 0.24499449133872986, "learning_rate": 2.593957413333331e-07, "loss": 0.0004, "num_input_tokens_seen": 34350672, "step": 34225 }, { "epoch": 18.149522799575823, "grad_norm": 107.38072967529297, "learning_rate": 2.5866075327738216e-07, "loss": 0.0764, "num_input_tokens_seen": 34356720, "step": 34230 }, { "epoch": 18.152173913043477, "grad_norm": 0.024090446531772614, "learning_rate": 2.579267803297569e-07, "loss": 0.0003, "num_input_tokens_seen": 34361840, "step": 34235 }, { "epoch": 18.154825026511134, "grad_norm": 0.1717439889907837, "learning_rate": 2.571938226475995e-07, "loss": 0.0016, "num_input_tokens_seen": 34366992, "step": 34240 }, { "epoch": 18.15747613997879, "grad_norm": 0.029782157391309738, "learning_rate": 2.5646188038783383e-07, "loss": 0.0002, "num_input_tokens_seen": 34372560, "step": 34245 }, { "epoch": 18.160127253446447, "grad_norm": 0.2293500006198883, "learning_rate": 2.5573095370716673e-07, "loss": 0.0004, "num_input_tokens_seen": 34377456, "step": 34250 }, { "epoch": 18.162778366914104, "grad_norm": 0.14066843688488007, "learning_rate": 2.5500104276208747e-07, "loss": 0.0005, "num_input_tokens_seen": 34383952, "step": 34255 }, { "epoch": 18.16542948038176, "grad_norm": 0.7071813941001892, "learning_rate": 2.5427214770887e-07, "loss": 0.1319, "num_input_tokens_seen": 34388016, "step": 34260 }, { "epoch": 18.168080593849417, "grad_norm": 0.03210264444351196, "learning_rate": 2.5354426870356606e-07, "loss": 0.0006, "num_input_tokens_seen": 34395184, "step": 34265 }, { "epoch": 18.170731707317074, "grad_norm": 0.03337731212377548, "learning_rate": 2.52817405902015e-07, "loss": 0.0011, "num_input_tokens_seen": 34401744, "step": 34270 }, { "epoch": 18.17338282078473, "grad_norm": 0.18361908197402954, "learning_rate": 2.520915594598344e-07, "loss": 0.0002, "num_input_tokens_seen": 34406832, "step": 34275 }, { "epoch": 18.176033934252388, "grad_norm": 0.08717986941337585, "learning_rate": 2.5136672953242733e-07, "loss": 0.0005, "num_input_tokens_seen": 34412016, "step": 34280 }, { "epoch": 18.17868504772004, "grad_norm": 0.03881412371993065, "learning_rate": 2.5064291627497793e-07, "loss": 0.0009, "num_input_tokens_seen": 34417200, "step": 34285 }, { "epoch": 18.181336161187698, "grad_norm": 0.1392364203929901, "learning_rate": 2.499201198424517e-07, "loss": 0.0004, "num_input_tokens_seen": 34422960, "step": 34290 }, { "epoch": 18.183987274655355, "grad_norm": 0.21963807940483093, "learning_rate": 2.4919834038959756e-07, "loss": 0.0003, "num_input_tokens_seen": 34427664, "step": 34295 }, { "epoch": 18.18663838812301, "grad_norm": 0.42501989006996155, "learning_rate": 2.4847757807094866e-07, "loss": 0.0102, "num_input_tokens_seen": 34435568, "step": 34300 }, { "epoch": 18.189289501590668, "grad_norm": 0.15589064359664917, "learning_rate": 2.4775783304081536e-07, "loss": 0.0003, "num_input_tokens_seen": 34439728, "step": 34305 }, { "epoch": 18.191940615058325, "grad_norm": 2.05368971824646, "learning_rate": 2.4703910545329556e-07, "loss": 0.0007, "num_input_tokens_seen": 34445936, "step": 34310 }, { "epoch": 18.19459172852598, "grad_norm": 0.33417075872421265, "learning_rate": 2.463213954622656e-07, "loss": 0.0004, "num_input_tokens_seen": 34451344, "step": 34315 }, { "epoch": 18.19724284199364, "grad_norm": 0.2701321542263031, "learning_rate": 2.4560470322138597e-07, "loss": 0.0004, "num_input_tokens_seen": 34456176, "step": 34320 }, { "epoch": 18.199893955461295, "grad_norm": 0.09430430084466934, "learning_rate": 2.448890288840977e-07, "loss": 0.0003, "num_input_tokens_seen": 34460944, "step": 34325 }, { "epoch": 18.20254506892895, "grad_norm": 0.3168300688266754, "learning_rate": 2.441743726036272e-07, "loss": 0.0002, "num_input_tokens_seen": 34465712, "step": 34330 }, { "epoch": 18.205196182396605, "grad_norm": 3.2153584957122803, "learning_rate": 2.43460734532977e-07, "loss": 0.0022, "num_input_tokens_seen": 34471600, "step": 34335 }, { "epoch": 18.207847295864262, "grad_norm": 0.027346257120370865, "learning_rate": 2.427481148249383e-07, "loss": 0.0002, "num_input_tokens_seen": 34476560, "step": 34340 }, { "epoch": 18.21049840933192, "grad_norm": 3.4827895164489746, "learning_rate": 2.4203651363207845e-07, "loss": 0.0026, "num_input_tokens_seen": 34482192, "step": 34345 }, { "epoch": 18.213149522799576, "grad_norm": 0.3394591808319092, "learning_rate": 2.4132593110675174e-07, "loss": 0.0005, "num_input_tokens_seen": 34486224, "step": 34350 }, { "epoch": 18.215800636267232, "grad_norm": 0.04479480907320976, "learning_rate": 2.4061636740109027e-07, "loss": 0.0003, "num_input_tokens_seen": 34491056, "step": 34355 }, { "epoch": 18.21845174973489, "grad_norm": 0.005177754908800125, "learning_rate": 2.3990782266701094e-07, "loss": 0.0003, "num_input_tokens_seen": 34495536, "step": 34360 }, { "epoch": 18.221102863202546, "grad_norm": 0.06751420348882675, "learning_rate": 2.3920029705621016e-07, "loss": 0.0003, "num_input_tokens_seen": 34499792, "step": 34365 }, { "epoch": 18.223753976670203, "grad_norm": 0.030064096674323082, "learning_rate": 2.38493790720169e-07, "loss": 0.0004, "num_input_tokens_seen": 34503408, "step": 34370 }, { "epoch": 18.22640509013786, "grad_norm": 0.23497749865055084, "learning_rate": 2.3778830381014694e-07, "loss": 0.0002, "num_input_tokens_seen": 34508016, "step": 34375 }, { "epoch": 18.229056203605513, "grad_norm": 0.17398785054683685, "learning_rate": 2.3708383647718768e-07, "loss": 0.0947, "num_input_tokens_seen": 34513552, "step": 34380 }, { "epoch": 18.23170731707317, "grad_norm": 0.07104342430830002, "learning_rate": 2.3638038887211667e-07, "loss": 0.0012, "num_input_tokens_seen": 34518960, "step": 34385 }, { "epoch": 18.234358430540826, "grad_norm": 0.211492657661438, "learning_rate": 2.3567796114553788e-07, "loss": 0.0003, "num_input_tokens_seen": 34524528, "step": 34390 }, { "epoch": 18.237009544008483, "grad_norm": 0.9976763725280762, "learning_rate": 2.3497655344784153e-07, "loss": 0.0147, "num_input_tokens_seen": 34529264, "step": 34395 }, { "epoch": 18.23966065747614, "grad_norm": 0.004736852832138538, "learning_rate": 2.3427616592919587e-07, "loss": 0.0009, "num_input_tokens_seen": 34534704, "step": 34400 }, { "epoch": 18.242311770943797, "grad_norm": 3.2152693271636963, "learning_rate": 2.3357679873955263e-07, "loss": 0.002, "num_input_tokens_seen": 34540400, "step": 34405 }, { "epoch": 18.244962884411454, "grad_norm": 0.023134060204029083, "learning_rate": 2.3287845202864478e-07, "loss": 0.0177, "num_input_tokens_seen": 34545136, "step": 34410 }, { "epoch": 18.24761399787911, "grad_norm": 0.1182427778840065, "learning_rate": 2.3218112594598552e-07, "loss": 0.0002, "num_input_tokens_seen": 34549648, "step": 34415 }, { "epoch": 18.250265111346767, "grad_norm": 0.10949502140283585, "learning_rate": 2.3148482064087042e-07, "loss": 0.0003, "num_input_tokens_seen": 34554992, "step": 34420 }, { "epoch": 18.25291622481442, "grad_norm": 0.025109807029366493, "learning_rate": 2.3078953626237854e-07, "loss": 0.0002, "num_input_tokens_seen": 34562256, "step": 34425 }, { "epoch": 18.255567338282077, "grad_norm": 1.2286428213119507, "learning_rate": 2.3009527295936585e-07, "loss": 0.0008, "num_input_tokens_seen": 34568528, "step": 34430 }, { "epoch": 18.258218451749734, "grad_norm": 0.24165552854537964, "learning_rate": 2.2940203088047453e-07, "loss": 0.0003, "num_input_tokens_seen": 34573104, "step": 34435 }, { "epoch": 18.26086956521739, "grad_norm": 0.15010400116443634, "learning_rate": 2.2870981017412474e-07, "loss": 0.0003, "num_input_tokens_seen": 34577936, "step": 34440 }, { "epoch": 18.263520678685047, "grad_norm": 0.2899935245513916, "learning_rate": 2.2801861098851962e-07, "loss": 0.0007, "num_input_tokens_seen": 34583536, "step": 34445 }, { "epoch": 18.266171792152704, "grad_norm": 0.6419328451156616, "learning_rate": 2.2732843347164302e-07, "loss": 0.0007, "num_input_tokens_seen": 34589392, "step": 34450 }, { "epoch": 18.26882290562036, "grad_norm": 0.1934429407119751, "learning_rate": 2.266392777712595e-07, "loss": 0.0006, "num_input_tokens_seen": 34594352, "step": 34455 }, { "epoch": 18.271474019088018, "grad_norm": 3.4650278091430664, "learning_rate": 2.2595114403491602e-07, "loss": 0.0012, "num_input_tokens_seen": 34599632, "step": 34460 }, { "epoch": 18.274125132555675, "grad_norm": 0.028930241242051125, "learning_rate": 2.2526403240994087e-07, "loss": 0.2127, "num_input_tokens_seen": 34604944, "step": 34465 }, { "epoch": 18.27677624602333, "grad_norm": 0.5659445524215698, "learning_rate": 2.2457794304344082e-07, "loss": 0.0007, "num_input_tokens_seen": 34609008, "step": 34470 }, { "epoch": 18.279427359490985, "grad_norm": 53.51893997192383, "learning_rate": 2.2389287608230837e-07, "loss": 0.0218, "num_input_tokens_seen": 34613392, "step": 34475 }, { "epoch": 18.28207847295864, "grad_norm": 7.461734771728516, "learning_rate": 2.2320883167321283e-07, "loss": 0.004, "num_input_tokens_seen": 34618352, "step": 34480 }, { "epoch": 18.284729586426298, "grad_norm": 0.09051552414894104, "learning_rate": 2.2252580996260708e-07, "loss": 0.0003, "num_input_tokens_seen": 34622384, "step": 34485 }, { "epoch": 18.287380699893955, "grad_norm": 0.01893831603229046, "learning_rate": 2.218438110967236e-07, "loss": 0.0006, "num_input_tokens_seen": 34626672, "step": 34490 }, { "epoch": 18.29003181336161, "grad_norm": 0.0829814076423645, "learning_rate": 2.2116283522157777e-07, "loss": 0.0479, "num_input_tokens_seen": 34631184, "step": 34495 }, { "epoch": 18.29268292682927, "grad_norm": 0.5838547348976135, "learning_rate": 2.2048288248296246e-07, "loss": 0.0014, "num_input_tokens_seen": 34636432, "step": 34500 }, { "epoch": 18.295334040296925, "grad_norm": 3.3808085918426514, "learning_rate": 2.198039530264573e-07, "loss": 0.0026, "num_input_tokens_seen": 34641616, "step": 34505 }, { "epoch": 18.297985153764582, "grad_norm": 0.012084943242371082, "learning_rate": 2.1912604699741548e-07, "loss": 0.0006, "num_input_tokens_seen": 34646064, "step": 34510 }, { "epoch": 18.30063626723224, "grad_norm": 0.013668750412762165, "learning_rate": 2.1844916454097754e-07, "loss": 0.0288, "num_input_tokens_seen": 34651920, "step": 34515 }, { "epoch": 18.303287380699896, "grad_norm": 0.4023955464363098, "learning_rate": 2.1777330580206203e-07, "loss": 0.0565, "num_input_tokens_seen": 34656784, "step": 34520 }, { "epoch": 18.30593849416755, "grad_norm": 0.03083338588476181, "learning_rate": 2.1709847092536651e-07, "loss": 0.0005, "num_input_tokens_seen": 34664176, "step": 34525 }, { "epoch": 18.308589607635206, "grad_norm": 0.13413168489933014, "learning_rate": 2.1642466005537376e-07, "loss": 0.0004, "num_input_tokens_seen": 34669104, "step": 34530 }, { "epoch": 18.311240721102862, "grad_norm": 0.2958189845085144, "learning_rate": 2.1575187333634395e-07, "loss": 0.0003, "num_input_tokens_seen": 34674416, "step": 34535 }, { "epoch": 18.31389183457052, "grad_norm": 0.220766082406044, "learning_rate": 2.150801109123185e-07, "loss": 0.0015, "num_input_tokens_seen": 34679760, "step": 34540 }, { "epoch": 18.316542948038176, "grad_norm": 0.15438225865364075, "learning_rate": 2.1440937292712073e-07, "loss": 0.0043, "num_input_tokens_seen": 34683408, "step": 34545 }, { "epoch": 18.319194061505833, "grad_norm": 0.37120240926742554, "learning_rate": 2.1373965952435348e-07, "loss": 0.0003, "num_input_tokens_seen": 34688496, "step": 34550 }, { "epoch": 18.32184517497349, "grad_norm": 0.06249212101101875, "learning_rate": 2.130709708473999e-07, "loss": 0.0011, "num_input_tokens_seen": 34693328, "step": 34555 }, { "epoch": 18.324496288441146, "grad_norm": 0.1080620214343071, "learning_rate": 2.1240330703942714e-07, "loss": 0.0004, "num_input_tokens_seen": 34697968, "step": 34560 }, { "epoch": 18.327147401908803, "grad_norm": 1.4435890913009644, "learning_rate": 2.1173666824337692e-07, "loss": 0.0008, "num_input_tokens_seen": 34702480, "step": 34565 }, { "epoch": 18.329798515376456, "grad_norm": 0.059447452425956726, "learning_rate": 2.110710546019773e-07, "loss": 0.0002, "num_input_tokens_seen": 34706864, "step": 34570 }, { "epoch": 18.332449628844113, "grad_norm": 0.19496861100196838, "learning_rate": 2.104064662577332e-07, "loss": 0.0209, "num_input_tokens_seen": 34711600, "step": 34575 }, { "epoch": 18.33510074231177, "grad_norm": 0.13962283730506897, "learning_rate": 2.0974290335293136e-07, "loss": 0.0003, "num_input_tokens_seen": 34716912, "step": 34580 }, { "epoch": 18.337751855779427, "grad_norm": 0.4185008704662323, "learning_rate": 2.090803660296392e-07, "loss": 0.0008, "num_input_tokens_seen": 34723088, "step": 34585 }, { "epoch": 18.340402969247084, "grad_norm": 0.04683922231197357, "learning_rate": 2.0841885442970445e-07, "loss": 0.0019, "num_input_tokens_seen": 34727792, "step": 34590 }, { "epoch": 18.34305408271474, "grad_norm": 0.7010002136230469, "learning_rate": 2.0775836869475375e-07, "loss": 0.0013, "num_input_tokens_seen": 34733072, "step": 34595 }, { "epoch": 18.345705196182397, "grad_norm": 0.2515735626220703, "learning_rate": 2.070989089661979e-07, "loss": 0.0004, "num_input_tokens_seen": 34739952, "step": 34600 }, { "epoch": 18.348356309650054, "grad_norm": 0.010430839844048023, "learning_rate": 2.0644047538522226e-07, "loss": 0.0006, "num_input_tokens_seen": 34744304, "step": 34605 }, { "epoch": 18.35100742311771, "grad_norm": 0.14725010097026825, "learning_rate": 2.05783068092798e-07, "loss": 0.0005, "num_input_tokens_seen": 34749296, "step": 34610 }, { "epoch": 18.353658536585368, "grad_norm": 0.07762020081281662, "learning_rate": 2.0512668722967366e-07, "loss": 0.0035, "num_input_tokens_seen": 34753776, "step": 34615 }, { "epoch": 18.35630965005302, "grad_norm": 0.034985750913619995, "learning_rate": 2.04471332936379e-07, "loss": 0.0007, "num_input_tokens_seen": 34759312, "step": 34620 }, { "epoch": 18.358960763520678, "grad_norm": 0.06971141695976257, "learning_rate": 2.038170053532229e-07, "loss": 0.001, "num_input_tokens_seen": 34763792, "step": 34625 }, { "epoch": 18.361611876988334, "grad_norm": 0.7589962482452393, "learning_rate": 2.0316370462029556e-07, "loss": 0.0005, "num_input_tokens_seen": 34768432, "step": 34630 }, { "epoch": 18.36426299045599, "grad_norm": 0.17943772673606873, "learning_rate": 2.025114308774667e-07, "loss": 0.0006, "num_input_tokens_seen": 34773008, "step": 34635 }, { "epoch": 18.366914103923648, "grad_norm": 0.02123720571398735, "learning_rate": 2.0186018426438803e-07, "loss": 0.0008, "num_input_tokens_seen": 34777904, "step": 34640 }, { "epoch": 18.369565217391305, "grad_norm": 0.032240644097328186, "learning_rate": 2.0120996492048684e-07, "loss": 0.0005, "num_input_tokens_seen": 34782640, "step": 34645 }, { "epoch": 18.37221633085896, "grad_norm": 0.04768355190753937, "learning_rate": 2.0056077298497566e-07, "loss": 0.0004, "num_input_tokens_seen": 34787440, "step": 34650 }, { "epoch": 18.37486744432662, "grad_norm": 0.11570463329553604, "learning_rate": 1.9991260859684502e-07, "loss": 0.0003, "num_input_tokens_seen": 34793840, "step": 34655 }, { "epoch": 18.377518557794275, "grad_norm": 0.385094553232193, "learning_rate": 1.9926547189486277e-07, "loss": 0.0003, "num_input_tokens_seen": 34797616, "step": 34660 }, { "epoch": 18.380169671261932, "grad_norm": 4.847239017486572, "learning_rate": 1.98619363017582e-07, "loss": 0.0027, "num_input_tokens_seen": 34802672, "step": 34665 }, { "epoch": 18.382820784729585, "grad_norm": 0.037091583013534546, "learning_rate": 1.979742821033309e-07, "loss": 0.0365, "num_input_tokens_seen": 34806832, "step": 34670 }, { "epoch": 18.385471898197242, "grad_norm": 0.26952898502349854, "learning_rate": 1.9733022929022072e-07, "loss": 0.0004, "num_input_tokens_seen": 34811248, "step": 34675 }, { "epoch": 18.3881230116649, "grad_norm": 0.0583210289478302, "learning_rate": 1.9668720471614112e-07, "loss": 0.0002, "num_input_tokens_seen": 34816304, "step": 34680 }, { "epoch": 18.390774125132555, "grad_norm": 0.10728736966848373, "learning_rate": 1.9604520851876196e-07, "loss": 0.0016, "num_input_tokens_seen": 34820688, "step": 34685 }, { "epoch": 18.393425238600212, "grad_norm": 0.894619882106781, "learning_rate": 1.9540424083553222e-07, "loss": 0.0151, "num_input_tokens_seen": 34824880, "step": 34690 }, { "epoch": 18.39607635206787, "grad_norm": 0.03298120200634003, "learning_rate": 1.9476430180368322e-07, "loss": 0.0009, "num_input_tokens_seen": 34829264, "step": 34695 }, { "epoch": 18.398727465535526, "grad_norm": 0.11918888241052628, "learning_rate": 1.9412539156022258e-07, "loss": 0.0038, "num_input_tokens_seen": 34835120, "step": 34700 }, { "epoch": 18.401378579003183, "grad_norm": 0.05557185038924217, "learning_rate": 1.9348751024193978e-07, "loss": 0.0005, "num_input_tokens_seen": 34839760, "step": 34705 }, { "epoch": 18.40402969247084, "grad_norm": 0.11173209547996521, "learning_rate": 1.9285065798540447e-07, "loss": 0.0386, "num_input_tokens_seen": 34844464, "step": 34710 }, { "epoch": 18.406680805938493, "grad_norm": 0.04970468580722809, "learning_rate": 1.922148349269637e-07, "loss": 0.021, "num_input_tokens_seen": 34849712, "step": 34715 }, { "epoch": 18.40933191940615, "grad_norm": 0.07025222480297089, "learning_rate": 1.9158004120274632e-07, "loss": 0.0006, "num_input_tokens_seen": 34853360, "step": 34720 }, { "epoch": 18.411983032873806, "grad_norm": 50.90599822998047, "learning_rate": 1.9094627694865918e-07, "loss": 0.0146, "num_input_tokens_seen": 34858096, "step": 34725 }, { "epoch": 18.414634146341463, "grad_norm": 0.0916537195444107, "learning_rate": 1.903135423003899e-07, "loss": 0.0004, "num_input_tokens_seen": 34863088, "step": 34730 }, { "epoch": 18.41728525980912, "grad_norm": 1.9616962671279907, "learning_rate": 1.896818373934073e-07, "loss": 0.0007, "num_input_tokens_seen": 34867216, "step": 34735 }, { "epoch": 18.419936373276776, "grad_norm": 0.03823867812752724, "learning_rate": 1.8905116236295375e-07, "loss": 0.1723, "num_input_tokens_seen": 34871984, "step": 34740 }, { "epoch": 18.422587486744433, "grad_norm": 0.033980440348386765, "learning_rate": 1.8842151734405844e-07, "loss": 0.0058, "num_input_tokens_seen": 34876656, "step": 34745 }, { "epoch": 18.42523860021209, "grad_norm": 0.11153030395507812, "learning_rate": 1.877929024715258e-07, "loss": 0.0006, "num_input_tokens_seen": 34881360, "step": 34750 }, { "epoch": 18.427889713679747, "grad_norm": 0.11797656118869781, "learning_rate": 1.8716531787994042e-07, "loss": 0.0003, "num_input_tokens_seen": 34885264, "step": 34755 }, { "epoch": 18.430540827147404, "grad_norm": 0.15488146245479584, "learning_rate": 1.8653876370366641e-07, "loss": 0.0004, "num_input_tokens_seen": 34890928, "step": 34760 }, { "epoch": 18.433191940615057, "grad_norm": 0.6536217927932739, "learning_rate": 1.8591324007684762e-07, "loss": 0.0005, "num_input_tokens_seen": 34896176, "step": 34765 }, { "epoch": 18.435843054082714, "grad_norm": 0.5523231029510498, "learning_rate": 1.8528874713340639e-07, "loss": 0.0002, "num_input_tokens_seen": 34901424, "step": 34770 }, { "epoch": 18.43849416755037, "grad_norm": 0.027913017198443413, "learning_rate": 1.8466528500704683e-07, "loss": 0.0002, "num_input_tokens_seen": 34906064, "step": 34775 }, { "epoch": 18.441145281018027, "grad_norm": 0.323184072971344, "learning_rate": 1.8404285383124775e-07, "loss": 0.0012, "num_input_tokens_seen": 34910448, "step": 34780 }, { "epoch": 18.443796394485684, "grad_norm": 0.027456680312752724, "learning_rate": 1.8342145373927256e-07, "loss": 0.0002, "num_input_tokens_seen": 34915376, "step": 34785 }, { "epoch": 18.44644750795334, "grad_norm": 0.29533886909484863, "learning_rate": 1.828010848641598e-07, "loss": 0.0017, "num_input_tokens_seen": 34920752, "step": 34790 }, { "epoch": 18.449098621420998, "grad_norm": 0.09936046600341797, "learning_rate": 1.8218174733872995e-07, "loss": 0.0005, "num_input_tokens_seen": 34925648, "step": 34795 }, { "epoch": 18.451749734888654, "grad_norm": 0.16122029721736908, "learning_rate": 1.8156344129558078e-07, "loss": 0.0003, "num_input_tokens_seen": 34930896, "step": 34800 }, { "epoch": 18.45440084835631, "grad_norm": 0.047885552048683167, "learning_rate": 1.8094616686709032e-07, "loss": 0.0003, "num_input_tokens_seen": 34935248, "step": 34805 }, { "epoch": 18.457051961823964, "grad_norm": 0.005566049367189407, "learning_rate": 1.803299241854156e-07, "loss": 0.0002, "num_input_tokens_seen": 34940912, "step": 34810 }, { "epoch": 18.45970307529162, "grad_norm": 0.17135192453861237, "learning_rate": 1.7971471338249224e-07, "loss": 0.0036, "num_input_tokens_seen": 34947088, "step": 34815 }, { "epoch": 18.462354188759278, "grad_norm": 2.478428363800049, "learning_rate": 1.791005345900354e-07, "loss": 0.0007, "num_input_tokens_seen": 34951728, "step": 34820 }, { "epoch": 18.465005302226935, "grad_norm": 0.30566632747650146, "learning_rate": 1.7848738793953824e-07, "loss": 0.0004, "num_input_tokens_seen": 34957200, "step": 34825 }, { "epoch": 18.46765641569459, "grad_norm": 0.030162177979946136, "learning_rate": 1.7787527356227575e-07, "loss": 0.0003, "num_input_tokens_seen": 34962032, "step": 34830 }, { "epoch": 18.47030752916225, "grad_norm": 0.025040697306394577, "learning_rate": 1.7726419158929808e-07, "loss": 0.0006, "num_input_tokens_seen": 34967792, "step": 34835 }, { "epoch": 18.472958642629905, "grad_norm": 0.07802180200815201, "learning_rate": 1.7665414215143784e-07, "loss": 0.141, "num_input_tokens_seen": 34972496, "step": 34840 }, { "epoch": 18.475609756097562, "grad_norm": 0.05666434019804001, "learning_rate": 1.7604512537930386e-07, "loss": 0.0103, "num_input_tokens_seen": 34977232, "step": 34845 }, { "epoch": 18.47826086956522, "grad_norm": 0.03094208985567093, "learning_rate": 1.7543714140328572e-07, "loss": 0.0016, "num_input_tokens_seen": 34981616, "step": 34850 }, { "epoch": 18.480911983032875, "grad_norm": 0.24600233137607574, "learning_rate": 1.7483019035355098e-07, "loss": 0.0004, "num_input_tokens_seen": 34988464, "step": 34855 }, { "epoch": 18.48356309650053, "grad_norm": 0.03652401641011238, "learning_rate": 1.742242723600468e-07, "loss": 0.0008, "num_input_tokens_seen": 34993264, "step": 34860 }, { "epoch": 18.486214209968185, "grad_norm": 0.07800775021314621, "learning_rate": 1.736193875524972e-07, "loss": 0.0535, "num_input_tokens_seen": 34997584, "step": 34865 }, { "epoch": 18.488865323435842, "grad_norm": 0.40812814235687256, "learning_rate": 1.7301553606040798e-07, "loss": 0.0004, "num_input_tokens_seen": 35002864, "step": 34870 }, { "epoch": 18.4915164369035, "grad_norm": 0.058048028498888016, "learning_rate": 1.724127180130608e-07, "loss": 0.1699, "num_input_tokens_seen": 35007152, "step": 34875 }, { "epoch": 18.494167550371156, "grad_norm": 99.39777374267578, "learning_rate": 1.7181093353951906e-07, "loss": 0.038, "num_input_tokens_seen": 35012208, "step": 34880 }, { "epoch": 18.496818663838813, "grad_norm": 0.02465776726603508, "learning_rate": 1.712101827686219e-07, "loss": 0.0012, "num_input_tokens_seen": 35016816, "step": 34885 }, { "epoch": 18.49946977730647, "grad_norm": 0.2798537313938141, "learning_rate": 1.7061046582898922e-07, "loss": 0.0044, "num_input_tokens_seen": 35022288, "step": 34890 }, { "epoch": 18.502120890774126, "grad_norm": 0.12476921081542969, "learning_rate": 1.7001178284901887e-07, "loss": 0.0005, "num_input_tokens_seen": 35026864, "step": 34895 }, { "epoch": 18.504772004241783, "grad_norm": 0.022746525704860687, "learning_rate": 1.6941413395688665e-07, "loss": 0.0001, "num_input_tokens_seen": 35031024, "step": 34900 }, { "epoch": 18.507423117709436, "grad_norm": 0.12023778259754181, "learning_rate": 1.688175192805469e-07, "loss": 0.0002, "num_input_tokens_seen": 35037328, "step": 34905 }, { "epoch": 18.510074231177093, "grad_norm": 0.2363535761833191, "learning_rate": 1.6822193894773632e-07, "loss": 0.0004, "num_input_tokens_seen": 35041616, "step": 34910 }, { "epoch": 18.51272534464475, "grad_norm": 0.28268900513648987, "learning_rate": 1.6762739308596343e-07, "loss": 0.0004, "num_input_tokens_seen": 35046896, "step": 34915 }, { "epoch": 18.515376458112407, "grad_norm": 0.20321938395500183, "learning_rate": 1.6703388182252146e-07, "loss": 0.0005, "num_input_tokens_seen": 35052176, "step": 34920 }, { "epoch": 18.518027571580063, "grad_norm": 0.45699554681777954, "learning_rate": 1.6644140528447873e-07, "loss": 0.0009, "num_input_tokens_seen": 35057392, "step": 34925 }, { "epoch": 18.52067868504772, "grad_norm": 0.117817223072052, "learning_rate": 1.6584996359868322e-07, "loss": 0.1192, "num_input_tokens_seen": 35061744, "step": 34930 }, { "epoch": 18.523329798515377, "grad_norm": 0.05800371989607811, "learning_rate": 1.6525955689176033e-07, "loss": 0.0008, "num_input_tokens_seen": 35066704, "step": 34935 }, { "epoch": 18.525980911983034, "grad_norm": 0.0400686040520668, "learning_rate": 1.6467018529011557e-07, "loss": 0.0003, "num_input_tokens_seen": 35071664, "step": 34940 }, { "epoch": 18.52863202545069, "grad_norm": 0.037918515503406525, "learning_rate": 1.6408184891993083e-07, "loss": 0.0286, "num_input_tokens_seen": 35076240, "step": 34945 }, { "epoch": 18.531283138918347, "grad_norm": 0.6878930330276489, "learning_rate": 1.6349454790716757e-07, "loss": 0.0036, "num_input_tokens_seen": 35081232, "step": 34950 }, { "epoch": 18.533934252386, "grad_norm": 0.3964201807975769, "learning_rate": 1.6290828237756685e-07, "loss": 0.0004, "num_input_tokens_seen": 35086992, "step": 34955 }, { "epoch": 18.536585365853657, "grad_norm": 7.736043930053711, "learning_rate": 1.6232305245664493e-07, "loss": 0.0017, "num_input_tokens_seen": 35092304, "step": 34960 }, { "epoch": 18.539236479321314, "grad_norm": 0.1734420359134674, "learning_rate": 1.6173885826969826e-07, "loss": 0.0002, "num_input_tokens_seen": 35096304, "step": 34965 }, { "epoch": 18.54188759278897, "grad_norm": 0.2361048460006714, "learning_rate": 1.611556999418018e-07, "loss": 0.0037, "num_input_tokens_seen": 35100976, "step": 34970 }, { "epoch": 18.544538706256628, "grad_norm": 0.16394557058811188, "learning_rate": 1.6057357759780845e-07, "loss": 0.0004, "num_input_tokens_seen": 35106032, "step": 34975 }, { "epoch": 18.547189819724284, "grad_norm": 0.07589258998632431, "learning_rate": 1.5999249136234852e-07, "loss": 0.0005, "num_input_tokens_seen": 35110928, "step": 34980 }, { "epoch": 18.54984093319194, "grad_norm": 2.6645395755767822, "learning_rate": 1.594124413598308e-07, "loss": 0.0011, "num_input_tokens_seen": 35117264, "step": 34985 }, { "epoch": 18.552492046659598, "grad_norm": 0.023346664384007454, "learning_rate": 1.5883342771444266e-07, "loss": 0.0028, "num_input_tokens_seen": 35122192, "step": 34990 }, { "epoch": 18.555143160127255, "grad_norm": 0.07027685642242432, "learning_rate": 1.5825545055015047e-07, "loss": 0.0013, "num_input_tokens_seen": 35127280, "step": 34995 }, { "epoch": 18.55779427359491, "grad_norm": 0.055769916623830795, "learning_rate": 1.5767850999069523e-07, "loss": 0.0003, "num_input_tokens_seen": 35132336, "step": 35000 }, { "epoch": 18.560445387062565, "grad_norm": 0.046856824308633804, "learning_rate": 1.5710260615960094e-07, "loss": 0.003, "num_input_tokens_seen": 35138064, "step": 35005 }, { "epoch": 18.56309650053022, "grad_norm": 0.058241281658411026, "learning_rate": 1.5652773918016617e-07, "loss": 0.0012, "num_input_tokens_seen": 35143696, "step": 35010 }, { "epoch": 18.56574761399788, "grad_norm": 0.287348210811615, "learning_rate": 1.559539091754686e-07, "loss": 0.0003, "num_input_tokens_seen": 35148624, "step": 35015 }, { "epoch": 18.568398727465535, "grad_norm": 0.006848040968179703, "learning_rate": 1.5538111626836273e-07, "loss": 0.0015, "num_input_tokens_seen": 35153072, "step": 35020 }, { "epoch": 18.571049840933192, "grad_norm": 0.30787375569343567, "learning_rate": 1.5480936058148322e-07, "loss": 0.0015, "num_input_tokens_seen": 35157232, "step": 35025 }, { "epoch": 18.57370095440085, "grad_norm": 0.8470640182495117, "learning_rate": 1.542386422372405e-07, "loss": 0.0012, "num_input_tokens_seen": 35161744, "step": 35030 }, { "epoch": 18.576352067868505, "grad_norm": 1.4730370044708252, "learning_rate": 1.5366896135782572e-07, "loss": 0.0102, "num_input_tokens_seen": 35166352, "step": 35035 }, { "epoch": 18.579003181336162, "grad_norm": 1.0774286985397339, "learning_rate": 1.5310031806520298e-07, "loss": 0.0004, "num_input_tokens_seen": 35170192, "step": 35040 }, { "epoch": 18.58165429480382, "grad_norm": 0.1844485104084015, "learning_rate": 1.525327124811199e-07, "loss": 0.0003, "num_input_tokens_seen": 35174736, "step": 35045 }, { "epoch": 18.584305408271476, "grad_norm": 0.794007420539856, "learning_rate": 1.5196614472709869e-07, "loss": 0.0023, "num_input_tokens_seen": 35179440, "step": 35050 }, { "epoch": 18.58695652173913, "grad_norm": 0.21912498772144318, "learning_rate": 1.5140061492444013e-07, "loss": 0.0004, "num_input_tokens_seen": 35184272, "step": 35055 }, { "epoch": 18.589607635206786, "grad_norm": 0.08309329301118851, "learning_rate": 1.508361231942218e-07, "loss": 0.0004, "num_input_tokens_seen": 35189840, "step": 35060 }, { "epoch": 18.592258748674443, "grad_norm": 0.19509738683700562, "learning_rate": 1.5027266965730093e-07, "loss": 0.0003, "num_input_tokens_seen": 35194128, "step": 35065 }, { "epoch": 18.5949098621421, "grad_norm": 0.06442868709564209, "learning_rate": 1.4971025443431099e-07, "loss": 0.0003, "num_input_tokens_seen": 35198032, "step": 35070 }, { "epoch": 18.597560975609756, "grad_norm": 80.73402404785156, "learning_rate": 1.4914887764566345e-07, "loss": 0.1131, "num_input_tokens_seen": 35202864, "step": 35075 }, { "epoch": 18.600212089077413, "grad_norm": 0.15652889013290405, "learning_rate": 1.4858853941154827e-07, "loss": 0.0004, "num_input_tokens_seen": 35208304, "step": 35080 }, { "epoch": 18.60286320254507, "grad_norm": 0.8548992276191711, "learning_rate": 1.4802923985193063e-07, "loss": 0.0012, "num_input_tokens_seen": 35213456, "step": 35085 }, { "epoch": 18.605514316012727, "grad_norm": 0.5022298097610474, "learning_rate": 1.4747097908655805e-07, "loss": 0.0011, "num_input_tokens_seen": 35218992, "step": 35090 }, { "epoch": 18.608165429480383, "grad_norm": 0.050531838089227676, "learning_rate": 1.4691375723494993e-07, "loss": 0.0002, "num_input_tokens_seen": 35224304, "step": 35095 }, { "epoch": 18.610816542948037, "grad_norm": 0.2352805882692337, "learning_rate": 1.4635757441640752e-07, "loss": 0.0005, "num_input_tokens_seen": 35229360, "step": 35100 }, { "epoch": 18.613467656415693, "grad_norm": 0.1159658133983612, "learning_rate": 1.4580243075000778e-07, "loss": 0.0002, "num_input_tokens_seen": 35234384, "step": 35105 }, { "epoch": 18.61611876988335, "grad_norm": 0.24781520664691925, "learning_rate": 1.4524832635460563e-07, "loss": 0.0004, "num_input_tokens_seen": 35238192, "step": 35110 }, { "epoch": 18.618769883351007, "grad_norm": 0.24108350276947021, "learning_rate": 1.4469526134883282e-07, "loss": 0.0003, "num_input_tokens_seen": 35242672, "step": 35115 }, { "epoch": 18.621420996818664, "grad_norm": 2.434784173965454, "learning_rate": 1.4414323585109969e-07, "loss": 0.0008, "num_input_tokens_seen": 35247024, "step": 35120 }, { "epoch": 18.62407211028632, "grad_norm": 0.3572576642036438, "learning_rate": 1.4359224997959277e-07, "loss": 0.0004, "num_input_tokens_seen": 35251440, "step": 35125 }, { "epoch": 18.626723223753977, "grad_norm": 0.024890022352337837, "learning_rate": 1.4304230385227823e-07, "loss": 0.0009, "num_input_tokens_seen": 35256368, "step": 35130 }, { "epoch": 18.629374337221634, "grad_norm": 0.004686796106398106, "learning_rate": 1.4249339758689639e-07, "loss": 0.0002, "num_input_tokens_seen": 35260784, "step": 35135 }, { "epoch": 18.63202545068929, "grad_norm": 0.07706810534000397, "learning_rate": 1.419455313009671e-07, "loss": 0.0004, "num_input_tokens_seen": 35266640, "step": 35140 }, { "epoch": 18.634676564156948, "grad_norm": 0.4850817918777466, "learning_rate": 1.4139870511178767e-07, "loss": 0.0037, "num_input_tokens_seen": 35270992, "step": 35145 }, { "epoch": 18.6373276776246, "grad_norm": 0.27616921067237854, "learning_rate": 1.4085291913643217e-07, "loss": 0.0004, "num_input_tokens_seen": 35275792, "step": 35150 }, { "epoch": 18.639978791092258, "grad_norm": 0.041073840111494064, "learning_rate": 1.4030817349175107e-07, "loss": 0.0005, "num_input_tokens_seen": 35281616, "step": 35155 }, { "epoch": 18.642629904559914, "grad_norm": 0.3151928186416626, "learning_rate": 1.3976446829437384e-07, "loss": 0.0004, "num_input_tokens_seen": 35286928, "step": 35160 }, { "epoch": 18.64528101802757, "grad_norm": 0.09218204766511917, "learning_rate": 1.3922180366070516e-07, "loss": 0.001, "num_input_tokens_seen": 35294256, "step": 35165 }, { "epoch": 18.647932131495228, "grad_norm": 0.1704653948545456, "learning_rate": 1.3868017970693037e-07, "loss": 0.0003, "num_input_tokens_seen": 35300912, "step": 35170 }, { "epoch": 18.650583244962885, "grad_norm": 0.03404321148991585, "learning_rate": 1.3813959654900677e-07, "loss": 0.0208, "num_input_tokens_seen": 35306960, "step": 35175 }, { "epoch": 18.65323435843054, "grad_norm": 0.27128222584724426, "learning_rate": 1.37600054302674e-07, "loss": 0.0006, "num_input_tokens_seen": 35311056, "step": 35180 }, { "epoch": 18.6558854718982, "grad_norm": 0.1943739354610443, "learning_rate": 1.3706155308344625e-07, "loss": 0.0002, "num_input_tokens_seen": 35315472, "step": 35185 }, { "epoch": 18.658536585365855, "grad_norm": 0.04858281463384628, "learning_rate": 1.3652409300661472e-07, "loss": 0.0002, "num_input_tokens_seen": 35323312, "step": 35190 }, { "epoch": 18.66118769883351, "grad_norm": 0.01831815391778946, "learning_rate": 1.3598767418724846e-07, "loss": 0.071, "num_input_tokens_seen": 35327696, "step": 35195 }, { "epoch": 18.663838812301165, "grad_norm": 0.582061767578125, "learning_rate": 1.3545229674019332e-07, "loss": 0.0003, "num_input_tokens_seen": 35331888, "step": 35200 }, { "epoch": 18.666489925768822, "grad_norm": 1.7346090078353882, "learning_rate": 1.3491796078007214e-07, "loss": 0.0006, "num_input_tokens_seen": 35337168, "step": 35205 }, { "epoch": 18.66914103923648, "grad_norm": 0.004081866703927517, "learning_rate": 1.3438466642128555e-07, "loss": 0.0003, "num_input_tokens_seen": 35342928, "step": 35210 }, { "epoch": 18.671792152704136, "grad_norm": 0.06329939514398575, "learning_rate": 1.3385241377800894e-07, "loss": 0.0006, "num_input_tokens_seen": 35347088, "step": 35215 }, { "epoch": 18.674443266171792, "grad_norm": 0.289583295583725, "learning_rate": 1.3332120296419727e-07, "loss": 0.1555, "num_input_tokens_seen": 35351664, "step": 35220 }, { "epoch": 18.67709437963945, "grad_norm": 0.13036903738975525, "learning_rate": 1.3279103409358174e-07, "loss": 0.0008, "num_input_tokens_seen": 35356432, "step": 35225 }, { "epoch": 18.679745493107106, "grad_norm": 0.510209321975708, "learning_rate": 1.3226190727966882e-07, "loss": 0.0005, "num_input_tokens_seen": 35360400, "step": 35230 }, { "epoch": 18.682396606574763, "grad_norm": 0.0807209238409996, "learning_rate": 1.3173382263574452e-07, "loss": 0.0001, "num_input_tokens_seen": 35365616, "step": 35235 }, { "epoch": 18.68504772004242, "grad_norm": 0.016091343015432358, "learning_rate": 1.312067802748701e-07, "loss": 0.0064, "num_input_tokens_seen": 35371920, "step": 35240 }, { "epoch": 18.687698833510073, "grad_norm": 58.94381332397461, "learning_rate": 1.3068078030988306e-07, "loss": 0.0186, "num_input_tokens_seen": 35377232, "step": 35245 }, { "epoch": 18.69034994697773, "grad_norm": 0.6522610783576965, "learning_rate": 1.301558228534e-07, "loss": 0.0082, "num_input_tokens_seen": 35382864, "step": 35250 }, { "epoch": 18.693001060445386, "grad_norm": 0.22188495099544525, "learning_rate": 1.2963190801781156e-07, "loss": 0.0006, "num_input_tokens_seen": 35387824, "step": 35255 }, { "epoch": 18.695652173913043, "grad_norm": 0.019523875787854195, "learning_rate": 1.291090359152869e-07, "loss": 0.0004, "num_input_tokens_seen": 35393456, "step": 35260 }, { "epoch": 18.6983032873807, "grad_norm": 0.12266885489225388, "learning_rate": 1.2858720665777259e-07, "loss": 0.0004, "num_input_tokens_seen": 35398256, "step": 35265 }, { "epoch": 18.700954400848357, "grad_norm": 0.39914780855178833, "learning_rate": 1.2806642035698925e-07, "loss": 0.107, "num_input_tokens_seen": 35404176, "step": 35270 }, { "epoch": 18.703605514316013, "grad_norm": 0.12981998920440674, "learning_rate": 1.2754667712443712e-07, "loss": 0.0002, "num_input_tokens_seen": 35409232, "step": 35275 }, { "epoch": 18.70625662778367, "grad_norm": 0.06649404764175415, "learning_rate": 1.2702797707139168e-07, "loss": 0.0021, "num_input_tokens_seen": 35414512, "step": 35280 }, { "epoch": 18.708907741251327, "grad_norm": 22.789478302001953, "learning_rate": 1.265103203089052e-07, "loss": 0.0082, "num_input_tokens_seen": 35418640, "step": 35285 }, { "epoch": 18.71155885471898, "grad_norm": 0.10457810014486313, "learning_rate": 1.259937069478062e-07, "loss": 0.0003, "num_input_tokens_seen": 35426640, "step": 35290 }, { "epoch": 18.714209968186637, "grad_norm": 0.11419461667537689, "learning_rate": 1.2547813709870073e-07, "loss": 0.0003, "num_input_tokens_seen": 35431216, "step": 35295 }, { "epoch": 18.716861081654294, "grad_norm": 0.05913982540369034, "learning_rate": 1.2496361087196983e-07, "loss": 0.0007, "num_input_tokens_seen": 35437392, "step": 35300 }, { "epoch": 18.71951219512195, "grad_norm": 0.1255359798669815, "learning_rate": 1.2445012837777491e-07, "loss": 0.0002, "num_input_tokens_seen": 35442352, "step": 35305 }, { "epoch": 18.722163308589607, "grad_norm": 0.02712845802307129, "learning_rate": 1.2393768972604802e-07, "loss": 0.0647, "num_input_tokens_seen": 35446864, "step": 35310 }, { "epoch": 18.724814422057264, "grad_norm": 0.01869412511587143, "learning_rate": 1.2342629502650305e-07, "loss": 0.0013, "num_input_tokens_seen": 35451504, "step": 35315 }, { "epoch": 18.72746553552492, "grad_norm": 0.24948187172412872, "learning_rate": 1.2291594438862742e-07, "loss": 0.0004, "num_input_tokens_seen": 35456368, "step": 35320 }, { "epoch": 18.730116648992578, "grad_norm": 0.18194647133350372, "learning_rate": 1.2240663792168596e-07, "loss": 0.0002, "num_input_tokens_seen": 35461104, "step": 35325 }, { "epoch": 18.732767762460234, "grad_norm": 0.0021061748266220093, "learning_rate": 1.218983757347203e-07, "loss": 0.0001, "num_input_tokens_seen": 35465936, "step": 35330 }, { "epoch": 18.73541887592789, "grad_norm": 0.027499616146087646, "learning_rate": 1.2139115793654788e-07, "loss": 0.0041, "num_input_tokens_seen": 35471696, "step": 35335 }, { "epoch": 18.738069989395544, "grad_norm": 0.011785385198891163, "learning_rate": 1.2088498463576126e-07, "loss": 0.0003, "num_input_tokens_seen": 35475984, "step": 35340 }, { "epoch": 18.7407211028632, "grad_norm": 0.12631571292877197, "learning_rate": 1.2037985594073377e-07, "loss": 0.0005, "num_input_tokens_seen": 35480656, "step": 35345 }, { "epoch": 18.743372216330858, "grad_norm": 0.09414409846067429, "learning_rate": 1.1987577195960942e-07, "loss": 0.0003, "num_input_tokens_seen": 35485712, "step": 35350 }, { "epoch": 18.746023329798515, "grad_norm": 0.18629106879234314, "learning_rate": 1.1937273280031246e-07, "loss": 0.0007, "num_input_tokens_seen": 35490928, "step": 35355 }, { "epoch": 18.74867444326617, "grad_norm": 0.15400545299053192, "learning_rate": 1.1887073857054232e-07, "loss": 0.0004, "num_input_tokens_seen": 35495984, "step": 35360 }, { "epoch": 18.75132555673383, "grad_norm": 0.10598859190940857, "learning_rate": 1.183697893777741e-07, "loss": 0.0025, "num_input_tokens_seen": 35501200, "step": 35365 }, { "epoch": 18.753976670201485, "grad_norm": 0.020732497796416283, "learning_rate": 1.178698853292598e-07, "loss": 0.0773, "num_input_tokens_seen": 35507824, "step": 35370 }, { "epoch": 18.756627783669142, "grad_norm": 0.3130567669868469, "learning_rate": 1.1737102653202825e-07, "loss": 0.0002, "num_input_tokens_seen": 35513808, "step": 35375 }, { "epoch": 18.7592788971368, "grad_norm": 0.12947732210159302, "learning_rate": 1.1687321309288346e-07, "loss": 0.0003, "num_input_tokens_seen": 35519248, "step": 35380 }, { "epoch": 18.761930010604456, "grad_norm": 1.1235274076461792, "learning_rate": 1.1637644511840574e-07, "loss": 0.0006, "num_input_tokens_seen": 35525264, "step": 35385 }, { "epoch": 18.76458112407211, "grad_norm": 0.01754387468099594, "learning_rate": 1.1588072271495222e-07, "loss": 0.0003, "num_input_tokens_seen": 35529424, "step": 35390 }, { "epoch": 18.767232237539766, "grad_norm": 0.45980823040008545, "learning_rate": 1.1538604598865466e-07, "loss": 0.0005, "num_input_tokens_seen": 35534480, "step": 35395 }, { "epoch": 18.769883351007422, "grad_norm": 0.06364826112985611, "learning_rate": 1.1489241504542392e-07, "loss": 0.0005, "num_input_tokens_seen": 35538928, "step": 35400 }, { "epoch": 18.77253446447508, "grad_norm": 0.06750509142875671, "learning_rate": 1.1439982999094323e-07, "loss": 0.0003, "num_input_tokens_seen": 35543440, "step": 35405 }, { "epoch": 18.775185577942736, "grad_norm": 0.4768710136413574, "learning_rate": 1.139082909306749e-07, "loss": 0.001, "num_input_tokens_seen": 35548592, "step": 35410 }, { "epoch": 18.777836691410393, "grad_norm": 0.6781412959098816, "learning_rate": 1.1341779796985641e-07, "loss": 0.0009, "num_input_tokens_seen": 35554224, "step": 35415 }, { "epoch": 18.78048780487805, "grad_norm": 0.04614400118589401, "learning_rate": 1.1292835121349988e-07, "loss": 0.0018, "num_input_tokens_seen": 35558192, "step": 35420 }, { "epoch": 18.783138918345706, "grad_norm": 0.8714584708213806, "learning_rate": 1.1243995076639535e-07, "loss": 0.0031, "num_input_tokens_seen": 35562800, "step": 35425 }, { "epoch": 18.785790031813363, "grad_norm": 0.00795360654592514, "learning_rate": 1.119525967331081e-07, "loss": 0.0003, "num_input_tokens_seen": 35567696, "step": 35430 }, { "epoch": 18.78844114528102, "grad_norm": 4.8584465980529785, "learning_rate": 1.1146628921797853e-07, "loss": 0.0304, "num_input_tokens_seen": 35572464, "step": 35435 }, { "epoch": 18.791092258748673, "grad_norm": 0.1519463062286377, "learning_rate": 1.1098102832512558e-07, "loss": 0.0003, "num_input_tokens_seen": 35577904, "step": 35440 }, { "epoch": 18.79374337221633, "grad_norm": 1.3057538270950317, "learning_rate": 1.1049681415844004e-07, "loss": 0.0007, "num_input_tokens_seen": 35583344, "step": 35445 }, { "epoch": 18.796394485683987, "grad_norm": 0.024865521118044853, "learning_rate": 1.1001364682159232e-07, "loss": 0.0002, "num_input_tokens_seen": 35588112, "step": 35450 }, { "epoch": 18.799045599151643, "grad_norm": 0.0439445823431015, "learning_rate": 1.0953152641802744e-07, "loss": 0.0815, "num_input_tokens_seen": 35592144, "step": 35455 }, { "epoch": 18.8016967126193, "grad_norm": 68.63256072998047, "learning_rate": 1.0905045305096562e-07, "loss": 0.0147, "num_input_tokens_seen": 35597552, "step": 35460 }, { "epoch": 18.804347826086957, "grad_norm": 0.07049928605556488, "learning_rate": 1.0857042682340336e-07, "loss": 0.0016, "num_input_tokens_seen": 35601680, "step": 35465 }, { "epoch": 18.806998939554614, "grad_norm": 0.42023414373397827, "learning_rate": 1.0809144783811287e-07, "loss": 0.0006, "num_input_tokens_seen": 35606512, "step": 35470 }, { "epoch": 18.80965005302227, "grad_norm": 0.015887653455138206, "learning_rate": 1.0761351619764216e-07, "loss": 0.0002, "num_input_tokens_seen": 35612176, "step": 35475 }, { "epoch": 18.812301166489927, "grad_norm": 0.011526861228048801, "learning_rate": 1.0713663200431712e-07, "loss": 0.0002, "num_input_tokens_seen": 35616560, "step": 35480 }, { "epoch": 18.81495227995758, "grad_norm": 2.1166694164276123, "learning_rate": 1.0666079536023443e-07, "loss": 0.001, "num_input_tokens_seen": 35621968, "step": 35485 }, { "epoch": 18.817603393425237, "grad_norm": 0.4420107901096344, "learning_rate": 1.0618600636727095e-07, "loss": 0.0004, "num_input_tokens_seen": 35626480, "step": 35490 }, { "epoch": 18.820254506892894, "grad_norm": 0.09829074889421463, "learning_rate": 1.0571226512707867e-07, "loss": 0.0082, "num_input_tokens_seen": 35631856, "step": 35495 }, { "epoch": 18.82290562036055, "grad_norm": 0.3711022734642029, "learning_rate": 1.0523957174108257e-07, "loss": 0.0006, "num_input_tokens_seen": 35636400, "step": 35500 }, { "epoch": 18.825556733828208, "grad_norm": 0.03800485283136368, "learning_rate": 1.0476792631048616e-07, "loss": 0.0009, "num_input_tokens_seen": 35642480, "step": 35505 }, { "epoch": 18.828207847295864, "grad_norm": 0.008043020963668823, "learning_rate": 1.0429732893626698e-07, "loss": 0.0043, "num_input_tokens_seen": 35646736, "step": 35510 }, { "epoch": 18.83085896076352, "grad_norm": 0.4522470533847809, "learning_rate": 1.038277797191789e-07, "loss": 0.0003, "num_input_tokens_seen": 35652464, "step": 35515 }, { "epoch": 18.833510074231178, "grad_norm": 0.05017547309398651, "learning_rate": 1.0335927875975094e-07, "loss": 0.0005, "num_input_tokens_seen": 35657680, "step": 35520 }, { "epoch": 18.836161187698835, "grad_norm": 0.04035225510597229, "learning_rate": 1.0289182615828896e-07, "loss": 0.0002, "num_input_tokens_seen": 35662416, "step": 35525 }, { "epoch": 18.83881230116649, "grad_norm": 0.05426698550581932, "learning_rate": 1.0242542201487182e-07, "loss": 0.0003, "num_input_tokens_seen": 35668784, "step": 35530 }, { "epoch": 18.841463414634145, "grad_norm": 0.01921435259282589, "learning_rate": 1.0196006642935685e-07, "loss": 0.0006, "num_input_tokens_seen": 35672624, "step": 35535 }, { "epoch": 18.8441145281018, "grad_norm": 0.11485132575035095, "learning_rate": 1.0149575950137436e-07, "loss": 0.0003, "num_input_tokens_seen": 35678672, "step": 35540 }, { "epoch": 18.84676564156946, "grad_norm": 0.11557607352733612, "learning_rate": 1.0103250133033149e-07, "loss": 0.0003, "num_input_tokens_seen": 35684080, "step": 35545 }, { "epoch": 18.849416755037115, "grad_norm": 0.017445877194404602, "learning_rate": 1.0057029201541114e-07, "loss": 0.0774, "num_input_tokens_seen": 35689232, "step": 35550 }, { "epoch": 18.852067868504772, "grad_norm": 0.1476573795080185, "learning_rate": 1.001091316555708e-07, "loss": 0.0002, "num_input_tokens_seen": 35693968, "step": 35555 }, { "epoch": 18.85471898197243, "grad_norm": 0.5463324785232544, "learning_rate": 9.964902034954315e-08, "loss": 0.0002, "num_input_tokens_seen": 35698992, "step": 35560 }, { "epoch": 18.857370095440086, "grad_norm": 0.5244244337081909, "learning_rate": 9.918995819583776e-08, "loss": 0.0003, "num_input_tokens_seen": 35704208, "step": 35565 }, { "epoch": 18.860021208907742, "grad_norm": 0.24025343358516693, "learning_rate": 9.873194529273767e-08, "loss": 0.0004, "num_input_tokens_seen": 35708944, "step": 35570 }, { "epoch": 18.8626723223754, "grad_norm": 0.12838207185268402, "learning_rate": 9.82749817383033e-08, "loss": 0.0006, "num_input_tokens_seen": 35713648, "step": 35575 }, { "epoch": 18.865323435843052, "grad_norm": 0.1440495252609253, "learning_rate": 9.781906763036863e-08, "loss": 0.0004, "num_input_tokens_seen": 35720304, "step": 35580 }, { "epoch": 18.86797454931071, "grad_norm": 0.23629292845726013, "learning_rate": 9.736420306654393e-08, "loss": 0.0004, "num_input_tokens_seen": 35724976, "step": 35585 }, { "epoch": 18.870625662778366, "grad_norm": 0.11501304805278778, "learning_rate": 9.69103881442146e-08, "loss": 0.0003, "num_input_tokens_seen": 35729968, "step": 35590 }, { "epoch": 18.873276776246023, "grad_norm": 0.07956239581108093, "learning_rate": 9.645762296054129e-08, "loss": 0.0005, "num_input_tokens_seen": 35735504, "step": 35595 }, { "epoch": 18.87592788971368, "grad_norm": 0.17858950793743134, "learning_rate": 9.600590761245865e-08, "loss": 0.0005, "num_input_tokens_seen": 35740016, "step": 35600 }, { "epoch": 18.878579003181336, "grad_norm": 0.014265432953834534, "learning_rate": 9.555524219667989e-08, "loss": 0.0006, "num_input_tokens_seen": 35745008, "step": 35605 }, { "epoch": 18.881230116648993, "grad_norm": 0.10393023490905762, "learning_rate": 9.510562680968893e-08, "loss": 0.0003, "num_input_tokens_seen": 35750128, "step": 35610 }, { "epoch": 18.88388123011665, "grad_norm": 10.869929313659668, "learning_rate": 9.465706154774933e-08, "loss": 0.0032, "num_input_tokens_seen": 35755120, "step": 35615 }, { "epoch": 18.886532343584307, "grad_norm": 0.6036926507949829, "learning_rate": 9.420954650689706e-08, "loss": 0.0006, "num_input_tokens_seen": 35760080, "step": 35620 }, { "epoch": 18.889183457051963, "grad_norm": 0.7888486981391907, "learning_rate": 9.376308178294324e-08, "loss": 0.0003, "num_input_tokens_seen": 35764784, "step": 35625 }, { "epoch": 18.891834570519617, "grad_norm": 14.807917594909668, "learning_rate": 9.33176674714753e-08, "loss": 0.0053, "num_input_tokens_seen": 35769872, "step": 35630 }, { "epoch": 18.894485683987273, "grad_norm": 0.04058544710278511, "learning_rate": 9.287330366785529e-08, "loss": 0.0002, "num_input_tokens_seen": 35774736, "step": 35635 }, { "epoch": 18.89713679745493, "grad_norm": 0.03862453252077103, "learning_rate": 9.242999046722045e-08, "loss": 0.0001, "num_input_tokens_seen": 35779664, "step": 35640 }, { "epoch": 18.899787910922587, "grad_norm": 0.029505446553230286, "learning_rate": 9.198772796448263e-08, "loss": 0.0003, "num_input_tokens_seen": 35784272, "step": 35645 }, { "epoch": 18.902439024390244, "grad_norm": 0.030078640207648277, "learning_rate": 9.154651625432942e-08, "loss": 0.0004, "num_input_tokens_seen": 35790224, "step": 35650 }, { "epoch": 18.9050901378579, "grad_norm": 0.18892130255699158, "learning_rate": 9.110635543122249e-08, "loss": 0.0018, "num_input_tokens_seen": 35794768, "step": 35655 }, { "epoch": 18.907741251325557, "grad_norm": 0.06252609938383102, "learning_rate": 9.066724558940032e-08, "loss": 0.0008, "num_input_tokens_seen": 35799184, "step": 35660 }, { "epoch": 18.910392364793214, "grad_norm": 0.09751640260219574, "learning_rate": 9.022918682287385e-08, "loss": 0.0004, "num_input_tokens_seen": 35804304, "step": 35665 }, { "epoch": 18.91304347826087, "grad_norm": 0.00882764533162117, "learning_rate": 8.979217922543138e-08, "loss": 0.0497, "num_input_tokens_seen": 35808912, "step": 35670 }, { "epoch": 18.915694591728524, "grad_norm": 0.15895606577396393, "learning_rate": 8.935622289063417e-08, "loss": 0.0011, "num_input_tokens_seen": 35814032, "step": 35675 }, { "epoch": 18.91834570519618, "grad_norm": 0.02980027161538601, "learning_rate": 8.892131791182034e-08, "loss": 0.0002, "num_input_tokens_seen": 35818672, "step": 35680 }, { "epoch": 18.920996818663838, "grad_norm": 2.655317544937134, "learning_rate": 8.848746438210099e-08, "loss": 0.0014, "num_input_tokens_seen": 35822192, "step": 35685 }, { "epoch": 18.923647932131495, "grad_norm": 0.01590161770582199, "learning_rate": 8.805466239436345e-08, "loss": 0.0003, "num_input_tokens_seen": 35828432, "step": 35690 }, { "epoch": 18.92629904559915, "grad_norm": 0.7522031664848328, "learning_rate": 8.762291204126915e-08, "loss": 0.0003, "num_input_tokens_seen": 35834160, "step": 35695 }, { "epoch": 18.928950159066808, "grad_norm": 0.26890578866004944, "learning_rate": 8.71922134152553e-08, "loss": 0.0081, "num_input_tokens_seen": 35838384, "step": 35700 }, { "epoch": 18.931601272534465, "grad_norm": 0.7364397644996643, "learning_rate": 8.676256660853311e-08, "loss": 0.0007, "num_input_tokens_seen": 35843536, "step": 35705 }, { "epoch": 18.93425238600212, "grad_norm": 0.12578918039798737, "learning_rate": 8.633397171308844e-08, "loss": 0.0004, "num_input_tokens_seen": 35848080, "step": 35710 }, { "epoch": 18.93690349946978, "grad_norm": 0.2769991159439087, "learning_rate": 8.59064288206829e-08, "loss": 0.0002, "num_input_tokens_seen": 35852176, "step": 35715 }, { "epoch": 18.939554612937435, "grad_norm": 0.05005548894405365, "learning_rate": 8.547993802285215e-08, "loss": 0.0006, "num_input_tokens_seen": 35857552, "step": 35720 }, { "epoch": 18.94220572640509, "grad_norm": 0.027115315198898315, "learning_rate": 8.50544994109065e-08, "loss": 0.0019, "num_input_tokens_seen": 35862800, "step": 35725 }, { "epoch": 18.944856839872745, "grad_norm": 0.2986842691898346, "learning_rate": 8.463011307593139e-08, "loss": 0.0005, "num_input_tokens_seen": 35869040, "step": 35730 }, { "epoch": 18.947507953340402, "grad_norm": 0.10971657931804657, "learning_rate": 8.420677910878639e-08, "loss": 0.0005, "num_input_tokens_seen": 35874192, "step": 35735 }, { "epoch": 18.95015906680806, "grad_norm": 0.07157831639051437, "learning_rate": 8.378449760010787e-08, "loss": 0.0002, "num_input_tokens_seen": 35879152, "step": 35740 }, { "epoch": 18.952810180275716, "grad_norm": 0.1287173330783844, "learning_rate": 8.336326864030297e-08, "loss": 0.018, "num_input_tokens_seen": 35885104, "step": 35745 }, { "epoch": 18.955461293743372, "grad_norm": 0.01938123069703579, "learning_rate": 8.294309231955678e-08, "loss": 0.0149, "num_input_tokens_seen": 35890576, "step": 35750 }, { "epoch": 18.95811240721103, "grad_norm": 0.1317453384399414, "learning_rate": 8.252396872782786e-08, "loss": 0.0015, "num_input_tokens_seen": 35894160, "step": 35755 }, { "epoch": 18.960763520678686, "grad_norm": 0.12449346482753754, "learning_rate": 8.210589795485003e-08, "loss": 0.0209, "num_input_tokens_seen": 35899120, "step": 35760 }, { "epoch": 18.963414634146343, "grad_norm": 0.19189992547035217, "learning_rate": 8.168888009013054e-08, "loss": 0.0003, "num_input_tokens_seen": 35904496, "step": 35765 }, { "epoch": 18.966065747614, "grad_norm": 0.047307420521974564, "learning_rate": 8.127291522295133e-08, "loss": 0.0003, "num_input_tokens_seen": 35909968, "step": 35770 }, { "epoch": 18.968716861081653, "grad_norm": 0.03393010050058365, "learning_rate": 8.08580034423706e-08, "loss": 0.0009, "num_input_tokens_seen": 35914384, "step": 35775 }, { "epoch": 18.97136797454931, "grad_norm": 0.09614620357751846, "learning_rate": 8.044414483721897e-08, "loss": 0.0003, "num_input_tokens_seen": 35918512, "step": 35780 }, { "epoch": 18.974019088016966, "grad_norm": 0.11257315427064896, "learning_rate": 8.003133949610276e-08, "loss": 0.0003, "num_input_tokens_seen": 35923088, "step": 35785 }, { "epoch": 18.976670201484623, "grad_norm": 0.506405770778656, "learning_rate": 7.961958750740184e-08, "loss": 0.0003, "num_input_tokens_seen": 35928304, "step": 35790 }, { "epoch": 18.97932131495228, "grad_norm": 0.04936280846595764, "learning_rate": 7.920888895927292e-08, "loss": 0.0003, "num_input_tokens_seen": 35932656, "step": 35795 }, { "epoch": 18.981972428419937, "grad_norm": 2.0677921772003174, "learning_rate": 7.8799243939644e-08, "loss": 0.0022, "num_input_tokens_seen": 35937552, "step": 35800 }, { "epoch": 18.984623541887593, "grad_norm": 1.2111557722091675, "learning_rate": 7.839065253621935e-08, "loss": 0.0017, "num_input_tokens_seen": 35942448, "step": 35805 }, { "epoch": 18.98727465535525, "grad_norm": 0.020794058218598366, "learning_rate": 7.798311483647792e-08, "loss": 0.0016, "num_input_tokens_seen": 35947632, "step": 35810 }, { "epoch": 18.989925768822907, "grad_norm": 0.17546528577804565, "learning_rate": 7.757663092767153e-08, "loss": 0.01, "num_input_tokens_seen": 35952624, "step": 35815 }, { "epoch": 18.992576882290564, "grad_norm": 0.10870873183012009, "learning_rate": 7.717120089682784e-08, "loss": 0.0012, "num_input_tokens_seen": 35957264, "step": 35820 }, { "epoch": 18.995227995758217, "grad_norm": 0.005649686325341463, "learning_rate": 7.676682483074849e-08, "loss": 0.0004, "num_input_tokens_seen": 35962544, "step": 35825 }, { "epoch": 18.997879109225874, "grad_norm": 9.3494291305542, "learning_rate": 7.636350281600869e-08, "loss": 0.0021, "num_input_tokens_seen": 35968496, "step": 35830 }, { "epoch": 19.00053022269353, "grad_norm": 0.17800436913967133, "learning_rate": 7.59612349389599e-08, "loss": 0.0002, "num_input_tokens_seen": 35973072, "step": 35835 }, { "epoch": 19.003181336161187, "grad_norm": 0.02031058818101883, "learning_rate": 7.556002128572493e-08, "loss": 0.0145, "num_input_tokens_seen": 35977712, "step": 35840 }, { "epoch": 19.005832449628844, "grad_norm": 0.011374542489647865, "learning_rate": 7.515986194220393e-08, "loss": 0.0003, "num_input_tokens_seen": 35982640, "step": 35845 }, { "epoch": 19.0084835630965, "grad_norm": 0.2696273922920227, "learning_rate": 7.476075699406948e-08, "loss": 0.0002, "num_input_tokens_seen": 35988688, "step": 35850 }, { "epoch": 19.011134676564158, "grad_norm": 0.16870492696762085, "learning_rate": 7.43627065267688e-08, "loss": 0.0002, "num_input_tokens_seen": 35993680, "step": 35855 }, { "epoch": 19.013785790031815, "grad_norm": 0.07774430513381958, "learning_rate": 7.396571062552372e-08, "loss": 0.0006, "num_input_tokens_seen": 35998672, "step": 35860 }, { "epoch": 19.01643690349947, "grad_norm": 0.049422260373830795, "learning_rate": 7.356976937532956e-08, "loss": 0.0018, "num_input_tokens_seen": 36003024, "step": 35865 }, { "epoch": 19.019088016967125, "grad_norm": 0.32880014181137085, "learning_rate": 7.317488286095686e-08, "loss": 0.0016, "num_input_tokens_seen": 36006832, "step": 35870 }, { "epoch": 19.02173913043478, "grad_norm": 0.03607459366321564, "learning_rate": 7.278105116694967e-08, "loss": 0.0004, "num_input_tokens_seen": 36011632, "step": 35875 }, { "epoch": 19.024390243902438, "grad_norm": 0.05539098009467125, "learning_rate": 7.238827437762553e-08, "loss": 0.0001, "num_input_tokens_seen": 36017008, "step": 35880 }, { "epoch": 19.027041357370095, "grad_norm": 0.06413518637418747, "learning_rate": 7.199655257707828e-08, "loss": 0.0002, "num_input_tokens_seen": 36021616, "step": 35885 }, { "epoch": 19.02969247083775, "grad_norm": 0.0028415219858288765, "learning_rate": 7.16058858491736e-08, "loss": 0.0003, "num_input_tokens_seen": 36026768, "step": 35890 }, { "epoch": 19.03234358430541, "grad_norm": 0.11906395107507706, "learning_rate": 7.12162742775524e-08, "loss": 0.0004, "num_input_tokens_seen": 36031184, "step": 35895 }, { "epoch": 19.034994697773065, "grad_norm": 0.10433874279260635, "learning_rate": 7.082771794562959e-08, "loss": 0.0002, "num_input_tokens_seen": 36035728, "step": 35900 }, { "epoch": 19.037645811240722, "grad_norm": 0.005699817091226578, "learning_rate": 7.044021693659419e-08, "loss": 0.0005, "num_input_tokens_seen": 36040752, "step": 35905 }, { "epoch": 19.04029692470838, "grad_norm": 0.31805914640426636, "learning_rate": 7.005377133340874e-08, "loss": 0.0004, "num_input_tokens_seen": 36045616, "step": 35910 }, { "epoch": 19.042948038176036, "grad_norm": 0.4892261326313019, "learning_rate": 6.966838121881036e-08, "loss": 0.0006, "num_input_tokens_seen": 36049968, "step": 35915 }, { "epoch": 19.04559915164369, "grad_norm": 0.2417658269405365, "learning_rate": 6.92840466753103e-08, "loss": 0.0003, "num_input_tokens_seen": 36055760, "step": 35920 }, { "epoch": 19.048250265111346, "grad_norm": 0.16128304600715637, "learning_rate": 6.890076778519327e-08, "loss": 0.0002, "num_input_tokens_seen": 36060592, "step": 35925 }, { "epoch": 19.050901378579002, "grad_norm": 0.12544119358062744, "learning_rate": 6.851854463051921e-08, "loss": 0.0288, "num_input_tokens_seen": 36066416, "step": 35930 }, { "epoch": 19.05355249204666, "grad_norm": 1.0236440896987915, "learning_rate": 6.813737729311987e-08, "loss": 0.0005, "num_input_tokens_seen": 36071664, "step": 35935 }, { "epoch": 19.056203605514316, "grad_norm": 0.4399047791957855, "learning_rate": 6.775726585460218e-08, "loss": 0.0003, "num_input_tokens_seen": 36077328, "step": 35940 }, { "epoch": 19.058854718981973, "grad_norm": 0.4613317549228668, "learning_rate": 6.737821039634829e-08, "loss": 0.0004, "num_input_tokens_seen": 36081776, "step": 35945 }, { "epoch": 19.06150583244963, "grad_norm": 0.25252676010131836, "learning_rate": 6.700021099951215e-08, "loss": 0.0003, "num_input_tokens_seen": 36088208, "step": 35950 }, { "epoch": 19.064156945917286, "grad_norm": 0.30936405062675476, "learning_rate": 6.662326774502181e-08, "loss": 0.0003, "num_input_tokens_seen": 36092336, "step": 35955 }, { "epoch": 19.066808059384943, "grad_norm": 0.0935080423951149, "learning_rate": 6.624738071358105e-08, "loss": 0.033, "num_input_tokens_seen": 36097712, "step": 35960 }, { "epoch": 19.069459172852596, "grad_norm": 0.08289753645658493, "learning_rate": 6.58725499856655e-08, "loss": 0.0001, "num_input_tokens_seen": 36102096, "step": 35965 }, { "epoch": 19.072110286320253, "grad_norm": 0.012795647606253624, "learning_rate": 6.549877564152596e-08, "loss": 0.0004, "num_input_tokens_seen": 36108752, "step": 35970 }, { "epoch": 19.07476139978791, "grad_norm": 0.15753903985023499, "learning_rate": 6.512605776118563e-08, "loss": 0.0002, "num_input_tokens_seen": 36114224, "step": 35975 }, { "epoch": 19.077412513255567, "grad_norm": 0.01873520016670227, "learning_rate": 6.475439642444403e-08, "loss": 0.0002, "num_input_tokens_seen": 36118864, "step": 35980 }, { "epoch": 19.080063626723224, "grad_norm": 0.4563101828098297, "learning_rate": 6.43837917108714e-08, "loss": 0.0004, "num_input_tokens_seen": 36124176, "step": 35985 }, { "epoch": 19.08271474019088, "grad_norm": 0.5309697985649109, "learning_rate": 6.401424369981369e-08, "loss": 0.0025, "num_input_tokens_seen": 36129872, "step": 35990 }, { "epoch": 19.085365853658537, "grad_norm": 0.11451680213212967, "learning_rate": 6.36457524703904e-08, "loss": 0.0004, "num_input_tokens_seen": 36135248, "step": 35995 }, { "epoch": 19.088016967126194, "grad_norm": 0.03381827846169472, "learning_rate": 6.327831810149454e-08, "loss": 0.0003, "num_input_tokens_seen": 36139664, "step": 36000 }, { "epoch": 19.09066808059385, "grad_norm": 0.007747433613985777, "learning_rate": 6.291194067179207e-08, "loss": 0.0003, "num_input_tokens_seen": 36144912, "step": 36005 }, { "epoch": 19.093319194061507, "grad_norm": 1.1993694305419922, "learning_rate": 6.254662025972469e-08, "loss": 0.0537, "num_input_tokens_seen": 36149488, "step": 36010 }, { "epoch": 19.09597030752916, "grad_norm": 0.21734382212162018, "learning_rate": 6.218235694350538e-08, "loss": 0.0005, "num_input_tokens_seen": 36154000, "step": 36015 }, { "epoch": 19.098621420996817, "grad_norm": 0.01935366727411747, "learning_rate": 6.181915080112233e-08, "loss": 0.0004, "num_input_tokens_seen": 36158512, "step": 36020 }, { "epoch": 19.101272534464474, "grad_norm": 0.11276594549417496, "learning_rate": 6.145700191033776e-08, "loss": 0.0008, "num_input_tokens_seen": 36164496, "step": 36025 }, { "epoch": 19.10392364793213, "grad_norm": 0.32020366191864014, "learning_rate": 6.109591034868634e-08, "loss": 0.0002, "num_input_tokens_seen": 36168656, "step": 36030 }, { "epoch": 19.106574761399788, "grad_norm": 0.4305779039859772, "learning_rate": 6.07358761934762e-08, "loss": 0.0003, "num_input_tokens_seen": 36173744, "step": 36035 }, { "epoch": 19.109225874867445, "grad_norm": 0.1057097390294075, "learning_rate": 6.037689952179071e-08, "loss": 0.0006, "num_input_tokens_seen": 36178736, "step": 36040 }, { "epoch": 19.1118769883351, "grad_norm": 0.25160452723503113, "learning_rate": 6.001898041048504e-08, "loss": 0.107, "num_input_tokens_seen": 36184208, "step": 36045 }, { "epoch": 19.114528101802758, "grad_norm": 0.011153261177241802, "learning_rate": 5.9662118936189e-08, "loss": 0.0006, "num_input_tokens_seen": 36189616, "step": 36050 }, { "epoch": 19.117179215270415, "grad_norm": 0.10465963929891586, "learning_rate": 5.9306315175305386e-08, "loss": 0.0023, "num_input_tokens_seen": 36195312, "step": 36055 }, { "epoch": 19.11983032873807, "grad_norm": 0.25828227400779724, "learning_rate": 5.8951569204011574e-08, "loss": 0.0002, "num_input_tokens_seen": 36200784, "step": 36060 }, { "epoch": 19.122481442205725, "grad_norm": 0.2339617908000946, "learning_rate": 5.8597881098257924e-08, "loss": 0.0003, "num_input_tokens_seen": 36205968, "step": 36065 }, { "epoch": 19.12513255567338, "grad_norm": 0.38830292224884033, "learning_rate": 5.824525093376665e-08, "loss": 0.0004, "num_input_tokens_seen": 36211024, "step": 36070 }, { "epoch": 19.12778366914104, "grad_norm": 0.0886198952794075, "learning_rate": 5.7893678786036244e-08, "loss": 0.013, "num_input_tokens_seen": 36215024, "step": 36075 }, { "epoch": 19.130434782608695, "grad_norm": 0.008599948137998581, "learning_rate": 5.754316473033705e-08, "loss": 0.0006, "num_input_tokens_seen": 36219728, "step": 36080 }, { "epoch": 19.133085896076352, "grad_norm": 0.42493659257888794, "learning_rate": 5.7193708841712934e-08, "loss": 0.0648, "num_input_tokens_seen": 36225200, "step": 36085 }, { "epoch": 19.13573700954401, "grad_norm": 0.0503908134996891, "learning_rate": 5.684531119498182e-08, "loss": 0.0002, "num_input_tokens_seen": 36229616, "step": 36090 }, { "epoch": 19.138388123011666, "grad_norm": 0.02239931933581829, "learning_rate": 5.6497971864735156e-08, "loss": 0.0209, "num_input_tokens_seen": 36233840, "step": 36095 }, { "epoch": 19.141039236479322, "grad_norm": 0.19169847667217255, "learning_rate": 5.615169092533623e-08, "loss": 0.0003, "num_input_tokens_seen": 36238320, "step": 36100 }, { "epoch": 19.14369034994698, "grad_norm": 0.15000483393669128, "learning_rate": 5.5806468450924634e-08, "loss": 0.0003, "num_input_tokens_seen": 36243408, "step": 36105 }, { "epoch": 19.146341463414632, "grad_norm": 0.0067916191183030605, "learning_rate": 5.5462304515409015e-08, "loss": 0.0007, "num_input_tokens_seen": 36248080, "step": 36110 }, { "epoch": 19.14899257688229, "grad_norm": 0.1416141241788864, "learning_rate": 5.5119199192475994e-08, "loss": 0.0003, "num_input_tokens_seen": 36252560, "step": 36115 }, { "epoch": 19.151643690349946, "grad_norm": 0.11035466939210892, "learning_rate": 5.4777152555583465e-08, "loss": 0.0005, "num_input_tokens_seen": 36257232, "step": 36120 }, { "epoch": 19.154294803817603, "grad_norm": 0.026182005181908607, "learning_rate": 5.4436164677961754e-08, "loss": 0.0004, "num_input_tokens_seen": 36262448, "step": 36125 }, { "epoch": 19.15694591728526, "grad_norm": 0.6224012970924377, "learning_rate": 5.409623563261579e-08, "loss": 0.0006, "num_input_tokens_seen": 36267184, "step": 36130 }, { "epoch": 19.159597030752916, "grad_norm": 0.1705203354358673, "learning_rate": 5.3757365492324576e-08, "loss": 0.001, "num_input_tokens_seen": 36271312, "step": 36135 }, { "epoch": 19.162248144220573, "grad_norm": 2.1881916522979736, "learning_rate": 5.34195543296373e-08, "loss": 0.0008, "num_input_tokens_seen": 36278032, "step": 36140 }, { "epoch": 19.16489925768823, "grad_norm": 0.18735982477664948, "learning_rate": 5.308280221688e-08, "loss": 0.0004, "num_input_tokens_seen": 36284880, "step": 36145 }, { "epoch": 19.167550371155887, "grad_norm": 0.25348323583602905, "learning_rate": 5.274710922615001e-08, "loss": 0.0247, "num_input_tokens_seen": 36289424, "step": 36150 }, { "epoch": 19.170201484623544, "grad_norm": 0.014740493148565292, "learning_rate": 5.2412475429318155e-08, "loss": 0.0002, "num_input_tokens_seen": 36293904, "step": 36155 }, { "epoch": 19.172852598091197, "grad_norm": 0.5798541903495789, "learning_rate": 5.2078900898028805e-08, "loss": 0.0004, "num_input_tokens_seen": 36299248, "step": 36160 }, { "epoch": 19.175503711558854, "grad_norm": 0.02032233402132988, "learning_rate": 5.174638570369983e-08, "loss": 0.0004, "num_input_tokens_seen": 36305680, "step": 36165 }, { "epoch": 19.17815482502651, "grad_norm": 0.2515001595020294, "learning_rate": 5.14149299175204e-08, "loss": 0.0002, "num_input_tokens_seen": 36310128, "step": 36170 }, { "epoch": 19.180805938494167, "grad_norm": 0.7783267498016357, "learning_rate": 5.108453361045651e-08, "loss": 0.0005, "num_input_tokens_seen": 36315696, "step": 36175 }, { "epoch": 19.183457051961824, "grad_norm": 0.06573736667633057, "learning_rate": 5.0755196853243264e-08, "loss": 0.0004, "num_input_tokens_seen": 36320048, "step": 36180 }, { "epoch": 19.18610816542948, "grad_norm": 0.08356355130672455, "learning_rate": 5.0426919716391465e-08, "loss": 0.0003, "num_input_tokens_seen": 36324720, "step": 36185 }, { "epoch": 19.188759278897138, "grad_norm": 0.0976066142320633, "learning_rate": 5.0099702270184905e-08, "loss": 0.0002, "num_input_tokens_seen": 36329744, "step": 36190 }, { "epoch": 19.191410392364794, "grad_norm": 0.17600686848163605, "learning_rate": 4.9773544584678646e-08, "loss": 0.0003, "num_input_tokens_seen": 36334128, "step": 36195 }, { "epoch": 19.19406150583245, "grad_norm": 0.05599088966846466, "learning_rate": 4.944844672970406e-08, "loss": 0.0002, "num_input_tokens_seen": 36338768, "step": 36200 }, { "epoch": 19.196712619300104, "grad_norm": 0.16151833534240723, "learning_rate": 4.912440877486213e-08, "loss": 0.0002, "num_input_tokens_seen": 36343536, "step": 36205 }, { "epoch": 19.19936373276776, "grad_norm": 1.574042558670044, "learning_rate": 4.880143078952904e-08, "loss": 0.0014, "num_input_tokens_seen": 36348592, "step": 36210 }, { "epoch": 19.202014846235418, "grad_norm": 0.10033988207578659, "learning_rate": 4.84795128428539e-08, "loss": 0.0083, "num_input_tokens_seen": 36354288, "step": 36215 }, { "epoch": 19.204665959703075, "grad_norm": 0.0498226024210453, "learning_rate": 4.815865500375827e-08, "loss": 0.0007, "num_input_tokens_seen": 36359408, "step": 36220 }, { "epoch": 19.20731707317073, "grad_norm": 0.05788154900074005, "learning_rate": 4.783885734093663e-08, "loss": 0.0074, "num_input_tokens_seen": 36365808, "step": 36225 }, { "epoch": 19.20996818663839, "grad_norm": 0.05492236092686653, "learning_rate": 4.75201199228581e-08, "loss": 0.0002, "num_input_tokens_seen": 36371952, "step": 36230 }, { "epoch": 19.212619300106045, "grad_norm": 0.4318368136882782, "learning_rate": 4.720244281776143e-08, "loss": 0.0067, "num_input_tokens_seen": 36376816, "step": 36235 }, { "epoch": 19.215270413573702, "grad_norm": 0.01775011233985424, "learning_rate": 4.6885826093662215e-08, "loss": 0.0002, "num_input_tokens_seen": 36380368, "step": 36240 }, { "epoch": 19.21792152704136, "grad_norm": 0.0535372830927372, "learning_rate": 4.657026981834623e-08, "loss": 0.0002, "num_input_tokens_seen": 36385168, "step": 36245 }, { "epoch": 19.220572640509015, "grad_norm": 0.03804754465818405, "learning_rate": 4.625577405937387e-08, "loss": 0.0003, "num_input_tokens_seen": 36390512, "step": 36250 }, { "epoch": 19.22322375397667, "grad_norm": 0.7029911279678345, "learning_rate": 4.594233888407795e-08, "loss": 0.0003, "num_input_tokens_seen": 36394992, "step": 36255 }, { "epoch": 19.225874867444325, "grad_norm": 0.02188223786652088, "learning_rate": 4.562996435956313e-08, "loss": 0.0003, "num_input_tokens_seen": 36399088, "step": 36260 }, { "epoch": 19.228525980911982, "grad_norm": 0.15560099482536316, "learning_rate": 4.531865055270923e-08, "loss": 0.0003, "num_input_tokens_seen": 36404432, "step": 36265 }, { "epoch": 19.23117709437964, "grad_norm": 0.06514623761177063, "learning_rate": 4.5008397530166835e-08, "loss": 0.0002, "num_input_tokens_seen": 36408752, "step": 36270 }, { "epoch": 19.233828207847296, "grad_norm": 0.10306866466999054, "learning_rate": 4.4699205358360586e-08, "loss": 0.0427, "num_input_tokens_seen": 36413360, "step": 36275 }, { "epoch": 19.236479321314953, "grad_norm": 0.3494683802127838, "learning_rate": 4.4391074103488084e-08, "loss": 0.0003, "num_input_tokens_seen": 36418512, "step": 36280 }, { "epoch": 19.23913043478261, "grad_norm": 0.14027948677539825, "learning_rate": 4.408400383151878e-08, "loss": 0.0004, "num_input_tokens_seen": 36423664, "step": 36285 }, { "epoch": 19.241781548250266, "grad_norm": 0.25505512952804565, "learning_rate": 4.377799460819621e-08, "loss": 0.0946, "num_input_tokens_seen": 36428976, "step": 36290 }, { "epoch": 19.244432661717923, "grad_norm": 0.009455871768295765, "learning_rate": 4.347304649903572e-08, "loss": 0.0002, "num_input_tokens_seen": 36433648, "step": 36295 }, { "epoch": 19.24708377518558, "grad_norm": 0.05580651015043259, "learning_rate": 4.3169159569326216e-08, "loss": 0.0004, "num_input_tokens_seen": 36438512, "step": 36300 }, { "epoch": 19.249734888653233, "grad_norm": 0.09814029186964035, "learning_rate": 4.2866333884128416e-08, "loss": 0.0003, "num_input_tokens_seen": 36443408, "step": 36305 }, { "epoch": 19.25238600212089, "grad_norm": 0.040143728256225586, "learning_rate": 4.25645695082777e-08, "loss": 0.0254, "num_input_tokens_seen": 36448144, "step": 36310 }, { "epoch": 19.255037115588546, "grad_norm": 0.03373926877975464, "learning_rate": 4.226386650638015e-08, "loss": 0.0016, "num_input_tokens_seen": 36453360, "step": 36315 }, { "epoch": 19.257688229056203, "grad_norm": 0.12957563996315002, "learning_rate": 4.1964224942815376e-08, "loss": 0.0002, "num_input_tokens_seen": 36458704, "step": 36320 }, { "epoch": 19.26033934252386, "grad_norm": 0.00954545196145773, "learning_rate": 4.1665644881736525e-08, "loss": 0.0002, "num_input_tokens_seen": 36464720, "step": 36325 }, { "epoch": 19.262990455991517, "grad_norm": 0.20740443468093872, "learning_rate": 4.1368126387067995e-08, "loss": 0.0011, "num_input_tokens_seen": 36469840, "step": 36330 }, { "epoch": 19.265641569459174, "grad_norm": 0.25781407952308655, "learning_rate": 4.107166952250885e-08, "loss": 0.0082, "num_input_tokens_seen": 36474480, "step": 36335 }, { "epoch": 19.26829268292683, "grad_norm": 0.04858660697937012, "learning_rate": 4.077627435152831e-08, "loss": 0.0002, "num_input_tokens_seen": 36479536, "step": 36340 }, { "epoch": 19.270943796394487, "grad_norm": 0.07849906384944916, "learning_rate": 4.0481940937371346e-08, "loss": 0.0003, "num_input_tokens_seen": 36483728, "step": 36345 }, { "epoch": 19.27359490986214, "grad_norm": 0.11292898654937744, "learning_rate": 4.0188669343052546e-08, "loss": 0.0003, "num_input_tokens_seen": 36488912, "step": 36350 }, { "epoch": 19.276246023329797, "grad_norm": 0.0783720538020134, "learning_rate": 3.9896459631361127e-08, "loss": 0.0005, "num_input_tokens_seen": 36493392, "step": 36355 }, { "epoch": 19.278897136797454, "grad_norm": 0.0028045938815921545, "learning_rate": 3.9605311864858166e-08, "loss": 0.0003, "num_input_tokens_seen": 36498448, "step": 36360 }, { "epoch": 19.28154825026511, "grad_norm": 0.1533506214618683, "learning_rate": 3.9315226105878236e-08, "loss": 0.0005, "num_input_tokens_seen": 36504720, "step": 36365 }, { "epoch": 19.284199363732768, "grad_norm": 0.1854933798313141, "learning_rate": 3.9026202416527214e-08, "loss": 0.0017, "num_input_tokens_seen": 36511440, "step": 36370 }, { "epoch": 19.286850477200424, "grad_norm": 0.04832182824611664, "learning_rate": 3.873824085868505e-08, "loss": 0.0002, "num_input_tokens_seen": 36516400, "step": 36375 }, { "epoch": 19.28950159066808, "grad_norm": 0.12291865050792694, "learning_rate": 3.845134149400298e-08, "loss": 0.0006, "num_input_tokens_seen": 36521040, "step": 36380 }, { "epoch": 19.292152704135738, "grad_norm": 0.15490293502807617, "learning_rate": 3.816550438390576e-08, "loss": 0.0009, "num_input_tokens_seen": 36525488, "step": 36385 }, { "epoch": 19.294803817603395, "grad_norm": 0.11819058656692505, "learning_rate": 3.788072958959055e-08, "loss": 0.0004, "num_input_tokens_seen": 36530384, "step": 36390 }, { "epoch": 19.29745493107105, "grad_norm": 0.007884564809501171, "learning_rate": 3.759701717202635e-08, "loss": 0.0005, "num_input_tokens_seen": 36534832, "step": 36395 }, { "epoch": 19.300106044538705, "grad_norm": 0.17625674605369568, "learning_rate": 3.7314367191955136e-08, "loss": 0.0002, "num_input_tokens_seen": 36539792, "step": 36400 }, { "epoch": 19.30275715800636, "grad_norm": 0.027629174292087555, "learning_rate": 3.703277970989239e-08, "loss": 0.0006, "num_input_tokens_seen": 36544560, "step": 36405 }, { "epoch": 19.30540827147402, "grad_norm": 0.08686850965023041, "learning_rate": 3.675225478612432e-08, "loss": 0.0002, "num_input_tokens_seen": 36548912, "step": 36410 }, { "epoch": 19.308059384941675, "grad_norm": 0.07036706805229187, "learning_rate": 3.647279248071123e-08, "loss": 0.0003, "num_input_tokens_seen": 36553616, "step": 36415 }, { "epoch": 19.310710498409332, "grad_norm": 0.3981592357158661, "learning_rate": 3.619439285348525e-08, "loss": 0.0004, "num_input_tokens_seen": 36558640, "step": 36420 }, { "epoch": 19.31336161187699, "grad_norm": 0.04839986190199852, "learning_rate": 3.5917055964050374e-08, "loss": 0.0019, "num_input_tokens_seen": 36564144, "step": 36425 }, { "epoch": 19.316012725344645, "grad_norm": 0.011575030162930489, "learning_rate": 3.5640781871784105e-08, "loss": 0.0008, "num_input_tokens_seen": 36569456, "step": 36430 }, { "epoch": 19.318663838812302, "grad_norm": 0.07278327643871307, "learning_rate": 3.536557063583634e-08, "loss": 0.0003, "num_input_tokens_seen": 36574384, "step": 36435 }, { "epoch": 19.32131495227996, "grad_norm": 0.2600579261779785, "learning_rate": 3.50914223151283e-08, "loss": 0.0004, "num_input_tokens_seen": 36579152, "step": 36440 }, { "epoch": 19.323966065747612, "grad_norm": 0.007763793226331472, "learning_rate": 3.481833696835524e-08, "loss": 0.0002, "num_input_tokens_seen": 36584336, "step": 36445 }, { "epoch": 19.32661717921527, "grad_norm": 0.18896417319774628, "learning_rate": 3.454631465398373e-08, "loss": 0.0005, "num_input_tokens_seen": 36588272, "step": 36450 }, { "epoch": 19.329268292682926, "grad_norm": 0.9632337689399719, "learning_rate": 3.427535543025273e-08, "loss": 0.0005, "num_input_tokens_seen": 36592720, "step": 36455 }, { "epoch": 19.331919406150583, "grad_norm": 0.1806144118309021, "learning_rate": 3.400545935517474e-08, "loss": 0.0004, "num_input_tokens_seen": 36596976, "step": 36460 }, { "epoch": 19.33457051961824, "grad_norm": 0.013968930579721928, "learning_rate": 3.373662648653242e-08, "loss": 0.0286, "num_input_tokens_seen": 36602192, "step": 36465 }, { "epoch": 19.337221633085896, "grad_norm": 0.2638152539730072, "learning_rate": 3.346885688188306e-08, "loss": 0.0003, "num_input_tokens_seen": 36606256, "step": 36470 }, { "epoch": 19.339872746553553, "grad_norm": 1.0394833087921143, "learning_rate": 3.32021505985558e-08, "loss": 0.0005, "num_input_tokens_seen": 36610832, "step": 36475 }, { "epoch": 19.34252386002121, "grad_norm": 0.13310447335243225, "learning_rate": 3.293650769365109e-08, "loss": 0.0005, "num_input_tokens_seen": 36615312, "step": 36480 }, { "epoch": 19.345174973488866, "grad_norm": 0.029815999791026115, "learning_rate": 3.2671928224042306e-08, "loss": 0.0005, "num_input_tokens_seen": 36619728, "step": 36485 }, { "epoch": 19.347826086956523, "grad_norm": 0.24977707862854004, "learning_rate": 3.240841224637581e-08, "loss": 0.0011, "num_input_tokens_seen": 36625200, "step": 36490 }, { "epoch": 19.350477200424177, "grad_norm": 0.15320147573947906, "learning_rate": 3.214595981706925e-08, "loss": 0.0147, "num_input_tokens_seen": 36630576, "step": 36495 }, { "epoch": 19.353128313891833, "grad_norm": 0.018605317920446396, "learning_rate": 3.188457099231324e-08, "loss": 0.0004, "num_input_tokens_seen": 36635440, "step": 36500 }, { "epoch": 19.35577942735949, "grad_norm": 0.02089683711528778, "learning_rate": 3.162424582807022e-08, "loss": 0.0002, "num_input_tokens_seen": 36641136, "step": 36505 }, { "epoch": 19.358430540827147, "grad_norm": 0.045435525476932526, "learning_rate": 3.136498438007507e-08, "loss": 0.0003, "num_input_tokens_seen": 36646448, "step": 36510 }, { "epoch": 19.361081654294804, "grad_norm": 0.00921781174838543, "learning_rate": 3.11067867038356e-08, "loss": 0.0001, "num_input_tokens_seen": 36650032, "step": 36515 }, { "epoch": 19.36373276776246, "grad_norm": 0.18426232039928436, "learning_rate": 3.084965285463037e-08, "loss": 0.0003, "num_input_tokens_seen": 36655888, "step": 36520 }, { "epoch": 19.366383881230117, "grad_norm": 0.012581923045217991, "learning_rate": 3.059358288751202e-08, "loss": 0.0002, "num_input_tokens_seen": 36660272, "step": 36525 }, { "epoch": 19.369034994697774, "grad_norm": 0.24901317059993744, "learning_rate": 3.033857685730335e-08, "loss": 0.0121, "num_input_tokens_seen": 36665904, "step": 36530 }, { "epoch": 19.37168610816543, "grad_norm": 0.05926108360290527, "learning_rate": 3.0084634818601244e-08, "loss": 0.0012, "num_input_tokens_seen": 36671280, "step": 36535 }, { "epoch": 19.374337221633088, "grad_norm": 0.09372241795063019, "learning_rate": 2.983175682577444e-08, "loss": 0.0003, "num_input_tokens_seen": 36676272, "step": 36540 }, { "epoch": 19.37698833510074, "grad_norm": 0.4468676447868347, "learning_rate": 2.9579942932962404e-08, "loss": 0.0229, "num_input_tokens_seen": 36680976, "step": 36545 }, { "epoch": 19.379639448568398, "grad_norm": 0.2156054973602295, "learning_rate": 2.932919319407812e-08, "loss": 0.0003, "num_input_tokens_seen": 36685584, "step": 36550 }, { "epoch": 19.382290562036054, "grad_norm": 0.041035450994968414, "learning_rate": 2.9079507662806982e-08, "loss": 0.0121, "num_input_tokens_seen": 36691152, "step": 36555 }, { "epoch": 19.38494167550371, "grad_norm": 0.32664525508880615, "learning_rate": 2.8830886392605672e-08, "loss": 0.0508, "num_input_tokens_seen": 36695824, "step": 36560 }, { "epoch": 19.387592788971368, "grad_norm": 0.01331184059381485, "learning_rate": 2.858332943670328e-08, "loss": 0.0534, "num_input_tokens_seen": 36702736, "step": 36565 }, { "epoch": 19.390243902439025, "grad_norm": 0.26731958985328674, "learning_rate": 2.8336836848100736e-08, "loss": 0.0009, "num_input_tokens_seen": 36707600, "step": 36570 }, { "epoch": 19.39289501590668, "grad_norm": 0.19157546758651733, "learning_rate": 2.8091408679572496e-08, "loss": 0.0003, "num_input_tokens_seen": 36712848, "step": 36575 }, { "epoch": 19.39554612937434, "grad_norm": 0.1541663408279419, "learning_rate": 2.7847044983663195e-08, "loss": 0.0005, "num_input_tokens_seen": 36717584, "step": 36580 }, { "epoch": 19.398197242841995, "grad_norm": 0.14242802560329437, "learning_rate": 2.7603745812690984e-08, "loss": 0.0002, "num_input_tokens_seen": 36723440, "step": 36585 }, { "epoch": 19.40084835630965, "grad_norm": 0.28462719917297363, "learning_rate": 2.7361511218744753e-08, "loss": 0.0003, "num_input_tokens_seen": 36729488, "step": 36590 }, { "epoch": 19.403499469777305, "grad_norm": 0.13563820719718933, "learning_rate": 2.712034125368801e-08, "loss": 0.0003, "num_input_tokens_seen": 36734000, "step": 36595 }, { "epoch": 19.406150583244962, "grad_norm": 0.21322154998779297, "learning_rate": 2.6880235969152234e-08, "loss": 0.012, "num_input_tokens_seen": 36740208, "step": 36600 }, { "epoch": 19.40880169671262, "grad_norm": 0.08109987527132034, "learning_rate": 2.6641195416545197e-08, "loss": 0.0005, "num_input_tokens_seen": 36744912, "step": 36605 }, { "epoch": 19.411452810180275, "grad_norm": 0.061873871833086014, "learning_rate": 2.6403219647044288e-08, "loss": 0.0004, "num_input_tokens_seen": 36749392, "step": 36610 }, { "epoch": 19.414103923647932, "grad_norm": 128.4134521484375, "learning_rate": 2.6166308711599307e-08, "loss": 0.1353, "num_input_tokens_seen": 36753392, "step": 36615 }, { "epoch": 19.41675503711559, "grad_norm": 0.08045631647109985, "learning_rate": 2.5930462660932466e-08, "loss": 0.0004, "num_input_tokens_seen": 36758576, "step": 36620 }, { "epoch": 19.419406150583246, "grad_norm": 0.3191989064216614, "learning_rate": 2.5695681545537808e-08, "loss": 0.0003, "num_input_tokens_seen": 36763408, "step": 36625 }, { "epoch": 19.422057264050903, "grad_norm": 0.14142537117004395, "learning_rate": 2.5461965415681246e-08, "loss": 0.0003, "num_input_tokens_seen": 36768624, "step": 36630 }, { "epoch": 19.42470837751856, "grad_norm": 0.039291542023420334, "learning_rate": 2.5229314321401078e-08, "loss": 0.0008, "num_input_tokens_seen": 36773136, "step": 36635 }, { "epoch": 19.427359490986213, "grad_norm": 0.40696755051612854, "learning_rate": 2.4997728312506353e-08, "loss": 0.0006, "num_input_tokens_seen": 36778000, "step": 36640 }, { "epoch": 19.43001060445387, "grad_norm": 0.06174511834979057, "learning_rate": 2.4767207438580186e-08, "loss": 0.0002, "num_input_tokens_seen": 36782160, "step": 36645 }, { "epoch": 19.432661717921526, "grad_norm": 0.0503145195543766, "learning_rate": 2.4537751748976434e-08, "loss": 0.0003, "num_input_tokens_seen": 36786992, "step": 36650 }, { "epoch": 19.435312831389183, "grad_norm": 0.14618588984012604, "learning_rate": 2.4309361292820245e-08, "loss": 0.0012, "num_input_tokens_seen": 36792592, "step": 36655 }, { "epoch": 19.43796394485684, "grad_norm": 0.0722031518816948, "learning_rate": 2.4082036119009166e-08, "loss": 0.0004, "num_input_tokens_seen": 36798480, "step": 36660 }, { "epoch": 19.440615058324497, "grad_norm": 0.017883814871311188, "learning_rate": 2.385577627621427e-08, "loss": 0.0014, "num_input_tokens_seen": 36803856, "step": 36665 }, { "epoch": 19.443266171792153, "grad_norm": 0.06020307168364525, "learning_rate": 2.363058181287625e-08, "loss": 0.0005, "num_input_tokens_seen": 36808976, "step": 36670 }, { "epoch": 19.44591728525981, "grad_norm": 0.1445922702550888, "learning_rate": 2.3406452777208765e-08, "loss": 0.0121, "num_input_tokens_seen": 36812816, "step": 36675 }, { "epoch": 19.448568398727467, "grad_norm": 0.06357580423355103, "learning_rate": 2.3183389217196762e-08, "loss": 0.0002, "num_input_tokens_seen": 36819440, "step": 36680 }, { "epoch": 19.451219512195124, "grad_norm": 0.023828132078051567, "learning_rate": 2.2961391180598702e-08, "loss": 0.0007, "num_input_tokens_seen": 36825424, "step": 36685 }, { "epoch": 19.453870625662777, "grad_norm": 0.3560827970504761, "learning_rate": 2.2740458714943236e-08, "loss": 0.0002, "num_input_tokens_seen": 36830928, "step": 36690 }, { "epoch": 19.456521739130434, "grad_norm": 0.1645759493112564, "learning_rate": 2.2520591867531415e-08, "loss": 0.0246, "num_input_tokens_seen": 36835824, "step": 36695 }, { "epoch": 19.45917285259809, "grad_norm": 0.16810210049152374, "learning_rate": 2.2301790685435587e-08, "loss": 0.0003, "num_input_tokens_seen": 36842672, "step": 36700 }, { "epoch": 19.461823966065747, "grad_norm": 0.08123607188463211, "learning_rate": 2.208405521550161e-08, "loss": 0.1382, "num_input_tokens_seen": 36847184, "step": 36705 }, { "epoch": 19.464475079533404, "grad_norm": 0.051660504192113876, "learning_rate": 2.1867385504344973e-08, "loss": 0.0005, "num_input_tokens_seen": 36852560, "step": 36710 }, { "epoch": 19.46712619300106, "grad_norm": 0.6497955918312073, "learning_rate": 2.1651781598355238e-08, "loss": 0.0006, "num_input_tokens_seen": 36857008, "step": 36715 }, { "epoch": 19.469777306468718, "grad_norm": 0.009193203411996365, "learning_rate": 2.143724354369159e-08, "loss": 0.0004, "num_input_tokens_seen": 36861744, "step": 36720 }, { "epoch": 19.472428419936374, "grad_norm": 0.014811666682362556, "learning_rate": 2.122377138628673e-08, "loss": 0.0001, "num_input_tokens_seen": 36866256, "step": 36725 }, { "epoch": 19.47507953340403, "grad_norm": 0.013625764288008213, "learning_rate": 2.10113651718441e-08, "loss": 0.0027, "num_input_tokens_seen": 36871152, "step": 36730 }, { "epoch": 19.477730646871684, "grad_norm": 0.4445054233074188, "learning_rate": 2.0800024945839548e-08, "loss": 0.0005, "num_input_tokens_seen": 36876752, "step": 36735 }, { "epoch": 19.48038176033934, "grad_norm": 0.4645351767539978, "learning_rate": 2.0589750753520765e-08, "loss": 0.0012, "num_input_tokens_seen": 36881488, "step": 36740 }, { "epoch": 19.483032873806998, "grad_norm": 0.2531084418296814, "learning_rate": 2.0380542639906186e-08, "loss": 0.0015, "num_input_tokens_seen": 36886800, "step": 36745 }, { "epoch": 19.485683987274655, "grad_norm": 0.238953098654747, "learning_rate": 2.0172400649787206e-08, "loss": 0.0005, "num_input_tokens_seen": 36892592, "step": 36750 }, { "epoch": 19.48833510074231, "grad_norm": 0.10425093024969101, "learning_rate": 1.996532482772595e-08, "loss": 0.0004, "num_input_tokens_seen": 36899952, "step": 36755 }, { "epoch": 19.49098621420997, "grad_norm": 0.2639995515346527, "learning_rate": 1.9759315218058074e-08, "loss": 0.0003, "num_input_tokens_seen": 36904304, "step": 36760 }, { "epoch": 19.493637327677625, "grad_norm": 0.17380906641483307, "learning_rate": 1.955437186488829e-08, "loss": 0.0002, "num_input_tokens_seen": 36909008, "step": 36765 }, { "epoch": 19.496288441145282, "grad_norm": 0.10388895124197006, "learning_rate": 1.9350494812094833e-08, "loss": 0.0002, "num_input_tokens_seen": 36914224, "step": 36770 }, { "epoch": 19.49893955461294, "grad_norm": 4.200565814971924, "learning_rate": 1.914768410332779e-08, "loss": 0.0014, "num_input_tokens_seen": 36919408, "step": 36775 }, { "epoch": 19.501590668080595, "grad_norm": 0.0895937904715538, "learning_rate": 1.894593978200743e-08, "loss": 0.0003, "num_input_tokens_seen": 36925360, "step": 36780 }, { "epoch": 19.50424178154825, "grad_norm": 0.7759642601013184, "learning_rate": 1.8745261891327527e-08, "loss": 0.0006, "num_input_tokens_seen": 36930416, "step": 36785 }, { "epoch": 19.506892895015906, "grad_norm": 0.08960514515638351, "learning_rate": 1.854565047425261e-08, "loss": 0.0429, "num_input_tokens_seen": 36936944, "step": 36790 }, { "epoch": 19.509544008483562, "grad_norm": 0.06980985403060913, "learning_rate": 1.8347105573518487e-08, "loss": 0.0002, "num_input_tokens_seen": 36942032, "step": 36795 }, { "epoch": 19.51219512195122, "grad_norm": 0.007276164833456278, "learning_rate": 1.8149627231633938e-08, "loss": 0.0004, "num_input_tokens_seen": 36946672, "step": 36800 }, { "epoch": 19.514846235418876, "grad_norm": 0.12650573253631592, "learning_rate": 1.7953215490877362e-08, "loss": 0.0006, "num_input_tokens_seen": 36951632, "step": 36805 }, { "epoch": 19.517497348886533, "grad_norm": 0.9223158955574036, "learning_rate": 1.7757870393301234e-08, "loss": 0.0008, "num_input_tokens_seen": 36956240, "step": 36810 }, { "epoch": 19.52014846235419, "grad_norm": 0.041636791080236435, "learning_rate": 1.7563591980727102e-08, "loss": 0.0246, "num_input_tokens_seen": 36960528, "step": 36815 }, { "epoch": 19.522799575821846, "grad_norm": 0.05002354457974434, "learning_rate": 1.7370380294750576e-08, "loss": 0.0002, "num_input_tokens_seen": 36966544, "step": 36820 }, { "epoch": 19.525450689289503, "grad_norm": 0.10345643013715744, "learning_rate": 1.7178235376737463e-08, "loss": 0.0004, "num_input_tokens_seen": 36971184, "step": 36825 }, { "epoch": 19.528101802757156, "grad_norm": 0.29890546202659607, "learning_rate": 1.698715726782596e-08, "loss": 0.2303, "num_input_tokens_seen": 36976400, "step": 36830 }, { "epoch": 19.530752916224813, "grad_norm": 0.8389151692390442, "learning_rate": 1.6797146008923904e-08, "loss": 0.0019, "num_input_tokens_seen": 36981584, "step": 36835 }, { "epoch": 19.53340402969247, "grad_norm": 0.11878835409879684, "learning_rate": 1.660820164071375e-08, "loss": 0.029, "num_input_tokens_seen": 36986800, "step": 36840 }, { "epoch": 19.536055143160127, "grad_norm": 0.013005951419472694, "learning_rate": 1.642032420364703e-08, "loss": 0.0002, "num_input_tokens_seen": 36991728, "step": 36845 }, { "epoch": 19.538706256627783, "grad_norm": 0.15882772207260132, "learning_rate": 1.6233513737948238e-08, "loss": 0.0002, "num_input_tokens_seen": 36996848, "step": 36850 }, { "epoch": 19.54135737009544, "grad_norm": 0.004648907575756311, "learning_rate": 1.6047770283613152e-08, "loss": 0.0012, "num_input_tokens_seen": 37002608, "step": 36855 }, { "epoch": 19.544008483563097, "grad_norm": 0.20600803196430206, "learning_rate": 1.5863093880408852e-08, "loss": 0.0019, "num_input_tokens_seen": 37007120, "step": 36860 }, { "epoch": 19.546659597030754, "grad_norm": 0.035502564162015915, "learning_rate": 1.5679484567873714e-08, "loss": 0.0003, "num_input_tokens_seen": 37011824, "step": 36865 }, { "epoch": 19.54931071049841, "grad_norm": 1.2142280340194702, "learning_rate": 1.5496942385318515e-08, "loss": 0.0021, "num_input_tokens_seen": 37017040, "step": 36870 }, { "epoch": 19.551961823966067, "grad_norm": 0.07595314085483551, "learning_rate": 1.5315467371824764e-08, "loss": 0.0003, "num_input_tokens_seen": 37022608, "step": 36875 }, { "epoch": 19.55461293743372, "grad_norm": 0.11158941686153412, "learning_rate": 1.5135059566245835e-08, "loss": 0.0004, "num_input_tokens_seen": 37029424, "step": 36880 }, { "epoch": 19.557264050901377, "grad_norm": 0.07186022400856018, "learning_rate": 1.4955719007206382e-08, "loss": 0.0176, "num_input_tokens_seen": 37033968, "step": 36885 }, { "epoch": 19.559915164369034, "grad_norm": 0.1618805229663849, "learning_rate": 1.4777445733102913e-08, "loss": 0.0002, "num_input_tokens_seen": 37038416, "step": 36890 }, { "epoch": 19.56256627783669, "grad_norm": 0.06249107047915459, "learning_rate": 1.4600239782103787e-08, "loss": 0.0003, "num_input_tokens_seen": 37043408, "step": 36895 }, { "epoch": 19.565217391304348, "grad_norm": 0.526393711566925, "learning_rate": 1.4424101192147544e-08, "loss": 0.0003, "num_input_tokens_seen": 37047792, "step": 36900 }, { "epoch": 19.567868504772004, "grad_norm": 0.054415512830019, "learning_rate": 1.424903000094513e-08, "loss": 0.0004, "num_input_tokens_seen": 37053040, "step": 36905 }, { "epoch": 19.57051961823966, "grad_norm": 0.22315837442874908, "learning_rate": 1.407502624597934e-08, "loss": 0.0002, "num_input_tokens_seen": 37058384, "step": 36910 }, { "epoch": 19.573170731707318, "grad_norm": 13.01669692993164, "learning_rate": 1.3902089964503152e-08, "loss": 0.003, "num_input_tokens_seen": 37063632, "step": 36915 }, { "epoch": 19.575821845174975, "grad_norm": 0.02594727836549282, "learning_rate": 1.3730221193542503e-08, "loss": 0.0008, "num_input_tokens_seen": 37068432, "step": 36920 }, { "epoch": 19.57847295864263, "grad_norm": 0.17680679261684418, "learning_rate": 1.3559419969893517e-08, "loss": 0.0002, "num_input_tokens_seen": 37072656, "step": 36925 }, { "epoch": 19.581124072110285, "grad_norm": 0.026116374880075455, "learning_rate": 1.3389686330124719e-08, "loss": 0.0266, "num_input_tokens_seen": 37077296, "step": 36930 }, { "epoch": 19.58377518557794, "grad_norm": 0.07513133436441422, "learning_rate": 1.3221020310575371e-08, "loss": 0.0002, "num_input_tokens_seen": 37081648, "step": 36935 }, { "epoch": 19.5864262990456, "grad_norm": 0.050677817314863205, "learning_rate": 1.3053421947356593e-08, "loss": 0.0003, "num_input_tokens_seen": 37086416, "step": 36940 }, { "epoch": 19.589077412513255, "grad_norm": 0.5919137001037598, "learning_rate": 1.2886891276350789e-08, "loss": 0.0828, "num_input_tokens_seen": 37091600, "step": 36945 }, { "epoch": 19.591728525980912, "grad_norm": 0.09644637256860733, "learning_rate": 1.2721428333211106e-08, "loss": 0.0002, "num_input_tokens_seen": 37096912, "step": 36950 }, { "epoch": 19.59437963944857, "grad_norm": 0.12515445053577423, "learning_rate": 1.2557033153363652e-08, "loss": 0.0003, "num_input_tokens_seen": 37101808, "step": 36955 }, { "epoch": 19.597030752916226, "grad_norm": 0.6491737961769104, "learning_rate": 1.239370577200416e-08, "loss": 0.0006, "num_input_tokens_seen": 37106288, "step": 36960 }, { "epoch": 19.599681866383882, "grad_norm": 18.084218978881836, "learning_rate": 1.2231446224101329e-08, "loss": 0.0068, "num_input_tokens_seen": 37111920, "step": 36965 }, { "epoch": 19.60233297985154, "grad_norm": 0.0181729793548584, "learning_rate": 1.2070254544394034e-08, "loss": 0.008, "num_input_tokens_seen": 37116656, "step": 36970 }, { "epoch": 19.604984093319196, "grad_norm": 0.09753050655126572, "learning_rate": 1.1910130767393557e-08, "loss": 0.0004, "num_input_tokens_seen": 37122256, "step": 36975 }, { "epoch": 19.60763520678685, "grad_norm": 0.3664814829826355, "learning_rate": 1.1751074927381367e-08, "loss": 0.0021, "num_input_tokens_seen": 37127600, "step": 36980 }, { "epoch": 19.610286320254506, "grad_norm": 0.15030346810817719, "learning_rate": 1.159308705841078e-08, "loss": 0.0003, "num_input_tokens_seen": 37131728, "step": 36985 }, { "epoch": 19.612937433722163, "grad_norm": 0.20416900515556335, "learning_rate": 1.1436167194306957e-08, "loss": 0.001, "num_input_tokens_seen": 37136144, "step": 36990 }, { "epoch": 19.61558854718982, "grad_norm": 0.7138712406158447, "learning_rate": 1.128031536866636e-08, "loss": 0.0008, "num_input_tokens_seen": 37141232, "step": 36995 }, { "epoch": 19.618239660657476, "grad_norm": 0.004124159924685955, "learning_rate": 1.1125531614855634e-08, "loss": 0.0015, "num_input_tokens_seen": 37146288, "step": 37000 }, { "epoch": 19.620890774125133, "grad_norm": 0.021559426560997963, "learning_rate": 1.0971815966014376e-08, "loss": 0.0005, "num_input_tokens_seen": 37150512, "step": 37005 }, { "epoch": 19.62354188759279, "grad_norm": 82.44065856933594, "learning_rate": 1.0819168455052375e-08, "loss": 0.0652, "num_input_tokens_seen": 37154832, "step": 37010 }, { "epoch": 19.626193001060447, "grad_norm": 3.5421206951141357, "learning_rate": 1.0667589114650712e-08, "loss": 0.0824, "num_input_tokens_seen": 37161200, "step": 37015 }, { "epoch": 19.628844114528103, "grad_norm": 0.0383320190012455, "learning_rate": 1.0517077977262869e-08, "loss": 0.0006, "num_input_tokens_seen": 37166064, "step": 37020 }, { "epoch": 19.631495227995757, "grad_norm": 0.3951767683029175, "learning_rate": 1.0367635075111959e-08, "loss": 0.0003, "num_input_tokens_seen": 37170768, "step": 37025 }, { "epoch": 19.634146341463413, "grad_norm": 0.020947130396962166, "learning_rate": 1.021926044019461e-08, "loss": 0.0006, "num_input_tokens_seen": 37175344, "step": 37030 }, { "epoch": 19.63679745493107, "grad_norm": 0.5744392275810242, "learning_rate": 1.0071954104275972e-08, "loss": 0.0007, "num_input_tokens_seen": 37180400, "step": 37035 }, { "epoch": 19.639448568398727, "grad_norm": 0.028343893587589264, "learning_rate": 9.9257160988947e-09, "loss": 0.0011, "num_input_tokens_seen": 37185776, "step": 37040 }, { "epoch": 19.642099681866384, "grad_norm": 0.033213041722774506, "learning_rate": 9.780546455360195e-09, "loss": 0.0001, "num_input_tokens_seen": 37190512, "step": 37045 }, { "epoch": 19.64475079533404, "grad_norm": 0.1638093739748001, "learning_rate": 9.636445204752042e-09, "loss": 0.0004, "num_input_tokens_seen": 37194768, "step": 37050 }, { "epoch": 19.647401908801697, "grad_norm": 0.10183592885732651, "learning_rate": 9.493412377923339e-09, "loss": 0.0003, "num_input_tokens_seen": 37201808, "step": 37055 }, { "epoch": 19.650053022269354, "grad_norm": 0.27984100580215454, "learning_rate": 9.3514480054957e-09, "loss": 0.0005, "num_input_tokens_seen": 37206992, "step": 37060 }, { "epoch": 19.65270413573701, "grad_norm": 0.34970027208328247, "learning_rate": 9.210552117863703e-09, "loss": 0.0003, "num_input_tokens_seen": 37211568, "step": 37065 }, { "epoch": 19.655355249204668, "grad_norm": 0.38920465111732483, "learning_rate": 9.070724745193215e-09, "loss": 0.0005, "num_input_tokens_seen": 37215760, "step": 37070 }, { "epoch": 19.65800636267232, "grad_norm": 0.33870822191238403, "learning_rate": 8.931965917420294e-09, "loss": 0.0004, "num_input_tokens_seen": 37220208, "step": 37075 }, { "epoch": 19.660657476139978, "grad_norm": 0.1998567432165146, "learning_rate": 8.794275664253393e-09, "loss": 0.0003, "num_input_tokens_seen": 37225328, "step": 37080 }, { "epoch": 19.663308589607635, "grad_norm": 0.5263547301292419, "learning_rate": 8.657654015171157e-09, "loss": 0.0004, "num_input_tokens_seen": 37229936, "step": 37085 }, { "epoch": 19.66595970307529, "grad_norm": 0.13172774016857147, "learning_rate": 8.522100999424632e-09, "loss": 0.0002, "num_input_tokens_seen": 37234480, "step": 37090 }, { "epoch": 19.668610816542948, "grad_norm": 0.8880034685134888, "learning_rate": 8.387616646034491e-09, "loss": 0.0085, "num_input_tokens_seen": 37240688, "step": 37095 }, { "epoch": 19.671261930010605, "grad_norm": 2.4005014896392822, "learning_rate": 8.254200983794369e-09, "loss": 0.0126, "num_input_tokens_seen": 37245264, "step": 37100 }, { "epoch": 19.67391304347826, "grad_norm": 0.006305407732725143, "learning_rate": 8.12185404126753e-09, "loss": 0.0002, "num_input_tokens_seen": 37250768, "step": 37105 }, { "epoch": 19.67656415694592, "grad_norm": 0.15328016877174377, "learning_rate": 7.990575846789083e-09, "loss": 0.0004, "num_input_tokens_seen": 37256176, "step": 37110 }, { "epoch": 19.679215270413575, "grad_norm": 0.4437200725078583, "learning_rate": 7.860366428465439e-09, "loss": 0.0003, "num_input_tokens_seen": 37260752, "step": 37115 }, { "epoch": 19.68186638388123, "grad_norm": 0.03146916255354881, "learning_rate": 7.731225814174847e-09, "loss": 0.0001, "num_input_tokens_seen": 37266000, "step": 37120 }, { "epoch": 19.684517497348885, "grad_norm": 0.4881077706813812, "learning_rate": 7.603154031564641e-09, "loss": 0.0004, "num_input_tokens_seen": 37271088, "step": 37125 }, { "epoch": 19.687168610816542, "grad_norm": 0.05769888311624527, "learning_rate": 7.476151108056218e-09, "loss": 0.0003, "num_input_tokens_seen": 37276400, "step": 37130 }, { "epoch": 19.6898197242842, "grad_norm": 0.25878235697746277, "learning_rate": 7.350217070838939e-09, "loss": 0.0005, "num_input_tokens_seen": 37281488, "step": 37135 }, { "epoch": 19.692470837751856, "grad_norm": 0.4410582184791565, "learning_rate": 7.225351946876236e-09, "loss": 0.0005, "num_input_tokens_seen": 37286256, "step": 37140 }, { "epoch": 19.695121951219512, "grad_norm": 0.014117765240371227, "learning_rate": 7.101555762900614e-09, "loss": 0.0009, "num_input_tokens_seen": 37292752, "step": 37145 }, { "epoch": 19.69777306468717, "grad_norm": 0.013103253208100796, "learning_rate": 6.978828545416983e-09, "loss": 0.0005, "num_input_tokens_seen": 37297072, "step": 37150 }, { "epoch": 19.700424178154826, "grad_norm": 0.08747631311416626, "learning_rate": 6.857170320700435e-09, "loss": 0.0065, "num_input_tokens_seen": 37301136, "step": 37155 }, { "epoch": 19.703075291622483, "grad_norm": 0.018440691754221916, "learning_rate": 6.736581114798469e-09, "loss": 0.0002, "num_input_tokens_seen": 37306224, "step": 37160 }, { "epoch": 19.70572640509014, "grad_norm": 0.2850801646709442, "learning_rate": 6.617060953528209e-09, "loss": 0.0024, "num_input_tokens_seen": 37310864, "step": 37165 }, { "epoch": 19.708377518557793, "grad_norm": 0.1383548229932785, "learning_rate": 6.498609862478633e-09, "loss": 0.0003, "num_input_tokens_seen": 37316272, "step": 37170 }, { "epoch": 19.71102863202545, "grad_norm": 0.022707674652338028, "learning_rate": 6.381227867010009e-09, "loss": 0.0002, "num_input_tokens_seen": 37321616, "step": 37175 }, { "epoch": 19.713679745493106, "grad_norm": 0.08413643389940262, "learning_rate": 6.264914992253901e-09, "loss": 0.0887, "num_input_tokens_seen": 37325552, "step": 37180 }, { "epoch": 19.716330858960763, "grad_norm": 72.75627899169922, "learning_rate": 6.149671263112056e-09, "loss": 0.0356, "num_input_tokens_seen": 37329648, "step": 37185 }, { "epoch": 19.71898197242842, "grad_norm": 0.04499148577451706, "learning_rate": 6.0354967042580705e-09, "loss": 0.0015, "num_input_tokens_seen": 37335088, "step": 37190 }, { "epoch": 19.721633085896077, "grad_norm": 0.040433503687381744, "learning_rate": 5.92239134013628e-09, "loss": 0.0003, "num_input_tokens_seen": 37340240, "step": 37195 }, { "epoch": 19.724284199363733, "grad_norm": 0.04505469650030136, "learning_rate": 5.810355194961759e-09, "loss": 0.0003, "num_input_tokens_seen": 37344368, "step": 37200 }, { "epoch": 19.72693531283139, "grad_norm": 0.013597697019577026, "learning_rate": 5.699388292722541e-09, "loss": 0.0002, "num_input_tokens_seen": 37350736, "step": 37205 }, { "epoch": 19.729586426299047, "grad_norm": 0.19490903615951538, "learning_rate": 5.5894906571751786e-09, "loss": 0.0007, "num_input_tokens_seen": 37355440, "step": 37210 }, { "epoch": 19.7322375397667, "grad_norm": 0.32378652691841125, "learning_rate": 5.480662311848628e-09, "loss": 0.0009, "num_input_tokens_seen": 37360240, "step": 37215 }, { "epoch": 19.734888653234357, "grad_norm": 0.1181790828704834, "learning_rate": 5.3729032800431404e-09, "loss": 0.001, "num_input_tokens_seen": 37364464, "step": 37220 }, { "epoch": 19.737539766702014, "grad_norm": 0.055508434772491455, "learning_rate": 5.2662135848297045e-09, "loss": 0.0003, "num_input_tokens_seen": 37369136, "step": 37225 }, { "epoch": 19.74019088016967, "grad_norm": 0.14405451714992523, "learning_rate": 5.160593249049495e-09, "loss": 0.0012, "num_input_tokens_seen": 37373392, "step": 37230 }, { "epoch": 19.742841993637327, "grad_norm": 0.2147468775510788, "learning_rate": 5.056042295316643e-09, "loss": 0.0053, "num_input_tokens_seen": 37377648, "step": 37235 }, { "epoch": 19.745493107104984, "grad_norm": 0.14186425507068634, "learning_rate": 4.952560746014356e-09, "loss": 0.0655, "num_input_tokens_seen": 37381648, "step": 37240 }, { "epoch": 19.74814422057264, "grad_norm": 0.03061920590698719, "learning_rate": 4.850148623298246e-09, "loss": 0.0002, "num_input_tokens_seen": 37386032, "step": 37245 }, { "epoch": 19.750795334040298, "grad_norm": 0.2589547634124756, "learning_rate": 4.748805949094104e-09, "loss": 0.0003, "num_input_tokens_seen": 37390064, "step": 37250 }, { "epoch": 19.753446447507955, "grad_norm": 74.24169921875, "learning_rate": 4.648532745099577e-09, "loss": 0.0453, "num_input_tokens_seen": 37394512, "step": 37255 }, { "epoch": 19.75609756097561, "grad_norm": 0.02741970308125019, "learning_rate": 4.54932903278249e-09, "loss": 0.0099, "num_input_tokens_seen": 37399088, "step": 37260 }, { "epoch": 19.758748674443265, "grad_norm": 0.3114045262336731, "learning_rate": 4.451194833382522e-09, "loss": 0.0003, "num_input_tokens_seen": 37404720, "step": 37265 }, { "epoch": 19.76139978791092, "grad_norm": 0.008838639594614506, "learning_rate": 4.354130167908976e-09, "loss": 0.0004, "num_input_tokens_seen": 37409072, "step": 37270 }, { "epoch": 19.764050901378578, "grad_norm": 0.14183636009693146, "learning_rate": 4.258135057144674e-09, "loss": 0.0053, "num_input_tokens_seen": 37414064, "step": 37275 }, { "epoch": 19.766702014846235, "grad_norm": 0.12429627031087875, "learning_rate": 4.163209521640954e-09, "loss": 0.001, "num_input_tokens_seen": 37418480, "step": 37280 }, { "epoch": 19.76935312831389, "grad_norm": 0.19593748450279236, "learning_rate": 4.069353581721003e-09, "loss": 0.0007, "num_input_tokens_seen": 37423888, "step": 37285 }, { "epoch": 19.77200424178155, "grad_norm": 0.19673608243465424, "learning_rate": 3.976567257479858e-09, "loss": 0.1131, "num_input_tokens_seen": 37428304, "step": 37290 }, { "epoch": 19.774655355249205, "grad_norm": 0.16054685413837433, "learning_rate": 3.884850568782184e-09, "loss": 0.1722, "num_input_tokens_seen": 37432432, "step": 37295 }, { "epoch": 19.777306468716862, "grad_norm": 0.039293959736824036, "learning_rate": 3.794203535264496e-09, "loss": 0.0003, "num_input_tokens_seen": 37437232, "step": 37300 }, { "epoch": 19.77995758218452, "grad_norm": 0.2465868443250656, "learning_rate": 3.7046261763340474e-09, "loss": 0.0017, "num_input_tokens_seen": 37442320, "step": 37305 }, { "epoch": 19.782608695652176, "grad_norm": 1.5833773612976074, "learning_rate": 3.6161185111693864e-09, "loss": 0.0006, "num_input_tokens_seen": 37446864, "step": 37310 }, { "epoch": 19.78525980911983, "grad_norm": 0.2616364359855652, "learning_rate": 3.5286805587192443e-09, "loss": 0.0027, "num_input_tokens_seen": 37452272, "step": 37315 }, { "epoch": 19.787910922587486, "grad_norm": 0.09571138024330139, "learning_rate": 3.442312337704201e-09, "loss": 0.0007, "num_input_tokens_seen": 37457296, "step": 37320 }, { "epoch": 19.790562036055142, "grad_norm": 0.6696109175682068, "learning_rate": 3.357013866615022e-09, "loss": 0.0004, "num_input_tokens_seen": 37461520, "step": 37325 }, { "epoch": 19.7932131495228, "grad_norm": 0.3529033660888672, "learning_rate": 3.2727851637148753e-09, "loss": 0.0002, "num_input_tokens_seen": 37465200, "step": 37330 }, { "epoch": 19.795864262990456, "grad_norm": 0.03888006508350372, "learning_rate": 3.189626247036004e-09, "loss": 0.0002, "num_input_tokens_seen": 37470064, "step": 37335 }, { "epoch": 19.798515376458113, "grad_norm": 1.034056544303894, "learning_rate": 3.107537134383054e-09, "loss": 0.0006, "num_input_tokens_seen": 37474512, "step": 37340 }, { "epoch": 19.80116648992577, "grad_norm": 0.2928358316421509, "learning_rate": 3.026517843330856e-09, "loss": 0.0005, "num_input_tokens_seen": 37478864, "step": 37345 }, { "epoch": 19.803817603393426, "grad_norm": 0.13830062747001648, "learning_rate": 2.9465683912249798e-09, "loss": 0.0002, "num_input_tokens_seen": 37483568, "step": 37350 }, { "epoch": 19.806468716861083, "grad_norm": 0.047974903136491776, "learning_rate": 2.8676887951833985e-09, "loss": 0.0003, "num_input_tokens_seen": 37487824, "step": 37355 }, { "epoch": 19.809119830328736, "grad_norm": 26.382625579833984, "learning_rate": 2.78987907209316e-09, "loss": 0.0052, "num_input_tokens_seen": 37493392, "step": 37360 }, { "epoch": 19.811770943796393, "grad_norm": 0.13002628087997437, "learning_rate": 2.7131392386131605e-09, "loss": 0.0003, "num_input_tokens_seen": 37498128, "step": 37365 }, { "epoch": 19.81442205726405, "grad_norm": 0.02851617895066738, "learning_rate": 2.6374693111741457e-09, "loss": 0.0001, "num_input_tokens_seen": 37503728, "step": 37370 }, { "epoch": 19.817073170731707, "grad_norm": 0.13088937103748322, "learning_rate": 2.5628693059759345e-09, "loss": 0.0008, "num_input_tokens_seen": 37509328, "step": 37375 }, { "epoch": 19.819724284199363, "grad_norm": 0.002869706368073821, "learning_rate": 2.489339238990196e-09, "loss": 0.0001, "num_input_tokens_seen": 37514832, "step": 37380 }, { "epoch": 19.82237539766702, "grad_norm": 0.23076362907886505, "learning_rate": 2.4168791259598924e-09, "loss": 0.0005, "num_input_tokens_seen": 37519472, "step": 37385 }, { "epoch": 19.825026511134677, "grad_norm": 0.2850032150745392, "learning_rate": 2.34548898239817e-09, "loss": 0.1008, "num_input_tokens_seen": 37524336, "step": 37390 }, { "epoch": 19.827677624602334, "grad_norm": 0.04904108867049217, "learning_rate": 2.2751688235900237e-09, "loss": 0.0004, "num_input_tokens_seen": 37529584, "step": 37395 }, { "epoch": 19.83032873806999, "grad_norm": 0.19765359163284302, "learning_rate": 2.205918664590634e-09, "loss": 0.0026, "num_input_tokens_seen": 37535152, "step": 37400 }, { "epoch": 19.832979851537647, "grad_norm": 0.09094229340553284, "learning_rate": 2.137738520225918e-09, "loss": 0.0003, "num_input_tokens_seen": 37539920, "step": 37405 }, { "epoch": 19.8356309650053, "grad_norm": 0.2901199162006378, "learning_rate": 2.070628405093089e-09, "loss": 0.0004, "num_input_tokens_seen": 37545648, "step": 37410 }, { "epoch": 19.838282078472957, "grad_norm": 0.0041197617538273335, "learning_rate": 2.004588333560653e-09, "loss": 0.0043, "num_input_tokens_seen": 37551056, "step": 37415 }, { "epoch": 19.840933191940614, "grad_norm": 0.04316798225045204, "learning_rate": 1.9396183197678554e-09, "loss": 0.0009, "num_input_tokens_seen": 37556944, "step": 37420 }, { "epoch": 19.84358430540827, "grad_norm": 6.289717197418213, "learning_rate": 1.8757183776235697e-09, "loss": 0.0027, "num_input_tokens_seen": 37561904, "step": 37425 }, { "epoch": 19.846235418875928, "grad_norm": 0.10315154492855072, "learning_rate": 1.8128885208096303e-09, "loss": 0.0002, "num_input_tokens_seen": 37566608, "step": 37430 }, { "epoch": 19.848886532343585, "grad_norm": 0.028828654438257217, "learning_rate": 1.7511287627769435e-09, "loss": 0.0002, "num_input_tokens_seen": 37571536, "step": 37435 }, { "epoch": 19.85153764581124, "grad_norm": 5.690290451049805, "learning_rate": 1.690439116748266e-09, "loss": 0.0014, "num_input_tokens_seen": 37578064, "step": 37440 }, { "epoch": 19.854188759278898, "grad_norm": 2.0798940658569336, "learning_rate": 1.6308195957182028e-09, "loss": 0.0007, "num_input_tokens_seen": 37583536, "step": 37445 }, { "epoch": 19.856839872746555, "grad_norm": 0.3390583395957947, "learning_rate": 1.5722702124493228e-09, "loss": 0.0003, "num_input_tokens_seen": 37588464, "step": 37450 }, { "epoch": 19.85949098621421, "grad_norm": 0.05487004294991493, "learning_rate": 1.5147909794782644e-09, "loss": 0.0004, "num_input_tokens_seen": 37592976, "step": 37455 }, { "epoch": 19.862142099681865, "grad_norm": 0.9447763562202454, "learning_rate": 1.4583819091107398e-09, "loss": 0.0012, "num_input_tokens_seen": 37597616, "step": 37460 }, { "epoch": 19.86479321314952, "grad_norm": 0.15409979224205017, "learning_rate": 1.4030430134237549e-09, "loss": 0.0004, "num_input_tokens_seen": 37602928, "step": 37465 }, { "epoch": 19.86744432661718, "grad_norm": 0.9019076824188232, "learning_rate": 1.3487743042656098e-09, "loss": 0.0004, "num_input_tokens_seen": 37607504, "step": 37470 }, { "epoch": 19.870095440084835, "grad_norm": 0.467871755361557, "learning_rate": 1.2955757932542334e-09, "loss": 0.0007, "num_input_tokens_seen": 37613680, "step": 37475 }, { "epoch": 19.872746553552492, "grad_norm": 0.1013883724808693, "learning_rate": 1.2434474917799587e-09, "loss": 0.0003, "num_input_tokens_seen": 37619312, "step": 37480 }, { "epoch": 19.87539766702015, "grad_norm": 0.024483198300004005, "learning_rate": 1.192389411003303e-09, "loss": 0.0002, "num_input_tokens_seen": 37623344, "step": 37485 }, { "epoch": 19.878048780487806, "grad_norm": 7.42764139175415, "learning_rate": 1.1424015618549666e-09, "loss": 0.0026, "num_input_tokens_seen": 37628016, "step": 37490 }, { "epoch": 19.880699893955462, "grad_norm": 0.16721375286579132, "learning_rate": 1.093483955038055e-09, "loss": 0.0122, "num_input_tokens_seen": 37633488, "step": 37495 }, { "epoch": 19.88335100742312, "grad_norm": 0.0219559483230114, "learning_rate": 1.0456366010258566e-09, "loss": 0.0002, "num_input_tokens_seen": 37638032, "step": 37500 }, { "epoch": 19.886002120890772, "grad_norm": 0.12455183267593384, "learning_rate": 9.988595100612896e-10, "loss": 0.0004, "num_input_tokens_seen": 37642704, "step": 37505 }, { "epoch": 19.88865323435843, "grad_norm": 0.1943138837814331, "learning_rate": 9.531526921602307e-10, "loss": 0.0004, "num_input_tokens_seen": 37647952, "step": 37510 }, { "epoch": 19.891304347826086, "grad_norm": 21.992998123168945, "learning_rate": 9.085161571076307e-10, "loss": 0.0054, "num_input_tokens_seen": 37652592, "step": 37515 }, { "epoch": 19.893955461293743, "grad_norm": 0.08774004876613617, "learning_rate": 8.649499144608442e-10, "loss": 0.0005, "num_input_tokens_seen": 37657104, "step": 37520 }, { "epoch": 19.8966065747614, "grad_norm": 0.4070051312446594, "learning_rate": 8.224539735468551e-10, "loss": 0.0005, "num_input_tokens_seen": 37661968, "step": 37525 }, { "epoch": 19.899257688229056, "grad_norm": 0.007396903354674578, "learning_rate": 7.810283434633859e-10, "loss": 0.0022, "num_input_tokens_seen": 37667280, "step": 37530 }, { "epoch": 19.901908801696713, "grad_norm": 0.35566744208335876, "learning_rate": 7.406730330805633e-10, "loss": 0.0005, "num_input_tokens_seen": 37672048, "step": 37535 }, { "epoch": 19.90455991516437, "grad_norm": 15.639396667480469, "learning_rate": 7.013880510375882e-10, "loss": 0.0055, "num_input_tokens_seen": 37676272, "step": 37540 }, { "epoch": 19.907211028632027, "grad_norm": 0.01000971533358097, "learning_rate": 6.631734057460648e-10, "loss": 0.0005, "num_input_tokens_seen": 37681104, "step": 37545 }, { "epoch": 19.909862142099684, "grad_norm": 0.5278067588806152, "learning_rate": 6.260291053866718e-10, "loss": 0.0004, "num_input_tokens_seen": 37685904, "step": 37550 }, { "epoch": 19.912513255567337, "grad_norm": 0.03186510130763054, "learning_rate": 5.899551579130469e-10, "loss": 0.0001, "num_input_tokens_seen": 37691216, "step": 37555 }, { "epoch": 19.915164369034994, "grad_norm": 0.5912235975265503, "learning_rate": 5.549515710473463e-10, "loss": 0.0008, "num_input_tokens_seen": 37696432, "step": 37560 }, { "epoch": 19.91781548250265, "grad_norm": 0.11610807478427887, "learning_rate": 5.210183522841305e-10, "loss": 0.0006, "num_input_tokens_seen": 37703024, "step": 37565 }, { "epoch": 19.920466595970307, "grad_norm": 0.07896365225315094, "learning_rate": 4.881555088892543e-10, "loss": 0.0005, "num_input_tokens_seen": 37707568, "step": 37570 }, { "epoch": 19.923117709437964, "grad_norm": 0.060092031955718994, "learning_rate": 4.563630478970904e-10, "loss": 0.0032, "num_input_tokens_seen": 37713424, "step": 37575 }, { "epoch": 19.92576882290562, "grad_norm": 0.4247131645679474, "learning_rate": 4.2564097611552667e-10, "loss": 0.0003, "num_input_tokens_seen": 37718032, "step": 37580 }, { "epoch": 19.928419936373277, "grad_norm": 0.09207753837108612, "learning_rate": 3.959893001220794e-10, "loss": 0.0003, "num_input_tokens_seen": 37722864, "step": 37585 }, { "epoch": 19.931071049840934, "grad_norm": 0.21952815353870392, "learning_rate": 3.674080262638935e-10, "loss": 0.0003, "num_input_tokens_seen": 37728208, "step": 37590 }, { "epoch": 19.93372216330859, "grad_norm": 0.12078266590833664, "learning_rate": 3.398971606616286e-10, "loss": 0.0067, "num_input_tokens_seen": 37732656, "step": 37595 }, { "epoch": 19.936373276776244, "grad_norm": 0.2819594144821167, "learning_rate": 3.1345670920390757e-10, "loss": 0.0004, "num_input_tokens_seen": 37737552, "step": 37600 }, { "epoch": 19.9390243902439, "grad_norm": 27.734560012817383, "learning_rate": 2.880866775528679e-10, "loss": 0.0068, "num_input_tokens_seen": 37743472, "step": 37605 }, { "epoch": 19.941675503711558, "grad_norm": 0.02806836925446987, "learning_rate": 2.637870711391655e-10, "loss": 0.0002, "num_input_tokens_seen": 37747984, "step": 37610 }, { "epoch": 19.944326617179215, "grad_norm": 1.1578953266143799, "learning_rate": 2.405578951658605e-10, "loss": 0.0007, "num_input_tokens_seen": 37753072, "step": 37615 }, { "epoch": 19.94697773064687, "grad_norm": 0.28613367676734924, "learning_rate": 2.1839915460564188e-10, "loss": 0.0005, "num_input_tokens_seen": 37757776, "step": 37620 }, { "epoch": 19.949628844114528, "grad_norm": 0.21875706315040588, "learning_rate": 1.9731085420360286e-10, "loss": 0.0005, "num_input_tokens_seen": 37762768, "step": 37625 }, { "epoch": 19.952279957582185, "grad_norm": 0.25540995597839355, "learning_rate": 1.7729299847391023e-10, "loss": 0.0082, "num_input_tokens_seen": 37769200, "step": 37630 }, { "epoch": 19.95493107104984, "grad_norm": 0.08775386959314346, "learning_rate": 1.5834559170258001e-10, "loss": 0.0002, "num_input_tokens_seen": 37773136, "step": 37635 }, { "epoch": 19.9575821845175, "grad_norm": 0.09225066006183624, "learning_rate": 1.40468637945812e-10, "loss": 0.0021, "num_input_tokens_seen": 37778672, "step": 37640 }, { "epoch": 19.960233297985155, "grad_norm": 0.41996029019355774, "learning_rate": 1.2366214103221031e-10, "loss": 0.0003, "num_input_tokens_seen": 37784752, "step": 37645 }, { "epoch": 19.96288441145281, "grad_norm": 0.09879252314567566, "learning_rate": 1.079261045583424e-10, "loss": 0.0004, "num_input_tokens_seen": 37789008, "step": 37650 }, { "epoch": 19.965535524920465, "grad_norm": 0.6078483462333679, "learning_rate": 9.326053189484541e-11, "loss": 0.0003, "num_input_tokens_seen": 37793200, "step": 37655 }, { "epoch": 19.968186638388122, "grad_norm": 0.04600002244114876, "learning_rate": 7.966542618031981e-11, "loss": 0.0024, "num_input_tokens_seen": 37798320, "step": 37660 }, { "epoch": 19.97083775185578, "grad_norm": 0.15984079241752625, "learning_rate": 6.714079032577037e-11, "loss": 0.0002, "num_input_tokens_seen": 37803408, "step": 37665 }, { "epoch": 19.973488865323436, "grad_norm": 0.28483283519744873, "learning_rate": 5.568662701349592e-11, "loss": 0.0003, "num_input_tokens_seen": 37808752, "step": 37670 }, { "epoch": 19.976139978791092, "grad_norm": 0.17435666918754578, "learning_rate": 4.53029386948689e-11, "loss": 0.0004, "num_input_tokens_seen": 37814832, "step": 37675 }, { "epoch": 19.97879109225875, "grad_norm": 0.2643859088420868, "learning_rate": 3.5989727593110924e-11, "loss": 0.0005, "num_input_tokens_seen": 37820208, "step": 37680 }, { "epoch": 19.981442205726406, "grad_norm": 0.10419491678476334, "learning_rate": 2.7746995702737643e-11, "loss": 0.0003, "num_input_tokens_seen": 37825104, "step": 37685 }, { "epoch": 19.984093319194063, "grad_norm": 2.0504345893859863, "learning_rate": 2.057474478789345e-11, "loss": 0.0006, "num_input_tokens_seen": 37830288, "step": 37690 }, { "epoch": 19.98674443266172, "grad_norm": 0.3369556665420532, "learning_rate": 1.4472976384571903e-11, "loss": 0.0003, "num_input_tokens_seen": 37834192, "step": 37695 }, { "epoch": 19.989395546129373, "grad_norm": 0.07337664812803268, "learning_rate": 9.441691798950381e-12, "loss": 0.0428, "num_input_tokens_seen": 37838160, "step": 37700 }, { "epoch": 19.99204665959703, "grad_norm": 0.6208184361457825, "learning_rate": 5.480892107945224e-12, "loss": 0.0005, "num_input_tokens_seen": 37844176, "step": 37705 }, { "epoch": 19.994697773064686, "grad_norm": 0.5100275278091431, "learning_rate": 2.590578159766821e-12, "loss": 0.0008, "num_input_tokens_seen": 37849872, "step": 37710 }, { "epoch": 19.997348886532343, "grad_norm": 2.208310127258301, "learning_rate": 7.707505739196208e-13, "loss": 0.0129, "num_input_tokens_seen": 37854928, "step": 37715 }, { "epoch": 20.0, "grad_norm": 0.054887861013412476, "learning_rate": 2.1409738426569905e-14, "loss": 0.0009, "num_input_tokens_seen": 37859408, "step": 37720 }, { "epoch": 20.0, "eval_loss": 1.073240041732788, "eval_runtime": 29.4465, "eval_samples_per_second": 64.048, "eval_steps_per_second": 16.029, "num_input_tokens_seen": 37859408, "step": 37720 }, { "epoch": 20.0, "num_input_tokens_seen": 37859408, "step": 37720, "total_flos": 1.704792030999085e+18, "train_loss": 0.21434799538800403, "train_runtime": 5698.4239, "train_samples_per_second": 26.467, "train_steps_per_second": 6.619 } ], "logging_steps": 5, "max_steps": 37720, "num_input_tokens_seen": 37859408, "num_train_epochs": 20, "save_steps": 3772, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.704792030999085e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }