{ "best_global_step": 12892, "best_metric": 0.23131409287452698, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_piqa_123_1762638012/checkpoint-12892", "epoch": 20.0, "eval_steps": 6446, "global_step": 64460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015513496742165685, "grad_norm": 210.12213134765625, "learning_rate": 6.205398696866274e-09, "loss": 8.8099, "num_input_tokens_seen": 3136, "step": 5 }, { "epoch": 0.003102699348433137, "grad_norm": 232.9886474609375, "learning_rate": 1.3962147067949117e-08, "loss": 8.8799, "num_input_tokens_seen": 5856, "step": 10 }, { "epoch": 0.004654049022649705, "grad_norm": 229.80706787109375, "learning_rate": 2.1718895439031957e-08, "loss": 8.9107, "num_input_tokens_seen": 8960, "step": 15 }, { "epoch": 0.006205398696866274, "grad_norm": 244.54818725585938, "learning_rate": 2.9475643810114802e-08, "loss": 8.8672, "num_input_tokens_seen": 12096, "step": 20 }, { "epoch": 0.007756748371082842, "grad_norm": 206.6392364501953, "learning_rate": 3.7232392181197643e-08, "loss": 8.7785, "num_input_tokens_seen": 14656, "step": 25 }, { "epoch": 0.00930809804529941, "grad_norm": 241.5086669921875, "learning_rate": 4.498914055228049e-08, "loss": 8.4772, "num_input_tokens_seen": 16736, "step": 30 }, { "epoch": 0.01085944771951598, "grad_norm": 192.2936248779297, "learning_rate": 5.274588892336333e-08, "loss": 8.6017, "num_input_tokens_seen": 19392, "step": 35 }, { "epoch": 0.012410797393732548, "grad_norm": 226.30606079101562, "learning_rate": 6.050263729444618e-08, "loss": 8.6932, "num_input_tokens_seen": 22368, "step": 40 }, { "epoch": 0.013962147067949116, "grad_norm": 186.1767578125, "learning_rate": 6.825938566552902e-08, "loss": 8.5863, "num_input_tokens_seen": 25856, "step": 45 }, { "epoch": 0.015513496742165685, "grad_norm": 223.8227081298828, "learning_rate": 7.601613403661185e-08, "loss": 8.3081, "num_input_tokens_seen": 29696, "step": 50 }, { "epoch": 0.017064846416382253, "grad_norm": 220.96453857421875, "learning_rate": 8.377288240769469e-08, "loss": 8.108, "num_input_tokens_seen": 32224, "step": 55 }, { "epoch": 0.01861619609059882, "grad_norm": 217.817626953125, "learning_rate": 9.152963077877753e-08, "loss": 7.9731, "num_input_tokens_seen": 35168, "step": 60 }, { "epoch": 0.02016754576481539, "grad_norm": 229.24411010742188, "learning_rate": 9.928637914986038e-08, "loss": 8.2764, "num_input_tokens_seen": 38784, "step": 65 }, { "epoch": 0.02171889543903196, "grad_norm": 206.79066467285156, "learning_rate": 1.0704312752094322e-07, "loss": 8.0645, "num_input_tokens_seen": 42304, "step": 70 }, { "epoch": 0.023270245113248527, "grad_norm": 193.42312622070312, "learning_rate": 1.1479987589202607e-07, "loss": 7.5826, "num_input_tokens_seen": 44960, "step": 75 }, { "epoch": 0.024821594787465096, "grad_norm": 208.83694458007812, "learning_rate": 1.225566242631089e-07, "loss": 7.5741, "num_input_tokens_seen": 47968, "step": 80 }, { "epoch": 0.026372944461681664, "grad_norm": 175.05616760253906, "learning_rate": 1.3031337263419176e-07, "loss": 7.1172, "num_input_tokens_seen": 50496, "step": 85 }, { "epoch": 0.027924294135898232, "grad_norm": 157.90528869628906, "learning_rate": 1.380701210052746e-07, "loss": 7.2512, "num_input_tokens_seen": 53888, "step": 90 }, { "epoch": 0.0294756438101148, "grad_norm": 167.39625549316406, "learning_rate": 1.4582686937635744e-07, "loss": 6.6914, "num_input_tokens_seen": 56224, "step": 95 }, { "epoch": 0.03102699348433137, "grad_norm": 155.56700134277344, "learning_rate": 1.535836177474403e-07, "loss": 6.8742, "num_input_tokens_seen": 59360, "step": 100 }, { "epoch": 0.032578343158547934, "grad_norm": 140.67820739746094, "learning_rate": 1.6134036611852314e-07, "loss": 6.4364, "num_input_tokens_seen": 62336, "step": 105 }, { "epoch": 0.034129692832764506, "grad_norm": 122.25746154785156, "learning_rate": 1.6909711448960597e-07, "loss": 6.1634, "num_input_tokens_seen": 65056, "step": 110 }, { "epoch": 0.03568104250698107, "grad_norm": 123.44780731201172, "learning_rate": 1.7685386286068882e-07, "loss": 5.875, "num_input_tokens_seen": 67776, "step": 115 }, { "epoch": 0.03723239218119764, "grad_norm": 105.608642578125, "learning_rate": 1.8461061123177164e-07, "loss": 5.9362, "num_input_tokens_seen": 70848, "step": 120 }, { "epoch": 0.03878374185541421, "grad_norm": 91.20225524902344, "learning_rate": 1.9236735960285452e-07, "loss": 5.6406, "num_input_tokens_seen": 73600, "step": 125 }, { "epoch": 0.04033509152963078, "grad_norm": 87.58357238769531, "learning_rate": 2.0012410797393735e-07, "loss": 5.6974, "num_input_tokens_seen": 77408, "step": 130 }, { "epoch": 0.041886441203847345, "grad_norm": 90.6343002319336, "learning_rate": 2.078808563450202e-07, "loss": 5.3866, "num_input_tokens_seen": 80896, "step": 135 }, { "epoch": 0.04343779087806392, "grad_norm": 87.1835708618164, "learning_rate": 2.1563760471610302e-07, "loss": 5.2915, "num_input_tokens_seen": 83584, "step": 140 }, { "epoch": 0.04498914055228048, "grad_norm": 80.73114013671875, "learning_rate": 2.2339435308718587e-07, "loss": 5.3407, "num_input_tokens_seen": 87072, "step": 145 }, { "epoch": 0.046540490226497054, "grad_norm": 89.09452819824219, "learning_rate": 2.311511014582687e-07, "loss": 4.7676, "num_input_tokens_seen": 89696, "step": 150 }, { "epoch": 0.04809183990071362, "grad_norm": 74.42778778076172, "learning_rate": 2.3890784982935155e-07, "loss": 5.1005, "num_input_tokens_seen": 93536, "step": 155 }, { "epoch": 0.04964318957493019, "grad_norm": 77.69342803955078, "learning_rate": 2.466645982004344e-07, "loss": 4.8372, "num_input_tokens_seen": 96896, "step": 160 }, { "epoch": 0.051194539249146756, "grad_norm": 78.32523345947266, "learning_rate": 2.5442134657151725e-07, "loss": 4.4734, "num_input_tokens_seen": 99584, "step": 165 }, { "epoch": 0.05274588892336333, "grad_norm": 88.0487060546875, "learning_rate": 2.621780949426001e-07, "loss": 4.7271, "num_input_tokens_seen": 106336, "step": 170 }, { "epoch": 0.05429723859757989, "grad_norm": 78.70283508300781, "learning_rate": 2.6993484331368296e-07, "loss": 4.1557, "num_input_tokens_seen": 109152, "step": 175 }, { "epoch": 0.055848588271796465, "grad_norm": 70.62953186035156, "learning_rate": 2.776915916847658e-07, "loss": 3.8712, "num_input_tokens_seen": 111488, "step": 180 }, { "epoch": 0.05739993794601303, "grad_norm": 70.9137954711914, "learning_rate": 2.854483400558486e-07, "loss": 3.8641, "num_input_tokens_seen": 114464, "step": 185 }, { "epoch": 0.0589512876202296, "grad_norm": 69.14026641845703, "learning_rate": 2.9320508842693143e-07, "loss": 3.7816, "num_input_tokens_seen": 117984, "step": 190 }, { "epoch": 0.06050263729444617, "grad_norm": 71.74274444580078, "learning_rate": 3.0096183679801426e-07, "loss": 3.3856, "num_input_tokens_seen": 120224, "step": 195 }, { "epoch": 0.06205398696866274, "grad_norm": 67.41858673095703, "learning_rate": 3.0871858516909713e-07, "loss": 3.4033, "num_input_tokens_seen": 122880, "step": 200 }, { "epoch": 0.0636053366428793, "grad_norm": 75.80384826660156, "learning_rate": 3.1647533354017996e-07, "loss": 3.342, "num_input_tokens_seen": 126080, "step": 205 }, { "epoch": 0.06515668631709587, "grad_norm": 55.78644943237305, "learning_rate": 3.242320819112628e-07, "loss": 2.863, "num_input_tokens_seen": 128576, "step": 210 }, { "epoch": 0.06670803599131245, "grad_norm": 52.645572662353516, "learning_rate": 3.3198883028234566e-07, "loss": 2.8464, "num_input_tokens_seen": 131712, "step": 215 }, { "epoch": 0.06825938566552901, "grad_norm": 52.57148742675781, "learning_rate": 3.397455786534285e-07, "loss": 2.5337, "num_input_tokens_seen": 134080, "step": 220 }, { "epoch": 0.06981073533974558, "grad_norm": 52.41255569458008, "learning_rate": 3.475023270245113e-07, "loss": 2.5682, "num_input_tokens_seen": 136992, "step": 225 }, { "epoch": 0.07136208501396214, "grad_norm": 77.54277038574219, "learning_rate": 3.5525907539559414e-07, "loss": 2.4665, "num_input_tokens_seen": 139552, "step": 230 }, { "epoch": 0.07291343468817872, "grad_norm": 52.097782135009766, "learning_rate": 3.6301582376667707e-07, "loss": 2.121, "num_input_tokens_seen": 141664, "step": 235 }, { "epoch": 0.07446478436239529, "grad_norm": 42.053466796875, "learning_rate": 3.707725721377599e-07, "loss": 2.0984, "num_input_tokens_seen": 143968, "step": 240 }, { "epoch": 0.07601613403661185, "grad_norm": 39.798423767089844, "learning_rate": 3.785293205088427e-07, "loss": 1.8893, "num_input_tokens_seen": 146176, "step": 245 }, { "epoch": 0.07756748371082842, "grad_norm": 58.81941604614258, "learning_rate": 3.8628606887992554e-07, "loss": 1.814, "num_input_tokens_seen": 148640, "step": 250 }, { "epoch": 0.079118833385045, "grad_norm": 54.32085418701172, "learning_rate": 3.940428172510084e-07, "loss": 1.7127, "num_input_tokens_seen": 151424, "step": 255 }, { "epoch": 0.08067018305926156, "grad_norm": 41.31128692626953, "learning_rate": 4.0179956562209125e-07, "loss": 1.6353, "num_input_tokens_seen": 154048, "step": 260 }, { "epoch": 0.08222153273347813, "grad_norm": 40.02718734741211, "learning_rate": 4.0955631399317407e-07, "loss": 1.4098, "num_input_tokens_seen": 156416, "step": 265 }, { "epoch": 0.08377288240769469, "grad_norm": 38.39036560058594, "learning_rate": 4.173130623642569e-07, "loss": 1.4388, "num_input_tokens_seen": 159104, "step": 270 }, { "epoch": 0.08532423208191127, "grad_norm": 31.04317855834961, "learning_rate": 4.2506981073533983e-07, "loss": 1.2588, "num_input_tokens_seen": 161568, "step": 275 }, { "epoch": 0.08687558175612783, "grad_norm": 34.87407302856445, "learning_rate": 4.3282655910642265e-07, "loss": 1.3577, "num_input_tokens_seen": 164672, "step": 280 }, { "epoch": 0.0884269314303444, "grad_norm": 32.8932991027832, "learning_rate": 4.405833074775055e-07, "loss": 1.1658, "num_input_tokens_seen": 167840, "step": 285 }, { "epoch": 0.08997828110456096, "grad_norm": 39.42401123046875, "learning_rate": 4.483400558485883e-07, "loss": 1.069, "num_input_tokens_seen": 170656, "step": 290 }, { "epoch": 0.09152963077877754, "grad_norm": 23.20857810974121, "learning_rate": 4.560968042196712e-07, "loss": 0.9351, "num_input_tokens_seen": 173376, "step": 295 }, { "epoch": 0.09308098045299411, "grad_norm": 47.013221740722656, "learning_rate": 4.63853552590754e-07, "loss": 0.9717, "num_input_tokens_seen": 176960, "step": 300 }, { "epoch": 0.09463233012721067, "grad_norm": 39.35919952392578, "learning_rate": 4.7161030096183683e-07, "loss": 0.9049, "num_input_tokens_seen": 179680, "step": 305 }, { "epoch": 0.09618367980142724, "grad_norm": 22.688934326171875, "learning_rate": 4.793670493329197e-07, "loss": 0.8444, "num_input_tokens_seen": 182912, "step": 310 }, { "epoch": 0.09773502947564382, "grad_norm": 27.92320442199707, "learning_rate": 4.871237977040025e-07, "loss": 0.7123, "num_input_tokens_seen": 186528, "step": 315 }, { "epoch": 0.09928637914986038, "grad_norm": 19.0408935546875, "learning_rate": 4.948805460750854e-07, "loss": 0.5901, "num_input_tokens_seen": 188640, "step": 320 }, { "epoch": 0.10083772882407695, "grad_norm": 25.27012825012207, "learning_rate": 5.026372944461682e-07, "loss": 0.5284, "num_input_tokens_seen": 190752, "step": 325 }, { "epoch": 0.10238907849829351, "grad_norm": 18.394014358520508, "learning_rate": 5.103940428172511e-07, "loss": 0.5626, "num_input_tokens_seen": 193504, "step": 330 }, { "epoch": 0.10394042817251008, "grad_norm": 19.656599044799805, "learning_rate": 5.181507911883339e-07, "loss": 0.4812, "num_input_tokens_seen": 195968, "step": 335 }, { "epoch": 0.10549177784672666, "grad_norm": 21.128849029541016, "learning_rate": 5.259075395594167e-07, "loss": 0.4644, "num_input_tokens_seen": 198624, "step": 340 }, { "epoch": 0.10704312752094322, "grad_norm": 35.683658599853516, "learning_rate": 5.336642879304996e-07, "loss": 0.4821, "num_input_tokens_seen": 201120, "step": 345 }, { "epoch": 0.10859447719515979, "grad_norm": 21.726837158203125, "learning_rate": 5.414210363015824e-07, "loss": 0.4086, "num_input_tokens_seen": 203456, "step": 350 }, { "epoch": 0.11014582686937635, "grad_norm": 28.539443969726562, "learning_rate": 5.491777846726652e-07, "loss": 0.5308, "num_input_tokens_seen": 206944, "step": 355 }, { "epoch": 0.11169717654359293, "grad_norm": 29.114961624145508, "learning_rate": 5.569345330437481e-07, "loss": 0.4011, "num_input_tokens_seen": 209632, "step": 360 }, { "epoch": 0.1132485262178095, "grad_norm": 61.74116897583008, "learning_rate": 5.646912814148309e-07, "loss": 0.4416, "num_input_tokens_seen": 213120, "step": 365 }, { "epoch": 0.11479987589202606, "grad_norm": 23.288484573364258, "learning_rate": 5.724480297859138e-07, "loss": 0.3943, "num_input_tokens_seen": 215648, "step": 370 }, { "epoch": 0.11635122556624262, "grad_norm": 20.939146041870117, "learning_rate": 5.802047781569966e-07, "loss": 0.4088, "num_input_tokens_seen": 219840, "step": 375 }, { "epoch": 0.1179025752404592, "grad_norm": 28.443706512451172, "learning_rate": 5.879615265280795e-07, "loss": 0.3649, "num_input_tokens_seen": 222144, "step": 380 }, { "epoch": 0.11945392491467577, "grad_norm": 16.870277404785156, "learning_rate": 5.957182748991623e-07, "loss": 0.3502, "num_input_tokens_seen": 224736, "step": 385 }, { "epoch": 0.12100527458889233, "grad_norm": 32.83539581298828, "learning_rate": 6.034750232702452e-07, "loss": 0.3671, "num_input_tokens_seen": 227744, "step": 390 }, { "epoch": 0.1225566242631089, "grad_norm": 24.96401596069336, "learning_rate": 6.11231771641328e-07, "loss": 0.311, "num_input_tokens_seen": 231424, "step": 395 }, { "epoch": 0.12410797393732548, "grad_norm": 29.840538024902344, "learning_rate": 6.189885200124108e-07, "loss": 0.3029, "num_input_tokens_seen": 234496, "step": 400 }, { "epoch": 0.12565932361154203, "grad_norm": 34.20615768432617, "learning_rate": 6.267452683834938e-07, "loss": 0.2854, "num_input_tokens_seen": 237056, "step": 405 }, { "epoch": 0.1272106732857586, "grad_norm": 22.80665397644043, "learning_rate": 6.345020167545765e-07, "loss": 0.3084, "num_input_tokens_seen": 239744, "step": 410 }, { "epoch": 0.1287620229599752, "grad_norm": 24.470134735107422, "learning_rate": 6.422587651256595e-07, "loss": 0.3168, "num_input_tokens_seen": 242528, "step": 415 }, { "epoch": 0.13031337263419174, "grad_norm": 28.278629302978516, "learning_rate": 6.500155134967421e-07, "loss": 0.4239, "num_input_tokens_seen": 246080, "step": 420 }, { "epoch": 0.13186472230840832, "grad_norm": 17.722511291503906, "learning_rate": 6.577722618678251e-07, "loss": 0.3253, "num_input_tokens_seen": 248928, "step": 425 }, { "epoch": 0.1334160719826249, "grad_norm": 18.382959365844727, "learning_rate": 6.65529010238908e-07, "loss": 0.2718, "num_input_tokens_seen": 252032, "step": 430 }, { "epoch": 0.13496742165684145, "grad_norm": 23.929231643676758, "learning_rate": 6.732857586099908e-07, "loss": 0.3391, "num_input_tokens_seen": 255072, "step": 435 }, { "epoch": 0.13651877133105803, "grad_norm": 22.811641693115234, "learning_rate": 6.810425069810736e-07, "loss": 0.3208, "num_input_tokens_seen": 257376, "step": 440 }, { "epoch": 0.13807012100527458, "grad_norm": 26.858150482177734, "learning_rate": 6.887992553521565e-07, "loss": 0.2555, "num_input_tokens_seen": 261344, "step": 445 }, { "epoch": 0.13962147067949116, "grad_norm": 21.210458755493164, "learning_rate": 6.965560037232393e-07, "loss": 0.3094, "num_input_tokens_seen": 263744, "step": 450 }, { "epoch": 0.14117282035370773, "grad_norm": 28.18222999572754, "learning_rate": 7.043127520943222e-07, "loss": 0.3539, "num_input_tokens_seen": 266368, "step": 455 }, { "epoch": 0.14272417002792429, "grad_norm": 29.02256202697754, "learning_rate": 7.120695004654049e-07, "loss": 0.3232, "num_input_tokens_seen": 268704, "step": 460 }, { "epoch": 0.14427551970214086, "grad_norm": 30.899425506591797, "learning_rate": 7.198262488364878e-07, "loss": 0.2834, "num_input_tokens_seen": 271584, "step": 465 }, { "epoch": 0.14582686937635744, "grad_norm": 17.563106536865234, "learning_rate": 7.275829972075707e-07, "loss": 0.2718, "num_input_tokens_seen": 273728, "step": 470 }, { "epoch": 0.147378219050574, "grad_norm": 12.533330917358398, "learning_rate": 7.353397455786535e-07, "loss": 0.3183, "num_input_tokens_seen": 275968, "step": 475 }, { "epoch": 0.14892956872479057, "grad_norm": 13.154541015625, "learning_rate": 7.430964939497363e-07, "loss": 0.2808, "num_input_tokens_seen": 278528, "step": 480 }, { "epoch": 0.15048091839900712, "grad_norm": 20.521808624267578, "learning_rate": 7.508532423208192e-07, "loss": 0.3304, "num_input_tokens_seen": 281792, "step": 485 }, { "epoch": 0.1520322680732237, "grad_norm": 14.83568286895752, "learning_rate": 7.58609990691902e-07, "loss": 0.2803, "num_input_tokens_seen": 284096, "step": 490 }, { "epoch": 0.15358361774744028, "grad_norm": 13.524412155151367, "learning_rate": 7.663667390629849e-07, "loss": 0.266, "num_input_tokens_seen": 286720, "step": 495 }, { "epoch": 0.15513496742165683, "grad_norm": 44.134883880615234, "learning_rate": 7.741234874340676e-07, "loss": 0.296, "num_input_tokens_seen": 290272, "step": 500 }, { "epoch": 0.1566863170958734, "grad_norm": 17.211517333984375, "learning_rate": 7.818802358051505e-07, "loss": 0.2932, "num_input_tokens_seen": 292992, "step": 505 }, { "epoch": 0.15823766677009, "grad_norm": 44.91914367675781, "learning_rate": 7.896369841762334e-07, "loss": 0.2966, "num_input_tokens_seen": 296128, "step": 510 }, { "epoch": 0.15978901644430654, "grad_norm": 23.775617599487305, "learning_rate": 7.973937325473162e-07, "loss": 0.3017, "num_input_tokens_seen": 299040, "step": 515 }, { "epoch": 0.16134036611852312, "grad_norm": 21.152624130249023, "learning_rate": 8.05150480918399e-07, "loss": 0.233, "num_input_tokens_seen": 301856, "step": 520 }, { "epoch": 0.16289171579273967, "grad_norm": 25.383792877197266, "learning_rate": 8.129072292894818e-07, "loss": 0.4155, "num_input_tokens_seen": 306656, "step": 525 }, { "epoch": 0.16444306546695625, "grad_norm": 15.416352272033691, "learning_rate": 8.206639776605647e-07, "loss": 0.2785, "num_input_tokens_seen": 309664, "step": 530 }, { "epoch": 0.16599441514117283, "grad_norm": 21.136539459228516, "learning_rate": 8.284207260316477e-07, "loss": 0.2969, "num_input_tokens_seen": 312768, "step": 535 }, { "epoch": 0.16754576481538938, "grad_norm": 13.924885749816895, "learning_rate": 8.361774744027303e-07, "loss": 0.2914, "num_input_tokens_seen": 316928, "step": 540 }, { "epoch": 0.16909711448960596, "grad_norm": 23.99787139892578, "learning_rate": 8.439342227738133e-07, "loss": 0.2562, "num_input_tokens_seen": 319488, "step": 545 }, { "epoch": 0.17064846416382254, "grad_norm": 13.350808143615723, "learning_rate": 8.516909711448962e-07, "loss": 0.2479, "num_input_tokens_seen": 321856, "step": 550 }, { "epoch": 0.1721998138380391, "grad_norm": 14.907549858093262, "learning_rate": 8.59447719515979e-07, "loss": 0.2673, "num_input_tokens_seen": 325536, "step": 555 }, { "epoch": 0.17375116351225567, "grad_norm": 22.984176635742188, "learning_rate": 8.672044678870619e-07, "loss": 0.278, "num_input_tokens_seen": 328928, "step": 560 }, { "epoch": 0.17530251318647222, "grad_norm": 24.472278594970703, "learning_rate": 8.749612162581446e-07, "loss": 0.2649, "num_input_tokens_seen": 332320, "step": 565 }, { "epoch": 0.1768538628606888, "grad_norm": 12.869365692138672, "learning_rate": 8.827179646292275e-07, "loss": 0.2517, "num_input_tokens_seen": 335104, "step": 570 }, { "epoch": 0.17840521253490538, "grad_norm": 23.488819122314453, "learning_rate": 8.904747130003104e-07, "loss": 0.2831, "num_input_tokens_seen": 338976, "step": 575 }, { "epoch": 0.17995656220912193, "grad_norm": 10.436274528503418, "learning_rate": 8.982314613713932e-07, "loss": 0.2574, "num_input_tokens_seen": 342688, "step": 580 }, { "epoch": 0.1815079118833385, "grad_norm": 25.300445556640625, "learning_rate": 9.05988209742476e-07, "loss": 0.2738, "num_input_tokens_seen": 345632, "step": 585 }, { "epoch": 0.1830592615575551, "grad_norm": 29.95111846923828, "learning_rate": 9.137449581135589e-07, "loss": 0.2512, "num_input_tokens_seen": 348832, "step": 590 }, { "epoch": 0.18461061123177164, "grad_norm": 10.846729278564453, "learning_rate": 9.215017064846417e-07, "loss": 0.2642, "num_input_tokens_seen": 351200, "step": 595 }, { "epoch": 0.18616196090598822, "grad_norm": 11.654212951660156, "learning_rate": 9.292584548557246e-07, "loss": 0.2423, "num_input_tokens_seen": 353952, "step": 600 }, { "epoch": 0.18771331058020477, "grad_norm": 25.873641967773438, "learning_rate": 9.370152032268073e-07, "loss": 0.2595, "num_input_tokens_seen": 357248, "step": 605 }, { "epoch": 0.18926466025442135, "grad_norm": 26.449050903320312, "learning_rate": 9.447719515978902e-07, "loss": 0.2758, "num_input_tokens_seen": 359840, "step": 610 }, { "epoch": 0.19081600992863793, "grad_norm": 15.381325721740723, "learning_rate": 9.525286999689731e-07, "loss": 0.3061, "num_input_tokens_seen": 365504, "step": 615 }, { "epoch": 0.19236735960285448, "grad_norm": 22.06991195678711, "learning_rate": 9.602854483400559e-07, "loss": 0.2767, "num_input_tokens_seen": 368064, "step": 620 }, { "epoch": 0.19391870927707106, "grad_norm": 34.28987503051758, "learning_rate": 9.680421967111388e-07, "loss": 0.2517, "num_input_tokens_seen": 370752, "step": 625 }, { "epoch": 0.19547005895128763, "grad_norm": 23.870922088623047, "learning_rate": 9.757989450822216e-07, "loss": 0.2757, "num_input_tokens_seen": 373664, "step": 630 }, { "epoch": 0.19702140862550419, "grad_norm": 18.459299087524414, "learning_rate": 9.835556934533044e-07, "loss": 0.2733, "num_input_tokens_seen": 376032, "step": 635 }, { "epoch": 0.19857275829972076, "grad_norm": 12.2029390335083, "learning_rate": 9.913124418243874e-07, "loss": 0.2611, "num_input_tokens_seen": 378560, "step": 640 }, { "epoch": 0.20012410797393732, "grad_norm": 13.917941093444824, "learning_rate": 9.990691901954701e-07, "loss": 0.2577, "num_input_tokens_seen": 381824, "step": 645 }, { "epoch": 0.2016754576481539, "grad_norm": 8.487762451171875, "learning_rate": 1.006825938566553e-06, "loss": 0.2434, "num_input_tokens_seen": 384096, "step": 650 }, { "epoch": 0.20322680732237047, "grad_norm": 10.986791610717773, "learning_rate": 1.014582686937636e-06, "loss": 0.241, "num_input_tokens_seen": 387712, "step": 655 }, { "epoch": 0.20477815699658702, "grad_norm": 11.024580001831055, "learning_rate": 1.0223394353087187e-06, "loss": 0.2621, "num_input_tokens_seen": 392448, "step": 660 }, { "epoch": 0.2063295066708036, "grad_norm": 31.208219528198242, "learning_rate": 1.0300961836798014e-06, "loss": 0.2543, "num_input_tokens_seen": 395200, "step": 665 }, { "epoch": 0.20788085634502015, "grad_norm": 17.659503936767578, "learning_rate": 1.0378529320508844e-06, "loss": 0.2575, "num_input_tokens_seen": 398176, "step": 670 }, { "epoch": 0.20943220601923673, "grad_norm": 16.195411682128906, "learning_rate": 1.0456096804219672e-06, "loss": 0.2525, "num_input_tokens_seen": 400544, "step": 675 }, { "epoch": 0.2109835556934533, "grad_norm": 13.823236465454102, "learning_rate": 1.05336642879305e-06, "loss": 0.2611, "num_input_tokens_seen": 404288, "step": 680 }, { "epoch": 0.21253490536766986, "grad_norm": 9.700652122497559, "learning_rate": 1.0611231771641327e-06, "loss": 0.2321, "num_input_tokens_seen": 406976, "step": 685 }, { "epoch": 0.21408625504188644, "grad_norm": 11.042900085449219, "learning_rate": 1.0688799255352157e-06, "loss": 0.2551, "num_input_tokens_seen": 409344, "step": 690 }, { "epoch": 0.21563760471610302, "grad_norm": 13.982616424560547, "learning_rate": 1.0766366739062987e-06, "loss": 0.2578, "num_input_tokens_seen": 412864, "step": 695 }, { "epoch": 0.21718895439031957, "grad_norm": 12.134109497070312, "learning_rate": 1.0843934222773813e-06, "loss": 0.2611, "num_input_tokens_seen": 415712, "step": 700 }, { "epoch": 0.21874030406453615, "grad_norm": 14.561785697937012, "learning_rate": 1.0921501706484643e-06, "loss": 0.232, "num_input_tokens_seen": 418400, "step": 705 }, { "epoch": 0.2202916537387527, "grad_norm": 13.653223037719727, "learning_rate": 1.0999069190195472e-06, "loss": 0.2414, "num_input_tokens_seen": 420928, "step": 710 }, { "epoch": 0.22184300341296928, "grad_norm": 13.556102752685547, "learning_rate": 1.10766366739063e-06, "loss": 0.2563, "num_input_tokens_seen": 424032, "step": 715 }, { "epoch": 0.22339435308718586, "grad_norm": 10.736146926879883, "learning_rate": 1.1154204157617128e-06, "loss": 0.2362, "num_input_tokens_seen": 427808, "step": 720 }, { "epoch": 0.2249457027614024, "grad_norm": 17.267044067382812, "learning_rate": 1.1231771641327956e-06, "loss": 0.2347, "num_input_tokens_seen": 430944, "step": 725 }, { "epoch": 0.226497052435619, "grad_norm": 12.67550277709961, "learning_rate": 1.1309339125038785e-06, "loss": 0.2423, "num_input_tokens_seen": 433920, "step": 730 }, { "epoch": 0.22804840210983557, "grad_norm": 17.456192016601562, "learning_rate": 1.1386906608749613e-06, "loss": 0.2535, "num_input_tokens_seen": 436544, "step": 735 }, { "epoch": 0.22959975178405212, "grad_norm": 13.504227638244629, "learning_rate": 1.146447409246044e-06, "loss": 0.2412, "num_input_tokens_seen": 439840, "step": 740 }, { "epoch": 0.2311511014582687, "grad_norm": 14.387642860412598, "learning_rate": 1.154204157617127e-06, "loss": 0.2447, "num_input_tokens_seen": 442912, "step": 745 }, { "epoch": 0.23270245113248525, "grad_norm": 6.868820667266846, "learning_rate": 1.1619609059882098e-06, "loss": 0.2481, "num_input_tokens_seen": 446144, "step": 750 }, { "epoch": 0.23425380080670183, "grad_norm": 10.836665153503418, "learning_rate": 1.1697176543592926e-06, "loss": 0.2684, "num_input_tokens_seen": 448768, "step": 755 }, { "epoch": 0.2358051504809184, "grad_norm": 6.5808916091918945, "learning_rate": 1.1774744027303756e-06, "loss": 0.2638, "num_input_tokens_seen": 452640, "step": 760 }, { "epoch": 0.23735650015513496, "grad_norm": 6.529001712799072, "learning_rate": 1.1852311511014584e-06, "loss": 0.2464, "num_input_tokens_seen": 455872, "step": 765 }, { "epoch": 0.23890784982935154, "grad_norm": 20.440860748291016, "learning_rate": 1.1929878994725411e-06, "loss": 0.246, "num_input_tokens_seen": 458624, "step": 770 }, { "epoch": 0.24045919950356812, "grad_norm": 16.609468460083008, "learning_rate": 1.2007446478436241e-06, "loss": 0.2245, "num_input_tokens_seen": 461312, "step": 775 }, { "epoch": 0.24201054917778467, "grad_norm": 9.65118408203125, "learning_rate": 1.208501396214707e-06, "loss": 0.2393, "num_input_tokens_seen": 463616, "step": 780 }, { "epoch": 0.24356189885200125, "grad_norm": 21.475479125976562, "learning_rate": 1.2162581445857897e-06, "loss": 0.241, "num_input_tokens_seen": 466464, "step": 785 }, { "epoch": 0.2451132485262178, "grad_norm": 6.9308671951293945, "learning_rate": 1.2240148929568727e-06, "loss": 0.2541, "num_input_tokens_seen": 469024, "step": 790 }, { "epoch": 0.24666459820043438, "grad_norm": 7.463472366333008, "learning_rate": 1.2317716413279554e-06, "loss": 0.2326, "num_input_tokens_seen": 471520, "step": 795 }, { "epoch": 0.24821594787465096, "grad_norm": 18.664093017578125, "learning_rate": 1.2395283896990382e-06, "loss": 0.2409, "num_input_tokens_seen": 474240, "step": 800 }, { "epoch": 0.2497672975488675, "grad_norm": 9.248006820678711, "learning_rate": 1.247285138070121e-06, "loss": 0.2246, "num_input_tokens_seen": 477120, "step": 805 }, { "epoch": 0.25131864722308406, "grad_norm": 12.786887168884277, "learning_rate": 1.255041886441204e-06, "loss": 0.2641, "num_input_tokens_seen": 479808, "step": 810 }, { "epoch": 0.25286999689730066, "grad_norm": 11.627196311950684, "learning_rate": 1.2627986348122867e-06, "loss": 0.264, "num_input_tokens_seen": 483104, "step": 815 }, { "epoch": 0.2544213465715172, "grad_norm": 10.910761833190918, "learning_rate": 1.2705553831833697e-06, "loss": 0.2397, "num_input_tokens_seen": 485376, "step": 820 }, { "epoch": 0.25597269624573377, "grad_norm": 10.993096351623535, "learning_rate": 1.2783121315544525e-06, "loss": 0.2461, "num_input_tokens_seen": 488512, "step": 825 }, { "epoch": 0.2575240459199504, "grad_norm": 6.278591632843018, "learning_rate": 1.2860688799255353e-06, "loss": 0.2503, "num_input_tokens_seen": 491712, "step": 830 }, { "epoch": 0.2590753955941669, "grad_norm": 7.413983345031738, "learning_rate": 1.2938256282966182e-06, "loss": 0.2347, "num_input_tokens_seen": 494400, "step": 835 }, { "epoch": 0.2606267452683835, "grad_norm": 11.030438423156738, "learning_rate": 1.301582376667701e-06, "loss": 0.3286, "num_input_tokens_seen": 499808, "step": 840 }, { "epoch": 0.2621780949426001, "grad_norm": 5.4713006019592285, "learning_rate": 1.3093391250387838e-06, "loss": 0.2409, "num_input_tokens_seen": 503648, "step": 845 }, { "epoch": 0.26372944461681663, "grad_norm": 8.904631614685059, "learning_rate": 1.3170958734098666e-06, "loss": 0.2522, "num_input_tokens_seen": 507040, "step": 850 }, { "epoch": 0.2652807942910332, "grad_norm": 7.805501461029053, "learning_rate": 1.3248526217809495e-06, "loss": 0.2283, "num_input_tokens_seen": 509760, "step": 855 }, { "epoch": 0.2668321439652498, "grad_norm": 13.922120094299316, "learning_rate": 1.3326093701520323e-06, "loss": 0.2292, "num_input_tokens_seen": 512992, "step": 860 }, { "epoch": 0.26838349363946634, "grad_norm": 5.608225345611572, "learning_rate": 1.340366118523115e-06, "loss": 0.2261, "num_input_tokens_seen": 515648, "step": 865 }, { "epoch": 0.2699348433136829, "grad_norm": 16.564374923706055, "learning_rate": 1.348122866894198e-06, "loss": 0.2295, "num_input_tokens_seen": 518368, "step": 870 }, { "epoch": 0.2714861929878995, "grad_norm": 8.206068992614746, "learning_rate": 1.3558796152652808e-06, "loss": 0.2492, "num_input_tokens_seen": 521952, "step": 875 }, { "epoch": 0.27303754266211605, "grad_norm": 9.24280071258545, "learning_rate": 1.3636363636363636e-06, "loss": 0.2231, "num_input_tokens_seen": 526688, "step": 880 }, { "epoch": 0.2745888923363326, "grad_norm": 13.82464599609375, "learning_rate": 1.3713931120074466e-06, "loss": 0.2818, "num_input_tokens_seen": 531104, "step": 885 }, { "epoch": 0.27614024201054915, "grad_norm": 4.867370128631592, "learning_rate": 1.3791498603785294e-06, "loss": 0.2431, "num_input_tokens_seen": 533472, "step": 890 }, { "epoch": 0.27769159168476576, "grad_norm": 8.546463012695312, "learning_rate": 1.3869066087496121e-06, "loss": 0.2129, "num_input_tokens_seen": 536000, "step": 895 }, { "epoch": 0.2792429413589823, "grad_norm": 14.465402603149414, "learning_rate": 1.3946633571206951e-06, "loss": 0.2338, "num_input_tokens_seen": 538272, "step": 900 }, { "epoch": 0.28079429103319886, "grad_norm": 14.635748863220215, "learning_rate": 1.402420105491778e-06, "loss": 0.2303, "num_input_tokens_seen": 541056, "step": 905 }, { "epoch": 0.28234564070741547, "grad_norm": 11.731718063354492, "learning_rate": 1.4101768538628607e-06, "loss": 0.2867, "num_input_tokens_seen": 543616, "step": 910 }, { "epoch": 0.283896990381632, "grad_norm": 9.276512145996094, "learning_rate": 1.4179336022339439e-06, "loss": 0.242, "num_input_tokens_seen": 546016, "step": 915 }, { "epoch": 0.28544834005584857, "grad_norm": 26.639772415161133, "learning_rate": 1.4256903506050264e-06, "loss": 0.2466, "num_input_tokens_seen": 548192, "step": 920 }, { "epoch": 0.2869996897300652, "grad_norm": 9.239564895629883, "learning_rate": 1.4334470989761092e-06, "loss": 0.2557, "num_input_tokens_seen": 550976, "step": 925 }, { "epoch": 0.28855103940428173, "grad_norm": 9.394201278686523, "learning_rate": 1.441203847347192e-06, "loss": 0.2316, "num_input_tokens_seen": 553568, "step": 930 }, { "epoch": 0.2901023890784983, "grad_norm": 12.830315589904785, "learning_rate": 1.4489605957182752e-06, "loss": 0.2329, "num_input_tokens_seen": 557280, "step": 935 }, { "epoch": 0.2916537387527149, "grad_norm": 7.062427520751953, "learning_rate": 1.456717344089358e-06, "loss": 0.238, "num_input_tokens_seen": 561696, "step": 940 }, { "epoch": 0.29320508842693144, "grad_norm": 6.5551838874816895, "learning_rate": 1.4644740924604405e-06, "loss": 0.2362, "num_input_tokens_seen": 564736, "step": 945 }, { "epoch": 0.294756438101148, "grad_norm": 7.343404293060303, "learning_rate": 1.4722308408315237e-06, "loss": 0.2275, "num_input_tokens_seen": 567872, "step": 950 }, { "epoch": 0.29630778777536454, "grad_norm": 9.1697998046875, "learning_rate": 1.4799875892026065e-06, "loss": 0.2568, "num_input_tokens_seen": 570816, "step": 955 }, { "epoch": 0.29785913744958115, "grad_norm": 17.60558319091797, "learning_rate": 1.4877443375736892e-06, "loss": 0.2255, "num_input_tokens_seen": 573344, "step": 960 }, { "epoch": 0.2994104871237977, "grad_norm": 7.344029426574707, "learning_rate": 1.4955010859447722e-06, "loss": 0.2325, "num_input_tokens_seen": 576768, "step": 965 }, { "epoch": 0.30096183679801425, "grad_norm": 10.749878883361816, "learning_rate": 1.503257834315855e-06, "loss": 0.2236, "num_input_tokens_seen": 579520, "step": 970 }, { "epoch": 0.30251318647223086, "grad_norm": 9.501692771911621, "learning_rate": 1.5110145826869378e-06, "loss": 0.2013, "num_input_tokens_seen": 583136, "step": 975 }, { "epoch": 0.3040645361464474, "grad_norm": 13.33872127532959, "learning_rate": 1.5187713310580207e-06, "loss": 0.2766, "num_input_tokens_seen": 586080, "step": 980 }, { "epoch": 0.30561588582066396, "grad_norm": 17.910507202148438, "learning_rate": 1.5265280794291035e-06, "loss": 0.3007, "num_input_tokens_seen": 588704, "step": 985 }, { "epoch": 0.30716723549488056, "grad_norm": 14.273642539978027, "learning_rate": 1.5342848278001863e-06, "loss": 0.2586, "num_input_tokens_seen": 592352, "step": 990 }, { "epoch": 0.3087185851690971, "grad_norm": 7.610750198364258, "learning_rate": 1.5420415761712693e-06, "loss": 0.248, "num_input_tokens_seen": 595424, "step": 995 }, { "epoch": 0.31026993484331367, "grad_norm": 9.563008308410645, "learning_rate": 1.549798324542352e-06, "loss": 0.2307, "num_input_tokens_seen": 598400, "step": 1000 }, { "epoch": 0.3118212845175303, "grad_norm": 7.235225200653076, "learning_rate": 1.5575550729134348e-06, "loss": 0.2431, "num_input_tokens_seen": 602592, "step": 1005 }, { "epoch": 0.3133726341917468, "grad_norm": 4.118921756744385, "learning_rate": 1.5653118212845176e-06, "loss": 0.243, "num_input_tokens_seen": 606080, "step": 1010 }, { "epoch": 0.3149239838659634, "grad_norm": 6.259126663208008, "learning_rate": 1.5730685696556006e-06, "loss": 0.2287, "num_input_tokens_seen": 609408, "step": 1015 }, { "epoch": 0.31647533354018, "grad_norm": 13.151688575744629, "learning_rate": 1.5808253180266833e-06, "loss": 0.254, "num_input_tokens_seen": 612256, "step": 1020 }, { "epoch": 0.31802668321439653, "grad_norm": 6.424307823181152, "learning_rate": 1.5885820663977661e-06, "loss": 0.2326, "num_input_tokens_seen": 615424, "step": 1025 }, { "epoch": 0.3195780328886131, "grad_norm": 4.947159290313721, "learning_rate": 1.596338814768849e-06, "loss": 0.2304, "num_input_tokens_seen": 618048, "step": 1030 }, { "epoch": 0.32112938256282964, "grad_norm": 8.16169548034668, "learning_rate": 1.6040955631399319e-06, "loss": 0.2409, "num_input_tokens_seen": 622880, "step": 1035 }, { "epoch": 0.32268073223704624, "grad_norm": 9.810173034667969, "learning_rate": 1.6118523115110146e-06, "loss": 0.233, "num_input_tokens_seen": 625088, "step": 1040 }, { "epoch": 0.3242320819112628, "grad_norm": 5.08217191696167, "learning_rate": 1.6196090598820976e-06, "loss": 0.2372, "num_input_tokens_seen": 628064, "step": 1045 }, { "epoch": 0.32578343158547934, "grad_norm": 6.386065483093262, "learning_rate": 1.6273658082531804e-06, "loss": 0.2355, "num_input_tokens_seen": 630688, "step": 1050 }, { "epoch": 0.32733478125969595, "grad_norm": 8.078049659729004, "learning_rate": 1.6351225566242632e-06, "loss": 0.2273, "num_input_tokens_seen": 635136, "step": 1055 }, { "epoch": 0.3288861309339125, "grad_norm": 6.199044704437256, "learning_rate": 1.6428793049953462e-06, "loss": 0.239, "num_input_tokens_seen": 637536, "step": 1060 }, { "epoch": 0.33043748060812905, "grad_norm": 4.943363189697266, "learning_rate": 1.650636053366429e-06, "loss": 0.2363, "num_input_tokens_seen": 641216, "step": 1065 }, { "epoch": 0.33198883028234566, "grad_norm": 6.648599147796631, "learning_rate": 1.6583928017375117e-06, "loss": 0.2167, "num_input_tokens_seen": 644000, "step": 1070 }, { "epoch": 0.3335401799565622, "grad_norm": 11.884827613830566, "learning_rate": 1.6661495501085947e-06, "loss": 0.2365, "num_input_tokens_seen": 647264, "step": 1075 }, { "epoch": 0.33509152963077876, "grad_norm": 12.438932418823242, "learning_rate": 1.6739062984796775e-06, "loss": 0.2133, "num_input_tokens_seen": 650016, "step": 1080 }, { "epoch": 0.33664287930499537, "grad_norm": 9.440153121948242, "learning_rate": 1.6816630468507602e-06, "loss": 0.2252, "num_input_tokens_seen": 653376, "step": 1085 }, { "epoch": 0.3381942289792119, "grad_norm": 15.00513744354248, "learning_rate": 1.689419795221843e-06, "loss": 0.2224, "num_input_tokens_seen": 657376, "step": 1090 }, { "epoch": 0.33974557865342847, "grad_norm": 8.06560230255127, "learning_rate": 1.697176543592926e-06, "loss": 0.2405, "num_input_tokens_seen": 660832, "step": 1095 }, { "epoch": 0.3412969283276451, "grad_norm": 19.56668472290039, "learning_rate": 1.7049332919640088e-06, "loss": 0.2433, "num_input_tokens_seen": 663616, "step": 1100 }, { "epoch": 0.34284827800186163, "grad_norm": 21.974489212036133, "learning_rate": 1.7126900403350915e-06, "loss": 0.261, "num_input_tokens_seen": 666528, "step": 1105 }, { "epoch": 0.3443996276760782, "grad_norm": 8.593753814697266, "learning_rate": 1.7204467887061745e-06, "loss": 0.279, "num_input_tokens_seen": 669856, "step": 1110 }, { "epoch": 0.34595097735029473, "grad_norm": 6.258231163024902, "learning_rate": 1.7282035370772573e-06, "loss": 0.2245, "num_input_tokens_seen": 672608, "step": 1115 }, { "epoch": 0.34750232702451134, "grad_norm": 12.395196914672852, "learning_rate": 1.73596028544834e-06, "loss": 0.2739, "num_input_tokens_seen": 676160, "step": 1120 }, { "epoch": 0.3490536766987279, "grad_norm": 31.162939071655273, "learning_rate": 1.743717033819423e-06, "loss": 0.2563, "num_input_tokens_seen": 679968, "step": 1125 }, { "epoch": 0.35060502637294444, "grad_norm": 8.557174682617188, "learning_rate": 1.7514737821905058e-06, "loss": 0.2405, "num_input_tokens_seen": 683840, "step": 1130 }, { "epoch": 0.35215637604716105, "grad_norm": 4.304966926574707, "learning_rate": 1.7592305305615886e-06, "loss": 0.2369, "num_input_tokens_seen": 686560, "step": 1135 }, { "epoch": 0.3537077257213776, "grad_norm": 14.767252922058105, "learning_rate": 1.7669872789326718e-06, "loss": 0.2442, "num_input_tokens_seen": 690368, "step": 1140 }, { "epoch": 0.35525907539559415, "grad_norm": 7.5551581382751465, "learning_rate": 1.7747440273037543e-06, "loss": 0.2482, "num_input_tokens_seen": 694784, "step": 1145 }, { "epoch": 0.35681042506981075, "grad_norm": 3.1296226978302, "learning_rate": 1.7825007756748371e-06, "loss": 0.2263, "num_input_tokens_seen": 697440, "step": 1150 }, { "epoch": 0.3583617747440273, "grad_norm": 6.068717956542969, "learning_rate": 1.7902575240459199e-06, "loss": 0.2533, "num_input_tokens_seen": 700160, "step": 1155 }, { "epoch": 0.35991312441824386, "grad_norm": 5.620510101318359, "learning_rate": 1.798014272417003e-06, "loss": 0.2512, "num_input_tokens_seen": 704064, "step": 1160 }, { "epoch": 0.36146447409246046, "grad_norm": 4.839707374572754, "learning_rate": 1.8057710207880856e-06, "loss": 0.2359, "num_input_tokens_seen": 707648, "step": 1165 }, { "epoch": 0.363015823766677, "grad_norm": 5.363236427307129, "learning_rate": 1.8135277691591684e-06, "loss": 0.2357, "num_input_tokens_seen": 710688, "step": 1170 }, { "epoch": 0.36456717344089357, "grad_norm": 5.4056715965271, "learning_rate": 1.8212845175302516e-06, "loss": 0.2287, "num_input_tokens_seen": 713536, "step": 1175 }, { "epoch": 0.3661185231151102, "grad_norm": 3.880324125289917, "learning_rate": 1.8290412659013344e-06, "loss": 0.2452, "num_input_tokens_seen": 716032, "step": 1180 }, { "epoch": 0.3676698727893267, "grad_norm": 3.9070112705230713, "learning_rate": 1.8367980142724172e-06, "loss": 0.2283, "num_input_tokens_seen": 718752, "step": 1185 }, { "epoch": 0.3692212224635433, "grad_norm": 3.3089842796325684, "learning_rate": 1.8445547626435001e-06, "loss": 0.2302, "num_input_tokens_seen": 721088, "step": 1190 }, { "epoch": 0.3707725721377598, "grad_norm": 6.660016059875488, "learning_rate": 1.852311511014583e-06, "loss": 0.2277, "num_input_tokens_seen": 723968, "step": 1195 }, { "epoch": 0.37232392181197643, "grad_norm": 7.880346298217773, "learning_rate": 1.8600682593856657e-06, "loss": 0.2272, "num_input_tokens_seen": 726784, "step": 1200 }, { "epoch": 0.373875271486193, "grad_norm": 8.268329620361328, "learning_rate": 1.8678250077567487e-06, "loss": 0.2347, "num_input_tokens_seen": 729312, "step": 1205 }, { "epoch": 0.37542662116040953, "grad_norm": 14.149624824523926, "learning_rate": 1.8755817561278314e-06, "loss": 0.2168, "num_input_tokens_seen": 732448, "step": 1210 }, { "epoch": 0.37697797083462614, "grad_norm": 23.32288360595703, "learning_rate": 1.8833385044989142e-06, "loss": 0.2543, "num_input_tokens_seen": 735744, "step": 1215 }, { "epoch": 0.3785293205088427, "grad_norm": 5.4793219566345215, "learning_rate": 1.8910952528699972e-06, "loss": 0.214, "num_input_tokens_seen": 738752, "step": 1220 }, { "epoch": 0.38008067018305924, "grad_norm": 21.19795036315918, "learning_rate": 1.89885200124108e-06, "loss": 0.2259, "num_input_tokens_seen": 741824, "step": 1225 }, { "epoch": 0.38163201985727585, "grad_norm": 6.3834028244018555, "learning_rate": 1.9066087496121627e-06, "loss": 0.2424, "num_input_tokens_seen": 745376, "step": 1230 }, { "epoch": 0.3831833695314924, "grad_norm": 5.95145845413208, "learning_rate": 1.9143654979832455e-06, "loss": 0.2268, "num_input_tokens_seen": 749440, "step": 1235 }, { "epoch": 0.38473471920570895, "grad_norm": 7.458003520965576, "learning_rate": 1.9221222463543285e-06, "loss": 0.238, "num_input_tokens_seen": 751936, "step": 1240 }, { "epoch": 0.38628606887992556, "grad_norm": 5.214407920837402, "learning_rate": 1.929878994725411e-06, "loss": 0.2239, "num_input_tokens_seen": 754496, "step": 1245 }, { "epoch": 0.3878374185541421, "grad_norm": 7.589509963989258, "learning_rate": 1.937635743096494e-06, "loss": 0.2312, "num_input_tokens_seen": 757632, "step": 1250 }, { "epoch": 0.38938876822835866, "grad_norm": 6.206959247589111, "learning_rate": 1.945392491467577e-06, "loss": 0.2133, "num_input_tokens_seen": 760352, "step": 1255 }, { "epoch": 0.39094011790257527, "grad_norm": 12.671391487121582, "learning_rate": 1.9531492398386596e-06, "loss": 0.2338, "num_input_tokens_seen": 763136, "step": 1260 }, { "epoch": 0.3924914675767918, "grad_norm": 13.642729759216309, "learning_rate": 1.9609059882097426e-06, "loss": 0.2832, "num_input_tokens_seen": 765472, "step": 1265 }, { "epoch": 0.39404281725100837, "grad_norm": 5.338411808013916, "learning_rate": 1.9686627365808256e-06, "loss": 0.2272, "num_input_tokens_seen": 767456, "step": 1270 }, { "epoch": 0.3955941669252249, "grad_norm": 5.1895856857299805, "learning_rate": 1.9764194849519085e-06, "loss": 0.2482, "num_input_tokens_seen": 770176, "step": 1275 }, { "epoch": 0.39714551659944153, "grad_norm": 8.718914985656738, "learning_rate": 1.984176233322991e-06, "loss": 0.2277, "num_input_tokens_seen": 772800, "step": 1280 }, { "epoch": 0.3986968662736581, "grad_norm": 9.778889656066895, "learning_rate": 1.991932981694074e-06, "loss": 0.2344, "num_input_tokens_seen": 775584, "step": 1285 }, { "epoch": 0.40024821594787463, "grad_norm": 7.117669105529785, "learning_rate": 1.999689730065157e-06, "loss": 0.2373, "num_input_tokens_seen": 778816, "step": 1290 }, { "epoch": 0.40179956562209124, "grad_norm": 5.664675712585449, "learning_rate": 2.0074464784362396e-06, "loss": 0.1973, "num_input_tokens_seen": 781376, "step": 1295 }, { "epoch": 0.4033509152963078, "grad_norm": 12.282574653625488, "learning_rate": 2.0152032268073226e-06, "loss": 0.2545, "num_input_tokens_seen": 785280, "step": 1300 }, { "epoch": 0.40490226497052434, "grad_norm": 7.163668632507324, "learning_rate": 2.0229599751784056e-06, "loss": 0.2488, "num_input_tokens_seen": 787712, "step": 1305 }, { "epoch": 0.40645361464474095, "grad_norm": 8.471031188964844, "learning_rate": 2.030716723549488e-06, "loss": 0.2549, "num_input_tokens_seen": 790464, "step": 1310 }, { "epoch": 0.4080049643189575, "grad_norm": 5.973827838897705, "learning_rate": 2.038473471920571e-06, "loss": 0.2497, "num_input_tokens_seen": 793312, "step": 1315 }, { "epoch": 0.40955631399317405, "grad_norm": 3.048070192337036, "learning_rate": 2.046230220291654e-06, "loss": 0.2414, "num_input_tokens_seen": 797024, "step": 1320 }, { "epoch": 0.41110766366739065, "grad_norm": 1.4210063219070435, "learning_rate": 2.0539869686627367e-06, "loss": 0.2368, "num_input_tokens_seen": 799456, "step": 1325 }, { "epoch": 0.4126590133416072, "grad_norm": 1.9492261409759521, "learning_rate": 2.0617437170338197e-06, "loss": 0.2364, "num_input_tokens_seen": 802272, "step": 1330 }, { "epoch": 0.41421036301582376, "grad_norm": 3.4835333824157715, "learning_rate": 2.0695004654049026e-06, "loss": 0.2367, "num_input_tokens_seen": 806656, "step": 1335 }, { "epoch": 0.4157617126900403, "grad_norm": 3.6267638206481934, "learning_rate": 2.077257213775985e-06, "loss": 0.2326, "num_input_tokens_seen": 809056, "step": 1340 }, { "epoch": 0.4173130623642569, "grad_norm": 1.4259048700332642, "learning_rate": 2.085013962147068e-06, "loss": 0.2446, "num_input_tokens_seen": 812192, "step": 1345 }, { "epoch": 0.41886441203847347, "grad_norm": 6.973081588745117, "learning_rate": 2.092770710518151e-06, "loss": 0.2355, "num_input_tokens_seen": 815104, "step": 1350 }, { "epoch": 0.42041576171269, "grad_norm": 3.5298213958740234, "learning_rate": 2.1005274588892337e-06, "loss": 0.2313, "num_input_tokens_seen": 817152, "step": 1355 }, { "epoch": 0.4219671113869066, "grad_norm": 5.003673076629639, "learning_rate": 2.1082842072603167e-06, "loss": 0.2506, "num_input_tokens_seen": 820096, "step": 1360 }, { "epoch": 0.4235184610611232, "grad_norm": 6.310431957244873, "learning_rate": 2.1160409556313997e-06, "loss": 0.2309, "num_input_tokens_seen": 824512, "step": 1365 }, { "epoch": 0.4250698107353397, "grad_norm": 9.483888626098633, "learning_rate": 2.1237977040024823e-06, "loss": 0.2073, "num_input_tokens_seen": 827712, "step": 1370 }, { "epoch": 0.42662116040955633, "grad_norm": 5.450127601623535, "learning_rate": 2.1315544523735652e-06, "loss": 0.2202, "num_input_tokens_seen": 830880, "step": 1375 }, { "epoch": 0.4281725100837729, "grad_norm": 14.250082015991211, "learning_rate": 2.1393112007446482e-06, "loss": 0.256, "num_input_tokens_seen": 832992, "step": 1380 }, { "epoch": 0.42972385975798943, "grad_norm": 10.682156562805176, "learning_rate": 2.147067949115731e-06, "loss": 0.2515, "num_input_tokens_seen": 835488, "step": 1385 }, { "epoch": 0.43127520943220604, "grad_norm": 14.897111892700195, "learning_rate": 2.1548246974868138e-06, "loss": 0.2384, "num_input_tokens_seen": 838432, "step": 1390 }, { "epoch": 0.4328265591064226, "grad_norm": 6.490475177764893, "learning_rate": 2.1625814458578963e-06, "loss": 0.2498, "num_input_tokens_seen": 840640, "step": 1395 }, { "epoch": 0.43437790878063914, "grad_norm": 10.086031913757324, "learning_rate": 2.1703381942289793e-06, "loss": 0.2233, "num_input_tokens_seen": 843968, "step": 1400 }, { "epoch": 0.43592925845485575, "grad_norm": 7.740044593811035, "learning_rate": 2.1780949426000623e-06, "loss": 0.2285, "num_input_tokens_seen": 846528, "step": 1405 }, { "epoch": 0.4374806081290723, "grad_norm": 7.130372047424316, "learning_rate": 2.185851690971145e-06, "loss": 0.2359, "num_input_tokens_seen": 849888, "step": 1410 }, { "epoch": 0.43903195780328885, "grad_norm": 6.2613091468811035, "learning_rate": 2.193608439342228e-06, "loss": 0.2437, "num_input_tokens_seen": 853760, "step": 1415 }, { "epoch": 0.4405833074775054, "grad_norm": 2.788583517074585, "learning_rate": 2.201365187713311e-06, "loss": 0.2225, "num_input_tokens_seen": 856064, "step": 1420 }, { "epoch": 0.442134657151722, "grad_norm": 2.486278533935547, "learning_rate": 2.2091219360843934e-06, "loss": 0.2396, "num_input_tokens_seen": 859104, "step": 1425 }, { "epoch": 0.44368600682593856, "grad_norm": 4.126947402954102, "learning_rate": 2.2168786844554764e-06, "loss": 0.2335, "num_input_tokens_seen": 861344, "step": 1430 }, { "epoch": 0.4452373565001551, "grad_norm": 7.570391654968262, "learning_rate": 2.2246354328265594e-06, "loss": 0.2324, "num_input_tokens_seen": 864032, "step": 1435 }, { "epoch": 0.4467887061743717, "grad_norm": 5.490358829498291, "learning_rate": 2.232392181197642e-06, "loss": 0.2307, "num_input_tokens_seen": 867744, "step": 1440 }, { "epoch": 0.44834005584858827, "grad_norm": 8.583271026611328, "learning_rate": 2.240148929568725e-06, "loss": 0.2382, "num_input_tokens_seen": 870240, "step": 1445 }, { "epoch": 0.4498914055228048, "grad_norm": 6.705873966217041, "learning_rate": 2.247905677939808e-06, "loss": 0.2316, "num_input_tokens_seen": 874016, "step": 1450 }, { "epoch": 0.45144275519702143, "grad_norm": 4.237557411193848, "learning_rate": 2.2556624263108904e-06, "loss": 0.2325, "num_input_tokens_seen": 877536, "step": 1455 }, { "epoch": 0.452994104871238, "grad_norm": 5.858123779296875, "learning_rate": 2.2634191746819734e-06, "loss": 0.2448, "num_input_tokens_seen": 880064, "step": 1460 }, { "epoch": 0.45454545454545453, "grad_norm": 2.1889233589172363, "learning_rate": 2.2711759230530564e-06, "loss": 0.2467, "num_input_tokens_seen": 882816, "step": 1465 }, { "epoch": 0.45609680421967114, "grad_norm": 2.0669639110565186, "learning_rate": 2.278932671424139e-06, "loss": 0.214, "num_input_tokens_seen": 885472, "step": 1470 }, { "epoch": 0.4576481538938877, "grad_norm": 3.9713544845581055, "learning_rate": 2.286689419795222e-06, "loss": 0.2447, "num_input_tokens_seen": 888736, "step": 1475 }, { "epoch": 0.45919950356810424, "grad_norm": 2.996089458465576, "learning_rate": 2.294446168166305e-06, "loss": 0.2317, "num_input_tokens_seen": 891904, "step": 1480 }, { "epoch": 0.46075085324232085, "grad_norm": 1.7253000736236572, "learning_rate": 2.3022029165373875e-06, "loss": 0.229, "num_input_tokens_seen": 894112, "step": 1485 }, { "epoch": 0.4623022029165374, "grad_norm": 3.95493483543396, "learning_rate": 2.3099596649084705e-06, "loss": 0.2345, "num_input_tokens_seen": 896320, "step": 1490 }, { "epoch": 0.46385355259075395, "grad_norm": 4.685126304626465, "learning_rate": 2.3177164132795535e-06, "loss": 0.2327, "num_input_tokens_seen": 899936, "step": 1495 }, { "epoch": 0.4654049022649705, "grad_norm": 1.8305084705352783, "learning_rate": 2.3254731616506365e-06, "loss": 0.2312, "num_input_tokens_seen": 902592, "step": 1500 }, { "epoch": 0.4669562519391871, "grad_norm": 2.1959455013275146, "learning_rate": 2.333229910021719e-06, "loss": 0.254, "num_input_tokens_seen": 905312, "step": 1505 }, { "epoch": 0.46850760161340366, "grad_norm": 3.7127625942230225, "learning_rate": 2.340986658392802e-06, "loss": 0.2333, "num_input_tokens_seen": 907648, "step": 1510 }, { "epoch": 0.4700589512876202, "grad_norm": 1.3566960096359253, "learning_rate": 2.348743406763885e-06, "loss": 0.2324, "num_input_tokens_seen": 911264, "step": 1515 }, { "epoch": 0.4716103009618368, "grad_norm": 3.809645175933838, "learning_rate": 2.3565001551349675e-06, "loss": 0.2299, "num_input_tokens_seen": 914144, "step": 1520 }, { "epoch": 0.47316165063605337, "grad_norm": 1.3711073398590088, "learning_rate": 2.3642569035060505e-06, "loss": 0.2365, "num_input_tokens_seen": 916832, "step": 1525 }, { "epoch": 0.4747130003102699, "grad_norm": 3.7817866802215576, "learning_rate": 2.3720136518771335e-06, "loss": 0.2272, "num_input_tokens_seen": 919424, "step": 1530 }, { "epoch": 0.4762643499844865, "grad_norm": 2.6864078044891357, "learning_rate": 2.379770400248216e-06, "loss": 0.2381, "num_input_tokens_seen": 922272, "step": 1535 }, { "epoch": 0.4778156996587031, "grad_norm": 4.747899055480957, "learning_rate": 2.387527148619299e-06, "loss": 0.231, "num_input_tokens_seen": 925760, "step": 1540 }, { "epoch": 0.4793670493329196, "grad_norm": 2.248534679412842, "learning_rate": 2.395283896990382e-06, "loss": 0.2391, "num_input_tokens_seen": 929024, "step": 1545 }, { "epoch": 0.48091839900713623, "grad_norm": 2.2839481830596924, "learning_rate": 2.4030406453614646e-06, "loss": 0.2252, "num_input_tokens_seen": 932000, "step": 1550 }, { "epoch": 0.4824697486813528, "grad_norm": 3.0511105060577393, "learning_rate": 2.4107973937325476e-06, "loss": 0.2324, "num_input_tokens_seen": 934784, "step": 1555 }, { "epoch": 0.48402109835556933, "grad_norm": 5.125743865966797, "learning_rate": 2.4185541421036306e-06, "loss": 0.2332, "num_input_tokens_seen": 937344, "step": 1560 }, { "epoch": 0.48557244802978594, "grad_norm": 5.940786838531494, "learning_rate": 2.426310890474713e-06, "loss": 0.2352, "num_input_tokens_seen": 942240, "step": 1565 }, { "epoch": 0.4871237977040025, "grad_norm": 4.468514442443848, "learning_rate": 2.434067638845796e-06, "loss": 0.2321, "num_input_tokens_seen": 944448, "step": 1570 }, { "epoch": 0.48867514737821904, "grad_norm": 6.121624946594238, "learning_rate": 2.441824387216879e-06, "loss": 0.2455, "num_input_tokens_seen": 948288, "step": 1575 }, { "epoch": 0.4902264970524356, "grad_norm": 3.8893847465515137, "learning_rate": 2.4495811355879617e-06, "loss": 0.2395, "num_input_tokens_seen": 950400, "step": 1580 }, { "epoch": 0.4917778467266522, "grad_norm": 1.8171436786651611, "learning_rate": 2.4573378839590446e-06, "loss": 0.2405, "num_input_tokens_seen": 952960, "step": 1585 }, { "epoch": 0.49332919640086875, "grad_norm": 1.6644997596740723, "learning_rate": 2.4650946323301276e-06, "loss": 0.2257, "num_input_tokens_seen": 956672, "step": 1590 }, { "epoch": 0.4948805460750853, "grad_norm": 4.369275093078613, "learning_rate": 2.47285138070121e-06, "loss": 0.2247, "num_input_tokens_seen": 960672, "step": 1595 }, { "epoch": 0.4964318957493019, "grad_norm": 3.028066873550415, "learning_rate": 2.480608129072293e-06, "loss": 0.2394, "num_input_tokens_seen": 963360, "step": 1600 }, { "epoch": 0.49798324542351846, "grad_norm": 3.692035436630249, "learning_rate": 2.488364877443376e-06, "loss": 0.2421, "num_input_tokens_seen": 966336, "step": 1605 }, { "epoch": 0.499534595097735, "grad_norm": 4.531917572021484, "learning_rate": 2.4961216258144587e-06, "loss": 0.251, "num_input_tokens_seen": 969408, "step": 1610 }, { "epoch": 0.5010859447719516, "grad_norm": 1.9402778148651123, "learning_rate": 2.5038783741855417e-06, "loss": 0.2314, "num_input_tokens_seen": 973152, "step": 1615 }, { "epoch": 0.5026372944461681, "grad_norm": 1.2798184156417847, "learning_rate": 2.5116351225566243e-06, "loss": 0.2396, "num_input_tokens_seen": 976096, "step": 1620 }, { "epoch": 0.5041886441203848, "grad_norm": 1.8992865085601807, "learning_rate": 2.5193918709277072e-06, "loss": 0.2343, "num_input_tokens_seen": 978912, "step": 1625 }, { "epoch": 0.5057399937946013, "grad_norm": 2.898108720779419, "learning_rate": 2.5271486192987902e-06, "loss": 0.2385, "num_input_tokens_seen": 982336, "step": 1630 }, { "epoch": 0.5072913434688179, "grad_norm": 2.496814489364624, "learning_rate": 2.534905367669873e-06, "loss": 0.233, "num_input_tokens_seen": 985440, "step": 1635 }, { "epoch": 0.5088426931430344, "grad_norm": 3.407593011856079, "learning_rate": 2.5426621160409558e-06, "loss": 0.2387, "num_input_tokens_seen": 988192, "step": 1640 }, { "epoch": 0.510394042817251, "grad_norm": 3.2547576427459717, "learning_rate": 2.5504188644120388e-06, "loss": 0.2315, "num_input_tokens_seen": 991232, "step": 1645 }, { "epoch": 0.5119453924914675, "grad_norm": 7.154918670654297, "learning_rate": 2.5581756127831213e-06, "loss": 0.2302, "num_input_tokens_seen": 993248, "step": 1650 }, { "epoch": 0.5134967421656842, "grad_norm": 3.8636813163757324, "learning_rate": 2.5659323611542043e-06, "loss": 0.23, "num_input_tokens_seen": 995808, "step": 1655 }, { "epoch": 0.5150480918399007, "grad_norm": 3.4568958282470703, "learning_rate": 2.573689109525287e-06, "loss": 0.2437, "num_input_tokens_seen": 999584, "step": 1660 }, { "epoch": 0.5165994415141173, "grad_norm": 1.2965117692947388, "learning_rate": 2.5814458578963703e-06, "loss": 0.2372, "num_input_tokens_seen": 1002336, "step": 1665 }, { "epoch": 0.5181507911883338, "grad_norm": 2.3122968673706055, "learning_rate": 2.589202606267453e-06, "loss": 0.2505, "num_input_tokens_seen": 1005152, "step": 1670 }, { "epoch": 0.5197021408625504, "grad_norm": 2.532331705093384, "learning_rate": 2.596959354638536e-06, "loss": 0.2317, "num_input_tokens_seen": 1009024, "step": 1675 }, { "epoch": 0.521253490536767, "grad_norm": 3.201425313949585, "learning_rate": 2.6047161030096184e-06, "loss": 0.2281, "num_input_tokens_seen": 1011328, "step": 1680 }, { "epoch": 0.5228048402109835, "grad_norm": 3.130385637283325, "learning_rate": 2.6124728513807014e-06, "loss": 0.2476, "num_input_tokens_seen": 1016160, "step": 1685 }, { "epoch": 0.5243561898852002, "grad_norm": 3.7169857025146484, "learning_rate": 2.620229599751784e-06, "loss": 0.2408, "num_input_tokens_seen": 1019168, "step": 1690 }, { "epoch": 0.5259075395594167, "grad_norm": 3.172335386276245, "learning_rate": 2.6279863481228673e-06, "loss": 0.2411, "num_input_tokens_seen": 1022720, "step": 1695 }, { "epoch": 0.5274588892336333, "grad_norm": 3.651167392730713, "learning_rate": 2.6357430964939503e-06, "loss": 0.2279, "num_input_tokens_seen": 1026144, "step": 1700 }, { "epoch": 0.5290102389078498, "grad_norm": 3.6725635528564453, "learning_rate": 2.643499844865033e-06, "loss": 0.2356, "num_input_tokens_seen": 1030496, "step": 1705 }, { "epoch": 0.5305615885820664, "grad_norm": 2.2014057636260986, "learning_rate": 2.6512565932361154e-06, "loss": 0.2263, "num_input_tokens_seen": 1033312, "step": 1710 }, { "epoch": 0.5321129382562829, "grad_norm": 2.5508811473846436, "learning_rate": 2.6590133416071984e-06, "loss": 0.2485, "num_input_tokens_seen": 1036800, "step": 1715 }, { "epoch": 0.5336642879304996, "grad_norm": 2.457003355026245, "learning_rate": 2.666770089978281e-06, "loss": 0.2256, "num_input_tokens_seen": 1039424, "step": 1720 }, { "epoch": 0.5352156376047161, "grad_norm": 4.534296989440918, "learning_rate": 2.6745268383493644e-06, "loss": 0.2266, "num_input_tokens_seen": 1042304, "step": 1725 }, { "epoch": 0.5367669872789327, "grad_norm": 3.369375705718994, "learning_rate": 2.6822835867204474e-06, "loss": 0.2377, "num_input_tokens_seen": 1045664, "step": 1730 }, { "epoch": 0.5383183369531492, "grad_norm": 2.4585113525390625, "learning_rate": 2.69004033509153e-06, "loss": 0.2251, "num_input_tokens_seen": 1048352, "step": 1735 }, { "epoch": 0.5398696866273658, "grad_norm": 1.3899656534194946, "learning_rate": 2.697797083462613e-06, "loss": 0.2342, "num_input_tokens_seen": 1052448, "step": 1740 }, { "epoch": 0.5414210363015823, "grad_norm": 2.820141315460205, "learning_rate": 2.7055538318336955e-06, "loss": 0.2379, "num_input_tokens_seen": 1055328, "step": 1745 }, { "epoch": 0.542972385975799, "grad_norm": 2.222752809524536, "learning_rate": 2.713310580204778e-06, "loss": 0.2278, "num_input_tokens_seen": 1057696, "step": 1750 }, { "epoch": 0.5445237356500156, "grad_norm": 1.0644205808639526, "learning_rate": 2.721067328575861e-06, "loss": 0.2332, "num_input_tokens_seen": 1061600, "step": 1755 }, { "epoch": 0.5460750853242321, "grad_norm": 1.0353666543960571, "learning_rate": 2.7288240769469444e-06, "loss": 0.2298, "num_input_tokens_seen": 1064576, "step": 1760 }, { "epoch": 0.5476264349984487, "grad_norm": 2.216329574584961, "learning_rate": 2.736580825318027e-06, "loss": 0.2363, "num_input_tokens_seen": 1068096, "step": 1765 }, { "epoch": 0.5491777846726652, "grad_norm": 1.358993649482727, "learning_rate": 2.74433757368911e-06, "loss": 0.2349, "num_input_tokens_seen": 1071264, "step": 1770 }, { "epoch": 0.5507291343468818, "grad_norm": 4.031161785125732, "learning_rate": 2.7520943220601925e-06, "loss": 0.2283, "num_input_tokens_seen": 1073888, "step": 1775 }, { "epoch": 0.5522804840210983, "grad_norm": 3.0650155544281006, "learning_rate": 2.7598510704312755e-06, "loss": 0.2406, "num_input_tokens_seen": 1077536, "step": 1780 }, { "epoch": 0.553831833695315, "grad_norm": 2.165518283843994, "learning_rate": 2.767607818802358e-06, "loss": 0.2225, "num_input_tokens_seen": 1080512, "step": 1785 }, { "epoch": 0.5553831833695315, "grad_norm": 3.69694447517395, "learning_rate": 2.7753645671734415e-06, "loss": 0.2498, "num_input_tokens_seen": 1082944, "step": 1790 }, { "epoch": 0.5569345330437481, "grad_norm": 5.3074870109558105, "learning_rate": 2.783121315544524e-06, "loss": 0.2285, "num_input_tokens_seen": 1086432, "step": 1795 }, { "epoch": 0.5584858827179646, "grad_norm": 4.583057880401611, "learning_rate": 2.790878063915607e-06, "loss": 0.2462, "num_input_tokens_seen": 1090176, "step": 1800 }, { "epoch": 0.5600372323921812, "grad_norm": 3.4440078735351562, "learning_rate": 2.7986348122866896e-06, "loss": 0.2333, "num_input_tokens_seen": 1092704, "step": 1805 }, { "epoch": 0.5615885820663977, "grad_norm": 6.187201499938965, "learning_rate": 2.8063915606577726e-06, "loss": 0.2295, "num_input_tokens_seen": 1095264, "step": 1810 }, { "epoch": 0.5631399317406144, "grad_norm": 4.240512371063232, "learning_rate": 2.814148309028855e-06, "loss": 0.2343, "num_input_tokens_seen": 1098336, "step": 1815 }, { "epoch": 0.5646912814148309, "grad_norm": 5.399506568908691, "learning_rate": 2.821905057399938e-06, "loss": 0.2405, "num_input_tokens_seen": 1101504, "step": 1820 }, { "epoch": 0.5662426310890475, "grad_norm": 5.346932888031006, "learning_rate": 2.829661805771021e-06, "loss": 0.2528, "num_input_tokens_seen": 1104320, "step": 1825 }, { "epoch": 0.567793980763264, "grad_norm": 1.8028559684753418, "learning_rate": 2.837418554142104e-06, "loss": 0.2313, "num_input_tokens_seen": 1106208, "step": 1830 }, { "epoch": 0.5693453304374806, "grad_norm": 1.6499934196472168, "learning_rate": 2.8451753025131866e-06, "loss": 0.2305, "num_input_tokens_seen": 1108992, "step": 1835 }, { "epoch": 0.5708966801116971, "grad_norm": 7.1818623542785645, "learning_rate": 2.8529320508842696e-06, "loss": 0.2475, "num_input_tokens_seen": 1112416, "step": 1840 }, { "epoch": 0.5724480297859137, "grad_norm": 2.6084342002868652, "learning_rate": 2.860688799255352e-06, "loss": 0.2329, "num_input_tokens_seen": 1116480, "step": 1845 }, { "epoch": 0.5739993794601304, "grad_norm": 3.6258704662323, "learning_rate": 2.868445547626435e-06, "loss": 0.2339, "num_input_tokens_seen": 1119392, "step": 1850 }, { "epoch": 0.5755507291343469, "grad_norm": 1.1508411169052124, "learning_rate": 2.876202295997518e-06, "loss": 0.2351, "num_input_tokens_seen": 1122880, "step": 1855 }, { "epoch": 0.5771020788085635, "grad_norm": 2.012174129486084, "learning_rate": 2.883959044368601e-06, "loss": 0.231, "num_input_tokens_seen": 1125760, "step": 1860 }, { "epoch": 0.57865342848278, "grad_norm": 3.3914191722869873, "learning_rate": 2.8917157927396837e-06, "loss": 0.2288, "num_input_tokens_seen": 1128736, "step": 1865 }, { "epoch": 0.5802047781569966, "grad_norm": 2.915160655975342, "learning_rate": 2.8994725411107667e-06, "loss": 0.2377, "num_input_tokens_seen": 1132224, "step": 1870 }, { "epoch": 0.5817561278312131, "grad_norm": 2.544297695159912, "learning_rate": 2.9072292894818492e-06, "loss": 0.2372, "num_input_tokens_seen": 1134816, "step": 1875 }, { "epoch": 0.5833074775054298, "grad_norm": 2.198301315307617, "learning_rate": 2.9149860378529322e-06, "loss": 0.2327, "num_input_tokens_seen": 1137760, "step": 1880 }, { "epoch": 0.5848588271796463, "grad_norm": 2.0347509384155273, "learning_rate": 2.9227427862240148e-06, "loss": 0.234, "num_input_tokens_seen": 1140800, "step": 1885 }, { "epoch": 0.5864101768538629, "grad_norm": 0.7615256905555725, "learning_rate": 2.930499534595098e-06, "loss": 0.2311, "num_input_tokens_seen": 1143680, "step": 1890 }, { "epoch": 0.5879615265280794, "grad_norm": 3.695526123046875, "learning_rate": 2.9382562829661807e-06, "loss": 0.2325, "num_input_tokens_seen": 1146240, "step": 1895 }, { "epoch": 0.589512876202296, "grad_norm": 2.567736864089966, "learning_rate": 2.9460130313372637e-06, "loss": 0.2268, "num_input_tokens_seen": 1149504, "step": 1900 }, { "epoch": 0.5910642258765125, "grad_norm": 2.246968984603882, "learning_rate": 2.9537697797083463e-06, "loss": 0.2286, "num_input_tokens_seen": 1151808, "step": 1905 }, { "epoch": 0.5926155755507291, "grad_norm": 0.8982674479484558, "learning_rate": 2.9615265280794293e-06, "loss": 0.2319, "num_input_tokens_seen": 1154464, "step": 1910 }, { "epoch": 0.5941669252249457, "grad_norm": 1.959923267364502, "learning_rate": 2.969283276450512e-06, "loss": 0.2347, "num_input_tokens_seen": 1157696, "step": 1915 }, { "epoch": 0.5957182748991623, "grad_norm": 2.1219558715820312, "learning_rate": 2.9770400248215952e-06, "loss": 0.2351, "num_input_tokens_seen": 1161056, "step": 1920 }, { "epoch": 0.5972696245733788, "grad_norm": 0.8644160628318787, "learning_rate": 2.9847967731926782e-06, "loss": 0.2319, "num_input_tokens_seen": 1163744, "step": 1925 }, { "epoch": 0.5988209742475954, "grad_norm": 0.7357949018478394, "learning_rate": 2.9925535215637608e-06, "loss": 0.2311, "num_input_tokens_seen": 1167136, "step": 1930 }, { "epoch": 0.600372323921812, "grad_norm": 3.1240928173065186, "learning_rate": 3.0003102699348433e-06, "loss": 0.2341, "num_input_tokens_seen": 1169600, "step": 1935 }, { "epoch": 0.6019236735960285, "grad_norm": 3.242189884185791, "learning_rate": 3.0080670183059263e-06, "loss": 0.2496, "num_input_tokens_seen": 1172384, "step": 1940 }, { "epoch": 0.6034750232702452, "grad_norm": 3.5512075424194336, "learning_rate": 3.015823766677009e-06, "loss": 0.2322, "num_input_tokens_seen": 1175168, "step": 1945 }, { "epoch": 0.6050263729444617, "grad_norm": 2.40012788772583, "learning_rate": 3.0235805150480923e-06, "loss": 0.232, "num_input_tokens_seen": 1177664, "step": 1950 }, { "epoch": 0.6065777226186783, "grad_norm": 3.8249266147613525, "learning_rate": 3.0313372634191753e-06, "loss": 0.226, "num_input_tokens_seen": 1180736, "step": 1955 }, { "epoch": 0.6081290722928948, "grad_norm": 1.347088098526001, "learning_rate": 3.039094011790258e-06, "loss": 0.223, "num_input_tokens_seen": 1183616, "step": 1960 }, { "epoch": 0.6096804219671114, "grad_norm": 1.5315206050872803, "learning_rate": 3.046850760161341e-06, "loss": 0.2396, "num_input_tokens_seen": 1185856, "step": 1965 }, { "epoch": 0.6112317716413279, "grad_norm": 2.856199264526367, "learning_rate": 3.0546075085324234e-06, "loss": 0.2204, "num_input_tokens_seen": 1188416, "step": 1970 }, { "epoch": 0.6127831213155446, "grad_norm": 2.7151172161102295, "learning_rate": 3.062364256903506e-06, "loss": 0.2295, "num_input_tokens_seen": 1191744, "step": 1975 }, { "epoch": 0.6143344709897611, "grad_norm": 5.963690757751465, "learning_rate": 3.070121005274589e-06, "loss": 0.2416, "num_input_tokens_seen": 1195072, "step": 1980 }, { "epoch": 0.6158858206639777, "grad_norm": 2.9934685230255127, "learning_rate": 3.0778777536456723e-06, "loss": 0.2278, "num_input_tokens_seen": 1197888, "step": 1985 }, { "epoch": 0.6174371703381942, "grad_norm": 2.317763566970825, "learning_rate": 3.085634502016755e-06, "loss": 0.2437, "num_input_tokens_seen": 1200640, "step": 1990 }, { "epoch": 0.6189885200124108, "grad_norm": 4.141030788421631, "learning_rate": 3.093391250387838e-06, "loss": 0.2328, "num_input_tokens_seen": 1203680, "step": 1995 }, { "epoch": 0.6205398696866273, "grad_norm": 3.2368087768554688, "learning_rate": 3.1011479987589204e-06, "loss": 0.2339, "num_input_tokens_seen": 1206080, "step": 2000 }, { "epoch": 0.6220912193608439, "grad_norm": 2.0397183895111084, "learning_rate": 3.1089047471300034e-06, "loss": 0.2301, "num_input_tokens_seen": 1208480, "step": 2005 }, { "epoch": 0.6236425690350605, "grad_norm": 5.417470932006836, "learning_rate": 3.116661495501086e-06, "loss": 0.238, "num_input_tokens_seen": 1211616, "step": 2010 }, { "epoch": 0.6251939187092771, "grad_norm": 1.865788459777832, "learning_rate": 3.1244182438721694e-06, "loss": 0.2379, "num_input_tokens_seen": 1214528, "step": 2015 }, { "epoch": 0.6267452683834936, "grad_norm": 3.3501365184783936, "learning_rate": 3.132174992243252e-06, "loss": 0.2335, "num_input_tokens_seen": 1217408, "step": 2020 }, { "epoch": 0.6282966180577102, "grad_norm": 2.799233913421631, "learning_rate": 3.139931740614335e-06, "loss": 0.2393, "num_input_tokens_seen": 1220480, "step": 2025 }, { "epoch": 0.6298479677319268, "grad_norm": 1.3711128234863281, "learning_rate": 3.1476884889854175e-06, "loss": 0.2392, "num_input_tokens_seen": 1224544, "step": 2030 }, { "epoch": 0.6313993174061433, "grad_norm": 2.9197793006896973, "learning_rate": 3.1554452373565005e-06, "loss": 0.2356, "num_input_tokens_seen": 1228416, "step": 2035 }, { "epoch": 0.63295066708036, "grad_norm": 3.0964438915252686, "learning_rate": 3.163201985727583e-06, "loss": 0.2347, "num_input_tokens_seen": 1231584, "step": 2040 }, { "epoch": 0.6345020167545765, "grad_norm": 2.5717833042144775, "learning_rate": 3.170958734098666e-06, "loss": 0.2441, "num_input_tokens_seen": 1234432, "step": 2045 }, { "epoch": 0.6360533664287931, "grad_norm": 0.765681266784668, "learning_rate": 3.178715482469749e-06, "loss": 0.2265, "num_input_tokens_seen": 1238112, "step": 2050 }, { "epoch": 0.6376047161030096, "grad_norm": 3.705872058868408, "learning_rate": 3.186472230840832e-06, "loss": 0.2004, "num_input_tokens_seen": 1241312, "step": 2055 }, { "epoch": 0.6391560657772262, "grad_norm": 2.3581342697143555, "learning_rate": 3.1942289792119146e-06, "loss": 0.2312, "num_input_tokens_seen": 1244256, "step": 2060 }, { "epoch": 0.6407074154514427, "grad_norm": 5.96222448348999, "learning_rate": 3.2019857275829975e-06, "loss": 0.2837, "num_input_tokens_seen": 1248256, "step": 2065 }, { "epoch": 0.6422587651256593, "grad_norm": 2.4061503410339355, "learning_rate": 3.20974247595408e-06, "loss": 0.2236, "num_input_tokens_seen": 1251456, "step": 2070 }, { "epoch": 0.6438101147998759, "grad_norm": 8.556083679199219, "learning_rate": 3.217499224325163e-06, "loss": 0.261, "num_input_tokens_seen": 1254112, "step": 2075 }, { "epoch": 0.6453614644740925, "grad_norm": 70.50948333740234, "learning_rate": 3.225255972696246e-06, "loss": 0.2581, "num_input_tokens_seen": 1257600, "step": 2080 }, { "epoch": 0.646912814148309, "grad_norm": 3.174952268600464, "learning_rate": 3.233012721067329e-06, "loss": 0.2391, "num_input_tokens_seen": 1260832, "step": 2085 }, { "epoch": 0.6484641638225256, "grad_norm": 2.676058053970337, "learning_rate": 3.2407694694384116e-06, "loss": 0.214, "num_input_tokens_seen": 1263648, "step": 2090 }, { "epoch": 0.6500155134967421, "grad_norm": 6.1069793701171875, "learning_rate": 3.2485262178094946e-06, "loss": 0.21, "num_input_tokens_seen": 1266976, "step": 2095 }, { "epoch": 0.6515668631709587, "grad_norm": 3.556453227996826, "learning_rate": 3.256282966180577e-06, "loss": 0.2339, "num_input_tokens_seen": 1269376, "step": 2100 }, { "epoch": 0.6531182128451753, "grad_norm": 3.921257257461548, "learning_rate": 3.26403971455166e-06, "loss": 0.2122, "num_input_tokens_seen": 1272352, "step": 2105 }, { "epoch": 0.6546695625193919, "grad_norm": 1.2407851219177246, "learning_rate": 3.2717964629227427e-06, "loss": 0.2519, "num_input_tokens_seen": 1275296, "step": 2110 }, { "epoch": 0.6562209121936085, "grad_norm": 3.6224849224090576, "learning_rate": 3.279553211293826e-06, "loss": 0.2381, "num_input_tokens_seen": 1278400, "step": 2115 }, { "epoch": 0.657772261867825, "grad_norm": 3.1856348514556885, "learning_rate": 3.2873099596649087e-06, "loss": 0.2312, "num_input_tokens_seen": 1281344, "step": 2120 }, { "epoch": 0.6593236115420416, "grad_norm": 2.265424966812134, "learning_rate": 3.2950667080359916e-06, "loss": 0.2309, "num_input_tokens_seen": 1283968, "step": 2125 }, { "epoch": 0.6608749612162581, "grad_norm": 3.3130927085876465, "learning_rate": 3.302823456407074e-06, "loss": 0.2258, "num_input_tokens_seen": 1287904, "step": 2130 }, { "epoch": 0.6624263108904748, "grad_norm": 4.975899696350098, "learning_rate": 3.310580204778157e-06, "loss": 0.2457, "num_input_tokens_seen": 1291104, "step": 2135 }, { "epoch": 0.6639776605646913, "grad_norm": 3.304588556289673, "learning_rate": 3.3183369531492398e-06, "loss": 0.2203, "num_input_tokens_seen": 1293920, "step": 2140 }, { "epoch": 0.6655290102389079, "grad_norm": 2.244102716445923, "learning_rate": 3.326093701520323e-06, "loss": 0.2152, "num_input_tokens_seen": 1298944, "step": 2145 }, { "epoch": 0.6670803599131244, "grad_norm": 6.391890525817871, "learning_rate": 3.333850449891406e-06, "loss": 0.2283, "num_input_tokens_seen": 1303520, "step": 2150 }, { "epoch": 0.668631709587341, "grad_norm": 4.264994144439697, "learning_rate": 3.3416071982624887e-06, "loss": 0.2005, "num_input_tokens_seen": 1307552, "step": 2155 }, { "epoch": 0.6701830592615575, "grad_norm": 6.629047393798828, "learning_rate": 3.3493639466335713e-06, "loss": 0.2569, "num_input_tokens_seen": 1310048, "step": 2160 }, { "epoch": 0.6717344089357741, "grad_norm": 4.686851501464844, "learning_rate": 3.3571206950046542e-06, "loss": 0.266, "num_input_tokens_seen": 1312896, "step": 2165 }, { "epoch": 0.6732857586099907, "grad_norm": 2.0745911598205566, "learning_rate": 3.364877443375737e-06, "loss": 0.216, "num_input_tokens_seen": 1316608, "step": 2170 }, { "epoch": 0.6748371082842073, "grad_norm": 2.6934573650360107, "learning_rate": 3.3726341917468202e-06, "loss": 0.2428, "num_input_tokens_seen": 1318752, "step": 2175 }, { "epoch": 0.6763884579584238, "grad_norm": 2.585860252380371, "learning_rate": 3.380390940117903e-06, "loss": 0.2448, "num_input_tokens_seen": 1321568, "step": 2180 }, { "epoch": 0.6779398076326404, "grad_norm": 1.4923806190490723, "learning_rate": 3.3881476884889858e-06, "loss": 0.2245, "num_input_tokens_seen": 1325248, "step": 2185 }, { "epoch": 0.6794911573068569, "grad_norm": 1.686444878578186, "learning_rate": 3.3959044368600687e-06, "loss": 0.24, "num_input_tokens_seen": 1327904, "step": 2190 }, { "epoch": 0.6810425069810735, "grad_norm": 4.857655048370361, "learning_rate": 3.4036611852311513e-06, "loss": 0.2219, "num_input_tokens_seen": 1330464, "step": 2195 }, { "epoch": 0.6825938566552902, "grad_norm": 0.9989470839500427, "learning_rate": 3.411417933602234e-06, "loss": 0.2585, "num_input_tokens_seen": 1333216, "step": 2200 }, { "epoch": 0.6841452063295067, "grad_norm": 2.9610815048217773, "learning_rate": 3.419174681973317e-06, "loss": 0.2373, "num_input_tokens_seen": 1335776, "step": 2205 }, { "epoch": 0.6856965560037233, "grad_norm": 1.1612213850021362, "learning_rate": 3.4269314303444003e-06, "loss": 0.2274, "num_input_tokens_seen": 1338496, "step": 2210 }, { "epoch": 0.6872479056779398, "grad_norm": 1.5798197984695435, "learning_rate": 3.434688178715483e-06, "loss": 0.2391, "num_input_tokens_seen": 1341280, "step": 2215 }, { "epoch": 0.6887992553521564, "grad_norm": 0.685645580291748, "learning_rate": 3.442444927086566e-06, "loss": 0.231, "num_input_tokens_seen": 1343680, "step": 2220 }, { "epoch": 0.6903506050263729, "grad_norm": 0.6415290236473083, "learning_rate": 3.4502016754576484e-06, "loss": 0.2296, "num_input_tokens_seen": 1346656, "step": 2225 }, { "epoch": 0.6919019547005895, "grad_norm": 2.8590645790100098, "learning_rate": 3.4579584238287313e-06, "loss": 0.2338, "num_input_tokens_seen": 1348800, "step": 2230 }, { "epoch": 0.6934533043748061, "grad_norm": 2.6378540992736816, "learning_rate": 3.465715172199814e-06, "loss": 0.2438, "num_input_tokens_seen": 1351200, "step": 2235 }, { "epoch": 0.6950046540490227, "grad_norm": 1.334587812423706, "learning_rate": 3.4734719205708973e-06, "loss": 0.2336, "num_input_tokens_seen": 1353472, "step": 2240 }, { "epoch": 0.6965560037232392, "grad_norm": 0.649857223033905, "learning_rate": 3.48122866894198e-06, "loss": 0.2247, "num_input_tokens_seen": 1355808, "step": 2245 }, { "epoch": 0.6981073533974558, "grad_norm": 0.4763175845146179, "learning_rate": 3.488985417313063e-06, "loss": 0.2415, "num_input_tokens_seen": 1358176, "step": 2250 }, { "epoch": 0.6996587030716723, "grad_norm": 1.5466073751449585, "learning_rate": 3.4967421656841454e-06, "loss": 0.2296, "num_input_tokens_seen": 1360736, "step": 2255 }, { "epoch": 0.7012100527458889, "grad_norm": 6.858870506286621, "learning_rate": 3.5044989140552284e-06, "loss": 0.2321, "num_input_tokens_seen": 1363136, "step": 2260 }, { "epoch": 0.7027614024201055, "grad_norm": 4.04569673538208, "learning_rate": 3.512255662426311e-06, "loss": 0.2303, "num_input_tokens_seen": 1365760, "step": 2265 }, { "epoch": 0.7043127520943221, "grad_norm": 2.432297468185425, "learning_rate": 3.520012410797394e-06, "loss": 0.2432, "num_input_tokens_seen": 1369152, "step": 2270 }, { "epoch": 0.7058641017685386, "grad_norm": 1.1394624710083008, "learning_rate": 3.527769159168477e-06, "loss": 0.3713, "num_input_tokens_seen": 1374560, "step": 2275 }, { "epoch": 0.7074154514427552, "grad_norm": 1.0410029888153076, "learning_rate": 3.53552590753956e-06, "loss": 0.2393, "num_input_tokens_seen": 1377824, "step": 2280 }, { "epoch": 0.7089668011169717, "grad_norm": 2.2353038787841797, "learning_rate": 3.5432826559106425e-06, "loss": 0.2371, "num_input_tokens_seen": 1383072, "step": 2285 }, { "epoch": 0.7105181507911883, "grad_norm": 1.7878090143203735, "learning_rate": 3.5510394042817255e-06, "loss": 0.232, "num_input_tokens_seen": 1385504, "step": 2290 }, { "epoch": 0.7120695004654048, "grad_norm": 1.3161629438400269, "learning_rate": 3.558796152652808e-06, "loss": 0.2285, "num_input_tokens_seen": 1388352, "step": 2295 }, { "epoch": 0.7136208501396215, "grad_norm": 1.4719855785369873, "learning_rate": 3.566552901023891e-06, "loss": 0.241, "num_input_tokens_seen": 1390912, "step": 2300 }, { "epoch": 0.7151721998138381, "grad_norm": 1.246730089187622, "learning_rate": 3.574309649394974e-06, "loss": 0.2286, "num_input_tokens_seen": 1394272, "step": 2305 }, { "epoch": 0.7167235494880546, "grad_norm": 0.6808872818946838, "learning_rate": 3.582066397766057e-06, "loss": 0.2315, "num_input_tokens_seen": 1397056, "step": 2310 }, { "epoch": 0.7182748991622712, "grad_norm": 1.2512898445129395, "learning_rate": 3.5898231461371395e-06, "loss": 0.2263, "num_input_tokens_seen": 1400000, "step": 2315 }, { "epoch": 0.7198262488364877, "grad_norm": 1.483925461769104, "learning_rate": 3.5975798945082225e-06, "loss": 0.2355, "num_input_tokens_seen": 1403488, "step": 2320 }, { "epoch": 0.7213775985107043, "grad_norm": 0.665830135345459, "learning_rate": 3.605336642879305e-06, "loss": 0.235, "num_input_tokens_seen": 1407424, "step": 2325 }, { "epoch": 0.7229289481849209, "grad_norm": 1.4121513366699219, "learning_rate": 3.613093391250388e-06, "loss": 0.2307, "num_input_tokens_seen": 1410560, "step": 2330 }, { "epoch": 0.7244802978591375, "grad_norm": 1.2322171926498413, "learning_rate": 3.620850139621471e-06, "loss": 0.224, "num_input_tokens_seen": 1413504, "step": 2335 }, { "epoch": 0.726031647533354, "grad_norm": 1.2869343757629395, "learning_rate": 3.628606887992554e-06, "loss": 0.2411, "num_input_tokens_seen": 1415904, "step": 2340 }, { "epoch": 0.7275829972075706, "grad_norm": 1.3832385540008545, "learning_rate": 3.6363636363636366e-06, "loss": 0.2269, "num_input_tokens_seen": 1419072, "step": 2345 }, { "epoch": 0.7291343468817871, "grad_norm": 1.0431855916976929, "learning_rate": 3.6441203847347196e-06, "loss": 0.2295, "num_input_tokens_seen": 1422208, "step": 2350 }, { "epoch": 0.7306856965560037, "grad_norm": 3.5643553733825684, "learning_rate": 3.651877133105802e-06, "loss": 0.2496, "num_input_tokens_seen": 1425376, "step": 2355 }, { "epoch": 0.7322370462302203, "grad_norm": 1.5635310411453247, "learning_rate": 3.659633881476885e-06, "loss": 0.227, "num_input_tokens_seen": 1427648, "step": 2360 }, { "epoch": 0.7337883959044369, "grad_norm": 1.0059112310409546, "learning_rate": 3.6673906298479677e-06, "loss": 0.2456, "num_input_tokens_seen": 1429824, "step": 2365 }, { "epoch": 0.7353397455786534, "grad_norm": 2.442960023880005, "learning_rate": 3.675147378219051e-06, "loss": 0.2329, "num_input_tokens_seen": 1433088, "step": 2370 }, { "epoch": 0.73689109525287, "grad_norm": 2.146904945373535, "learning_rate": 3.682904126590134e-06, "loss": 0.2306, "num_input_tokens_seen": 1435680, "step": 2375 }, { "epoch": 0.7384424449270865, "grad_norm": 2.038365602493286, "learning_rate": 3.6906608749612166e-06, "loss": 0.2369, "num_input_tokens_seen": 1438464, "step": 2380 }, { "epoch": 0.7399937946013031, "grad_norm": 0.4675410985946655, "learning_rate": 3.698417623332299e-06, "loss": 0.2342, "num_input_tokens_seen": 1440992, "step": 2385 }, { "epoch": 0.7415451442755197, "grad_norm": 1.4931837320327759, "learning_rate": 3.706174371703382e-06, "loss": 0.2203, "num_input_tokens_seen": 1444896, "step": 2390 }, { "epoch": 0.7430964939497363, "grad_norm": 1.9972243309020996, "learning_rate": 3.7139311200744647e-06, "loss": 0.2003, "num_input_tokens_seen": 1447392, "step": 2395 }, { "epoch": 0.7446478436239529, "grad_norm": 1.9932491779327393, "learning_rate": 3.721687868445548e-06, "loss": 0.235, "num_input_tokens_seen": 1449728, "step": 2400 }, { "epoch": 0.7461991932981694, "grad_norm": 1.106065273284912, "learning_rate": 3.729444616816631e-06, "loss": 0.1905, "num_input_tokens_seen": 1453440, "step": 2405 }, { "epoch": 0.747750542972386, "grad_norm": 5.655160903930664, "learning_rate": 3.7372013651877137e-06, "loss": 0.2628, "num_input_tokens_seen": 1455584, "step": 2410 }, { "epoch": 0.7493018926466025, "grad_norm": 1.1236636638641357, "learning_rate": 3.7449581135587967e-06, "loss": 0.2268, "num_input_tokens_seen": 1458368, "step": 2415 }, { "epoch": 0.7508532423208191, "grad_norm": 2.058396339416504, "learning_rate": 3.7527148619298792e-06, "loss": 0.2531, "num_input_tokens_seen": 1461376, "step": 2420 }, { "epoch": 0.7524045919950357, "grad_norm": 1.4200472831726074, "learning_rate": 3.7604716103009618e-06, "loss": 0.2272, "num_input_tokens_seen": 1465344, "step": 2425 }, { "epoch": 0.7539559416692523, "grad_norm": 1.2685987949371338, "learning_rate": 3.7682283586720448e-06, "loss": 0.2571, "num_input_tokens_seen": 1468928, "step": 2430 }, { "epoch": 0.7555072913434688, "grad_norm": 1.1850941181182861, "learning_rate": 3.775985107043128e-06, "loss": 0.247, "num_input_tokens_seen": 1471776, "step": 2435 }, { "epoch": 0.7570586410176854, "grad_norm": 1.4807466268539429, "learning_rate": 3.7837418554142107e-06, "loss": 0.2302, "num_input_tokens_seen": 1474592, "step": 2440 }, { "epoch": 0.7586099906919019, "grad_norm": 2.505568027496338, "learning_rate": 3.7914986037852937e-06, "loss": 0.2336, "num_input_tokens_seen": 1478080, "step": 2445 }, { "epoch": 0.7601613403661185, "grad_norm": 3.7647595405578613, "learning_rate": 3.7992553521563763e-06, "loss": 0.2534, "num_input_tokens_seen": 1482560, "step": 2450 }, { "epoch": 0.761712690040335, "grad_norm": 3.5450499057769775, "learning_rate": 3.8070121005274593e-06, "loss": 0.2219, "num_input_tokens_seen": 1485536, "step": 2455 }, { "epoch": 0.7632640397145517, "grad_norm": 8.025308609008789, "learning_rate": 3.814768848898542e-06, "loss": 0.2757, "num_input_tokens_seen": 1488224, "step": 2460 }, { "epoch": 0.7648153893887683, "grad_norm": 4.681526184082031, "learning_rate": 3.822525597269625e-06, "loss": 0.2054, "num_input_tokens_seen": 1491840, "step": 2465 }, { "epoch": 0.7663667390629848, "grad_norm": 1.0835164785385132, "learning_rate": 3.830282345640708e-06, "loss": 0.2631, "num_input_tokens_seen": 1494880, "step": 2470 }, { "epoch": 0.7679180887372014, "grad_norm": 0.7330272197723389, "learning_rate": 3.838039094011791e-06, "loss": 0.232, "num_input_tokens_seen": 1497472, "step": 2475 }, { "epoch": 0.7694694384114179, "grad_norm": 2.0429799556732178, "learning_rate": 3.845795842382874e-06, "loss": 0.2384, "num_input_tokens_seen": 1500736, "step": 2480 }, { "epoch": 0.7710207880856345, "grad_norm": 1.902640700340271, "learning_rate": 3.853552590753956e-06, "loss": 0.24, "num_input_tokens_seen": 1503072, "step": 2485 }, { "epoch": 0.7725721377598511, "grad_norm": 0.7691041231155396, "learning_rate": 3.861309339125039e-06, "loss": 0.231, "num_input_tokens_seen": 1505760, "step": 2490 }, { "epoch": 0.7741234874340677, "grad_norm": 1.843485951423645, "learning_rate": 3.869066087496122e-06, "loss": 0.2364, "num_input_tokens_seen": 1509600, "step": 2495 }, { "epoch": 0.7756748371082842, "grad_norm": 1.3200936317443848, "learning_rate": 3.876822835867205e-06, "loss": 0.2164, "num_input_tokens_seen": 1512608, "step": 2500 }, { "epoch": 0.7772261867825008, "grad_norm": 2.2157962322235107, "learning_rate": 3.884579584238288e-06, "loss": 0.2372, "num_input_tokens_seen": 1515584, "step": 2505 }, { "epoch": 0.7787775364567173, "grad_norm": 3.6436290740966797, "learning_rate": 3.892336332609371e-06, "loss": 0.2374, "num_input_tokens_seen": 1518016, "step": 2510 }, { "epoch": 0.7803288861309339, "grad_norm": 2.888575315475464, "learning_rate": 3.900093080980453e-06, "loss": 0.2591, "num_input_tokens_seen": 1521952, "step": 2515 }, { "epoch": 0.7818802358051505, "grad_norm": 2.035649299621582, "learning_rate": 3.907849829351536e-06, "loss": 0.2434, "num_input_tokens_seen": 1524768, "step": 2520 }, { "epoch": 0.7834315854793671, "grad_norm": 0.7538897395133972, "learning_rate": 3.915606577722619e-06, "loss": 0.2288, "num_input_tokens_seen": 1527648, "step": 2525 }, { "epoch": 0.7849829351535836, "grad_norm": 2.7067155838012695, "learning_rate": 3.923363326093702e-06, "loss": 0.2464, "num_input_tokens_seen": 1529984, "step": 2530 }, { "epoch": 0.7865342848278002, "grad_norm": 1.980299711227417, "learning_rate": 3.931120074464785e-06, "loss": 0.2385, "num_input_tokens_seen": 1533376, "step": 2535 }, { "epoch": 0.7880856345020167, "grad_norm": 5.217749118804932, "learning_rate": 3.938876822835868e-06, "loss": 0.2299, "num_input_tokens_seen": 1537088, "step": 2540 }, { "epoch": 0.7896369841762333, "grad_norm": 6.008622169494629, "learning_rate": 3.94663357120695e-06, "loss": 0.2485, "num_input_tokens_seen": 1539328, "step": 2545 }, { "epoch": 0.7911883338504498, "grad_norm": 3.360995292663574, "learning_rate": 3.954390319578033e-06, "loss": 0.2356, "num_input_tokens_seen": 1542656, "step": 2550 }, { "epoch": 0.7927396835246665, "grad_norm": 2.967195987701416, "learning_rate": 3.962147067949116e-06, "loss": 0.234, "num_input_tokens_seen": 1547136, "step": 2555 }, { "epoch": 0.7942910331988831, "grad_norm": 4.386361122131348, "learning_rate": 3.969903816320199e-06, "loss": 0.2303, "num_input_tokens_seen": 1549920, "step": 2560 }, { "epoch": 0.7958423828730996, "grad_norm": 2.4551544189453125, "learning_rate": 3.977660564691282e-06, "loss": 0.2364, "num_input_tokens_seen": 1552736, "step": 2565 }, { "epoch": 0.7973937325473162, "grad_norm": 3.8961191177368164, "learning_rate": 3.985417313062365e-06, "loss": 0.2197, "num_input_tokens_seen": 1555072, "step": 2570 }, { "epoch": 0.7989450822215327, "grad_norm": 6.327117919921875, "learning_rate": 3.993174061433447e-06, "loss": 0.2122, "num_input_tokens_seen": 1557504, "step": 2575 }, { "epoch": 0.8004964318957493, "grad_norm": 5.200240135192871, "learning_rate": 4.00093080980453e-06, "loss": 0.196, "num_input_tokens_seen": 1560096, "step": 2580 }, { "epoch": 0.8020477815699659, "grad_norm": 5.748575687408447, "learning_rate": 4.008687558175613e-06, "loss": 0.2891, "num_input_tokens_seen": 1562912, "step": 2585 }, { "epoch": 0.8035991312441825, "grad_norm": 9.533347129821777, "learning_rate": 4.016444306546696e-06, "loss": 0.2386, "num_input_tokens_seen": 1565664, "step": 2590 }, { "epoch": 0.805150480918399, "grad_norm": 4.248400688171387, "learning_rate": 4.024201054917779e-06, "loss": 0.2505, "num_input_tokens_seen": 1568384, "step": 2595 }, { "epoch": 0.8067018305926156, "grad_norm": 2.077392339706421, "learning_rate": 4.031957803288862e-06, "loss": 0.2258, "num_input_tokens_seen": 1570912, "step": 2600 }, { "epoch": 0.8082531802668321, "grad_norm": 1.8061816692352295, "learning_rate": 4.039714551659944e-06, "loss": 0.2217, "num_input_tokens_seen": 1574176, "step": 2605 }, { "epoch": 0.8098045299410487, "grad_norm": 2.9435133934020996, "learning_rate": 4.047471300031027e-06, "loss": 0.2287, "num_input_tokens_seen": 1577152, "step": 2610 }, { "epoch": 0.8113558796152652, "grad_norm": 3.142052173614502, "learning_rate": 4.05522804840211e-06, "loss": 0.2592, "num_input_tokens_seen": 1580192, "step": 2615 }, { "epoch": 0.8129072292894819, "grad_norm": 3.6281328201293945, "learning_rate": 4.062984796773193e-06, "loss": 0.2391, "num_input_tokens_seen": 1583072, "step": 2620 }, { "epoch": 0.8144585789636984, "grad_norm": 3.5389206409454346, "learning_rate": 4.070741545144276e-06, "loss": 0.2382, "num_input_tokens_seen": 1586048, "step": 2625 }, { "epoch": 0.816009928637915, "grad_norm": 1.586928367614746, "learning_rate": 4.078498293515359e-06, "loss": 0.239, "num_input_tokens_seen": 1588384, "step": 2630 }, { "epoch": 0.8175612783121315, "grad_norm": 1.142808437347412, "learning_rate": 4.086255041886442e-06, "loss": 0.2222, "num_input_tokens_seen": 1591712, "step": 2635 }, { "epoch": 0.8191126279863481, "grad_norm": 0.9435325264930725, "learning_rate": 4.094011790257524e-06, "loss": 0.2399, "num_input_tokens_seen": 1594464, "step": 2640 }, { "epoch": 0.8206639776605646, "grad_norm": 1.1571969985961914, "learning_rate": 4.101768538628607e-06, "loss": 0.2436, "num_input_tokens_seen": 1597920, "step": 2645 }, { "epoch": 0.8222153273347813, "grad_norm": 2.2843782901763916, "learning_rate": 4.10952528699969e-06, "loss": 0.2511, "num_input_tokens_seen": 1600320, "step": 2650 }, { "epoch": 0.8237666770089979, "grad_norm": 5.714015483856201, "learning_rate": 4.117282035370772e-06, "loss": 0.223, "num_input_tokens_seen": 1602592, "step": 2655 }, { "epoch": 0.8253180266832144, "grad_norm": 2.956899642944336, "learning_rate": 4.125038783741856e-06, "loss": 0.2346, "num_input_tokens_seen": 1606496, "step": 2660 }, { "epoch": 0.826869376357431, "grad_norm": 2.494033098220825, "learning_rate": 4.132795532112939e-06, "loss": 0.2314, "num_input_tokens_seen": 1609984, "step": 2665 }, { "epoch": 0.8284207260316475, "grad_norm": 3.311908483505249, "learning_rate": 4.140552280484021e-06, "loss": 0.237, "num_input_tokens_seen": 1612224, "step": 2670 }, { "epoch": 0.8299720757058641, "grad_norm": 0.8926174640655518, "learning_rate": 4.148309028855104e-06, "loss": 0.241, "num_input_tokens_seen": 1615264, "step": 2675 }, { "epoch": 0.8315234253800806, "grad_norm": 0.784396231174469, "learning_rate": 4.156065777226187e-06, "loss": 0.2244, "num_input_tokens_seen": 1618048, "step": 2680 }, { "epoch": 0.8330747750542973, "grad_norm": 2.1681406497955322, "learning_rate": 4.163822525597269e-06, "loss": 0.235, "num_input_tokens_seen": 1620416, "step": 2685 }, { "epoch": 0.8346261247285138, "grad_norm": 1.1861917972564697, "learning_rate": 4.171579273968353e-06, "loss": 0.2382, "num_input_tokens_seen": 1623744, "step": 2690 }, { "epoch": 0.8361774744027304, "grad_norm": 0.9076901078224182, "learning_rate": 4.179336022339436e-06, "loss": 0.2411, "num_input_tokens_seen": 1627168, "step": 2695 }, { "epoch": 0.8377288240769469, "grad_norm": 1.1843576431274414, "learning_rate": 4.187092770710518e-06, "loss": 0.2363, "num_input_tokens_seen": 1629888, "step": 2700 }, { "epoch": 0.8392801737511635, "grad_norm": 1.7766894102096558, "learning_rate": 4.194849519081601e-06, "loss": 0.2245, "num_input_tokens_seen": 1633152, "step": 2705 }, { "epoch": 0.84083152342538, "grad_norm": 0.5997194051742554, "learning_rate": 4.202606267452684e-06, "loss": 0.2358, "num_input_tokens_seen": 1635488, "step": 2710 }, { "epoch": 0.8423828730995967, "grad_norm": 0.792295515537262, "learning_rate": 4.210363015823767e-06, "loss": 0.2336, "num_input_tokens_seen": 1638272, "step": 2715 }, { "epoch": 0.8439342227738132, "grad_norm": 2.569981336593628, "learning_rate": 4.218119764194849e-06, "loss": 0.233, "num_input_tokens_seen": 1641632, "step": 2720 }, { "epoch": 0.8454855724480298, "grad_norm": 1.8949120044708252, "learning_rate": 4.225876512565933e-06, "loss": 0.238, "num_input_tokens_seen": 1644384, "step": 2725 }, { "epoch": 0.8470369221222463, "grad_norm": 2.4229915142059326, "learning_rate": 4.233633260937015e-06, "loss": 0.2265, "num_input_tokens_seen": 1646688, "step": 2730 }, { "epoch": 0.8485882717964629, "grad_norm": 1.6916037797927856, "learning_rate": 4.241390009308098e-06, "loss": 0.2338, "num_input_tokens_seen": 1650400, "step": 2735 }, { "epoch": 0.8501396214706795, "grad_norm": 1.2396372556686401, "learning_rate": 4.249146757679181e-06, "loss": 0.2265, "num_input_tokens_seen": 1653216, "step": 2740 }, { "epoch": 0.8516909711448961, "grad_norm": 2.713197708129883, "learning_rate": 4.256903506050264e-06, "loss": 0.245, "num_input_tokens_seen": 1655616, "step": 2745 }, { "epoch": 0.8532423208191127, "grad_norm": 0.5518041849136353, "learning_rate": 4.264660254421346e-06, "loss": 0.2276, "num_input_tokens_seen": 1661664, "step": 2750 }, { "epoch": 0.8547936704933292, "grad_norm": 0.9838311076164246, "learning_rate": 4.27241700279243e-06, "loss": 0.2382, "num_input_tokens_seen": 1664512, "step": 2755 }, { "epoch": 0.8563450201675458, "grad_norm": 1.030358910560608, "learning_rate": 4.280173751163512e-06, "loss": 0.2309, "num_input_tokens_seen": 1667328, "step": 2760 }, { "epoch": 0.8578963698417623, "grad_norm": 0.9078791737556458, "learning_rate": 4.287930499534595e-06, "loss": 0.2325, "num_input_tokens_seen": 1671136, "step": 2765 }, { "epoch": 0.8594477195159789, "grad_norm": 1.2656270265579224, "learning_rate": 4.295687247905678e-06, "loss": 0.2267, "num_input_tokens_seen": 1674112, "step": 2770 }, { "epoch": 0.8609990691901954, "grad_norm": 0.6243343353271484, "learning_rate": 4.303443996276761e-06, "loss": 0.2488, "num_input_tokens_seen": 1676992, "step": 2775 }, { "epoch": 0.8625504188644121, "grad_norm": 0.70853590965271, "learning_rate": 4.3112007446478435e-06, "loss": 0.2277, "num_input_tokens_seen": 1679648, "step": 2780 }, { "epoch": 0.8641017685386286, "grad_norm": 0.550470232963562, "learning_rate": 4.318957493018927e-06, "loss": 0.2394, "num_input_tokens_seen": 1682944, "step": 2785 }, { "epoch": 0.8656531182128452, "grad_norm": 0.5435770750045776, "learning_rate": 4.3267142413900094e-06, "loss": 0.2359, "num_input_tokens_seen": 1686176, "step": 2790 }, { "epoch": 0.8672044678870617, "grad_norm": 0.8407547473907471, "learning_rate": 4.3344709897610924e-06, "loss": 0.2344, "num_input_tokens_seen": 1689120, "step": 2795 }, { "epoch": 0.8687558175612783, "grad_norm": 1.0360605716705322, "learning_rate": 4.342227738132175e-06, "loss": 0.2319, "num_input_tokens_seen": 1691552, "step": 2800 }, { "epoch": 0.8703071672354948, "grad_norm": 0.6597394347190857, "learning_rate": 4.349984486503258e-06, "loss": 0.2315, "num_input_tokens_seen": 1695040, "step": 2805 }, { "epoch": 0.8718585169097115, "grad_norm": 0.7061944603919983, "learning_rate": 4.3577412348743405e-06, "loss": 0.2305, "num_input_tokens_seen": 1698304, "step": 2810 }, { "epoch": 0.873409866583928, "grad_norm": 0.4965258538722992, "learning_rate": 4.3654979832454235e-06, "loss": 0.2318, "num_input_tokens_seen": 1701216, "step": 2815 }, { "epoch": 0.8749612162581446, "grad_norm": 0.5037447214126587, "learning_rate": 4.373254731616507e-06, "loss": 0.2278, "num_input_tokens_seen": 1703360, "step": 2820 }, { "epoch": 0.8765125659323612, "grad_norm": 0.4434105455875397, "learning_rate": 4.3810114799875895e-06, "loss": 0.2326, "num_input_tokens_seen": 1706048, "step": 2825 }, { "epoch": 0.8780639156065777, "grad_norm": 0.8791712522506714, "learning_rate": 4.3887682283586725e-06, "loss": 0.2287, "num_input_tokens_seen": 1709248, "step": 2830 }, { "epoch": 0.8796152652807943, "grad_norm": 2.3169822692871094, "learning_rate": 4.3965249767297554e-06, "loss": 0.2346, "num_input_tokens_seen": 1712736, "step": 2835 }, { "epoch": 0.8811666149550108, "grad_norm": 1.5125583410263062, "learning_rate": 4.404281725100838e-06, "loss": 0.2365, "num_input_tokens_seen": 1715232, "step": 2840 }, { "epoch": 0.8827179646292275, "grad_norm": 0.9953866600990295, "learning_rate": 4.4120384734719206e-06, "loss": 0.2307, "num_input_tokens_seen": 1718656, "step": 2845 }, { "epoch": 0.884269314303444, "grad_norm": 0.5645773410797119, "learning_rate": 4.419795221843004e-06, "loss": 0.2229, "num_input_tokens_seen": 1721376, "step": 2850 }, { "epoch": 0.8858206639776606, "grad_norm": 0.7463214993476868, "learning_rate": 4.4275519702140865e-06, "loss": 0.2436, "num_input_tokens_seen": 1724288, "step": 2855 }, { "epoch": 0.8873720136518771, "grad_norm": 0.7502066493034363, "learning_rate": 4.4353087185851695e-06, "loss": 0.2441, "num_input_tokens_seen": 1727232, "step": 2860 }, { "epoch": 0.8889233633260937, "grad_norm": 1.2878220081329346, "learning_rate": 4.4430654669562525e-06, "loss": 0.2362, "num_input_tokens_seen": 1730304, "step": 2865 }, { "epoch": 0.8904747130003102, "grad_norm": 0.6410181522369385, "learning_rate": 4.450822215327335e-06, "loss": 0.2369, "num_input_tokens_seen": 1732800, "step": 2870 }, { "epoch": 0.8920260626745269, "grad_norm": 1.184542179107666, "learning_rate": 4.458578963698418e-06, "loss": 0.2305, "num_input_tokens_seen": 1736512, "step": 2875 }, { "epoch": 0.8935774123487434, "grad_norm": 1.642678141593933, "learning_rate": 4.466335712069501e-06, "loss": 0.2295, "num_input_tokens_seen": 1739904, "step": 2880 }, { "epoch": 0.89512876202296, "grad_norm": 0.566307008266449, "learning_rate": 4.474092460440584e-06, "loss": 0.2315, "num_input_tokens_seen": 1744768, "step": 2885 }, { "epoch": 0.8966801116971765, "grad_norm": 1.627235770225525, "learning_rate": 4.4818492088116666e-06, "loss": 0.2317, "num_input_tokens_seen": 1748224, "step": 2890 }, { "epoch": 0.8982314613713931, "grad_norm": 0.5742165446281433, "learning_rate": 4.4896059571827496e-06, "loss": 0.2361, "num_input_tokens_seen": 1751488, "step": 2895 }, { "epoch": 0.8997828110456096, "grad_norm": 0.5041505098342896, "learning_rate": 4.4973627055538325e-06, "loss": 0.2353, "num_input_tokens_seen": 1753856, "step": 2900 }, { "epoch": 0.9013341607198262, "grad_norm": 1.820811152458191, "learning_rate": 4.505119453924915e-06, "loss": 0.2317, "num_input_tokens_seen": 1756672, "step": 2905 }, { "epoch": 0.9028855103940429, "grad_norm": 0.3163919448852539, "learning_rate": 4.512876202295998e-06, "loss": 0.2347, "num_input_tokens_seen": 1759424, "step": 2910 }, { "epoch": 0.9044368600682594, "grad_norm": 0.8846561312675476, "learning_rate": 4.520632950667081e-06, "loss": 0.224, "num_input_tokens_seen": 1762336, "step": 2915 }, { "epoch": 0.905988209742476, "grad_norm": 0.880647599697113, "learning_rate": 4.528389699038164e-06, "loss": 0.237, "num_input_tokens_seen": 1765024, "step": 2920 }, { "epoch": 0.9075395594166925, "grad_norm": 0.7534177899360657, "learning_rate": 4.536146447409247e-06, "loss": 0.2309, "num_input_tokens_seen": 1768640, "step": 2925 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6912259459495544, "learning_rate": 4.54390319578033e-06, "loss": 0.2353, "num_input_tokens_seen": 1771936, "step": 2930 }, { "epoch": 0.9106422587651256, "grad_norm": 0.5821657180786133, "learning_rate": 4.551659944151412e-06, "loss": 0.2319, "num_input_tokens_seen": 1776416, "step": 2935 }, { "epoch": 0.9121936084393423, "grad_norm": 0.6865861415863037, "learning_rate": 4.559416692522495e-06, "loss": 0.2382, "num_input_tokens_seen": 1779840, "step": 2940 }, { "epoch": 0.9137449581135588, "grad_norm": 1.1282670497894287, "learning_rate": 4.567173440893578e-06, "loss": 0.2372, "num_input_tokens_seen": 1782816, "step": 2945 }, { "epoch": 0.9152963077877754, "grad_norm": 0.7609823942184448, "learning_rate": 4.574930189264661e-06, "loss": 0.2328, "num_input_tokens_seen": 1785248, "step": 2950 }, { "epoch": 0.9168476574619919, "grad_norm": 0.48636582493782043, "learning_rate": 4.582686937635744e-06, "loss": 0.2231, "num_input_tokens_seen": 1787680, "step": 2955 }, { "epoch": 0.9183990071362085, "grad_norm": 1.090908169746399, "learning_rate": 4.590443686006827e-06, "loss": 0.242, "num_input_tokens_seen": 1791584, "step": 2960 }, { "epoch": 0.919950356810425, "grad_norm": 0.8184488415718079, "learning_rate": 4.598200434377909e-06, "loss": 0.2352, "num_input_tokens_seen": 1794176, "step": 2965 }, { "epoch": 0.9215017064846417, "grad_norm": 1.1441287994384766, "learning_rate": 4.605957182748992e-06, "loss": 0.2294, "num_input_tokens_seen": 1797728, "step": 2970 }, { "epoch": 0.9230530561588582, "grad_norm": 1.814975619316101, "learning_rate": 4.613713931120075e-06, "loss": 0.2305, "num_input_tokens_seen": 1801952, "step": 2975 }, { "epoch": 0.9246044058330748, "grad_norm": 7.887892246246338, "learning_rate": 4.621470679491158e-06, "loss": 0.245, "num_input_tokens_seen": 1805632, "step": 2980 }, { "epoch": 0.9261557555072913, "grad_norm": 5.145443916320801, "learning_rate": 4.629227427862241e-06, "loss": 0.2351, "num_input_tokens_seen": 1808352, "step": 2985 }, { "epoch": 0.9277071051815079, "grad_norm": 1.9484988451004028, "learning_rate": 4.636984176233324e-06, "loss": 0.2308, "num_input_tokens_seen": 1810880, "step": 2990 }, { "epoch": 0.9292584548557244, "grad_norm": 1.3856889009475708, "learning_rate": 4.644740924604406e-06, "loss": 0.2329, "num_input_tokens_seen": 1813600, "step": 2995 }, { "epoch": 0.930809804529941, "grad_norm": 0.7255025506019592, "learning_rate": 4.652497672975489e-06, "loss": 0.2365, "num_input_tokens_seen": 1817024, "step": 3000 }, { "epoch": 0.9323611542041577, "grad_norm": 1.90272057056427, "learning_rate": 4.660254421346572e-06, "loss": 0.2391, "num_input_tokens_seen": 1819872, "step": 3005 }, { "epoch": 0.9339125038783742, "grad_norm": 0.7706185579299927, "learning_rate": 4.668011169717655e-06, "loss": 0.2274, "num_input_tokens_seen": 1822528, "step": 3010 }, { "epoch": 0.9354638535525908, "grad_norm": 1.4806544780731201, "learning_rate": 4.675767918088738e-06, "loss": 0.2188, "num_input_tokens_seen": 1825664, "step": 3015 }, { "epoch": 0.9370152032268073, "grad_norm": 2.0446817874908447, "learning_rate": 4.683524666459821e-06, "loss": 0.2554, "num_input_tokens_seen": 1828704, "step": 3020 }, { "epoch": 0.9385665529010239, "grad_norm": 1.1387221813201904, "learning_rate": 4.691281414830903e-06, "loss": 0.2398, "num_input_tokens_seen": 1833536, "step": 3025 }, { "epoch": 0.9401179025752404, "grad_norm": 1.562933087348938, "learning_rate": 4.699038163201986e-06, "loss": 0.2401, "num_input_tokens_seen": 1836064, "step": 3030 }, { "epoch": 0.9416692522494571, "grad_norm": 0.3180149495601654, "learning_rate": 4.706794911573069e-06, "loss": 0.2318, "num_input_tokens_seen": 1839776, "step": 3035 }, { "epoch": 0.9432206019236736, "grad_norm": 1.0414570569992065, "learning_rate": 4.714551659944152e-06, "loss": 0.2253, "num_input_tokens_seen": 1842336, "step": 3040 }, { "epoch": 0.9447719515978902, "grad_norm": 1.2602155208587646, "learning_rate": 4.722308408315235e-06, "loss": 0.2282, "num_input_tokens_seen": 1844640, "step": 3045 }, { "epoch": 0.9463233012721067, "grad_norm": 1.1223349571228027, "learning_rate": 4.730065156686318e-06, "loss": 0.2373, "num_input_tokens_seen": 1847200, "step": 3050 }, { "epoch": 0.9478746509463233, "grad_norm": 1.002503514289856, "learning_rate": 4.7378219050574e-06, "loss": 0.2294, "num_input_tokens_seen": 1849792, "step": 3055 }, { "epoch": 0.9494260006205398, "grad_norm": 0.7044450044631958, "learning_rate": 4.745578653428483e-06, "loss": 0.2236, "num_input_tokens_seen": 1852864, "step": 3060 }, { "epoch": 0.9509773502947564, "grad_norm": 0.5535057187080383, "learning_rate": 4.753335401799566e-06, "loss": 0.2256, "num_input_tokens_seen": 1856416, "step": 3065 }, { "epoch": 0.952528699968973, "grad_norm": 1.4629753828048706, "learning_rate": 4.761092150170649e-06, "loss": 0.2318, "num_input_tokens_seen": 1859872, "step": 3070 }, { "epoch": 0.9540800496431896, "grad_norm": 3.16107439994812, "learning_rate": 4.768848898541732e-06, "loss": 0.2337, "num_input_tokens_seen": 1862752, "step": 3075 }, { "epoch": 0.9556313993174061, "grad_norm": 4.071774005889893, "learning_rate": 4.776605646912815e-06, "loss": 0.2273, "num_input_tokens_seen": 1865952, "step": 3080 }, { "epoch": 0.9571827489916227, "grad_norm": 4.585273742675781, "learning_rate": 4.784362395283898e-06, "loss": 0.2317, "num_input_tokens_seen": 1869632, "step": 3085 }, { "epoch": 0.9587340986658393, "grad_norm": 0.5954428315162659, "learning_rate": 4.79211914365498e-06, "loss": 0.236, "num_input_tokens_seen": 1872256, "step": 3090 }, { "epoch": 0.9602854483400558, "grad_norm": 1.0393356084823608, "learning_rate": 4.799875892026063e-06, "loss": 0.2345, "num_input_tokens_seen": 1875008, "step": 3095 }, { "epoch": 0.9618367980142725, "grad_norm": 1.2603129148483276, "learning_rate": 4.807632640397146e-06, "loss": 0.2353, "num_input_tokens_seen": 1877888, "step": 3100 }, { "epoch": 0.963388147688489, "grad_norm": 0.4514506161212921, "learning_rate": 4.815389388768228e-06, "loss": 0.226, "num_input_tokens_seen": 1880928, "step": 3105 }, { "epoch": 0.9649394973627056, "grad_norm": 1.2725855112075806, "learning_rate": 4.823146137139312e-06, "loss": 0.2402, "num_input_tokens_seen": 1886688, "step": 3110 }, { "epoch": 0.9664908470369221, "grad_norm": 0.8339421153068542, "learning_rate": 4.830902885510395e-06, "loss": 0.2396, "num_input_tokens_seen": 1890080, "step": 3115 }, { "epoch": 0.9680421967111387, "grad_norm": 1.1779639720916748, "learning_rate": 4.838659633881477e-06, "loss": 0.2325, "num_input_tokens_seen": 1892544, "step": 3120 }, { "epoch": 0.9695935463853552, "grad_norm": 1.1303960084915161, "learning_rate": 4.84641638225256e-06, "loss": 0.2285, "num_input_tokens_seen": 1896288, "step": 3125 }, { "epoch": 0.9711448960595719, "grad_norm": 0.9172843098640442, "learning_rate": 4.854173130623643e-06, "loss": 0.2318, "num_input_tokens_seen": 1899808, "step": 3130 }, { "epoch": 0.9726962457337884, "grad_norm": 0.4381037950515747, "learning_rate": 4.861929878994725e-06, "loss": 0.2306, "num_input_tokens_seen": 1903392, "step": 3135 }, { "epoch": 0.974247595408005, "grad_norm": 0.3881298899650574, "learning_rate": 4.869686627365809e-06, "loss": 0.2382, "num_input_tokens_seen": 1906944, "step": 3140 }, { "epoch": 0.9757989450822215, "grad_norm": 0.9445329308509827, "learning_rate": 4.877443375736892e-06, "loss": 0.2415, "num_input_tokens_seen": 1910592, "step": 3145 }, { "epoch": 0.9773502947564381, "grad_norm": 0.644145667552948, "learning_rate": 4.885200124107974e-06, "loss": 0.2307, "num_input_tokens_seen": 1915872, "step": 3150 }, { "epoch": 0.9789016444306546, "grad_norm": 0.6388810276985168, "learning_rate": 4.892956872479057e-06, "loss": 0.2286, "num_input_tokens_seen": 1918400, "step": 3155 }, { "epoch": 0.9804529941048712, "grad_norm": 0.31470248103141785, "learning_rate": 4.90071362085014e-06, "loss": 0.2339, "num_input_tokens_seen": 1921088, "step": 3160 }, { "epoch": 0.9820043437790879, "grad_norm": 0.25780758261680603, "learning_rate": 4.908470369221223e-06, "loss": 0.2353, "num_input_tokens_seen": 1924576, "step": 3165 }, { "epoch": 0.9835556934533044, "grad_norm": 0.6659814715385437, "learning_rate": 4.916227117592306e-06, "loss": 0.2295, "num_input_tokens_seen": 1927808, "step": 3170 }, { "epoch": 0.985107043127521, "grad_norm": 0.3035561144351959, "learning_rate": 4.923983865963389e-06, "loss": 0.2263, "num_input_tokens_seen": 1930880, "step": 3175 }, { "epoch": 0.9866583928017375, "grad_norm": 1.3765619993209839, "learning_rate": 4.931740614334471e-06, "loss": 0.2252, "num_input_tokens_seen": 1934464, "step": 3180 }, { "epoch": 0.9882097424759541, "grad_norm": 1.0597573518753052, "learning_rate": 4.939497362705554e-06, "loss": 0.2214, "num_input_tokens_seen": 1937120, "step": 3185 }, { "epoch": 0.9897610921501706, "grad_norm": 1.0768457651138306, "learning_rate": 4.947254111076637e-06, "loss": 0.2221, "num_input_tokens_seen": 1939680, "step": 3190 }, { "epoch": 0.9913124418243873, "grad_norm": 2.471296548843384, "learning_rate": 4.95501085944772e-06, "loss": 0.2175, "num_input_tokens_seen": 1942656, "step": 3195 }, { "epoch": 0.9928637914986038, "grad_norm": 0.8448272347450256, "learning_rate": 4.962767607818802e-06, "loss": 0.2727, "num_input_tokens_seen": 1948256, "step": 3200 }, { "epoch": 0.9944151411728204, "grad_norm": 0.8499921560287476, "learning_rate": 4.970524356189886e-06, "loss": 0.2593, "num_input_tokens_seen": 1950496, "step": 3205 }, { "epoch": 0.9959664908470369, "grad_norm": 1.3863129615783691, "learning_rate": 4.978281104560968e-06, "loss": 0.2462, "num_input_tokens_seen": 1953376, "step": 3210 }, { "epoch": 0.9975178405212535, "grad_norm": 0.31457093358039856, "learning_rate": 4.986037852932051e-06, "loss": 0.2344, "num_input_tokens_seen": 1955392, "step": 3215 }, { "epoch": 0.99906919019547, "grad_norm": 0.8206101655960083, "learning_rate": 4.993794601303134e-06, "loss": 0.2433, "num_input_tokens_seen": 1958496, "step": 3220 }, { "epoch": 1.0006205398696866, "grad_norm": 0.49661609530448914, "learning_rate": 5.001551349674217e-06, "loss": 0.2308, "num_input_tokens_seen": 1961104, "step": 3225 }, { "epoch": 1.0021718895439031, "grad_norm": 0.8486115336418152, "learning_rate": 5.0093080980453e-06, "loss": 0.2357, "num_input_tokens_seen": 1963632, "step": 3230 }, { "epoch": 1.0037232392181197, "grad_norm": 0.3786095082759857, "learning_rate": 5.017064846416383e-06, "loss": 0.2287, "num_input_tokens_seen": 1966192, "step": 3235 }, { "epoch": 1.0052745888923362, "grad_norm": 0.5279488563537598, "learning_rate": 5.024821594787465e-06, "loss": 0.2322, "num_input_tokens_seen": 1968848, "step": 3240 }, { "epoch": 1.006825938566553, "grad_norm": 1.1088894605636597, "learning_rate": 5.032578343158548e-06, "loss": 0.2105, "num_input_tokens_seen": 1974256, "step": 3245 }, { "epoch": 1.0083772882407696, "grad_norm": 0.7890156507492065, "learning_rate": 5.040335091529631e-06, "loss": 0.2423, "num_input_tokens_seen": 1978384, "step": 3250 }, { "epoch": 1.009928637914986, "grad_norm": 1.830674171447754, "learning_rate": 5.048091839900714e-06, "loss": 0.232, "num_input_tokens_seen": 1981072, "step": 3255 }, { "epoch": 1.0114799875892027, "grad_norm": 2.191357374191284, "learning_rate": 5.055848588271796e-06, "loss": 0.2361, "num_input_tokens_seen": 1984368, "step": 3260 }, { "epoch": 1.0130313372634192, "grad_norm": 0.4153956472873688, "learning_rate": 5.063605336642879e-06, "loss": 0.2256, "num_input_tokens_seen": 1987088, "step": 3265 }, { "epoch": 1.0145826869376358, "grad_norm": 1.2064212560653687, "learning_rate": 5.071362085013962e-06, "loss": 0.2282, "num_input_tokens_seen": 1990960, "step": 3270 }, { "epoch": 1.0161340366118523, "grad_norm": 0.8845511078834534, "learning_rate": 5.079118833385045e-06, "loss": 0.2243, "num_input_tokens_seen": 1994512, "step": 3275 }, { "epoch": 1.0176853862860689, "grad_norm": 0.9558754563331604, "learning_rate": 5.0868755817561275e-06, "loss": 0.2375, "num_input_tokens_seen": 1997328, "step": 3280 }, { "epoch": 1.0192367359602854, "grad_norm": 0.46258389949798584, "learning_rate": 5.094632330127211e-06, "loss": 0.2275, "num_input_tokens_seen": 1999920, "step": 3285 }, { "epoch": 1.020788085634502, "grad_norm": 0.6777845621109009, "learning_rate": 5.102389078498294e-06, "loss": 0.2194, "num_input_tokens_seen": 2003120, "step": 3290 }, { "epoch": 1.0223394353087185, "grad_norm": 1.6136845350265503, "learning_rate": 5.110145826869377e-06, "loss": 0.216, "num_input_tokens_seen": 2005872, "step": 3295 }, { "epoch": 1.023890784982935, "grad_norm": 0.4981827437877655, "learning_rate": 5.11790257524046e-06, "loss": 0.2404, "num_input_tokens_seen": 2008912, "step": 3300 }, { "epoch": 1.0254421346571516, "grad_norm": 0.4221687614917755, "learning_rate": 5.125659323611542e-06, "loss": 0.2411, "num_input_tokens_seen": 2012816, "step": 3305 }, { "epoch": 1.0269934843313684, "grad_norm": 0.48579105734825134, "learning_rate": 5.133416071982625e-06, "loss": 0.2264, "num_input_tokens_seen": 2016240, "step": 3310 }, { "epoch": 1.028544834005585, "grad_norm": 0.4985552132129669, "learning_rate": 5.141172820353708e-06, "loss": 0.2331, "num_input_tokens_seen": 2019024, "step": 3315 }, { "epoch": 1.0300961836798015, "grad_norm": 0.5085026025772095, "learning_rate": 5.1489295687247905e-06, "loss": 0.2358, "num_input_tokens_seen": 2022352, "step": 3320 }, { "epoch": 1.031647533354018, "grad_norm": 0.616306722164154, "learning_rate": 5.1566863170958735e-06, "loss": 0.2268, "num_input_tokens_seen": 2026224, "step": 3325 }, { "epoch": 1.0331988830282346, "grad_norm": 0.9891427159309387, "learning_rate": 5.1644430654669564e-06, "loss": 0.2337, "num_input_tokens_seen": 2028816, "step": 3330 }, { "epoch": 1.0347502327024511, "grad_norm": 2.8967227935791016, "learning_rate": 5.1721998138380394e-06, "loss": 0.231, "num_input_tokens_seen": 2032336, "step": 3335 }, { "epoch": 1.0363015823766677, "grad_norm": 1.600041151046753, "learning_rate": 5.1799565622091216e-06, "loss": 0.2294, "num_input_tokens_seen": 2036016, "step": 3340 }, { "epoch": 1.0378529320508842, "grad_norm": 4.489060401916504, "learning_rate": 5.1877133105802046e-06, "loss": 0.2375, "num_input_tokens_seen": 2038576, "step": 3345 }, { "epoch": 1.0394042817251008, "grad_norm": 2.708071708679199, "learning_rate": 5.195470058951288e-06, "loss": 0.2394, "num_input_tokens_seen": 2042480, "step": 3350 }, { "epoch": 1.0409556313993173, "grad_norm": 1.0592000484466553, "learning_rate": 5.203226807322371e-06, "loss": 0.2376, "num_input_tokens_seen": 2045872, "step": 3355 }, { "epoch": 1.042506981073534, "grad_norm": 1.8577905893325806, "learning_rate": 5.210983555693454e-06, "loss": 0.219, "num_input_tokens_seen": 2048432, "step": 3360 }, { "epoch": 1.0440583307477505, "grad_norm": 0.8224554657936096, "learning_rate": 5.2187403040645365e-06, "loss": 0.2333, "num_input_tokens_seen": 2050800, "step": 3365 }, { "epoch": 1.045609680421967, "grad_norm": 0.9210195541381836, "learning_rate": 5.2264970524356195e-06, "loss": 0.2327, "num_input_tokens_seen": 2053296, "step": 3370 }, { "epoch": 1.0471610300961838, "grad_norm": 1.7773957252502441, "learning_rate": 5.2342538008067025e-06, "loss": 0.2318, "num_input_tokens_seen": 2056016, "step": 3375 }, { "epoch": 1.0487123797704003, "grad_norm": 1.1675617694854736, "learning_rate": 5.2420105491777854e-06, "loss": 0.2338, "num_input_tokens_seen": 2059440, "step": 3380 }, { "epoch": 1.0502637294446169, "grad_norm": 1.548389196395874, "learning_rate": 5.2497672975488676e-06, "loss": 0.2385, "num_input_tokens_seen": 2061840, "step": 3385 }, { "epoch": 1.0518150791188334, "grad_norm": 1.8630876541137695, "learning_rate": 5.2575240459199506e-06, "loss": 0.232, "num_input_tokens_seen": 2064720, "step": 3390 }, { "epoch": 1.05336642879305, "grad_norm": 5.128227710723877, "learning_rate": 5.2652807942910335e-06, "loss": 0.2398, "num_input_tokens_seen": 2067280, "step": 3395 }, { "epoch": 1.0549177784672665, "grad_norm": 67.0801010131836, "learning_rate": 5.273037542662116e-06, "loss": 0.2654, "num_input_tokens_seen": 2070608, "step": 3400 }, { "epoch": 1.056469128141483, "grad_norm": 0.7496700882911682, "learning_rate": 5.280794291033199e-06, "loss": 0.2411, "num_input_tokens_seen": 2073136, "step": 3405 }, { "epoch": 1.0580204778156996, "grad_norm": 0.7862813472747803, "learning_rate": 5.288551039404282e-06, "loss": 0.2294, "num_input_tokens_seen": 2077680, "step": 3410 }, { "epoch": 1.0595718274899162, "grad_norm": 1.1298843622207642, "learning_rate": 5.2963077877753655e-06, "loss": 0.2314, "num_input_tokens_seen": 2080208, "step": 3415 }, { "epoch": 1.0611231771641327, "grad_norm": 1.6752259731292725, "learning_rate": 5.3040645361464485e-06, "loss": 0.2357, "num_input_tokens_seen": 2084112, "step": 3420 }, { "epoch": 1.0626745268383493, "grad_norm": 0.7353588342666626, "learning_rate": 5.311821284517531e-06, "loss": 0.2279, "num_input_tokens_seen": 2086864, "step": 3425 }, { "epoch": 1.0642258765125658, "grad_norm": 0.31087493896484375, "learning_rate": 5.319578032888614e-06, "loss": 0.2248, "num_input_tokens_seen": 2090480, "step": 3430 }, { "epoch": 1.0657772261867824, "grad_norm": 1.5895074605941772, "learning_rate": 5.3273347812596966e-06, "loss": 0.218, "num_input_tokens_seen": 2094224, "step": 3435 }, { "epoch": 1.0673285758609992, "grad_norm": 1.5930413007736206, "learning_rate": 5.3350915296307796e-06, "loss": 0.2185, "num_input_tokens_seen": 2098864, "step": 3440 }, { "epoch": 1.0688799255352157, "grad_norm": 0.632409393787384, "learning_rate": 5.342848278001862e-06, "loss": 0.2318, "num_input_tokens_seen": 2102128, "step": 3445 }, { "epoch": 1.0704312752094323, "grad_norm": 1.5319722890853882, "learning_rate": 5.350605026372945e-06, "loss": 0.2398, "num_input_tokens_seen": 2104912, "step": 3450 }, { "epoch": 1.0719826248836488, "grad_norm": 0.5207808613777161, "learning_rate": 5.358361774744028e-06, "loss": 0.2233, "num_input_tokens_seen": 2107856, "step": 3455 }, { "epoch": 1.0735339745578654, "grad_norm": 0.3532901108264923, "learning_rate": 5.366118523115111e-06, "loss": 0.2346, "num_input_tokens_seen": 2110576, "step": 3460 }, { "epoch": 1.075085324232082, "grad_norm": 0.8190866708755493, "learning_rate": 5.373875271486193e-06, "loss": 0.2247, "num_input_tokens_seen": 2113392, "step": 3465 }, { "epoch": 1.0766366739062985, "grad_norm": 1.3716390132904053, "learning_rate": 5.381632019857276e-06, "loss": 0.2371, "num_input_tokens_seen": 2115984, "step": 3470 }, { "epoch": 1.078188023580515, "grad_norm": 1.1631823778152466, "learning_rate": 5.38938876822836e-06, "loss": 0.2327, "num_input_tokens_seen": 2119632, "step": 3475 }, { "epoch": 1.0797393732547316, "grad_norm": 1.527818202972412, "learning_rate": 5.3971455165994426e-06, "loss": 0.2403, "num_input_tokens_seen": 2122960, "step": 3480 }, { "epoch": 1.0812907229289481, "grad_norm": 1.5898364782333374, "learning_rate": 5.4049022649705256e-06, "loss": 0.2336, "num_input_tokens_seen": 2126800, "step": 3485 }, { "epoch": 1.0828420726031647, "grad_norm": 1.6105022430419922, "learning_rate": 5.412659013341608e-06, "loss": 0.2256, "num_input_tokens_seen": 2129072, "step": 3490 }, { "epoch": 1.0843934222773812, "grad_norm": 1.9800541400909424, "learning_rate": 5.420415761712691e-06, "loss": 0.2216, "num_input_tokens_seen": 2132144, "step": 3495 }, { "epoch": 1.0859447719515978, "grad_norm": 1.7483640909194946, "learning_rate": 5.428172510083774e-06, "loss": 0.2759, "num_input_tokens_seen": 2134832, "step": 3500 }, { "epoch": 1.0874961216258145, "grad_norm": 5.778538227081299, "learning_rate": 5.435929258454856e-06, "loss": 0.2813, "num_input_tokens_seen": 2138064, "step": 3505 }, { "epoch": 1.089047471300031, "grad_norm": 0.6768835783004761, "learning_rate": 5.443686006825939e-06, "loss": 0.2366, "num_input_tokens_seen": 2140656, "step": 3510 }, { "epoch": 1.0905988209742477, "grad_norm": 0.5752744078636169, "learning_rate": 5.451442755197022e-06, "loss": 0.2362, "num_input_tokens_seen": 2143760, "step": 3515 }, { "epoch": 1.0921501706484642, "grad_norm": 0.7696873545646667, "learning_rate": 5.459199503568105e-06, "loss": 0.2274, "num_input_tokens_seen": 2146128, "step": 3520 }, { "epoch": 1.0937015203226808, "grad_norm": 0.6913201808929443, "learning_rate": 5.466956251939187e-06, "loss": 0.2326, "num_input_tokens_seen": 2148624, "step": 3525 }, { "epoch": 1.0952528699968973, "grad_norm": 0.6904485821723938, "learning_rate": 5.47471300031027e-06, "loss": 0.2335, "num_input_tokens_seen": 2151344, "step": 3530 }, { "epoch": 1.0968042196711139, "grad_norm": 0.2080530971288681, "learning_rate": 5.482469748681353e-06, "loss": 0.2326, "num_input_tokens_seen": 2154352, "step": 3535 }, { "epoch": 1.0983555693453304, "grad_norm": 0.24303539097309113, "learning_rate": 5.490226497052437e-06, "loss": 0.234, "num_input_tokens_seen": 2156784, "step": 3540 }, { "epoch": 1.099906919019547, "grad_norm": 0.3818199038505554, "learning_rate": 5.49798324542352e-06, "loss": 0.2308, "num_input_tokens_seen": 2159856, "step": 3545 }, { "epoch": 1.1014582686937635, "grad_norm": 0.6768509149551392, "learning_rate": 5.505739993794602e-06, "loss": 0.2318, "num_input_tokens_seen": 2163216, "step": 3550 }, { "epoch": 1.10300961836798, "grad_norm": 0.36440610885620117, "learning_rate": 5.513496742165685e-06, "loss": 0.2308, "num_input_tokens_seen": 2166832, "step": 3555 }, { "epoch": 1.1045609680421966, "grad_norm": 0.43169060349464417, "learning_rate": 5.521253490536768e-06, "loss": 0.2349, "num_input_tokens_seen": 2170544, "step": 3560 }, { "epoch": 1.1061123177164132, "grad_norm": 0.6839707493782043, "learning_rate": 5.529010238907851e-06, "loss": 0.2284, "num_input_tokens_seen": 2173520, "step": 3565 }, { "epoch": 1.10766366739063, "grad_norm": 0.6513141393661499, "learning_rate": 5.536766987278933e-06, "loss": 0.2321, "num_input_tokens_seen": 2177616, "step": 3570 }, { "epoch": 1.1092150170648465, "grad_norm": 0.580251932144165, "learning_rate": 5.544523735650016e-06, "loss": 0.2338, "num_input_tokens_seen": 2180400, "step": 3575 }, { "epoch": 1.110766366739063, "grad_norm": 0.4925972819328308, "learning_rate": 5.552280484021099e-06, "loss": 0.229, "num_input_tokens_seen": 2183152, "step": 3580 }, { "epoch": 1.1123177164132796, "grad_norm": 1.0998328924179077, "learning_rate": 5.560037232392181e-06, "loss": 0.2236, "num_input_tokens_seen": 2186160, "step": 3585 }, { "epoch": 1.1138690660874961, "grad_norm": 1.4680665731430054, "learning_rate": 5.567793980763264e-06, "loss": 0.2257, "num_input_tokens_seen": 2188784, "step": 3590 }, { "epoch": 1.1154204157617127, "grad_norm": 0.5087776780128479, "learning_rate": 5.575550729134347e-06, "loss": 0.2388, "num_input_tokens_seen": 2192048, "step": 3595 }, { "epoch": 1.1169717654359292, "grad_norm": 0.47183725237846375, "learning_rate": 5.58330747750543e-06, "loss": 0.2545, "num_input_tokens_seen": 2195664, "step": 3600 }, { "epoch": 1.1185231151101458, "grad_norm": 1.2773641347885132, "learning_rate": 5.591064225876514e-06, "loss": 0.237, "num_input_tokens_seen": 2198320, "step": 3605 }, { "epoch": 1.1200744647843623, "grad_norm": 0.3627447783946991, "learning_rate": 5.598820974247596e-06, "loss": 0.2285, "num_input_tokens_seen": 2202608, "step": 3610 }, { "epoch": 1.121625814458579, "grad_norm": 0.6638182401657104, "learning_rate": 5.606577722618679e-06, "loss": 0.2304, "num_input_tokens_seen": 2205808, "step": 3615 }, { "epoch": 1.1231771641327954, "grad_norm": 0.9063841700553894, "learning_rate": 5.614334470989762e-06, "loss": 0.2391, "num_input_tokens_seen": 2209008, "step": 3620 }, { "epoch": 1.124728513807012, "grad_norm": 0.5674965977668762, "learning_rate": 5.622091219360845e-06, "loss": 0.2298, "num_input_tokens_seen": 2211280, "step": 3625 }, { "epoch": 1.1262798634812285, "grad_norm": 0.6522002220153809, "learning_rate": 5.629847967731927e-06, "loss": 0.2234, "num_input_tokens_seen": 2214064, "step": 3630 }, { "epoch": 1.1278312131554453, "grad_norm": 0.5252724885940552, "learning_rate": 5.63760471610301e-06, "loss": 0.2363, "num_input_tokens_seen": 2216592, "step": 3635 }, { "epoch": 1.1293825628296619, "grad_norm": 0.3040519952774048, "learning_rate": 5.645361464474093e-06, "loss": 0.2388, "num_input_tokens_seen": 2218704, "step": 3640 }, { "epoch": 1.1309339125038784, "grad_norm": 0.6702378392219543, "learning_rate": 5.653118212845176e-06, "loss": 0.2329, "num_input_tokens_seen": 2222768, "step": 3645 }, { "epoch": 1.132485262178095, "grad_norm": 0.832801103591919, "learning_rate": 5.660874961216258e-06, "loss": 0.2341, "num_input_tokens_seen": 2227056, "step": 3650 }, { "epoch": 1.1340366118523115, "grad_norm": 0.95384681224823, "learning_rate": 5.668631709587341e-06, "loss": 0.2378, "num_input_tokens_seen": 2230160, "step": 3655 }, { "epoch": 1.135587961526528, "grad_norm": 0.5880937576293945, "learning_rate": 5.676388457958424e-06, "loss": 0.2222, "num_input_tokens_seen": 2233360, "step": 3660 }, { "epoch": 1.1371393112007446, "grad_norm": 1.8017263412475586, "learning_rate": 5.684145206329506e-06, "loss": 0.2336, "num_input_tokens_seen": 2236880, "step": 3665 }, { "epoch": 1.1386906608749612, "grad_norm": 1.0445033311843872, "learning_rate": 5.691901954700591e-06, "loss": 0.2165, "num_input_tokens_seen": 2240592, "step": 3670 }, { "epoch": 1.1402420105491777, "grad_norm": 0.6570665836334229, "learning_rate": 5.699658703071673e-06, "loss": 0.239, "num_input_tokens_seen": 2243120, "step": 3675 }, { "epoch": 1.1417933602233943, "grad_norm": 0.4253065288066864, "learning_rate": 5.707415451442756e-06, "loss": 0.2306, "num_input_tokens_seen": 2247952, "step": 3680 }, { "epoch": 1.1433447098976108, "grad_norm": 0.4530254602432251, "learning_rate": 5.715172199813839e-06, "loss": 0.226, "num_input_tokens_seen": 2249904, "step": 3685 }, { "epoch": 1.1448960595718276, "grad_norm": 1.094305396080017, "learning_rate": 5.722928948184921e-06, "loss": 0.2432, "num_input_tokens_seen": 2253200, "step": 3690 }, { "epoch": 1.146447409246044, "grad_norm": 0.7086771726608276, "learning_rate": 5.730685696556004e-06, "loss": 0.23, "num_input_tokens_seen": 2256016, "step": 3695 }, { "epoch": 1.1479987589202607, "grad_norm": 0.6234970092773438, "learning_rate": 5.738442444927087e-06, "loss": 0.2304, "num_input_tokens_seen": 2258352, "step": 3700 }, { "epoch": 1.1495501085944773, "grad_norm": 0.24746009707450867, "learning_rate": 5.74619919329817e-06, "loss": 0.2373, "num_input_tokens_seen": 2260912, "step": 3705 }, { "epoch": 1.1511014582686938, "grad_norm": 1.230715036392212, "learning_rate": 5.753955941669252e-06, "loss": 0.2331, "num_input_tokens_seen": 2263664, "step": 3710 }, { "epoch": 1.1526528079429104, "grad_norm": 0.5843259692192078, "learning_rate": 5.761712690040335e-06, "loss": 0.2316, "num_input_tokens_seen": 2266320, "step": 3715 }, { "epoch": 1.154204157617127, "grad_norm": 0.4534372091293335, "learning_rate": 5.769469438411418e-06, "loss": 0.2268, "num_input_tokens_seen": 2269008, "step": 3720 }, { "epoch": 1.1557555072913435, "grad_norm": 0.5786911845207214, "learning_rate": 5.777226186782501e-06, "loss": 0.2418, "num_input_tokens_seen": 2272080, "step": 3725 }, { "epoch": 1.15730685696556, "grad_norm": 0.6195271015167236, "learning_rate": 5.784982935153583e-06, "loss": 0.2328, "num_input_tokens_seen": 2275312, "step": 3730 }, { "epoch": 1.1588582066397766, "grad_norm": 0.7049320936203003, "learning_rate": 5.792739683524667e-06, "loss": 0.2274, "num_input_tokens_seen": 2279184, "step": 3735 }, { "epoch": 1.1604095563139931, "grad_norm": 1.0159977674484253, "learning_rate": 5.80049643189575e-06, "loss": 0.2317, "num_input_tokens_seen": 2283408, "step": 3740 }, { "epoch": 1.1619609059882097, "grad_norm": 0.5420042872428894, "learning_rate": 5.808253180266833e-06, "loss": 0.2343, "num_input_tokens_seen": 2286672, "step": 3745 }, { "epoch": 1.1635122556624262, "grad_norm": 0.23388029634952545, "learning_rate": 5.816009928637916e-06, "loss": 0.2356, "num_input_tokens_seen": 2290576, "step": 3750 }, { "epoch": 1.165063605336643, "grad_norm": 0.47550228238105774, "learning_rate": 5.823766677008998e-06, "loss": 0.2357, "num_input_tokens_seen": 2293264, "step": 3755 }, { "epoch": 1.1666149550108595, "grad_norm": 1.2516272068023682, "learning_rate": 5.831523425380081e-06, "loss": 0.232, "num_input_tokens_seen": 2298992, "step": 3760 }, { "epoch": 1.168166304685076, "grad_norm": 0.6107848882675171, "learning_rate": 5.839280173751164e-06, "loss": 0.2359, "num_input_tokens_seen": 2302384, "step": 3765 }, { "epoch": 1.1697176543592926, "grad_norm": 0.637311577796936, "learning_rate": 5.847036922122246e-06, "loss": 0.2346, "num_input_tokens_seen": 2305424, "step": 3770 }, { "epoch": 1.1712690040335092, "grad_norm": 0.5769680738449097, "learning_rate": 5.854793670493329e-06, "loss": 0.2305, "num_input_tokens_seen": 2307952, "step": 3775 }, { "epoch": 1.1728203537077257, "grad_norm": 1.3534064292907715, "learning_rate": 5.862550418864412e-06, "loss": 0.2274, "num_input_tokens_seen": 2310928, "step": 3780 }, { "epoch": 1.1743717033819423, "grad_norm": 1.7395708560943604, "learning_rate": 5.870307167235495e-06, "loss": 0.2443, "num_input_tokens_seen": 2312912, "step": 3785 }, { "epoch": 1.1759230530561589, "grad_norm": 1.1408040523529053, "learning_rate": 5.878063915606577e-06, "loss": 0.2338, "num_input_tokens_seen": 2315824, "step": 3790 }, { "epoch": 1.1774744027303754, "grad_norm": 0.509151041507721, "learning_rate": 5.88582066397766e-06, "loss": 0.2394, "num_input_tokens_seen": 2318096, "step": 3795 }, { "epoch": 1.179025752404592, "grad_norm": 0.4794757068157196, "learning_rate": 5.893577412348744e-06, "loss": 0.2367, "num_input_tokens_seen": 2321936, "step": 3800 }, { "epoch": 1.1805771020788085, "grad_norm": 0.6467407941818237, "learning_rate": 5.901334160719827e-06, "loss": 0.2269, "num_input_tokens_seen": 2325360, "step": 3805 }, { "epoch": 1.182128451753025, "grad_norm": 0.2607957720756531, "learning_rate": 5.90909090909091e-06, "loss": 0.2258, "num_input_tokens_seen": 2327664, "step": 3810 }, { "epoch": 1.1836798014272416, "grad_norm": 0.976070761680603, "learning_rate": 5.916847657461992e-06, "loss": 0.2258, "num_input_tokens_seen": 2330896, "step": 3815 }, { "epoch": 1.1852311511014584, "grad_norm": 0.8779816627502441, "learning_rate": 5.924604405833075e-06, "loss": 0.238, "num_input_tokens_seen": 2334192, "step": 3820 }, { "epoch": 1.186782500775675, "grad_norm": 0.9388937950134277, "learning_rate": 5.932361154204158e-06, "loss": 0.2333, "num_input_tokens_seen": 2336560, "step": 3825 }, { "epoch": 1.1883338504498915, "grad_norm": 0.5648269653320312, "learning_rate": 5.940117902575241e-06, "loss": 0.2424, "num_input_tokens_seen": 2338768, "step": 3830 }, { "epoch": 1.189885200124108, "grad_norm": 0.6624540686607361, "learning_rate": 5.947874650946323e-06, "loss": 0.2328, "num_input_tokens_seen": 2341872, "step": 3835 }, { "epoch": 1.1914365497983246, "grad_norm": 0.3060755133628845, "learning_rate": 5.955631399317406e-06, "loss": 0.2316, "num_input_tokens_seen": 2345168, "step": 3840 }, { "epoch": 1.1929878994725411, "grad_norm": 0.24758280813694, "learning_rate": 5.963388147688489e-06, "loss": 0.2271, "num_input_tokens_seen": 2347600, "step": 3845 }, { "epoch": 1.1945392491467577, "grad_norm": 0.40205082297325134, "learning_rate": 5.9711448960595715e-06, "loss": 0.2196, "num_input_tokens_seen": 2350448, "step": 3850 }, { "epoch": 1.1960905988209742, "grad_norm": 0.26560643315315247, "learning_rate": 5.9789016444306545e-06, "loss": 0.2301, "num_input_tokens_seen": 2353072, "step": 3855 }, { "epoch": 1.1976419484951908, "grad_norm": 0.3998272120952606, "learning_rate": 5.986658392801738e-06, "loss": 0.2116, "num_input_tokens_seen": 2356176, "step": 3860 }, { "epoch": 1.1991932981694073, "grad_norm": 0.381879597902298, "learning_rate": 5.994415141172821e-06, "loss": 0.2399, "num_input_tokens_seen": 2358960, "step": 3865 }, { "epoch": 1.200744647843624, "grad_norm": 0.370696485042572, "learning_rate": 6.002171889543904e-06, "loss": 0.2393, "num_input_tokens_seen": 2362256, "step": 3870 }, { "epoch": 1.2022959975178404, "grad_norm": 0.42039528489112854, "learning_rate": 6.0099286379149864e-06, "loss": 0.2487, "num_input_tokens_seen": 2364720, "step": 3875 }, { "epoch": 1.203847347192057, "grad_norm": 0.4270108938217163, "learning_rate": 6.017685386286069e-06, "loss": 0.2256, "num_input_tokens_seen": 2367376, "step": 3880 }, { "epoch": 1.2053986968662738, "grad_norm": 1.0096319913864136, "learning_rate": 6.025442134657152e-06, "loss": 0.2267, "num_input_tokens_seen": 2369648, "step": 3885 }, { "epoch": 1.2069500465404903, "grad_norm": 0.21505463123321533, "learning_rate": 6.033198883028235e-06, "loss": 0.2243, "num_input_tokens_seen": 2373200, "step": 3890 }, { "epoch": 1.2085013962147069, "grad_norm": 0.2633768916130066, "learning_rate": 6.0409556313993175e-06, "loss": 0.2368, "num_input_tokens_seen": 2376400, "step": 3895 }, { "epoch": 1.2100527458889234, "grad_norm": 0.4339803457260132, "learning_rate": 6.0487123797704005e-06, "loss": 0.2426, "num_input_tokens_seen": 2379216, "step": 3900 }, { "epoch": 1.21160409556314, "grad_norm": 0.4363592565059662, "learning_rate": 6.0564691281414835e-06, "loss": 0.2282, "num_input_tokens_seen": 2382448, "step": 3905 }, { "epoch": 1.2131554452373565, "grad_norm": 0.42386236786842346, "learning_rate": 6.0642258765125665e-06, "loss": 0.2281, "num_input_tokens_seen": 2384976, "step": 3910 }, { "epoch": 1.214706794911573, "grad_norm": 0.23005974292755127, "learning_rate": 6.071982624883649e-06, "loss": 0.2274, "num_input_tokens_seen": 2388240, "step": 3915 }, { "epoch": 1.2162581445857896, "grad_norm": 0.41063717007637024, "learning_rate": 6.079739373254732e-06, "loss": 0.2235, "num_input_tokens_seen": 2390384, "step": 3920 }, { "epoch": 1.2178094942600062, "grad_norm": 0.3815286159515381, "learning_rate": 6.0874961216258154e-06, "loss": 0.2218, "num_input_tokens_seen": 2394352, "step": 3925 }, { "epoch": 1.2193608439342227, "grad_norm": 0.5150070190429688, "learning_rate": 6.095252869996898e-06, "loss": 0.2449, "num_input_tokens_seen": 2397040, "step": 3930 }, { "epoch": 1.2209121936084393, "grad_norm": 0.5930545926094055, "learning_rate": 6.103009618367981e-06, "loss": 0.2365, "num_input_tokens_seen": 2400080, "step": 3935 }, { "epoch": 1.2224635432826558, "grad_norm": 0.25459209084510803, "learning_rate": 6.1107663667390635e-06, "loss": 0.2339, "num_input_tokens_seen": 2402384, "step": 3940 }, { "epoch": 1.2240148929568724, "grad_norm": 0.6244063973426819, "learning_rate": 6.1185231151101465e-06, "loss": 0.2318, "num_input_tokens_seen": 2405712, "step": 3945 }, { "epoch": 1.2255662426310892, "grad_norm": 0.5302448272705078, "learning_rate": 6.1262798634812295e-06, "loss": 0.2304, "num_input_tokens_seen": 2409520, "step": 3950 }, { "epoch": 1.2271175923053057, "grad_norm": 0.274649441242218, "learning_rate": 6.134036611852312e-06, "loss": 0.2344, "num_input_tokens_seen": 2412944, "step": 3955 }, { "epoch": 1.2286689419795223, "grad_norm": 0.4647250175476074, "learning_rate": 6.141793360223395e-06, "loss": 0.2314, "num_input_tokens_seen": 2416240, "step": 3960 }, { "epoch": 1.2302202916537388, "grad_norm": 0.3146929442882538, "learning_rate": 6.149550108594478e-06, "loss": 0.2316, "num_input_tokens_seen": 2419824, "step": 3965 }, { "epoch": 1.2317716413279554, "grad_norm": 0.6808316111564636, "learning_rate": 6.157306856965561e-06, "loss": 0.2316, "num_input_tokens_seen": 2423408, "step": 3970 }, { "epoch": 1.233322991002172, "grad_norm": 1.0466681718826294, "learning_rate": 6.165063605336643e-06, "loss": 0.2341, "num_input_tokens_seen": 2426832, "step": 3975 }, { "epoch": 1.2348743406763885, "grad_norm": 0.1862410604953766, "learning_rate": 6.172820353707726e-06, "loss": 0.2321, "num_input_tokens_seen": 2430160, "step": 3980 }, { "epoch": 1.236425690350605, "grad_norm": 0.4188975393772125, "learning_rate": 6.180577102078809e-06, "loss": 0.2324, "num_input_tokens_seen": 2432400, "step": 3985 }, { "epoch": 1.2379770400248216, "grad_norm": 0.21231918036937714, "learning_rate": 6.1883338504498925e-06, "loss": 0.2188, "num_input_tokens_seen": 2434928, "step": 3990 }, { "epoch": 1.2395283896990381, "grad_norm": 0.37942612171173096, "learning_rate": 6.1960905988209755e-06, "loss": 0.2358, "num_input_tokens_seen": 2438160, "step": 3995 }, { "epoch": 1.2410797393732547, "grad_norm": 0.6991674304008484, "learning_rate": 6.203847347192058e-06, "loss": 0.2447, "num_input_tokens_seen": 2441648, "step": 4000 }, { "epoch": 1.2426310890474712, "grad_norm": 0.30517563223838806, "learning_rate": 6.211604095563141e-06, "loss": 0.2289, "num_input_tokens_seen": 2443984, "step": 4005 }, { "epoch": 1.2441824387216878, "grad_norm": 0.2656599283218384, "learning_rate": 6.219360843934224e-06, "loss": 0.2388, "num_input_tokens_seen": 2446384, "step": 4010 }, { "epoch": 1.2457337883959045, "grad_norm": 0.9696353077888489, "learning_rate": 6.227117592305307e-06, "loss": 0.2312, "num_input_tokens_seen": 2450032, "step": 4015 }, { "epoch": 1.247285138070121, "grad_norm": 0.3862801790237427, "learning_rate": 6.234874340676389e-06, "loss": 0.2304, "num_input_tokens_seen": 2452784, "step": 4020 }, { "epoch": 1.2488364877443376, "grad_norm": 0.4304838478565216, "learning_rate": 6.242631089047472e-06, "loss": 0.2426, "num_input_tokens_seen": 2454960, "step": 4025 }, { "epoch": 1.2503878374185542, "grad_norm": 0.9514424800872803, "learning_rate": 6.250387837418555e-06, "loss": 0.2342, "num_input_tokens_seen": 2457616, "step": 4030 }, { "epoch": 1.2519391870927707, "grad_norm": 0.9988929033279419, "learning_rate": 6.258144585789637e-06, "loss": 0.2317, "num_input_tokens_seen": 2461904, "step": 4035 }, { "epoch": 1.2534905367669873, "grad_norm": 1.4081430435180664, "learning_rate": 6.26590133416072e-06, "loss": 0.2495, "num_input_tokens_seen": 2465456, "step": 4040 }, { "epoch": 1.2550418864412038, "grad_norm": 1.5231823921203613, "learning_rate": 6.273658082531803e-06, "loss": 0.2192, "num_input_tokens_seen": 2470960, "step": 4045 }, { "epoch": 1.2565932361154204, "grad_norm": 0.632081925868988, "learning_rate": 6.281414830902886e-06, "loss": 0.2186, "num_input_tokens_seen": 2474160, "step": 4050 }, { "epoch": 1.258144585789637, "grad_norm": 1.428410291671753, "learning_rate": 6.28917157927397e-06, "loss": 0.2359, "num_input_tokens_seen": 2476688, "step": 4055 }, { "epoch": 1.2596959354638535, "grad_norm": 0.7303586602210999, "learning_rate": 6.296928327645052e-06, "loss": 0.2668, "num_input_tokens_seen": 2479376, "step": 4060 }, { "epoch": 1.26124728513807, "grad_norm": 1.385527491569519, "learning_rate": 6.304685076016135e-06, "loss": 0.2459, "num_input_tokens_seen": 2482128, "step": 4065 }, { "epoch": 1.2627986348122868, "grad_norm": 1.1090320348739624, "learning_rate": 6.312441824387218e-06, "loss": 0.2189, "num_input_tokens_seen": 2484656, "step": 4070 }, { "epoch": 1.2643499844865032, "grad_norm": 0.7143604755401611, "learning_rate": 6.320198572758301e-06, "loss": 0.2352, "num_input_tokens_seen": 2487376, "step": 4075 }, { "epoch": 1.26590133416072, "grad_norm": 2.5045690536499023, "learning_rate": 6.327955321129383e-06, "loss": 0.2387, "num_input_tokens_seen": 2489776, "step": 4080 }, { "epoch": 1.2674526838349365, "grad_norm": 4.538675785064697, "learning_rate": 6.335712069500466e-06, "loss": 0.2521, "num_input_tokens_seen": 2493488, "step": 4085 }, { "epoch": 1.269004033509153, "grad_norm": 1.5925955772399902, "learning_rate": 6.343468817871549e-06, "loss": 0.2136, "num_input_tokens_seen": 2496304, "step": 4090 }, { "epoch": 1.2705553831833696, "grad_norm": 2.241665840148926, "learning_rate": 6.351225566242632e-06, "loss": 0.2611, "num_input_tokens_seen": 2499632, "step": 4095 }, { "epoch": 1.2721067328575861, "grad_norm": 1.27857506275177, "learning_rate": 6.358982314613714e-06, "loss": 0.2354, "num_input_tokens_seen": 2503024, "step": 4100 }, { "epoch": 1.2736580825318027, "grad_norm": 1.5148100852966309, "learning_rate": 6.366739062984797e-06, "loss": 0.2219, "num_input_tokens_seen": 2505456, "step": 4105 }, { "epoch": 1.2752094322060192, "grad_norm": 1.4174981117248535, "learning_rate": 6.37449581135588e-06, "loss": 0.2514, "num_input_tokens_seen": 2508912, "step": 4110 }, { "epoch": 1.2767607818802358, "grad_norm": 2.0860886573791504, "learning_rate": 6.382252559726962e-06, "loss": 0.2335, "num_input_tokens_seen": 2511824, "step": 4115 }, { "epoch": 1.2783121315544523, "grad_norm": 2.575197458267212, "learning_rate": 6.390009308098047e-06, "loss": 0.2364, "num_input_tokens_seen": 2515024, "step": 4120 }, { "epoch": 1.2798634812286689, "grad_norm": 0.432499498128891, "learning_rate": 6.397766056469129e-06, "loss": 0.2234, "num_input_tokens_seen": 2517712, "step": 4125 }, { "epoch": 1.2814148309028854, "grad_norm": 2.775036334991455, "learning_rate": 6.405522804840212e-06, "loss": 0.2318, "num_input_tokens_seen": 2520400, "step": 4130 }, { "epoch": 1.2829661805771022, "grad_norm": 0.9488135576248169, "learning_rate": 6.413279553211295e-06, "loss": 0.263, "num_input_tokens_seen": 2523760, "step": 4135 }, { "epoch": 1.2845175302513185, "grad_norm": 0.28225913643836975, "learning_rate": 6.421036301582377e-06, "loss": 0.2255, "num_input_tokens_seen": 2527344, "step": 4140 }, { "epoch": 1.2860688799255353, "grad_norm": 0.5175378322601318, "learning_rate": 6.42879304995346e-06, "loss": 0.2443, "num_input_tokens_seen": 2530320, "step": 4145 }, { "epoch": 1.2876202295997519, "grad_norm": 0.5604712963104248, "learning_rate": 6.436549798324543e-06, "loss": 0.2313, "num_input_tokens_seen": 2532752, "step": 4150 }, { "epoch": 1.2891715792739684, "grad_norm": 1.0726629495620728, "learning_rate": 6.444306546695626e-06, "loss": 0.2325, "num_input_tokens_seen": 2535056, "step": 4155 }, { "epoch": 1.290722928948185, "grad_norm": 0.6438426971435547, "learning_rate": 6.452063295066708e-06, "loss": 0.2303, "num_input_tokens_seen": 2538800, "step": 4160 }, { "epoch": 1.2922742786224015, "grad_norm": 0.6247302889823914, "learning_rate": 6.459820043437791e-06, "loss": 0.2357, "num_input_tokens_seen": 2541872, "step": 4165 }, { "epoch": 1.293825628296618, "grad_norm": 0.3397570550441742, "learning_rate": 6.467576791808874e-06, "loss": 0.2316, "num_input_tokens_seen": 2544912, "step": 4170 }, { "epoch": 1.2953769779708346, "grad_norm": 0.5174832344055176, "learning_rate": 6.475333540179957e-06, "loss": 0.2345, "num_input_tokens_seen": 2547376, "step": 4175 }, { "epoch": 1.2969283276450512, "grad_norm": 0.9004796147346497, "learning_rate": 6.483090288551039e-06, "loss": 0.2284, "num_input_tokens_seen": 2550704, "step": 4180 }, { "epoch": 1.2984796773192677, "grad_norm": 1.124234676361084, "learning_rate": 6.490847036922123e-06, "loss": 0.2423, "num_input_tokens_seen": 2553392, "step": 4185 }, { "epoch": 1.3000310269934843, "grad_norm": 0.2248859703540802, "learning_rate": 6.498603785293206e-06, "loss": 0.2311, "num_input_tokens_seen": 2555888, "step": 4190 }, { "epoch": 1.3015823766677008, "grad_norm": 0.4768393635749817, "learning_rate": 6.506360533664289e-06, "loss": 0.2321, "num_input_tokens_seen": 2559440, "step": 4195 }, { "epoch": 1.3031337263419176, "grad_norm": 0.4341961443424225, "learning_rate": 6.514117282035372e-06, "loss": 0.2284, "num_input_tokens_seen": 2561840, "step": 4200 }, { "epoch": 1.304685076016134, "grad_norm": 0.5829176306724548, "learning_rate": 6.521874030406454e-06, "loss": 0.233, "num_input_tokens_seen": 2564976, "step": 4205 }, { "epoch": 1.3062364256903507, "grad_norm": 0.13607384264469147, "learning_rate": 6.529630778777537e-06, "loss": 0.23, "num_input_tokens_seen": 2568304, "step": 4210 }, { "epoch": 1.3077877753645673, "grad_norm": 0.20092058181762695, "learning_rate": 6.53738752714862e-06, "loss": 0.2393, "num_input_tokens_seen": 2571184, "step": 4215 }, { "epoch": 1.3093391250387838, "grad_norm": 0.41828587651252747, "learning_rate": 6.545144275519702e-06, "loss": 0.2346, "num_input_tokens_seen": 2573680, "step": 4220 }, { "epoch": 1.3108904747130004, "grad_norm": 0.19567590951919556, "learning_rate": 6.552901023890785e-06, "loss": 0.2346, "num_input_tokens_seen": 2576656, "step": 4225 }, { "epoch": 1.312441824387217, "grad_norm": 0.781905472278595, "learning_rate": 6.560657772261868e-06, "loss": 0.2346, "num_input_tokens_seen": 2578960, "step": 4230 }, { "epoch": 1.3139931740614335, "grad_norm": 0.834246039390564, "learning_rate": 6.568414520632951e-06, "loss": 0.2294, "num_input_tokens_seen": 2582288, "step": 4235 }, { "epoch": 1.31554452373565, "grad_norm": 0.17479842901229858, "learning_rate": 6.576171269004033e-06, "loss": 0.2327, "num_input_tokens_seen": 2584976, "step": 4240 }, { "epoch": 1.3170958734098666, "grad_norm": 0.197904571890831, "learning_rate": 6.583928017375117e-06, "loss": 0.2294, "num_input_tokens_seen": 2587760, "step": 4245 }, { "epoch": 1.318647223084083, "grad_norm": 0.3773338794708252, "learning_rate": 6.5916847657462e-06, "loss": 0.2307, "num_input_tokens_seen": 2590768, "step": 4250 }, { "epoch": 1.3201985727582997, "grad_norm": 0.18415164947509766, "learning_rate": 6.599441514117283e-06, "loss": 0.2302, "num_input_tokens_seen": 2594160, "step": 4255 }, { "epoch": 1.3217499224325162, "grad_norm": 0.6758473515510559, "learning_rate": 6.607198262488366e-06, "loss": 0.223, "num_input_tokens_seen": 2597264, "step": 4260 }, { "epoch": 1.323301272106733, "grad_norm": 0.33676278591156006, "learning_rate": 6.614955010859448e-06, "loss": 0.2429, "num_input_tokens_seen": 2602256, "step": 4265 }, { "epoch": 1.3248526217809493, "grad_norm": 0.3672167956829071, "learning_rate": 6.622711759230531e-06, "loss": 0.223, "num_input_tokens_seen": 2606384, "step": 4270 }, { "epoch": 1.326403971455166, "grad_norm": 0.1599324494600296, "learning_rate": 6.630468507601614e-06, "loss": 0.2363, "num_input_tokens_seen": 2608944, "step": 4275 }, { "epoch": 1.3279553211293826, "grad_norm": 0.21189025044441223, "learning_rate": 6.638225255972697e-06, "loss": 0.2437, "num_input_tokens_seen": 2611696, "step": 4280 }, { "epoch": 1.3295066708035992, "grad_norm": 0.2723950147628784, "learning_rate": 6.645982004343779e-06, "loss": 0.2335, "num_input_tokens_seen": 2614768, "step": 4285 }, { "epoch": 1.3310580204778157, "grad_norm": 0.38127195835113525, "learning_rate": 6.653738752714862e-06, "loss": 0.2237, "num_input_tokens_seen": 2617232, "step": 4290 }, { "epoch": 1.3326093701520323, "grad_norm": 0.5126985311508179, "learning_rate": 6.661495501085945e-06, "loss": 0.2328, "num_input_tokens_seen": 2620464, "step": 4295 }, { "epoch": 1.3341607198262488, "grad_norm": 0.15100950002670288, "learning_rate": 6.669252249457027e-06, "loss": 0.2317, "num_input_tokens_seen": 2623920, "step": 4300 }, { "epoch": 1.3357120695004654, "grad_norm": 0.19405634701251984, "learning_rate": 6.67700899782811e-06, "loss": 0.2264, "num_input_tokens_seen": 2626480, "step": 4305 }, { "epoch": 1.337263419174682, "grad_norm": 0.5331464409828186, "learning_rate": 6.684765746199194e-06, "loss": 0.2335, "num_input_tokens_seen": 2629264, "step": 4310 }, { "epoch": 1.3388147688488985, "grad_norm": 0.7006241083145142, "learning_rate": 6.692522494570277e-06, "loss": 0.2266, "num_input_tokens_seen": 2632272, "step": 4315 }, { "epoch": 1.340366118523115, "grad_norm": 0.31027379631996155, "learning_rate": 6.70027924294136e-06, "loss": 0.2348, "num_input_tokens_seen": 2634704, "step": 4320 }, { "epoch": 1.3419174681973316, "grad_norm": 0.3155101537704468, "learning_rate": 6.708035991312442e-06, "loss": 0.2276, "num_input_tokens_seen": 2637744, "step": 4325 }, { "epoch": 1.3434688178715484, "grad_norm": 1.506738543510437, "learning_rate": 6.715792739683525e-06, "loss": 0.2319, "num_input_tokens_seen": 2640528, "step": 4330 }, { "epoch": 1.3450201675457647, "grad_norm": 0.4613924026489258, "learning_rate": 6.723549488054608e-06, "loss": 0.2345, "num_input_tokens_seen": 2643376, "step": 4335 }, { "epoch": 1.3465715172199815, "grad_norm": 0.45486411452293396, "learning_rate": 6.731306236425691e-06, "loss": 0.2251, "num_input_tokens_seen": 2645936, "step": 4340 }, { "epoch": 1.348122866894198, "grad_norm": 0.7573488354682922, "learning_rate": 6.739062984796773e-06, "loss": 0.2473, "num_input_tokens_seen": 2649520, "step": 4345 }, { "epoch": 1.3496742165684146, "grad_norm": 1.0886660814285278, "learning_rate": 6.746819733167856e-06, "loss": 0.2098, "num_input_tokens_seen": 2652624, "step": 4350 }, { "epoch": 1.3512255662426311, "grad_norm": 0.43359649181365967, "learning_rate": 6.754576481538939e-06, "loss": 0.2245, "num_input_tokens_seen": 2655504, "step": 4355 }, { "epoch": 1.3527769159168477, "grad_norm": 0.4700445532798767, "learning_rate": 6.762333229910022e-06, "loss": 0.2247, "num_input_tokens_seen": 2658192, "step": 4360 }, { "epoch": 1.3543282655910642, "grad_norm": 2.1604197025299072, "learning_rate": 6.7700899782811045e-06, "loss": 0.2753, "num_input_tokens_seen": 2661104, "step": 4365 }, { "epoch": 1.3558796152652808, "grad_norm": 1.148078203201294, "learning_rate": 6.7778467266521874e-06, "loss": 0.2403, "num_input_tokens_seen": 2663952, "step": 4370 }, { "epoch": 1.3574309649394973, "grad_norm": 3.1284782886505127, "learning_rate": 6.785603475023271e-06, "loss": 0.2397, "num_input_tokens_seen": 2669168, "step": 4375 }, { "epoch": 1.3589823146137139, "grad_norm": 0.20151762664318085, "learning_rate": 6.793360223394354e-06, "loss": 0.2268, "num_input_tokens_seen": 2672464, "step": 4380 }, { "epoch": 1.3605336642879304, "grad_norm": 0.7326391339302063, "learning_rate": 6.801116971765437e-06, "loss": 0.2273, "num_input_tokens_seen": 2674864, "step": 4385 }, { "epoch": 1.362085013962147, "grad_norm": 0.6030550003051758, "learning_rate": 6.808873720136519e-06, "loss": 0.2363, "num_input_tokens_seen": 2678448, "step": 4390 }, { "epoch": 1.3636363636363638, "grad_norm": 0.24326904118061066, "learning_rate": 6.816630468507602e-06, "loss": 0.2319, "num_input_tokens_seen": 2682640, "step": 4395 }, { "epoch": 1.36518771331058, "grad_norm": 0.1594698131084442, "learning_rate": 6.824387216878685e-06, "loss": 0.2243, "num_input_tokens_seen": 2686992, "step": 4400 }, { "epoch": 1.3667390629847969, "grad_norm": 0.45843738317489624, "learning_rate": 6.8321439652497675e-06, "loss": 0.2341, "num_input_tokens_seen": 2689616, "step": 4405 }, { "epoch": 1.3682904126590134, "grad_norm": 0.42849037051200867, "learning_rate": 6.8399007136208505e-06, "loss": 0.2351, "num_input_tokens_seen": 2692560, "step": 4410 }, { "epoch": 1.36984176233323, "grad_norm": 1.0264612436294556, "learning_rate": 6.8476574619919334e-06, "loss": 0.2269, "num_input_tokens_seen": 2695440, "step": 4415 }, { "epoch": 1.3713931120074465, "grad_norm": 0.41323593258857727, "learning_rate": 6.8554142103630164e-06, "loss": 0.2302, "num_input_tokens_seen": 2698064, "step": 4420 }, { "epoch": 1.372944461681663, "grad_norm": 0.2983914613723755, "learning_rate": 6.8631709587340986e-06, "loss": 0.2363, "num_input_tokens_seen": 2702320, "step": 4425 }, { "epoch": 1.3744958113558796, "grad_norm": 0.5320796370506287, "learning_rate": 6.8709277071051816e-06, "loss": 0.2288, "num_input_tokens_seen": 2706096, "step": 4430 }, { "epoch": 1.3760471610300962, "grad_norm": 0.37159448862075806, "learning_rate": 6.8786844554762645e-06, "loss": 0.2239, "num_input_tokens_seen": 2709840, "step": 4435 }, { "epoch": 1.3775985107043127, "grad_norm": 0.9287410974502563, "learning_rate": 6.886441203847348e-06, "loss": 0.2436, "num_input_tokens_seen": 2714256, "step": 4440 }, { "epoch": 1.3791498603785293, "grad_norm": 0.21264639496803284, "learning_rate": 6.894197952218431e-06, "loss": 0.2386, "num_input_tokens_seen": 2717136, "step": 4445 }, { "epoch": 1.3807012100527458, "grad_norm": 0.15728998184204102, "learning_rate": 6.9019547005895135e-06, "loss": 0.237, "num_input_tokens_seen": 2719920, "step": 4450 }, { "epoch": 1.3822525597269624, "grad_norm": 0.40200677514076233, "learning_rate": 6.9097114489605965e-06, "loss": 0.231, "num_input_tokens_seen": 2723408, "step": 4455 }, { "epoch": 1.3838039094011791, "grad_norm": 0.41130852699279785, "learning_rate": 6.9174681973316795e-06, "loss": 0.2352, "num_input_tokens_seen": 2725584, "step": 4460 }, { "epoch": 1.3853552590753955, "grad_norm": 0.706247091293335, "learning_rate": 6.9252249457027624e-06, "loss": 0.2358, "num_input_tokens_seen": 2728144, "step": 4465 }, { "epoch": 1.3869066087496122, "grad_norm": 0.2987375855445862, "learning_rate": 6.9329816940738446e-06, "loss": 0.2346, "num_input_tokens_seen": 2731472, "step": 4470 }, { "epoch": 1.3884579584238288, "grad_norm": 0.5413241982460022, "learning_rate": 6.9407384424449276e-06, "loss": 0.2379, "num_input_tokens_seen": 2735664, "step": 4475 }, { "epoch": 1.3900093080980453, "grad_norm": 0.7749696373939514, "learning_rate": 6.9484951908160105e-06, "loss": 0.2293, "num_input_tokens_seen": 2738064, "step": 4480 }, { "epoch": 1.391560657772262, "grad_norm": 0.13151559233665466, "learning_rate": 6.956251939187093e-06, "loss": 0.2304, "num_input_tokens_seen": 2742096, "step": 4485 }, { "epoch": 1.3931120074464785, "grad_norm": 0.13803614675998688, "learning_rate": 6.964008687558176e-06, "loss": 0.2325, "num_input_tokens_seen": 2744560, "step": 4490 }, { "epoch": 1.394663357120695, "grad_norm": 0.12905186414718628, "learning_rate": 6.971765435929259e-06, "loss": 0.2337, "num_input_tokens_seen": 2747568, "step": 4495 }, { "epoch": 1.3962147067949116, "grad_norm": 0.3995530605316162, "learning_rate": 6.979522184300342e-06, "loss": 0.2295, "num_input_tokens_seen": 2750256, "step": 4500 }, { "epoch": 1.397766056469128, "grad_norm": 0.3869001567363739, "learning_rate": 6.9872789326714255e-06, "loss": 0.2298, "num_input_tokens_seen": 2753328, "step": 4505 }, { "epoch": 1.3993174061433447, "grad_norm": 0.16786432266235352, "learning_rate": 6.995035681042508e-06, "loss": 0.2288, "num_input_tokens_seen": 2757072, "step": 4510 }, { "epoch": 1.4008687558175612, "grad_norm": 0.6158885955810547, "learning_rate": 7.002792429413591e-06, "loss": 0.2354, "num_input_tokens_seen": 2760784, "step": 4515 }, { "epoch": 1.4024201054917778, "grad_norm": 0.42925024032592773, "learning_rate": 7.0105491777846736e-06, "loss": 0.2363, "num_input_tokens_seen": 2763472, "step": 4520 }, { "epoch": 1.4039714551659945, "grad_norm": 0.551906168460846, "learning_rate": 7.0183059261557565e-06, "loss": 0.2264, "num_input_tokens_seen": 2766448, "step": 4525 }, { "epoch": 1.4055228048402109, "grad_norm": 0.23822228610515594, "learning_rate": 7.026062674526839e-06, "loss": 0.23, "num_input_tokens_seen": 2769168, "step": 4530 }, { "epoch": 1.4070741545144276, "grad_norm": 0.4488354027271271, "learning_rate": 7.033819422897922e-06, "loss": 0.2112, "num_input_tokens_seen": 2771952, "step": 4535 }, { "epoch": 1.4086255041886442, "grad_norm": 0.7270663380622864, "learning_rate": 7.041576171269005e-06, "loss": 0.2381, "num_input_tokens_seen": 2774320, "step": 4540 }, { "epoch": 1.4101768538628607, "grad_norm": 0.5195521116256714, "learning_rate": 7.049332919640088e-06, "loss": 0.2182, "num_input_tokens_seen": 2777264, "step": 4545 }, { "epoch": 1.4117282035370773, "grad_norm": 1.5072447061538696, "learning_rate": 7.05708966801117e-06, "loss": 0.2355, "num_input_tokens_seen": 2780208, "step": 4550 }, { "epoch": 1.4132795532112938, "grad_norm": 0.3248875141143799, "learning_rate": 7.064846416382253e-06, "loss": 0.2578, "num_input_tokens_seen": 2782704, "step": 4555 }, { "epoch": 1.4148309028855104, "grad_norm": 0.16817766427993774, "learning_rate": 7.072603164753336e-06, "loss": 0.2372, "num_input_tokens_seen": 2785456, "step": 4560 }, { "epoch": 1.416382252559727, "grad_norm": 0.4237426519393921, "learning_rate": 7.080359913124418e-06, "loss": 0.2305, "num_input_tokens_seen": 2787952, "step": 4565 }, { "epoch": 1.4179336022339435, "grad_norm": 0.41476407647132874, "learning_rate": 7.0881166614955026e-06, "loss": 0.2252, "num_input_tokens_seen": 2791248, "step": 4570 }, { "epoch": 1.41948495190816, "grad_norm": 0.47294148802757263, "learning_rate": 7.095873409866585e-06, "loss": 0.2307, "num_input_tokens_seen": 2794320, "step": 4575 }, { "epoch": 1.4210363015823766, "grad_norm": 0.20474885404109955, "learning_rate": 7.103630158237668e-06, "loss": 0.2317, "num_input_tokens_seen": 2797008, "step": 4580 }, { "epoch": 1.4225876512565931, "grad_norm": 0.49610111117362976, "learning_rate": 7.111386906608751e-06, "loss": 0.229, "num_input_tokens_seen": 2800208, "step": 4585 }, { "epoch": 1.42413900093081, "grad_norm": 0.3640812039375305, "learning_rate": 7.119143654979833e-06, "loss": 0.2248, "num_input_tokens_seen": 2803056, "step": 4590 }, { "epoch": 1.4256903506050262, "grad_norm": 0.25473982095718384, "learning_rate": 7.126900403350916e-06, "loss": 0.2393, "num_input_tokens_seen": 2806288, "step": 4595 }, { "epoch": 1.427241700279243, "grad_norm": 0.2033909410238266, "learning_rate": 7.134657151721999e-06, "loss": 0.2327, "num_input_tokens_seen": 2809200, "step": 4600 }, { "epoch": 1.4287930499534596, "grad_norm": 0.4788903295993805, "learning_rate": 7.142413900093082e-06, "loss": 0.2325, "num_input_tokens_seen": 2813680, "step": 4605 }, { "epoch": 1.4303443996276761, "grad_norm": 0.4131218194961548, "learning_rate": 7.150170648464164e-06, "loss": 0.2282, "num_input_tokens_seen": 2816272, "step": 4610 }, { "epoch": 1.4318957493018927, "grad_norm": 0.18753011524677277, "learning_rate": 7.157927396835247e-06, "loss": 0.2304, "num_input_tokens_seen": 2818416, "step": 4615 }, { "epoch": 1.4334470989761092, "grad_norm": 0.7016630172729492, "learning_rate": 7.16568414520633e-06, "loss": 0.2372, "num_input_tokens_seen": 2822352, "step": 4620 }, { "epoch": 1.4349984486503258, "grad_norm": 0.8418395519256592, "learning_rate": 7.173440893577413e-06, "loss": 0.2333, "num_input_tokens_seen": 2825744, "step": 4625 }, { "epoch": 1.4365497983245423, "grad_norm": 0.36569520831108093, "learning_rate": 7.181197641948495e-06, "loss": 0.2298, "num_input_tokens_seen": 2828208, "step": 4630 }, { "epoch": 1.4381011479987589, "grad_norm": 0.23667170107364655, "learning_rate": 7.188954390319579e-06, "loss": 0.2339, "num_input_tokens_seen": 2830544, "step": 4635 }, { "epoch": 1.4396524976729754, "grad_norm": 0.47730663418769836, "learning_rate": 7.196711138690662e-06, "loss": 0.2325, "num_input_tokens_seen": 2833904, "step": 4640 }, { "epoch": 1.441203847347192, "grad_norm": 0.418529212474823, "learning_rate": 7.204467887061745e-06, "loss": 0.2294, "num_input_tokens_seen": 2837168, "step": 4645 }, { "epoch": 1.4427551970214085, "grad_norm": 0.7553504705429077, "learning_rate": 7.212224635432828e-06, "loss": 0.2314, "num_input_tokens_seen": 2839920, "step": 4650 }, { "epoch": 1.4443065466956253, "grad_norm": 0.16168054938316345, "learning_rate": 7.21998138380391e-06, "loss": 0.2295, "num_input_tokens_seen": 2843568, "step": 4655 }, { "epoch": 1.4458578963698416, "grad_norm": 0.38713914155960083, "learning_rate": 7.227738132174993e-06, "loss": 0.241, "num_input_tokens_seen": 2846480, "step": 4660 }, { "epoch": 1.4474092460440584, "grad_norm": 0.35348397493362427, "learning_rate": 7.235494880546076e-06, "loss": 0.2335, "num_input_tokens_seen": 2848912, "step": 4665 }, { "epoch": 1.448960595718275, "grad_norm": 0.2450769990682602, "learning_rate": 7.243251628917158e-06, "loss": 0.2295, "num_input_tokens_seen": 2852272, "step": 4670 }, { "epoch": 1.4505119453924915, "grad_norm": 0.7124223113059998, "learning_rate": 7.251008377288241e-06, "loss": 0.2368, "num_input_tokens_seen": 2855792, "step": 4675 }, { "epoch": 1.452063295066708, "grad_norm": 0.6316937804222107, "learning_rate": 7.258765125659324e-06, "loss": 0.2308, "num_input_tokens_seen": 2858448, "step": 4680 }, { "epoch": 1.4536146447409246, "grad_norm": 0.1395554095506668, "learning_rate": 7.266521874030407e-06, "loss": 0.2332, "num_input_tokens_seen": 2860944, "step": 4685 }, { "epoch": 1.4551659944151412, "grad_norm": 0.6054633259773254, "learning_rate": 7.274278622401489e-06, "loss": 0.2321, "num_input_tokens_seen": 2863664, "step": 4690 }, { "epoch": 1.4567173440893577, "grad_norm": 0.32760605216026306, "learning_rate": 7.282035370772573e-06, "loss": 0.2273, "num_input_tokens_seen": 2866672, "step": 4695 }, { "epoch": 1.4582686937635743, "grad_norm": 0.32610583305358887, "learning_rate": 7.289792119143656e-06, "loss": 0.2282, "num_input_tokens_seen": 2868848, "step": 4700 }, { "epoch": 1.4598200434377908, "grad_norm": 0.423457533121109, "learning_rate": 7.297548867514739e-06, "loss": 0.2343, "num_input_tokens_seen": 2872368, "step": 4705 }, { "epoch": 1.4613713931120074, "grad_norm": 0.7191725969314575, "learning_rate": 7.305305615885822e-06, "loss": 0.2354, "num_input_tokens_seen": 2875664, "step": 4710 }, { "epoch": 1.462922742786224, "grad_norm": 0.15443404018878937, "learning_rate": 7.313062364256904e-06, "loss": 0.2315, "num_input_tokens_seen": 2879856, "step": 4715 }, { "epoch": 1.4644740924604407, "grad_norm": 0.6498408317565918, "learning_rate": 7.320819112627987e-06, "loss": 0.2187, "num_input_tokens_seen": 2883024, "step": 4720 }, { "epoch": 1.466025442134657, "grad_norm": 0.20635470747947693, "learning_rate": 7.32857586099907e-06, "loss": 0.2383, "num_input_tokens_seen": 2886704, "step": 4725 }, { "epoch": 1.4675767918088738, "grad_norm": 0.667162299156189, "learning_rate": 7.336332609370153e-06, "loss": 0.2354, "num_input_tokens_seen": 2889168, "step": 4730 }, { "epoch": 1.4691281414830903, "grad_norm": 0.299736350774765, "learning_rate": 7.344089357741235e-06, "loss": 0.2317, "num_input_tokens_seen": 2891952, "step": 4735 }, { "epoch": 1.470679491157307, "grad_norm": 0.1659272313117981, "learning_rate": 7.351846106112318e-06, "loss": 0.2328, "num_input_tokens_seen": 2894864, "step": 4740 }, { "epoch": 1.4722308408315234, "grad_norm": 0.15106447041034698, "learning_rate": 7.359602854483401e-06, "loss": 0.2337, "num_input_tokens_seen": 2898224, "step": 4745 }, { "epoch": 1.47378219050574, "grad_norm": 0.43621379137039185, "learning_rate": 7.367359602854483e-06, "loss": 0.234, "num_input_tokens_seen": 2901808, "step": 4750 }, { "epoch": 1.4753335401799565, "grad_norm": 0.1424867957830429, "learning_rate": 7.375116351225566e-06, "loss": 0.2317, "num_input_tokens_seen": 2907088, "step": 4755 }, { "epoch": 1.476884889854173, "grad_norm": 0.40557244420051575, "learning_rate": 7.38287309959665e-06, "loss": 0.2324, "num_input_tokens_seen": 2910160, "step": 4760 }, { "epoch": 1.4784362395283897, "grad_norm": 0.7151642441749573, "learning_rate": 7.390629847967733e-06, "loss": 0.2311, "num_input_tokens_seen": 2913168, "step": 4765 }, { "epoch": 1.4799875892026062, "grad_norm": 0.519478440284729, "learning_rate": 7.398386596338816e-06, "loss": 0.2344, "num_input_tokens_seen": 2915760, "step": 4770 }, { "epoch": 1.4815389388768228, "grad_norm": 0.35987064242362976, "learning_rate": 7.406143344709898e-06, "loss": 0.2369, "num_input_tokens_seen": 2918192, "step": 4775 }, { "epoch": 1.4830902885510393, "grad_norm": 0.48882830142974854, "learning_rate": 7.413900093080981e-06, "loss": 0.229, "num_input_tokens_seen": 2921200, "step": 4780 }, { "epoch": 1.484641638225256, "grad_norm": 0.20393019914627075, "learning_rate": 7.421656841452064e-06, "loss": 0.2202, "num_input_tokens_seen": 2923984, "step": 4785 }, { "epoch": 1.4861929878994724, "grad_norm": 0.33530089259147644, "learning_rate": 7.429413589823147e-06, "loss": 0.2277, "num_input_tokens_seen": 2926704, "step": 4790 }, { "epoch": 1.4877443375736892, "grad_norm": 0.2355961799621582, "learning_rate": 7.437170338194229e-06, "loss": 0.2227, "num_input_tokens_seen": 2929136, "step": 4795 }, { "epoch": 1.4892956872479057, "grad_norm": 0.7162790894508362, "learning_rate": 7.444927086565312e-06, "loss": 0.2358, "num_input_tokens_seen": 2931120, "step": 4800 }, { "epoch": 1.4908470369221223, "grad_norm": 0.29540908336639404, "learning_rate": 7.452683834936395e-06, "loss": 0.2439, "num_input_tokens_seen": 2933520, "step": 4805 }, { "epoch": 1.4923983865963388, "grad_norm": 0.511987566947937, "learning_rate": 7.460440583307478e-06, "loss": 0.2386, "num_input_tokens_seen": 2936432, "step": 4810 }, { "epoch": 1.4939497362705554, "grad_norm": 0.1416841447353363, "learning_rate": 7.46819733167856e-06, "loss": 0.2289, "num_input_tokens_seen": 2939344, "step": 4815 }, { "epoch": 1.495501085944772, "grad_norm": 0.2125581055879593, "learning_rate": 7.475954080049643e-06, "loss": 0.2325, "num_input_tokens_seen": 2942832, "step": 4820 }, { "epoch": 1.4970524356189885, "grad_norm": 0.5678417086601257, "learning_rate": 7.483710828420727e-06, "loss": 0.2327, "num_input_tokens_seen": 2945424, "step": 4825 }, { "epoch": 1.498603785293205, "grad_norm": 0.1705409586429596, "learning_rate": 7.49146757679181e-06, "loss": 0.2339, "num_input_tokens_seen": 2948304, "step": 4830 }, { "epoch": 1.5001551349674216, "grad_norm": 0.46207156777381897, "learning_rate": 7.499224325162893e-06, "loss": 0.2338, "num_input_tokens_seen": 2951184, "step": 4835 }, { "epoch": 1.5017064846416384, "grad_norm": 0.42144304513931274, "learning_rate": 7.506981073533975e-06, "loss": 0.2327, "num_input_tokens_seen": 2954384, "step": 4840 }, { "epoch": 1.5032578343158547, "grad_norm": 0.15124157071113586, "learning_rate": 7.514737821905058e-06, "loss": 0.2346, "num_input_tokens_seen": 2958032, "step": 4845 }, { "epoch": 1.5048091839900715, "grad_norm": 0.3762567341327667, "learning_rate": 7.522494570276141e-06, "loss": 0.2304, "num_input_tokens_seen": 2961584, "step": 4850 }, { "epoch": 1.5063605336642878, "grad_norm": 0.6353211998939514, "learning_rate": 7.530251318647223e-06, "loss": 0.2283, "num_input_tokens_seen": 2965520, "step": 4855 }, { "epoch": 1.5079118833385046, "grad_norm": 0.21230386197566986, "learning_rate": 7.538008067018306e-06, "loss": 0.2275, "num_input_tokens_seen": 2968656, "step": 4860 }, { "epoch": 1.5094632330127211, "grad_norm": 0.12278501689434052, "learning_rate": 7.545764815389389e-06, "loss": 0.2231, "num_input_tokens_seen": 2971472, "step": 4865 }, { "epoch": 1.5110145826869377, "grad_norm": 0.17764505743980408, "learning_rate": 7.553521563760472e-06, "loss": 0.2318, "num_input_tokens_seen": 2973808, "step": 4870 }, { "epoch": 1.5125659323611542, "grad_norm": 0.22516219317913055, "learning_rate": 7.561278312131554e-06, "loss": 0.2208, "num_input_tokens_seen": 2976784, "step": 4875 }, { "epoch": 1.5141172820353708, "grad_norm": 0.16046176850795746, "learning_rate": 7.569035060502637e-06, "loss": 0.2405, "num_input_tokens_seen": 2979600, "step": 4880 }, { "epoch": 1.5156686317095873, "grad_norm": 0.635882556438446, "learning_rate": 7.57679180887372e-06, "loss": 0.2211, "num_input_tokens_seen": 2982992, "step": 4885 }, { "epoch": 1.5172199813838039, "grad_norm": 0.28559714555740356, "learning_rate": 7.584548557244804e-06, "loss": 0.2348, "num_input_tokens_seen": 2985808, "step": 4890 }, { "epoch": 1.5187713310580204, "grad_norm": 0.21132998168468475, "learning_rate": 7.592305305615887e-06, "loss": 0.2421, "num_input_tokens_seen": 2989392, "step": 4895 }, { "epoch": 1.520322680732237, "grad_norm": 0.23415355384349823, "learning_rate": 7.600062053986969e-06, "loss": 0.2238, "num_input_tokens_seen": 2991888, "step": 4900 }, { "epoch": 1.5218740304064537, "grad_norm": 0.4336777925491333, "learning_rate": 7.607818802358052e-06, "loss": 0.2383, "num_input_tokens_seen": 2994576, "step": 4905 }, { "epoch": 1.52342538008067, "grad_norm": 0.7797531485557556, "learning_rate": 7.615575550729135e-06, "loss": 0.2366, "num_input_tokens_seen": 2997744, "step": 4910 }, { "epoch": 1.5249767297548869, "grad_norm": 0.2171352207660675, "learning_rate": 7.623332299100218e-06, "loss": 0.2274, "num_input_tokens_seen": 3001648, "step": 4915 }, { "epoch": 1.5265280794291032, "grad_norm": 0.39131027460098267, "learning_rate": 7.631089047471301e-06, "loss": 0.233, "num_input_tokens_seen": 3003856, "step": 4920 }, { "epoch": 1.52807942910332, "grad_norm": 0.22539809346199036, "learning_rate": 7.638845795842383e-06, "loss": 0.2347, "num_input_tokens_seen": 3007696, "step": 4925 }, { "epoch": 1.5296307787775365, "grad_norm": 0.42868882417678833, "learning_rate": 7.646602544213466e-06, "loss": 0.2336, "num_input_tokens_seen": 3011856, "step": 4930 }, { "epoch": 1.531182128451753, "grad_norm": 0.5989904403686523, "learning_rate": 7.654359292584549e-06, "loss": 0.2326, "num_input_tokens_seen": 3015728, "step": 4935 }, { "epoch": 1.5327334781259696, "grad_norm": 0.5849951505661011, "learning_rate": 7.662116040955632e-06, "loss": 0.2236, "num_input_tokens_seen": 3018640, "step": 4940 }, { "epoch": 1.5342848278001862, "grad_norm": 0.557937741279602, "learning_rate": 7.669872789326714e-06, "loss": 0.2339, "num_input_tokens_seen": 3022000, "step": 4945 }, { "epoch": 1.5358361774744027, "grad_norm": 0.16051223874092102, "learning_rate": 7.677629537697797e-06, "loss": 0.2351, "num_input_tokens_seen": 3024592, "step": 4950 }, { "epoch": 1.5373875271486193, "grad_norm": 0.4702833890914917, "learning_rate": 7.68538628606888e-06, "loss": 0.2285, "num_input_tokens_seen": 3027504, "step": 4955 }, { "epoch": 1.538938876822836, "grad_norm": 0.2398497760295868, "learning_rate": 7.693143034439963e-06, "loss": 0.2338, "num_input_tokens_seen": 3030960, "step": 4960 }, { "epoch": 1.5404902264970524, "grad_norm": 0.2506011128425598, "learning_rate": 7.700899782811046e-06, "loss": 0.2265, "num_input_tokens_seen": 3034096, "step": 4965 }, { "epoch": 1.5420415761712691, "grad_norm": 1.1644349098205566, "learning_rate": 7.70865653118213e-06, "loss": 0.2366, "num_input_tokens_seen": 3038192, "step": 4970 }, { "epoch": 1.5435929258454855, "grad_norm": 0.7848861217498779, "learning_rate": 7.716413279553212e-06, "loss": 0.2375, "num_input_tokens_seen": 3040848, "step": 4975 }, { "epoch": 1.5451442755197022, "grad_norm": 0.17047567665576935, "learning_rate": 7.724170027924295e-06, "loss": 0.2351, "num_input_tokens_seen": 3043248, "step": 4980 }, { "epoch": 1.5466956251939186, "grad_norm": 0.1519746035337448, "learning_rate": 7.731926776295378e-06, "loss": 0.2346, "num_input_tokens_seen": 3045776, "step": 4985 }, { "epoch": 1.5482469748681353, "grad_norm": 0.4010748267173767, "learning_rate": 7.73968352466646e-06, "loss": 0.2328, "num_input_tokens_seen": 3050352, "step": 4990 }, { "epoch": 1.549798324542352, "grad_norm": 0.36751627922058105, "learning_rate": 7.747440273037543e-06, "loss": 0.2276, "num_input_tokens_seen": 3052912, "step": 4995 }, { "epoch": 1.5513496742165684, "grad_norm": 0.3339470624923706, "learning_rate": 7.755197021408626e-06, "loss": 0.2342, "num_input_tokens_seen": 3059440, "step": 5000 }, { "epoch": 1.552901023890785, "grad_norm": 0.4483391046524048, "learning_rate": 7.762953769779709e-06, "loss": 0.2376, "num_input_tokens_seen": 3061840, "step": 5005 }, { "epoch": 1.5544523735650015, "grad_norm": 0.3588384985923767, "learning_rate": 7.770710518150792e-06, "loss": 0.2317, "num_input_tokens_seen": 3064304, "step": 5010 }, { "epoch": 1.556003723239218, "grad_norm": 0.4200015664100647, "learning_rate": 7.778467266521875e-06, "loss": 0.2231, "num_input_tokens_seen": 3067312, "step": 5015 }, { "epoch": 1.5575550729134346, "grad_norm": 0.28934067487716675, "learning_rate": 7.786224014892958e-06, "loss": 0.2363, "num_input_tokens_seen": 3069776, "step": 5020 }, { "epoch": 1.5591064225876514, "grad_norm": 0.6704522967338562, "learning_rate": 7.79398076326404e-06, "loss": 0.2355, "num_input_tokens_seen": 3072240, "step": 5025 }, { "epoch": 1.5606577722618677, "grad_norm": 0.1157415583729744, "learning_rate": 7.801737511635124e-06, "loss": 0.2308, "num_input_tokens_seen": 3074576, "step": 5030 }, { "epoch": 1.5622091219360845, "grad_norm": 0.13718335330486298, "learning_rate": 7.809494260006207e-06, "loss": 0.2286, "num_input_tokens_seen": 3077072, "step": 5035 }, { "epoch": 1.5637604716103009, "grad_norm": 0.2971937954425812, "learning_rate": 7.81725100837729e-06, "loss": 0.2268, "num_input_tokens_seen": 3080080, "step": 5040 }, { "epoch": 1.5653118212845176, "grad_norm": 0.1545197069644928, "learning_rate": 7.825007756748372e-06, "loss": 0.2279, "num_input_tokens_seen": 3083216, "step": 5045 }, { "epoch": 1.566863170958734, "grad_norm": 0.6552954316139221, "learning_rate": 7.832764505119454e-06, "loss": 0.2186, "num_input_tokens_seen": 3085872, "step": 5050 }, { "epoch": 1.5684145206329507, "grad_norm": 0.5529700517654419, "learning_rate": 7.840521253490537e-06, "loss": 0.247, "num_input_tokens_seen": 3088688, "step": 5055 }, { "epoch": 1.5699658703071673, "grad_norm": 0.41611167788505554, "learning_rate": 7.84827800186162e-06, "loss": 0.2446, "num_input_tokens_seen": 3091696, "step": 5060 }, { "epoch": 1.5715172199813838, "grad_norm": 0.29680004715919495, "learning_rate": 7.856034750232703e-06, "loss": 0.2378, "num_input_tokens_seen": 3094288, "step": 5065 }, { "epoch": 1.5730685696556004, "grad_norm": 0.1494438201189041, "learning_rate": 7.863791498603786e-06, "loss": 0.2328, "num_input_tokens_seen": 3096624, "step": 5070 }, { "epoch": 1.574619919329817, "grad_norm": 0.15119126439094543, "learning_rate": 7.871548246974869e-06, "loss": 0.2314, "num_input_tokens_seen": 3099664, "step": 5075 }, { "epoch": 1.5761712690040335, "grad_norm": 0.7622805237770081, "learning_rate": 7.879304995345952e-06, "loss": 0.2325, "num_input_tokens_seen": 3102128, "step": 5080 }, { "epoch": 1.57772261867825, "grad_norm": 0.3402960002422333, "learning_rate": 7.887061743717035e-06, "loss": 0.2337, "num_input_tokens_seen": 3104880, "step": 5085 }, { "epoch": 1.5792739683524668, "grad_norm": 0.3568340837955475, "learning_rate": 7.894818492088118e-06, "loss": 0.2304, "num_input_tokens_seen": 3107216, "step": 5090 }, { "epoch": 1.5808253180266831, "grad_norm": 0.1954752802848816, "learning_rate": 7.9025752404592e-06, "loss": 0.2326, "num_input_tokens_seen": 3110352, "step": 5095 }, { "epoch": 1.5823766677009, "grad_norm": 0.33474260568618774, "learning_rate": 7.910331988830284e-06, "loss": 0.2316, "num_input_tokens_seen": 3112752, "step": 5100 }, { "epoch": 1.5839280173751162, "grad_norm": 0.41180840134620667, "learning_rate": 7.918088737201367e-06, "loss": 0.2328, "num_input_tokens_seen": 3116752, "step": 5105 }, { "epoch": 1.585479367049333, "grad_norm": 0.7017702460289001, "learning_rate": 7.925845485572448e-06, "loss": 0.2306, "num_input_tokens_seen": 3119312, "step": 5110 }, { "epoch": 1.5870307167235493, "grad_norm": 0.1902618408203125, "learning_rate": 7.933602233943531e-06, "loss": 0.2301, "num_input_tokens_seen": 3122896, "step": 5115 }, { "epoch": 1.5885820663977661, "grad_norm": 0.42266711592674255, "learning_rate": 7.941358982314614e-06, "loss": 0.2252, "num_input_tokens_seen": 3126384, "step": 5120 }, { "epoch": 1.5901334160719827, "grad_norm": 0.45858773589134216, "learning_rate": 7.949115730685697e-06, "loss": 0.231, "num_input_tokens_seen": 3130128, "step": 5125 }, { "epoch": 1.5916847657461992, "grad_norm": 0.1909772902727127, "learning_rate": 7.95687247905678e-06, "loss": 0.2372, "num_input_tokens_seen": 3133584, "step": 5130 }, { "epoch": 1.5932361154204158, "grad_norm": 0.31052157282829285, "learning_rate": 7.964629227427863e-06, "loss": 0.23, "num_input_tokens_seen": 3137456, "step": 5135 }, { "epoch": 1.5947874650946323, "grad_norm": 0.3241427540779114, "learning_rate": 7.972385975798946e-06, "loss": 0.231, "num_input_tokens_seen": 3140880, "step": 5140 }, { "epoch": 1.5963388147688489, "grad_norm": 0.309627503156662, "learning_rate": 7.980142724170029e-06, "loss": 0.2343, "num_input_tokens_seen": 3143312, "step": 5145 }, { "epoch": 1.5978901644430654, "grad_norm": 0.2881668508052826, "learning_rate": 7.987899472541112e-06, "loss": 0.2317, "num_input_tokens_seen": 3146608, "step": 5150 }, { "epoch": 1.5994415141172822, "grad_norm": 0.3844715356826782, "learning_rate": 7.995656220912195e-06, "loss": 0.237, "num_input_tokens_seen": 3149552, "step": 5155 }, { "epoch": 1.6009928637914985, "grad_norm": 0.1209980919957161, "learning_rate": 8.003412969283278e-06, "loss": 0.2265, "num_input_tokens_seen": 3153552, "step": 5160 }, { "epoch": 1.6025442134657153, "grad_norm": 0.3276832103729248, "learning_rate": 8.01116971765436e-06, "loss": 0.2359, "num_input_tokens_seen": 3157232, "step": 5165 }, { "epoch": 1.6040955631399316, "grad_norm": 0.2976282238960266, "learning_rate": 8.018926466025444e-06, "loss": 0.2327, "num_input_tokens_seen": 3159856, "step": 5170 }, { "epoch": 1.6056469128141484, "grad_norm": 0.3128800094127655, "learning_rate": 8.026683214396525e-06, "loss": 0.2327, "num_input_tokens_seen": 3161936, "step": 5175 }, { "epoch": 1.6071982624883647, "grad_norm": 0.3254602551460266, "learning_rate": 8.034439962767608e-06, "loss": 0.2342, "num_input_tokens_seen": 3165456, "step": 5180 }, { "epoch": 1.6087496121625815, "grad_norm": 0.08054110407829285, "learning_rate": 8.042196711138691e-06, "loss": 0.2326, "num_input_tokens_seen": 3168016, "step": 5185 }, { "epoch": 1.610300961836798, "grad_norm": 0.28291741013526917, "learning_rate": 8.049953459509774e-06, "loss": 0.2296, "num_input_tokens_seen": 3171600, "step": 5190 }, { "epoch": 1.6118523115110146, "grad_norm": 0.306076318025589, "learning_rate": 8.057710207880857e-06, "loss": 0.2321, "num_input_tokens_seen": 3174352, "step": 5195 }, { "epoch": 1.6134036611852312, "grad_norm": 0.5526375770568848, "learning_rate": 8.06546695625194e-06, "loss": 0.228, "num_input_tokens_seen": 3177456, "step": 5200 }, { "epoch": 1.6149550108594477, "grad_norm": 0.2804635465145111, "learning_rate": 8.073223704623023e-06, "loss": 0.2274, "num_input_tokens_seen": 3179696, "step": 5205 }, { "epoch": 1.6165063605336643, "grad_norm": 0.25749069452285767, "learning_rate": 8.080980452994106e-06, "loss": 0.2246, "num_input_tokens_seen": 3182992, "step": 5210 }, { "epoch": 1.6180577102078808, "grad_norm": 0.5111542344093323, "learning_rate": 8.088737201365189e-06, "loss": 0.2322, "num_input_tokens_seen": 3185872, "step": 5215 }, { "epoch": 1.6196090598820976, "grad_norm": 0.23035264015197754, "learning_rate": 8.096493949736272e-06, "loss": 0.2391, "num_input_tokens_seen": 3190032, "step": 5220 }, { "epoch": 1.621160409556314, "grad_norm": 0.11523294448852539, "learning_rate": 8.104250698107355e-06, "loss": 0.2311, "num_input_tokens_seen": 3192784, "step": 5225 }, { "epoch": 1.6227117592305307, "grad_norm": 0.36785241961479187, "learning_rate": 8.112007446478438e-06, "loss": 0.2393, "num_input_tokens_seen": 3195312, "step": 5230 }, { "epoch": 1.624263108904747, "grad_norm": 0.5804638862609863, "learning_rate": 8.119764194849519e-06, "loss": 0.2411, "num_input_tokens_seen": 3199728, "step": 5235 }, { "epoch": 1.6258144585789638, "grad_norm": 0.4986519515514374, "learning_rate": 8.127520943220602e-06, "loss": 0.2281, "num_input_tokens_seen": 3202928, "step": 5240 }, { "epoch": 1.6273658082531801, "grad_norm": 0.2782237231731415, "learning_rate": 8.135277691591685e-06, "loss": 0.233, "num_input_tokens_seen": 3205456, "step": 5245 }, { "epoch": 1.6289171579273969, "grad_norm": 0.2909472584724426, "learning_rate": 8.143034439962768e-06, "loss": 0.2316, "num_input_tokens_seen": 3208272, "step": 5250 }, { "epoch": 1.6304685076016134, "grad_norm": 0.24460923671722412, "learning_rate": 8.150791188333851e-06, "loss": 0.2296, "num_input_tokens_seen": 3212368, "step": 5255 }, { "epoch": 1.63201985727583, "grad_norm": 0.2912180721759796, "learning_rate": 8.158547936704934e-06, "loss": 0.2271, "num_input_tokens_seen": 3216464, "step": 5260 }, { "epoch": 1.6335712069500465, "grad_norm": 0.09113282710313797, "learning_rate": 8.166304685076017e-06, "loss": 0.2432, "num_input_tokens_seen": 3219408, "step": 5265 }, { "epoch": 1.635122556624263, "grad_norm": 0.2958107590675354, "learning_rate": 8.1740614334471e-06, "loss": 0.2273, "num_input_tokens_seen": 3222416, "step": 5270 }, { "epoch": 1.6366739062984796, "grad_norm": 0.1480788141489029, "learning_rate": 8.181818181818183e-06, "loss": 0.2342, "num_input_tokens_seen": 3225648, "step": 5275 }, { "epoch": 1.6382252559726962, "grad_norm": 0.28759849071502686, "learning_rate": 8.189574930189266e-06, "loss": 0.2325, "num_input_tokens_seen": 3228464, "step": 5280 }, { "epoch": 1.639776605646913, "grad_norm": 0.5491487979888916, "learning_rate": 8.197331678560349e-06, "loss": 0.2358, "num_input_tokens_seen": 3232368, "step": 5285 }, { "epoch": 1.6413279553211293, "grad_norm": 0.5411143898963928, "learning_rate": 8.205088426931432e-06, "loss": 0.231, "num_input_tokens_seen": 3234640, "step": 5290 }, { "epoch": 1.642879304995346, "grad_norm": 0.2949540913105011, "learning_rate": 8.212845175302513e-06, "loss": 0.2315, "num_input_tokens_seen": 3238256, "step": 5295 }, { "epoch": 1.6444306546695624, "grad_norm": 0.5248398780822754, "learning_rate": 8.220601923673596e-06, "loss": 0.23, "num_input_tokens_seen": 3241264, "step": 5300 }, { "epoch": 1.6459820043437792, "grad_norm": 0.12646083533763885, "learning_rate": 8.22835867204468e-06, "loss": 0.2294, "num_input_tokens_seen": 3244528, "step": 5305 }, { "epoch": 1.6475333540179955, "grad_norm": 0.28358981013298035, "learning_rate": 8.236115420415762e-06, "loss": 0.2359, "num_input_tokens_seen": 3247152, "step": 5310 }, { "epoch": 1.6490847036922123, "grad_norm": 0.26522818207740784, "learning_rate": 8.243872168786845e-06, "loss": 0.2332, "num_input_tokens_seen": 3250576, "step": 5315 }, { "epoch": 1.6506360533664288, "grad_norm": 0.30820801854133606, "learning_rate": 8.251628917157928e-06, "loss": 0.2336, "num_input_tokens_seen": 3253552, "step": 5320 }, { "epoch": 1.6521874030406454, "grad_norm": 0.2873440086841583, "learning_rate": 8.259385665529011e-06, "loss": 0.2326, "num_input_tokens_seen": 3256656, "step": 5325 }, { "epoch": 1.653738752714862, "grad_norm": 0.12347102910280228, "learning_rate": 8.267142413900094e-06, "loss": 0.2343, "num_input_tokens_seen": 3259216, "step": 5330 }, { "epoch": 1.6552901023890785, "grad_norm": 0.568271279335022, "learning_rate": 8.274899162271175e-06, "loss": 0.2302, "num_input_tokens_seen": 3262256, "step": 5335 }, { "epoch": 1.656841452063295, "grad_norm": 0.10968872904777527, "learning_rate": 8.28265591064226e-06, "loss": 0.2346, "num_input_tokens_seen": 3265424, "step": 5340 }, { "epoch": 1.6583928017375116, "grad_norm": 0.4956483244895935, "learning_rate": 8.290412659013343e-06, "loss": 0.228, "num_input_tokens_seen": 3269104, "step": 5345 }, { "epoch": 1.6599441514117284, "grad_norm": 0.33427953720092773, "learning_rate": 8.298169407384426e-06, "loss": 0.2301, "num_input_tokens_seen": 3271824, "step": 5350 }, { "epoch": 1.6614955010859447, "grad_norm": 0.11507105082273483, "learning_rate": 8.305926155755509e-06, "loss": 0.2275, "num_input_tokens_seen": 3275248, "step": 5355 }, { "epoch": 1.6630468507601615, "grad_norm": 0.3280748426914215, "learning_rate": 8.31368290412659e-06, "loss": 0.2283, "num_input_tokens_seen": 3277936, "step": 5360 }, { "epoch": 1.6645982004343778, "grad_norm": 0.24242065846920013, "learning_rate": 8.321439652497673e-06, "loss": 0.2223, "num_input_tokens_seen": 3281072, "step": 5365 }, { "epoch": 1.6661495501085946, "grad_norm": 0.34615689516067505, "learning_rate": 8.329196400868756e-06, "loss": 0.2409, "num_input_tokens_seen": 3284176, "step": 5370 }, { "epoch": 1.6677008997828109, "grad_norm": 0.2266174703836441, "learning_rate": 8.33695314923984e-06, "loss": 0.2326, "num_input_tokens_seen": 3287600, "step": 5375 }, { "epoch": 1.6692522494570277, "grad_norm": 0.15050342679023743, "learning_rate": 8.344709897610922e-06, "loss": 0.2284, "num_input_tokens_seen": 3290960, "step": 5380 }, { "epoch": 1.6708035991312442, "grad_norm": 0.10138463973999023, "learning_rate": 8.352466645982005e-06, "loss": 0.2255, "num_input_tokens_seen": 3293712, "step": 5385 }, { "epoch": 1.6723549488054608, "grad_norm": 0.6161874532699585, "learning_rate": 8.360223394353088e-06, "loss": 0.2433, "num_input_tokens_seen": 3296464, "step": 5390 }, { "epoch": 1.6739062984796773, "grad_norm": 0.3437924087047577, "learning_rate": 8.36798014272417e-06, "loss": 0.2507, "num_input_tokens_seen": 3298960, "step": 5395 }, { "epoch": 1.6754576481538939, "grad_norm": 0.5873526930809021, "learning_rate": 8.375736891095252e-06, "loss": 0.2287, "num_input_tokens_seen": 3301616, "step": 5400 }, { "epoch": 1.6770089978281104, "grad_norm": 0.11490794271230698, "learning_rate": 8.383493639466337e-06, "loss": 0.2356, "num_input_tokens_seen": 3304144, "step": 5405 }, { "epoch": 1.678560347502327, "grad_norm": 0.38942500948905945, "learning_rate": 8.39125038783742e-06, "loss": 0.2185, "num_input_tokens_seen": 3307696, "step": 5410 }, { "epoch": 1.6801116971765437, "grad_norm": 0.6616910696029663, "learning_rate": 8.399007136208503e-06, "loss": 0.2463, "num_input_tokens_seen": 3310352, "step": 5415 }, { "epoch": 1.68166304685076, "grad_norm": 0.27917367219924927, "learning_rate": 8.406763884579584e-06, "loss": 0.2425, "num_input_tokens_seen": 3313584, "step": 5420 }, { "epoch": 1.6832143965249768, "grad_norm": 0.11066268384456635, "learning_rate": 8.414520632950667e-06, "loss": 0.2291, "num_input_tokens_seen": 3316432, "step": 5425 }, { "epoch": 1.6847657461991932, "grad_norm": 0.24603299796581268, "learning_rate": 8.42227738132175e-06, "loss": 0.2331, "num_input_tokens_seen": 3318960, "step": 5430 }, { "epoch": 1.68631709587341, "grad_norm": 0.0956786647439003, "learning_rate": 8.430034129692833e-06, "loss": 0.2303, "num_input_tokens_seen": 3322608, "step": 5435 }, { "epoch": 1.6878684455476263, "grad_norm": 0.5125373601913452, "learning_rate": 8.437790878063916e-06, "loss": 0.2302, "num_input_tokens_seen": 3325488, "step": 5440 }, { "epoch": 1.689419795221843, "grad_norm": 0.5142573118209839, "learning_rate": 8.445547626435e-06, "loss": 0.2272, "num_input_tokens_seen": 3327792, "step": 5445 }, { "epoch": 1.6909711448960596, "grad_norm": 0.3528628945350647, "learning_rate": 8.453304374806082e-06, "loss": 0.2331, "num_input_tokens_seen": 3331312, "step": 5450 }, { "epoch": 1.6925224945702761, "grad_norm": 0.09921309351921082, "learning_rate": 8.461061123177164e-06, "loss": 0.2337, "num_input_tokens_seen": 3334928, "step": 5455 }, { "epoch": 1.6940738442444927, "grad_norm": 0.23141387104988098, "learning_rate": 8.468817871548247e-06, "loss": 0.2254, "num_input_tokens_seen": 3338928, "step": 5460 }, { "epoch": 1.6956251939187093, "grad_norm": 0.11051265895366669, "learning_rate": 8.476574619919331e-06, "loss": 0.2167, "num_input_tokens_seen": 3341904, "step": 5465 }, { "epoch": 1.6971765435929258, "grad_norm": 0.1868167221546173, "learning_rate": 8.484331368290414e-06, "loss": 0.2294, "num_input_tokens_seen": 3345104, "step": 5470 }, { "epoch": 1.6987278932671424, "grad_norm": 0.34588515758514404, "learning_rate": 8.492088116661497e-06, "loss": 0.2332, "num_input_tokens_seen": 3348624, "step": 5475 }, { "epoch": 1.7002792429413591, "grad_norm": 0.49241724610328674, "learning_rate": 8.499844865032579e-06, "loss": 0.2437, "num_input_tokens_seen": 3351376, "step": 5480 }, { "epoch": 1.7018305926155755, "grad_norm": 0.1405097246170044, "learning_rate": 8.507601613403661e-06, "loss": 0.2253, "num_input_tokens_seen": 3353776, "step": 5485 }, { "epoch": 1.7033819422897922, "grad_norm": 0.5361838936805725, "learning_rate": 8.515358361774744e-06, "loss": 0.2329, "num_input_tokens_seen": 3356880, "step": 5490 }, { "epoch": 1.7049332919640086, "grad_norm": 0.32198864221572876, "learning_rate": 8.523115110145827e-06, "loss": 0.2296, "num_input_tokens_seen": 3359472, "step": 5495 }, { "epoch": 1.7064846416382253, "grad_norm": 0.30604860186576843, "learning_rate": 8.53087185851691e-06, "loss": 0.236, "num_input_tokens_seen": 3362288, "step": 5500 }, { "epoch": 1.7080359913124417, "grad_norm": 0.12623810768127441, "learning_rate": 8.538628606887993e-06, "loss": 0.2295, "num_input_tokens_seen": 3364976, "step": 5505 }, { "epoch": 1.7095873409866584, "grad_norm": 0.07062545418739319, "learning_rate": 8.546385355259076e-06, "loss": 0.2293, "num_input_tokens_seen": 3367024, "step": 5510 }, { "epoch": 1.711138690660875, "grad_norm": 0.13387992978096008, "learning_rate": 8.55414210363016e-06, "loss": 0.2325, "num_input_tokens_seen": 3369520, "step": 5515 }, { "epoch": 1.7126900403350915, "grad_norm": 0.12012158334255219, "learning_rate": 8.56189885200124e-06, "loss": 0.2336, "num_input_tokens_seen": 3372080, "step": 5520 }, { "epoch": 1.714241390009308, "grad_norm": 0.5559934377670288, "learning_rate": 8.569655600372324e-06, "loss": 0.2379, "num_input_tokens_seen": 3374960, "step": 5525 }, { "epoch": 1.7157927396835246, "grad_norm": 0.2929873764514923, "learning_rate": 8.577412348743408e-06, "loss": 0.2289, "num_input_tokens_seen": 3378384, "step": 5530 }, { "epoch": 1.7173440893577412, "grad_norm": 0.2933463156223297, "learning_rate": 8.585169097114491e-06, "loss": 0.2353, "num_input_tokens_seen": 3381520, "step": 5535 }, { "epoch": 1.7188954390319577, "grad_norm": 0.088116355240345, "learning_rate": 8.592925845485574e-06, "loss": 0.2294, "num_input_tokens_seen": 3383824, "step": 5540 }, { "epoch": 1.7204467887061745, "grad_norm": 0.27879980206489563, "learning_rate": 8.600682593856656e-06, "loss": 0.2304, "num_input_tokens_seen": 3386352, "step": 5545 }, { "epoch": 1.7219981383803908, "grad_norm": 0.29909640550613403, "learning_rate": 8.608439342227739e-06, "loss": 0.2305, "num_input_tokens_seen": 3389296, "step": 5550 }, { "epoch": 1.7235494880546076, "grad_norm": 0.11563847213983536, "learning_rate": 8.616196090598822e-06, "loss": 0.2336, "num_input_tokens_seen": 3391760, "step": 5555 }, { "epoch": 1.725100837728824, "grad_norm": 0.27470964193344116, "learning_rate": 8.623952838969905e-06, "loss": 0.2369, "num_input_tokens_seen": 3395920, "step": 5560 }, { "epoch": 1.7266521874030407, "grad_norm": 0.30876806378364563, "learning_rate": 8.631709587340988e-06, "loss": 0.2298, "num_input_tokens_seen": 3398800, "step": 5565 }, { "epoch": 1.7282035370772573, "grad_norm": 0.32936498522758484, "learning_rate": 8.63946633571207e-06, "loss": 0.2347, "num_input_tokens_seen": 3402224, "step": 5570 }, { "epoch": 1.7297548867514738, "grad_norm": 0.30980539321899414, "learning_rate": 8.647223084083154e-06, "loss": 0.2313, "num_input_tokens_seen": 3405744, "step": 5575 }, { "epoch": 1.7313062364256904, "grad_norm": 0.07965142279863358, "learning_rate": 8.654979832454235e-06, "loss": 0.2369, "num_input_tokens_seen": 3408496, "step": 5580 }, { "epoch": 1.732857586099907, "grad_norm": 0.26356664299964905, "learning_rate": 8.662736580825318e-06, "loss": 0.2324, "num_input_tokens_seen": 3411280, "step": 5585 }, { "epoch": 1.7344089357741235, "grad_norm": 0.10858437418937683, "learning_rate": 8.6704933291964e-06, "loss": 0.2299, "num_input_tokens_seen": 3414768, "step": 5590 }, { "epoch": 1.73596028544834, "grad_norm": 0.2802604138851166, "learning_rate": 8.678250077567485e-06, "loss": 0.2304, "num_input_tokens_seen": 3417456, "step": 5595 }, { "epoch": 1.7375116351225566, "grad_norm": 0.2965069115161896, "learning_rate": 8.686006825938568e-06, "loss": 0.2346, "num_input_tokens_seen": 3421264, "step": 5600 }, { "epoch": 1.7390629847967731, "grad_norm": 0.2607303857803345, "learning_rate": 8.69376357430965e-06, "loss": 0.2289, "num_input_tokens_seen": 3423952, "step": 5605 }, { "epoch": 1.74061433447099, "grad_norm": 0.2696792483329773, "learning_rate": 8.701520322680733e-06, "loss": 0.2305, "num_input_tokens_seen": 3426512, "step": 5610 }, { "epoch": 1.7421656841452062, "grad_norm": 0.28439047932624817, "learning_rate": 8.709277071051816e-06, "loss": 0.2305, "num_input_tokens_seen": 3429968, "step": 5615 }, { "epoch": 1.743717033819423, "grad_norm": 0.4841957986354828, "learning_rate": 8.717033819422899e-06, "loss": 0.2326, "num_input_tokens_seen": 3432784, "step": 5620 }, { "epoch": 1.7452683834936393, "grad_norm": 0.12902876734733582, "learning_rate": 8.724790567793982e-06, "loss": 0.2243, "num_input_tokens_seen": 3436656, "step": 5625 }, { "epoch": 1.746819733167856, "grad_norm": 0.3015059232711792, "learning_rate": 8.732547316165065e-06, "loss": 0.24, "num_input_tokens_seen": 3441456, "step": 5630 }, { "epoch": 1.7483710828420727, "grad_norm": 0.23649121820926666, "learning_rate": 8.740304064536148e-06, "loss": 0.2364, "num_input_tokens_seen": 3444144, "step": 5635 }, { "epoch": 1.7499224325162892, "grad_norm": 0.2717781364917755, "learning_rate": 8.748060812907229e-06, "loss": 0.2324, "num_input_tokens_seen": 3447504, "step": 5640 }, { "epoch": 1.7514737821905058, "grad_norm": 0.13322533667087555, "learning_rate": 8.755817561278312e-06, "loss": 0.2289, "num_input_tokens_seen": 3450352, "step": 5645 }, { "epoch": 1.7530251318647223, "grad_norm": 0.08801095187664032, "learning_rate": 8.763574309649395e-06, "loss": 0.2332, "num_input_tokens_seen": 3452496, "step": 5650 }, { "epoch": 1.7545764815389389, "grad_norm": 0.2648777663707733, "learning_rate": 8.771331058020478e-06, "loss": 0.2362, "num_input_tokens_seen": 3455152, "step": 5655 }, { "epoch": 1.7561278312131554, "grad_norm": 0.5058441758155823, "learning_rate": 8.779087806391563e-06, "loss": 0.2332, "num_input_tokens_seen": 3458480, "step": 5660 }, { "epoch": 1.757679180887372, "grad_norm": 0.25909844040870667, "learning_rate": 8.786844554762644e-06, "loss": 0.2354, "num_input_tokens_seen": 3460976, "step": 5665 }, { "epoch": 1.7592305305615885, "grad_norm": 0.25665047764778137, "learning_rate": 8.794601303133727e-06, "loss": 0.231, "num_input_tokens_seen": 3463632, "step": 5670 }, { "epoch": 1.7607818802358053, "grad_norm": 0.10968980938196182, "learning_rate": 8.80235805150481e-06, "loss": 0.231, "num_input_tokens_seen": 3466096, "step": 5675 }, { "epoch": 1.7623332299100216, "grad_norm": 0.11563225835561752, "learning_rate": 8.810114799875893e-06, "loss": 0.2231, "num_input_tokens_seen": 3469008, "step": 5680 }, { "epoch": 1.7638845795842384, "grad_norm": 0.11227554827928543, "learning_rate": 8.817871548246976e-06, "loss": 0.23, "num_input_tokens_seen": 3472240, "step": 5685 }, { "epoch": 1.7654359292584547, "grad_norm": 0.18608056008815765, "learning_rate": 8.825628296618059e-06, "loss": 0.2245, "num_input_tokens_seen": 3475248, "step": 5690 }, { "epoch": 1.7669872789326715, "grad_norm": 0.14289502799510956, "learning_rate": 8.833385044989142e-06, "loss": 0.2402, "num_input_tokens_seen": 3477456, "step": 5695 }, { "epoch": 1.768538628606888, "grad_norm": 0.4697457253932953, "learning_rate": 8.841141793360225e-06, "loss": 0.2321, "num_input_tokens_seen": 3480752, "step": 5700 }, { "epoch": 1.7700899782811046, "grad_norm": 0.21795666217803955, "learning_rate": 8.848898541731306e-06, "loss": 0.2215, "num_input_tokens_seen": 3484304, "step": 5705 }, { "epoch": 1.7716413279553211, "grad_norm": 0.1282912641763687, "learning_rate": 8.856655290102389e-06, "loss": 0.2247, "num_input_tokens_seen": 3486736, "step": 5710 }, { "epoch": 1.7731926776295377, "grad_norm": 0.4142894744873047, "learning_rate": 8.864412038473472e-06, "loss": 0.2336, "num_input_tokens_seen": 3489392, "step": 5715 }, { "epoch": 1.7747440273037542, "grad_norm": 0.12638051807880402, "learning_rate": 8.872168786844555e-06, "loss": 0.2273, "num_input_tokens_seen": 3492720, "step": 5720 }, { "epoch": 1.7762953769779708, "grad_norm": 0.1374681293964386, "learning_rate": 8.87992553521564e-06, "loss": 0.2344, "num_input_tokens_seen": 3497008, "step": 5725 }, { "epoch": 1.7778467266521873, "grad_norm": 0.15212228894233704, "learning_rate": 8.887682283586721e-06, "loss": 0.2435, "num_input_tokens_seen": 3499920, "step": 5730 }, { "epoch": 1.779398076326404, "grad_norm": 0.21967092156410217, "learning_rate": 8.895439031957804e-06, "loss": 0.2247, "num_input_tokens_seen": 3502672, "step": 5735 }, { "epoch": 1.7809494260006207, "grad_norm": 0.3105068802833557, "learning_rate": 8.903195780328887e-06, "loss": 0.2294, "num_input_tokens_seen": 3505456, "step": 5740 }, { "epoch": 1.782500775674837, "grad_norm": 0.3275130093097687, "learning_rate": 8.91095252869997e-06, "loss": 0.244, "num_input_tokens_seen": 3508976, "step": 5745 }, { "epoch": 1.7840521253490538, "grad_norm": 0.10696697980165482, "learning_rate": 8.918709277071053e-06, "loss": 0.232, "num_input_tokens_seen": 3511280, "step": 5750 }, { "epoch": 1.78560347502327, "grad_norm": 0.2997499704360962, "learning_rate": 8.926466025442136e-06, "loss": 0.2324, "num_input_tokens_seen": 3514512, "step": 5755 }, { "epoch": 1.7871548246974869, "grad_norm": 0.2427181452512741, "learning_rate": 8.934222773813219e-06, "loss": 0.2362, "num_input_tokens_seen": 3518064, "step": 5760 }, { "epoch": 1.7887061743717034, "grad_norm": 0.570178210735321, "learning_rate": 8.9419795221843e-06, "loss": 0.2313, "num_input_tokens_seen": 3520688, "step": 5765 }, { "epoch": 1.79025752404592, "grad_norm": 0.16359758377075195, "learning_rate": 8.949736270555383e-06, "loss": 0.2293, "num_input_tokens_seen": 3524240, "step": 5770 }, { "epoch": 1.7918088737201365, "grad_norm": 0.12708236277103424, "learning_rate": 8.957493018926466e-06, "loss": 0.2271, "num_input_tokens_seen": 3527056, "step": 5775 }, { "epoch": 1.793360223394353, "grad_norm": 0.10564175248146057, "learning_rate": 8.965249767297549e-06, "loss": 0.2296, "num_input_tokens_seen": 3530224, "step": 5780 }, { "epoch": 1.7949115730685696, "grad_norm": 0.10045979171991348, "learning_rate": 8.973006515668632e-06, "loss": 0.2279, "num_input_tokens_seen": 3533072, "step": 5785 }, { "epoch": 1.7964629227427862, "grad_norm": 0.4386967718601227, "learning_rate": 8.980763264039715e-06, "loss": 0.2262, "num_input_tokens_seen": 3535984, "step": 5790 }, { "epoch": 1.798014272417003, "grad_norm": 0.22440841794013977, "learning_rate": 8.988520012410798e-06, "loss": 0.2161, "num_input_tokens_seen": 3538576, "step": 5795 }, { "epoch": 1.7995656220912193, "grad_norm": 0.12691688537597656, "learning_rate": 8.996276760781881e-06, "loss": 0.2533, "num_input_tokens_seen": 3541936, "step": 5800 }, { "epoch": 1.801116971765436, "grad_norm": 0.12467510998249054, "learning_rate": 9.004033509152964e-06, "loss": 0.2391, "num_input_tokens_seen": 3545680, "step": 5805 }, { "epoch": 1.8026683214396524, "grad_norm": 0.10847236216068268, "learning_rate": 9.011790257524047e-06, "loss": 0.2315, "num_input_tokens_seen": 3548048, "step": 5810 }, { "epoch": 1.8042196711138692, "grad_norm": 0.29533153772354126, "learning_rate": 9.01954700589513e-06, "loss": 0.232, "num_input_tokens_seen": 3550832, "step": 5815 }, { "epoch": 1.8057710207880855, "grad_norm": 0.2634905278682709, "learning_rate": 9.027303754266213e-06, "loss": 0.2317, "num_input_tokens_seen": 3553200, "step": 5820 }, { "epoch": 1.8073223704623023, "grad_norm": 0.10288938879966736, "learning_rate": 9.035060502637294e-06, "loss": 0.2366, "num_input_tokens_seen": 3555408, "step": 5825 }, { "epoch": 1.8088737201365188, "grad_norm": 0.306115061044693, "learning_rate": 9.042817251008377e-06, "loss": 0.2336, "num_input_tokens_seen": 3558640, "step": 5830 }, { "epoch": 1.8104250698107354, "grad_norm": 0.2986530363559723, "learning_rate": 9.05057399937946e-06, "loss": 0.2346, "num_input_tokens_seen": 3561200, "step": 5835 }, { "epoch": 1.811976419484952, "grad_norm": 0.1023850068449974, "learning_rate": 9.058330747750543e-06, "loss": 0.2368, "num_input_tokens_seen": 3565552, "step": 5840 }, { "epoch": 1.8135277691591685, "grad_norm": 0.241317018866539, "learning_rate": 9.066087496121626e-06, "loss": 0.2294, "num_input_tokens_seen": 3568016, "step": 5845 }, { "epoch": 1.815079118833385, "grad_norm": 0.232805073261261, "learning_rate": 9.073844244492709e-06, "loss": 0.2338, "num_input_tokens_seen": 3572368, "step": 5850 }, { "epoch": 1.8166304685076016, "grad_norm": 0.45053353905677795, "learning_rate": 9.081600992863792e-06, "loss": 0.2178, "num_input_tokens_seen": 3576240, "step": 5855 }, { "epoch": 1.8181818181818183, "grad_norm": 0.38880282640457153, "learning_rate": 9.089357741234875e-06, "loss": 0.2423, "num_input_tokens_seen": 3578672, "step": 5860 }, { "epoch": 1.8197331678560347, "grad_norm": 0.3285351097583771, "learning_rate": 9.097114489605958e-06, "loss": 0.2365, "num_input_tokens_seen": 3581040, "step": 5865 }, { "epoch": 1.8212845175302514, "grad_norm": 0.13944955170154572, "learning_rate": 9.104871237977041e-06, "loss": 0.2422, "num_input_tokens_seen": 3583728, "step": 5870 }, { "epoch": 1.8228358672044678, "grad_norm": 0.49372634291648865, "learning_rate": 9.112627986348124e-06, "loss": 0.2215, "num_input_tokens_seen": 3585872, "step": 5875 }, { "epoch": 1.8243872168786845, "grad_norm": 0.22009924054145813, "learning_rate": 9.120384734719207e-06, "loss": 0.2353, "num_input_tokens_seen": 3588432, "step": 5880 }, { "epoch": 1.8259385665529009, "grad_norm": 0.11725206673145294, "learning_rate": 9.12814148309029e-06, "loss": 0.2328, "num_input_tokens_seen": 3591696, "step": 5885 }, { "epoch": 1.8274899162271177, "grad_norm": 0.26784366369247437, "learning_rate": 9.135898231461371e-06, "loss": 0.2316, "num_input_tokens_seen": 3597392, "step": 5890 }, { "epoch": 1.8290412659013342, "grad_norm": 0.102384552359581, "learning_rate": 9.143654979832454e-06, "loss": 0.2347, "num_input_tokens_seen": 3599760, "step": 5895 }, { "epoch": 1.8305926155755508, "grad_norm": 0.12877897918224335, "learning_rate": 9.151411728203537e-06, "loss": 0.2263, "num_input_tokens_seen": 3602608, "step": 5900 }, { "epoch": 1.8321439652497673, "grad_norm": 0.09470420330762863, "learning_rate": 9.15916847657462e-06, "loss": 0.2315, "num_input_tokens_seen": 3606160, "step": 5905 }, { "epoch": 1.8336953149239839, "grad_norm": 0.2739367187023163, "learning_rate": 9.166925224945703e-06, "loss": 0.2379, "num_input_tokens_seen": 3609104, "step": 5910 }, { "epoch": 1.8352466645982004, "grad_norm": 0.283808171749115, "learning_rate": 9.174681973316786e-06, "loss": 0.2274, "num_input_tokens_seen": 3611824, "step": 5915 }, { "epoch": 1.836798014272417, "grad_norm": 0.11812000721693039, "learning_rate": 9.18243872168787e-06, "loss": 0.2264, "num_input_tokens_seen": 3614800, "step": 5920 }, { "epoch": 1.8383493639466337, "grad_norm": 0.2886594831943512, "learning_rate": 9.190195470058952e-06, "loss": 0.233, "num_input_tokens_seen": 3617392, "step": 5925 }, { "epoch": 1.83990071362085, "grad_norm": 0.07824813574552536, "learning_rate": 9.197952218430035e-06, "loss": 0.2277, "num_input_tokens_seen": 3620368, "step": 5930 }, { "epoch": 1.8414520632950668, "grad_norm": 0.17588844895362854, "learning_rate": 9.205708966801118e-06, "loss": 0.2338, "num_input_tokens_seen": 3623984, "step": 5935 }, { "epoch": 1.8430034129692832, "grad_norm": 0.17571040987968445, "learning_rate": 9.213465715172201e-06, "loss": 0.224, "num_input_tokens_seen": 3626896, "step": 5940 }, { "epoch": 1.8445547626435, "grad_norm": 0.14285734295845032, "learning_rate": 9.221222463543284e-06, "loss": 0.236, "num_input_tokens_seen": 3629904, "step": 5945 }, { "epoch": 1.8461061123177163, "grad_norm": 0.37722235918045044, "learning_rate": 9.228979211914365e-06, "loss": 0.2343, "num_input_tokens_seen": 3632592, "step": 5950 }, { "epoch": 1.847657461991933, "grad_norm": 0.3328477144241333, "learning_rate": 9.236735960285448e-06, "loss": 0.2296, "num_input_tokens_seen": 3635056, "step": 5955 }, { "epoch": 1.8492088116661496, "grad_norm": 0.12538890540599823, "learning_rate": 9.244492708656531e-06, "loss": 0.2294, "num_input_tokens_seen": 3637296, "step": 5960 }, { "epoch": 1.8507601613403661, "grad_norm": 0.28859448432922363, "learning_rate": 9.252249457027614e-06, "loss": 0.2393, "num_input_tokens_seen": 3639728, "step": 5965 }, { "epoch": 1.8523115110145827, "grad_norm": 0.4756113886833191, "learning_rate": 9.260006205398697e-06, "loss": 0.2244, "num_input_tokens_seen": 3643760, "step": 5970 }, { "epoch": 1.8538628606887992, "grad_norm": 0.2396615743637085, "learning_rate": 9.26776295376978e-06, "loss": 0.2318, "num_input_tokens_seen": 3647376, "step": 5975 }, { "epoch": 1.8554142103630158, "grad_norm": 0.1297803372144699, "learning_rate": 9.275519702140863e-06, "loss": 0.232, "num_input_tokens_seen": 3650448, "step": 5980 }, { "epoch": 1.8569655600372323, "grad_norm": 0.47204405069351196, "learning_rate": 9.283276450511946e-06, "loss": 0.2299, "num_input_tokens_seen": 3653744, "step": 5985 }, { "epoch": 1.8585169097114491, "grad_norm": 0.10929049551486969, "learning_rate": 9.29103319888303e-06, "loss": 0.24, "num_input_tokens_seen": 3657296, "step": 5990 }, { "epoch": 1.8600682593856654, "grad_norm": 0.08076424151659012, "learning_rate": 9.298789947254112e-06, "loss": 0.2303, "num_input_tokens_seen": 3659984, "step": 5995 }, { "epoch": 1.8616196090598822, "grad_norm": 0.3018551766872406, "learning_rate": 9.306546695625195e-06, "loss": 0.23, "num_input_tokens_seen": 3662576, "step": 6000 }, { "epoch": 1.8631709587340985, "grad_norm": 0.28416746854782104, "learning_rate": 9.314303443996278e-06, "loss": 0.2346, "num_input_tokens_seen": 3665264, "step": 6005 }, { "epoch": 1.8647223084083153, "grad_norm": 0.23405641317367554, "learning_rate": 9.32206019236736e-06, "loss": 0.2296, "num_input_tokens_seen": 3667600, "step": 6010 }, { "epoch": 1.8662736580825317, "grad_norm": 0.10178587585687637, "learning_rate": 9.329816940738443e-06, "loss": 0.2275, "num_input_tokens_seen": 3670512, "step": 6015 }, { "epoch": 1.8678250077567484, "grad_norm": 0.09556322544813156, "learning_rate": 9.337573689109526e-06, "loss": 0.2313, "num_input_tokens_seen": 3673264, "step": 6020 }, { "epoch": 1.869376357430965, "grad_norm": 0.26322439312934875, "learning_rate": 9.345330437480609e-06, "loss": 0.233, "num_input_tokens_seen": 3675856, "step": 6025 }, { "epoch": 1.8709277071051815, "grad_norm": 0.31424570083618164, "learning_rate": 9.353087185851691e-06, "loss": 0.2272, "num_input_tokens_seen": 3678256, "step": 6030 }, { "epoch": 1.872479056779398, "grad_norm": 0.10386031121015549, "learning_rate": 9.360843934222774e-06, "loss": 0.2386, "num_input_tokens_seen": 3681552, "step": 6035 }, { "epoch": 1.8740304064536146, "grad_norm": 0.2674192786216736, "learning_rate": 9.368600682593857e-06, "loss": 0.2321, "num_input_tokens_seen": 3684624, "step": 6040 }, { "epoch": 1.8755817561278312, "grad_norm": 0.25944438576698303, "learning_rate": 9.37635743096494e-06, "loss": 0.2263, "num_input_tokens_seen": 3688048, "step": 6045 }, { "epoch": 1.8771331058020477, "grad_norm": 0.2482394427061081, "learning_rate": 9.384114179336023e-06, "loss": 0.232, "num_input_tokens_seen": 3690832, "step": 6050 }, { "epoch": 1.8786844554762645, "grad_norm": 0.10068822652101517, "learning_rate": 9.391870927707106e-06, "loss": 0.2326, "num_input_tokens_seen": 3693744, "step": 6055 }, { "epoch": 1.8802358051504808, "grad_norm": 0.09797403961420059, "learning_rate": 9.39962767607819e-06, "loss": 0.2304, "num_input_tokens_seen": 3697712, "step": 6060 }, { "epoch": 1.8817871548246976, "grad_norm": 0.4938843250274658, "learning_rate": 9.407384424449272e-06, "loss": 0.2254, "num_input_tokens_seen": 3700560, "step": 6065 }, { "epoch": 1.883338504498914, "grad_norm": 0.15995332598686218, "learning_rate": 9.415141172820355e-06, "loss": 0.2272, "num_input_tokens_seen": 3703120, "step": 6070 }, { "epoch": 1.8848898541731307, "grad_norm": 0.46688762307167053, "learning_rate": 9.422897921191437e-06, "loss": 0.2427, "num_input_tokens_seen": 3706416, "step": 6075 }, { "epoch": 1.886441203847347, "grad_norm": 0.1415228545665741, "learning_rate": 9.43065466956252e-06, "loss": 0.2146, "num_input_tokens_seen": 3709776, "step": 6080 }, { "epoch": 1.8879925535215638, "grad_norm": 0.17173394560813904, "learning_rate": 9.438411417933603e-06, "loss": 0.2539, "num_input_tokens_seen": 3712784, "step": 6085 }, { "epoch": 1.8895439031957804, "grad_norm": 0.23213490843772888, "learning_rate": 9.446168166304686e-06, "loss": 0.2242, "num_input_tokens_seen": 3716880, "step": 6090 }, { "epoch": 1.891095252869997, "grad_norm": 0.14107683300971985, "learning_rate": 9.453924914675769e-06, "loss": 0.2257, "num_input_tokens_seen": 3719024, "step": 6095 }, { "epoch": 1.8926466025442135, "grad_norm": 0.22043292224407196, "learning_rate": 9.461681663046852e-06, "loss": 0.2265, "num_input_tokens_seen": 3722256, "step": 6100 }, { "epoch": 1.89419795221843, "grad_norm": 0.16030879318714142, "learning_rate": 9.469438411417935e-06, "loss": 0.2345, "num_input_tokens_seen": 3724432, "step": 6105 }, { "epoch": 1.8957493018926466, "grad_norm": 0.16408438980579376, "learning_rate": 9.477195159789018e-06, "loss": 0.2291, "num_input_tokens_seen": 3728080, "step": 6110 }, { "epoch": 1.8973006515668631, "grad_norm": 0.3123638927936554, "learning_rate": 9.4849519081601e-06, "loss": 0.2285, "num_input_tokens_seen": 3730800, "step": 6115 }, { "epoch": 1.89885200124108, "grad_norm": 0.15275070071220398, "learning_rate": 9.492708656531184e-06, "loss": 0.2331, "num_input_tokens_seen": 3734928, "step": 6120 }, { "epoch": 1.9004033509152962, "grad_norm": 0.15696217119693756, "learning_rate": 9.500465404902266e-06, "loss": 0.2235, "num_input_tokens_seen": 3737808, "step": 6125 }, { "epoch": 1.901954700589513, "grad_norm": 0.26792243123054504, "learning_rate": 9.50822215327335e-06, "loss": 0.2173, "num_input_tokens_seen": 3740976, "step": 6130 }, { "epoch": 1.9035060502637293, "grad_norm": 0.24645449221134186, "learning_rate": 9.51597890164443e-06, "loss": 0.2373, "num_input_tokens_seen": 3744752, "step": 6135 }, { "epoch": 1.905057399937946, "grad_norm": 0.7831502556800842, "learning_rate": 9.523735650015514e-06, "loss": 0.2354, "num_input_tokens_seen": 3749264, "step": 6140 }, { "epoch": 1.9066087496121624, "grad_norm": 0.1433064192533493, "learning_rate": 9.531492398386597e-06, "loss": 0.2347, "num_input_tokens_seen": 3752240, "step": 6145 }, { "epoch": 1.9081600992863792, "grad_norm": 0.18184693157672882, "learning_rate": 9.53924914675768e-06, "loss": 0.2403, "num_input_tokens_seen": 3755792, "step": 6150 }, { "epoch": 1.9097114489605957, "grad_norm": 0.5603284239768982, "learning_rate": 9.547005895128763e-06, "loss": 0.2326, "num_input_tokens_seen": 3758960, "step": 6155 }, { "epoch": 1.9112627986348123, "grad_norm": 0.3611885905265808, "learning_rate": 9.554762643499846e-06, "loss": 0.2294, "num_input_tokens_seen": 3762928, "step": 6160 }, { "epoch": 1.9128141483090288, "grad_norm": 0.3733344078063965, "learning_rate": 9.562519391870929e-06, "loss": 0.2346, "num_input_tokens_seen": 3765680, "step": 6165 }, { "epoch": 1.9143654979832454, "grad_norm": 0.6817658543586731, "learning_rate": 9.57027614024201e-06, "loss": 0.2335, "num_input_tokens_seen": 3769296, "step": 6170 }, { "epoch": 1.915916847657462, "grad_norm": 0.6213210821151733, "learning_rate": 9.578032888613095e-06, "loss": 0.2391, "num_input_tokens_seen": 3772528, "step": 6175 }, { "epoch": 1.9174681973316785, "grad_norm": 0.875577449798584, "learning_rate": 9.585789636984178e-06, "loss": 0.2333, "num_input_tokens_seen": 3775408, "step": 6180 }, { "epoch": 1.9190195470058953, "grad_norm": 0.1365695744752884, "learning_rate": 9.59354638535526e-06, "loss": 0.2338, "num_input_tokens_seen": 3778960, "step": 6185 }, { "epoch": 1.9205708966801116, "grad_norm": 0.6364971399307251, "learning_rate": 9.601303133726344e-06, "loss": 0.2277, "num_input_tokens_seen": 3782256, "step": 6190 }, { "epoch": 1.9221222463543284, "grad_norm": 0.35687100887298584, "learning_rate": 9.609059882097425e-06, "loss": 0.2218, "num_input_tokens_seen": 3784976, "step": 6195 }, { "epoch": 1.9236735960285447, "grad_norm": 0.5273014903068542, "learning_rate": 9.616816630468508e-06, "loss": 0.2584, "num_input_tokens_seen": 3787600, "step": 6200 }, { "epoch": 1.9252249457027615, "grad_norm": 0.6002390384674072, "learning_rate": 9.62457337883959e-06, "loss": 0.2382, "num_input_tokens_seen": 3790384, "step": 6205 }, { "epoch": 1.9267762953769778, "grad_norm": 0.1410730928182602, "learning_rate": 9.632330127210674e-06, "loss": 0.2256, "num_input_tokens_seen": 3793712, "step": 6210 }, { "epoch": 1.9283276450511946, "grad_norm": 0.4842439591884613, "learning_rate": 9.640086875581757e-06, "loss": 0.2337, "num_input_tokens_seen": 3797232, "step": 6215 }, { "epoch": 1.9298789947254111, "grad_norm": 0.24613922834396362, "learning_rate": 9.64784362395284e-06, "loss": 0.2315, "num_input_tokens_seen": 3799600, "step": 6220 }, { "epoch": 1.9314303443996277, "grad_norm": 0.31065237522125244, "learning_rate": 9.655600372323923e-06, "loss": 0.2312, "num_input_tokens_seen": 3802608, "step": 6225 }, { "epoch": 1.9329816940738442, "grad_norm": 0.1122695803642273, "learning_rate": 9.663357120695006e-06, "loss": 0.2316, "num_input_tokens_seen": 3805136, "step": 6230 }, { "epoch": 1.9345330437480608, "grad_norm": 0.23410440981388092, "learning_rate": 9.671113869066087e-06, "loss": 0.2277, "num_input_tokens_seen": 3807760, "step": 6235 }, { "epoch": 1.9360843934222773, "grad_norm": 0.1300814002752304, "learning_rate": 9.678870617437172e-06, "loss": 0.2372, "num_input_tokens_seen": 3810416, "step": 6240 }, { "epoch": 1.937635743096494, "grad_norm": 0.23366977274417877, "learning_rate": 9.686627365808255e-06, "loss": 0.2236, "num_input_tokens_seen": 3813360, "step": 6245 }, { "epoch": 1.9391870927707107, "grad_norm": 0.13636714220046997, "learning_rate": 9.694384114179338e-06, "loss": 0.2202, "num_input_tokens_seen": 3815632, "step": 6250 }, { "epoch": 1.940738442444927, "grad_norm": 0.3753505349159241, "learning_rate": 9.70214086255042e-06, "loss": 0.2366, "num_input_tokens_seen": 3818192, "step": 6255 }, { "epoch": 1.9422897921191438, "grad_norm": 0.5762627720832825, "learning_rate": 9.709897610921502e-06, "loss": 0.2516, "num_input_tokens_seen": 3820912, "step": 6260 }, { "epoch": 1.94384114179336, "grad_norm": 0.30963513255119324, "learning_rate": 9.717654359292585e-06, "loss": 0.2477, "num_input_tokens_seen": 3823824, "step": 6265 }, { "epoch": 1.9453924914675769, "grad_norm": 0.10560697317123413, "learning_rate": 9.725411107663668e-06, "loss": 0.2303, "num_input_tokens_seen": 3826192, "step": 6270 }, { "epoch": 1.9469438411417932, "grad_norm": 0.24438290297985077, "learning_rate": 9.733167856034751e-06, "loss": 0.2326, "num_input_tokens_seen": 3829744, "step": 6275 }, { "epoch": 1.94849519081601, "grad_norm": 0.2867261469364166, "learning_rate": 9.740924604405834e-06, "loss": 0.2356, "num_input_tokens_seen": 3832688, "step": 6280 }, { "epoch": 1.9500465404902265, "grad_norm": 0.10253652185201645, "learning_rate": 9.748681352776917e-06, "loss": 0.2305, "num_input_tokens_seen": 3836496, "step": 6285 }, { "epoch": 1.951597890164443, "grad_norm": 0.2619127333164215, "learning_rate": 9.756438101148e-06, "loss": 0.2342, "num_input_tokens_seen": 3839344, "step": 6290 }, { "epoch": 1.9531492398386596, "grad_norm": 0.22706089913845062, "learning_rate": 9.764194849519081e-06, "loss": 0.2294, "num_input_tokens_seen": 3843600, "step": 6295 }, { "epoch": 1.9547005895128762, "grad_norm": 0.26160869002342224, "learning_rate": 9.771951597890166e-06, "loss": 0.2283, "num_input_tokens_seen": 3846736, "step": 6300 }, { "epoch": 1.9562519391870927, "grad_norm": 0.25861576199531555, "learning_rate": 9.779708346261249e-06, "loss": 0.2315, "num_input_tokens_seen": 3850832, "step": 6305 }, { "epoch": 1.9578032888613093, "grad_norm": 0.0928642749786377, "learning_rate": 9.787465094632332e-06, "loss": 0.2346, "num_input_tokens_seen": 3855216, "step": 6310 }, { "epoch": 1.959354638535526, "grad_norm": 0.2868429720401764, "learning_rate": 9.795221843003415e-06, "loss": 0.2316, "num_input_tokens_seen": 3857520, "step": 6315 }, { "epoch": 1.9609059882097424, "grad_norm": 0.4981765151023865, "learning_rate": 9.802978591374496e-06, "loss": 0.2275, "num_input_tokens_seen": 3860720, "step": 6320 }, { "epoch": 1.9624573378839592, "grad_norm": 0.30926239490509033, "learning_rate": 9.810735339745579e-06, "loss": 0.2255, "num_input_tokens_seen": 3863536, "step": 6325 }, { "epoch": 1.9640086875581755, "grad_norm": 0.17053207755088806, "learning_rate": 9.818492088116662e-06, "loss": 0.2475, "num_input_tokens_seen": 3866384, "step": 6330 }, { "epoch": 1.9655600372323923, "grad_norm": 0.31261104345321655, "learning_rate": 9.826248836487745e-06, "loss": 0.2363, "num_input_tokens_seen": 3869168, "step": 6335 }, { "epoch": 1.9671113869066086, "grad_norm": 0.4642057716846466, "learning_rate": 9.834005584858828e-06, "loss": 0.2307, "num_input_tokens_seen": 3873008, "step": 6340 }, { "epoch": 1.9686627365808254, "grad_norm": 0.200853168964386, "learning_rate": 9.841762333229911e-06, "loss": 0.2253, "num_input_tokens_seen": 3875824, "step": 6345 }, { "epoch": 1.970214086255042, "grad_norm": 0.2647722363471985, "learning_rate": 9.849519081600994e-06, "loss": 0.2313, "num_input_tokens_seen": 3879728, "step": 6350 }, { "epoch": 1.9717654359292585, "grad_norm": 0.20708732306957245, "learning_rate": 9.857275829972075e-06, "loss": 0.2276, "num_input_tokens_seen": 3882448, "step": 6355 }, { "epoch": 1.973316785603475, "grad_norm": 0.08365536481142044, "learning_rate": 9.865032578343158e-06, "loss": 0.2294, "num_input_tokens_seen": 3884848, "step": 6360 }, { "epoch": 1.9748681352776916, "grad_norm": 0.10498952865600586, "learning_rate": 9.872789326714243e-06, "loss": 0.2252, "num_input_tokens_seen": 3888304, "step": 6365 }, { "epoch": 1.9764194849519081, "grad_norm": 0.2836953401565552, "learning_rate": 9.880546075085326e-06, "loss": 0.243, "num_input_tokens_seen": 3891568, "step": 6370 }, { "epoch": 1.9779708346261247, "grad_norm": 0.2478978931903839, "learning_rate": 9.888302823456409e-06, "loss": 0.2341, "num_input_tokens_seen": 3894352, "step": 6375 }, { "epoch": 1.9795221843003414, "grad_norm": 0.34242817759513855, "learning_rate": 9.89605957182749e-06, "loss": 0.2369, "num_input_tokens_seen": 3896496, "step": 6380 }, { "epoch": 1.9810735339745578, "grad_norm": 2.0797770023345947, "learning_rate": 9.903816320198573e-06, "loss": 0.2061, "num_input_tokens_seen": 3899440, "step": 6385 }, { "epoch": 1.9826248836487745, "grad_norm": 1.1698240041732788, "learning_rate": 9.911573068569656e-06, "loss": 0.2567, "num_input_tokens_seen": 3902256, "step": 6390 }, { "epoch": 1.9841762333229909, "grad_norm": 2.4942595958709717, "learning_rate": 9.919329816940739e-06, "loss": 0.3153, "num_input_tokens_seen": 3906640, "step": 6395 }, { "epoch": 1.9857275829972076, "grad_norm": 0.30240726470947266, "learning_rate": 9.927086565311822e-06, "loss": 0.2427, "num_input_tokens_seen": 3909168, "step": 6400 }, { "epoch": 1.9872789326714242, "grad_norm": 0.3854968547821045, "learning_rate": 9.934843313682905e-06, "loss": 0.2202, "num_input_tokens_seen": 3911632, "step": 6405 }, { "epoch": 1.9888302823456407, "grad_norm": 0.5188968777656555, "learning_rate": 9.942600062053988e-06, "loss": 0.2458, "num_input_tokens_seen": 3913936, "step": 6410 }, { "epoch": 1.9903816320198573, "grad_norm": 1.6793104410171509, "learning_rate": 9.950356810425071e-06, "loss": 0.2326, "num_input_tokens_seen": 3917072, "step": 6415 }, { "epoch": 1.9919329816940738, "grad_norm": 1.8814971446990967, "learning_rate": 9.958113558796152e-06, "loss": 0.2861, "num_input_tokens_seen": 3919216, "step": 6420 }, { "epoch": 1.9934843313682904, "grad_norm": 0.5001675486564636, "learning_rate": 9.965870307167235e-06, "loss": 0.2326, "num_input_tokens_seen": 3921872, "step": 6425 }, { "epoch": 1.995035681042507, "grad_norm": 0.3454176187515259, "learning_rate": 9.97362705553832e-06, "loss": 0.2366, "num_input_tokens_seen": 3924656, "step": 6430 }, { "epoch": 1.9965870307167235, "grad_norm": 0.4206741750240326, "learning_rate": 9.981383803909403e-06, "loss": 0.2325, "num_input_tokens_seen": 3927728, "step": 6435 }, { "epoch": 1.99813838039094, "grad_norm": 0.7896965146064758, "learning_rate": 9.989140552280486e-06, "loss": 0.2406, "num_input_tokens_seen": 3931152, "step": 6440 }, { "epoch": 1.9996897300651568, "grad_norm": 0.4909515380859375, "learning_rate": 9.996897300651567e-06, "loss": 0.2264, "num_input_tokens_seen": 3933808, "step": 6445 }, { "epoch": 2.0, "eval_loss": 0.23272447288036346, "eval_runtime": 34.4155, "eval_samples_per_second": 93.65, "eval_steps_per_second": 23.42, "num_input_tokens_seen": 3934032, "step": 6446 }, { "epoch": 2.001241079739373, "grad_norm": 0.20458869636058807, "learning_rate": 9.999999934019343e-06, "loss": 0.2295, "num_input_tokens_seen": 3936528, "step": 6450 }, { "epoch": 2.00279242941359, "grad_norm": 0.5877746939659119, "learning_rate": 9.99999953080422e-06, "loss": 0.2348, "num_input_tokens_seen": 3938768, "step": 6455 }, { "epoch": 2.0043437790878063, "grad_norm": 0.3268698751926422, "learning_rate": 9.999998761029918e-06, "loss": 0.2323, "num_input_tokens_seen": 3941904, "step": 6460 }, { "epoch": 2.005895128762023, "grad_norm": 0.18339617550373077, "learning_rate": 9.9999976246965e-06, "loss": 0.2237, "num_input_tokens_seen": 3944336, "step": 6465 }, { "epoch": 2.0074464784362394, "grad_norm": 0.965881884098053, "learning_rate": 9.99999612180405e-06, "loss": 0.2353, "num_input_tokens_seen": 3947088, "step": 6470 }, { "epoch": 2.008997828110456, "grad_norm": 1.3993685245513916, "learning_rate": 9.999994252352671e-06, "loss": 0.2361, "num_input_tokens_seen": 3950704, "step": 6475 }, { "epoch": 2.0105491777846725, "grad_norm": 1.0193300247192383, "learning_rate": 9.999992016342509e-06, "loss": 0.2315, "num_input_tokens_seen": 3953296, "step": 6480 }, { "epoch": 2.0121005274588892, "grad_norm": 0.4575270116329193, "learning_rate": 9.99998941377372e-06, "loss": 0.2357, "num_input_tokens_seen": 3956176, "step": 6485 }, { "epoch": 2.013651877133106, "grad_norm": 0.3547806739807129, "learning_rate": 9.999986444646499e-06, "loss": 0.2365, "num_input_tokens_seen": 3963376, "step": 6490 }, { "epoch": 2.0152032268073223, "grad_norm": 0.17411251366138458, "learning_rate": 9.999983108961064e-06, "loss": 0.2336, "num_input_tokens_seen": 3966256, "step": 6495 }, { "epoch": 2.016754576481539, "grad_norm": 0.25537019968032837, "learning_rate": 9.999979406717657e-06, "loss": 0.234, "num_input_tokens_seen": 3968848, "step": 6500 }, { "epoch": 2.0183059261557554, "grad_norm": 0.13866567611694336, "learning_rate": 9.999975337916551e-06, "loss": 0.238, "num_input_tokens_seen": 3971408, "step": 6505 }, { "epoch": 2.019857275829972, "grad_norm": 0.30794715881347656, "learning_rate": 9.999970902558046e-06, "loss": 0.2316, "num_input_tokens_seen": 3974448, "step": 6510 }, { "epoch": 2.0214086255041885, "grad_norm": 0.3135371506214142, "learning_rate": 9.999966100642464e-06, "loss": 0.2327, "num_input_tokens_seen": 3977328, "step": 6515 }, { "epoch": 2.0229599751784053, "grad_norm": 0.12142431735992432, "learning_rate": 9.999960932170158e-06, "loss": 0.2369, "num_input_tokens_seen": 3979696, "step": 6520 }, { "epoch": 2.0245113248526216, "grad_norm": 0.5448927879333496, "learning_rate": 9.999955397141509e-06, "loss": 0.2336, "num_input_tokens_seen": 3983440, "step": 6525 }, { "epoch": 2.0260626745268384, "grad_norm": 0.29124388098716736, "learning_rate": 9.99994949555692e-06, "loss": 0.2317, "num_input_tokens_seen": 3986704, "step": 6530 }, { "epoch": 2.0276140242010547, "grad_norm": 0.08049836754798889, "learning_rate": 9.999943227416823e-06, "loss": 0.2296, "num_input_tokens_seen": 3990352, "step": 6535 }, { "epoch": 2.0291653738752715, "grad_norm": 0.27110499143600464, "learning_rate": 9.999936592721682e-06, "loss": 0.227, "num_input_tokens_seen": 3992752, "step": 6540 }, { "epoch": 2.030716723549488, "grad_norm": 0.5376902222633362, "learning_rate": 9.99992959147198e-06, "loss": 0.2369, "num_input_tokens_seen": 3995568, "step": 6545 }, { "epoch": 2.0322680732237046, "grad_norm": 0.2924182415008545, "learning_rate": 9.99992222366823e-06, "loss": 0.2294, "num_input_tokens_seen": 3999408, "step": 6550 }, { "epoch": 2.0338194228979214, "grad_norm": 0.5349140167236328, "learning_rate": 9.999914489310973e-06, "loss": 0.2315, "num_input_tokens_seen": 4001680, "step": 6555 }, { "epoch": 2.0353707725721377, "grad_norm": 0.3491228520870209, "learning_rate": 9.999906388400777e-06, "loss": 0.2296, "num_input_tokens_seen": 4005552, "step": 6560 }, { "epoch": 2.0369221222463545, "grad_norm": 0.15546450018882751, "learning_rate": 9.999897920938235e-06, "loss": 0.2351, "num_input_tokens_seen": 4008336, "step": 6565 }, { "epoch": 2.038473471920571, "grad_norm": 0.2548375427722931, "learning_rate": 9.999889086923967e-06, "loss": 0.2316, "num_input_tokens_seen": 4010608, "step": 6570 }, { "epoch": 2.0400248215947876, "grad_norm": 0.5612536668777466, "learning_rate": 9.999879886358622e-06, "loss": 0.2258, "num_input_tokens_seen": 4015216, "step": 6575 }, { "epoch": 2.041576171269004, "grad_norm": 0.21269291639328003, "learning_rate": 9.999870319242872e-06, "loss": 0.2309, "num_input_tokens_seen": 4018800, "step": 6580 }, { "epoch": 2.0431275209432207, "grad_norm": 0.39947059750556946, "learning_rate": 9.999860385577422e-06, "loss": 0.2405, "num_input_tokens_seen": 4022416, "step": 6585 }, { "epoch": 2.044678870617437, "grad_norm": 0.267206609249115, "learning_rate": 9.999850085362998e-06, "loss": 0.2265, "num_input_tokens_seen": 4026640, "step": 6590 }, { "epoch": 2.046230220291654, "grad_norm": 0.5667111277580261, "learning_rate": 9.999839418600357e-06, "loss": 0.2315, "num_input_tokens_seen": 4029552, "step": 6595 }, { "epoch": 2.04778156996587, "grad_norm": 0.2860625088214874, "learning_rate": 9.999828385290279e-06, "loss": 0.2293, "num_input_tokens_seen": 4032912, "step": 6600 }, { "epoch": 2.049332919640087, "grad_norm": 0.18995019793510437, "learning_rate": 9.999816985433573e-06, "loss": 0.2253, "num_input_tokens_seen": 4035792, "step": 6605 }, { "epoch": 2.0508842693143032, "grad_norm": 0.17794524133205414, "learning_rate": 9.999805219031076e-06, "loss": 0.2277, "num_input_tokens_seen": 4038160, "step": 6610 }, { "epoch": 2.05243561898852, "grad_norm": 0.6433579325675964, "learning_rate": 9.99979308608365e-06, "loss": 0.2303, "num_input_tokens_seen": 4041872, "step": 6615 }, { "epoch": 2.053986968662737, "grad_norm": 1.2379636764526367, "learning_rate": 9.999780586592182e-06, "loss": 0.2409, "num_input_tokens_seen": 4046224, "step": 6620 }, { "epoch": 2.055538318336953, "grad_norm": 0.49489179253578186, "learning_rate": 9.999767720557593e-06, "loss": 0.2536, "num_input_tokens_seen": 4049584, "step": 6625 }, { "epoch": 2.05708966801117, "grad_norm": 0.21050357818603516, "learning_rate": 9.999754487980824e-06, "loss": 0.2225, "num_input_tokens_seen": 4052304, "step": 6630 }, { "epoch": 2.058641017685386, "grad_norm": 0.12704649567604065, "learning_rate": 9.999740888862844e-06, "loss": 0.234, "num_input_tokens_seen": 4055152, "step": 6635 }, { "epoch": 2.060192367359603, "grad_norm": 0.16934575140476227, "learning_rate": 9.999726923204651e-06, "loss": 0.2337, "num_input_tokens_seen": 4058512, "step": 6640 }, { "epoch": 2.0617437170338193, "grad_norm": 0.07832903414964676, "learning_rate": 9.99971259100727e-06, "loss": 0.2335, "num_input_tokens_seen": 4060656, "step": 6645 }, { "epoch": 2.063295066708036, "grad_norm": 0.5137988328933716, "learning_rate": 9.99969789227175e-06, "loss": 0.2296, "num_input_tokens_seen": 4065232, "step": 6650 }, { "epoch": 2.0648464163822524, "grad_norm": 0.25855085253715515, "learning_rate": 9.999682826999169e-06, "loss": 0.2254, "num_input_tokens_seen": 4068048, "step": 6655 }, { "epoch": 2.066397766056469, "grad_norm": 0.2773350179195404, "learning_rate": 9.999667395190633e-06, "loss": 0.2349, "num_input_tokens_seen": 4070544, "step": 6660 }, { "epoch": 2.0679491157306855, "grad_norm": 0.24235455691814423, "learning_rate": 9.999651596847271e-06, "loss": 0.2327, "num_input_tokens_seen": 4073296, "step": 6665 }, { "epoch": 2.0695004654049023, "grad_norm": 0.2753422260284424, "learning_rate": 9.999635431970243e-06, "loss": 0.2284, "num_input_tokens_seen": 4076368, "step": 6670 }, { "epoch": 2.0710518150791186, "grad_norm": 0.23971563577651978, "learning_rate": 9.999618900560731e-06, "loss": 0.2336, "num_input_tokens_seen": 4078992, "step": 6675 }, { "epoch": 2.0726031647533354, "grad_norm": 0.21871398389339447, "learning_rate": 9.999602002619951e-06, "loss": 0.2295, "num_input_tokens_seen": 4082256, "step": 6680 }, { "epoch": 2.074154514427552, "grad_norm": 0.20693105459213257, "learning_rate": 9.99958473814914e-06, "loss": 0.2317, "num_input_tokens_seen": 4084912, "step": 6685 }, { "epoch": 2.0757058641017685, "grad_norm": 0.22474397718906403, "learning_rate": 9.999567107149564e-06, "loss": 0.2342, "num_input_tokens_seen": 4088656, "step": 6690 }, { "epoch": 2.0772572137759853, "grad_norm": 0.23763518035411835, "learning_rate": 9.999549109622515e-06, "loss": 0.2346, "num_input_tokens_seen": 4091536, "step": 6695 }, { "epoch": 2.0788085634502016, "grad_norm": 0.07935638725757599, "learning_rate": 9.999530745569312e-06, "loss": 0.2369, "num_input_tokens_seen": 4095504, "step": 6700 }, { "epoch": 2.0803599131244184, "grad_norm": 0.04801783338189125, "learning_rate": 9.999512014991303e-06, "loss": 0.2213, "num_input_tokens_seen": 4098448, "step": 6705 }, { "epoch": 2.0819112627986347, "grad_norm": 0.09782169759273529, "learning_rate": 9.99949291788986e-06, "loss": 0.2376, "num_input_tokens_seen": 4101488, "step": 6710 }, { "epoch": 2.0834626124728515, "grad_norm": 0.4285088777542114, "learning_rate": 9.999473454266384e-06, "loss": 0.2184, "num_input_tokens_seen": 4104656, "step": 6715 }, { "epoch": 2.085013962147068, "grad_norm": 0.09460990875959396, "learning_rate": 9.9994536241223e-06, "loss": 0.2369, "num_input_tokens_seen": 4106992, "step": 6720 }, { "epoch": 2.0865653118212846, "grad_norm": 0.10151991993188858, "learning_rate": 9.999433427459063e-06, "loss": 0.2388, "num_input_tokens_seen": 4109712, "step": 6725 }, { "epoch": 2.088116661495501, "grad_norm": 0.5277295708656311, "learning_rate": 9.999412864278154e-06, "loss": 0.2329, "num_input_tokens_seen": 4113168, "step": 6730 }, { "epoch": 2.0896680111697177, "grad_norm": 0.07480164617300034, "learning_rate": 9.99939193458108e-06, "loss": 0.2386, "num_input_tokens_seen": 4117904, "step": 6735 }, { "epoch": 2.091219360843934, "grad_norm": 0.07137972116470337, "learning_rate": 9.999370638369377e-06, "loss": 0.2283, "num_input_tokens_seen": 4122320, "step": 6740 }, { "epoch": 2.092770710518151, "grad_norm": 0.22048187255859375, "learning_rate": 9.999348975644603e-06, "loss": 0.2293, "num_input_tokens_seen": 4125008, "step": 6745 }, { "epoch": 2.0943220601923676, "grad_norm": 0.061252281069755554, "learning_rate": 9.999326946408347e-06, "loss": 0.2337, "num_input_tokens_seen": 4127408, "step": 6750 }, { "epoch": 2.095873409866584, "grad_norm": 0.06537853181362152, "learning_rate": 9.999304550662228e-06, "loss": 0.2314, "num_input_tokens_seen": 4129936, "step": 6755 }, { "epoch": 2.0974247595408007, "grad_norm": 0.21936601400375366, "learning_rate": 9.999281788407882e-06, "loss": 0.2289, "num_input_tokens_seen": 4132816, "step": 6760 }, { "epoch": 2.098976109215017, "grad_norm": 0.10837429761886597, "learning_rate": 9.999258659646982e-06, "loss": 0.2295, "num_input_tokens_seen": 4135728, "step": 6765 }, { "epoch": 2.1005274588892338, "grad_norm": 0.0905844047665596, "learning_rate": 9.999235164381222e-06, "loss": 0.2298, "num_input_tokens_seen": 4140496, "step": 6770 }, { "epoch": 2.10207880856345, "grad_norm": 0.25469034910202026, "learning_rate": 9.999211302612323e-06, "loss": 0.2303, "num_input_tokens_seen": 4143984, "step": 6775 }, { "epoch": 2.103630158237667, "grad_norm": 0.07194621115922928, "learning_rate": 9.999187074342039e-06, "loss": 0.235, "num_input_tokens_seen": 4147248, "step": 6780 }, { "epoch": 2.105181507911883, "grad_norm": 0.223112553358078, "learning_rate": 9.999162479572142e-06, "loss": 0.2288, "num_input_tokens_seen": 4150928, "step": 6785 }, { "epoch": 2.1067328575861, "grad_norm": 0.11458857357501984, "learning_rate": 9.999137518304436e-06, "loss": 0.2249, "num_input_tokens_seen": 4153648, "step": 6790 }, { "epoch": 2.1082842072603163, "grad_norm": 0.36709722876548767, "learning_rate": 9.999112190540751e-06, "loss": 0.242, "num_input_tokens_seen": 4158032, "step": 6795 }, { "epoch": 2.109835556934533, "grad_norm": 0.1609109491109848, "learning_rate": 9.999086496282945e-06, "loss": 0.231, "num_input_tokens_seen": 4160976, "step": 6800 }, { "epoch": 2.1113869066087494, "grad_norm": 0.47167646884918213, "learning_rate": 9.9990604355329e-06, "loss": 0.2283, "num_input_tokens_seen": 4164016, "step": 6805 }, { "epoch": 2.112938256282966, "grad_norm": 0.31676483154296875, "learning_rate": 9.999034008292527e-06, "loss": 0.2332, "num_input_tokens_seen": 4166704, "step": 6810 }, { "epoch": 2.114489605957183, "grad_norm": 0.2274055778980255, "learning_rate": 9.999007214563765e-06, "loss": 0.2176, "num_input_tokens_seen": 4170640, "step": 6815 }, { "epoch": 2.1160409556313993, "grad_norm": 0.3418246805667877, "learning_rate": 9.998980054348575e-06, "loss": 0.2306, "num_input_tokens_seen": 4173104, "step": 6820 }, { "epoch": 2.117592305305616, "grad_norm": 0.468584805727005, "learning_rate": 9.998952527648953e-06, "loss": 0.2326, "num_input_tokens_seen": 4176304, "step": 6825 }, { "epoch": 2.1191436549798324, "grad_norm": 0.21367274224758148, "learning_rate": 9.998924634466913e-06, "loss": 0.2214, "num_input_tokens_seen": 4179824, "step": 6830 }, { "epoch": 2.120695004654049, "grad_norm": 0.4440143406391144, "learning_rate": 9.9988963748045e-06, "loss": 0.227, "num_input_tokens_seen": 4183440, "step": 6835 }, { "epoch": 2.1222463543282655, "grad_norm": 0.19212616980075836, "learning_rate": 9.998867748663789e-06, "loss": 0.2365, "num_input_tokens_seen": 4185808, "step": 6840 }, { "epoch": 2.1237977040024822, "grad_norm": 0.12014450132846832, "learning_rate": 9.998838756046876e-06, "loss": 0.2616, "num_input_tokens_seen": 4190096, "step": 6845 }, { "epoch": 2.1253490536766986, "grad_norm": 0.21316012740135193, "learning_rate": 9.998809396955887e-06, "loss": 0.2393, "num_input_tokens_seen": 4192592, "step": 6850 }, { "epoch": 2.1269004033509153, "grad_norm": 0.08344614505767822, "learning_rate": 9.998779671392974e-06, "loss": 0.2301, "num_input_tokens_seen": 4195472, "step": 6855 }, { "epoch": 2.1284517530251317, "grad_norm": 0.20700670778751373, "learning_rate": 9.998749579360316e-06, "loss": 0.2281, "num_input_tokens_seen": 4198224, "step": 6860 }, { "epoch": 2.1300031026993484, "grad_norm": 0.10290107131004333, "learning_rate": 9.998719120860121e-06, "loss": 0.235, "num_input_tokens_seen": 4200848, "step": 6865 }, { "epoch": 2.131554452373565, "grad_norm": 0.43049800395965576, "learning_rate": 9.99868829589462e-06, "loss": 0.2343, "num_input_tokens_seen": 4203184, "step": 6870 }, { "epoch": 2.1331058020477816, "grad_norm": 0.20070688426494598, "learning_rate": 9.998657104466075e-06, "loss": 0.2256, "num_input_tokens_seen": 4206448, "step": 6875 }, { "epoch": 2.1346571517219983, "grad_norm": 0.22103449702262878, "learning_rate": 9.998625546576769e-06, "loss": 0.2261, "num_input_tokens_seen": 4209264, "step": 6880 }, { "epoch": 2.1362085013962147, "grad_norm": 0.17441394925117493, "learning_rate": 9.998593622229018e-06, "loss": 0.2255, "num_input_tokens_seen": 4211888, "step": 6885 }, { "epoch": 2.1377598510704314, "grad_norm": 0.3045758903026581, "learning_rate": 9.998561331425164e-06, "loss": 0.2455, "num_input_tokens_seen": 4215216, "step": 6890 }, { "epoch": 2.1393112007446478, "grad_norm": 0.1700555980205536, "learning_rate": 9.99852867416757e-06, "loss": 0.2258, "num_input_tokens_seen": 4218320, "step": 6895 }, { "epoch": 2.1408625504188645, "grad_norm": 0.09912616014480591, "learning_rate": 9.998495650458637e-06, "loss": 0.233, "num_input_tokens_seen": 4221008, "step": 6900 }, { "epoch": 2.142413900093081, "grad_norm": 0.1918186992406845, "learning_rate": 9.998462260300779e-06, "loss": 0.2277, "num_input_tokens_seen": 4227728, "step": 6905 }, { "epoch": 2.1439652497672976, "grad_norm": 0.20523278415203094, "learning_rate": 9.998428503696447e-06, "loss": 0.2185, "num_input_tokens_seen": 4231056, "step": 6910 }, { "epoch": 2.145516599441514, "grad_norm": 0.2731011211872101, "learning_rate": 9.998394380648115e-06, "loss": 0.239, "num_input_tokens_seen": 4233744, "step": 6915 }, { "epoch": 2.1470679491157307, "grad_norm": 0.11236749589443207, "learning_rate": 9.998359891158287e-06, "loss": 0.2432, "num_input_tokens_seen": 4237776, "step": 6920 }, { "epoch": 2.148619298789947, "grad_norm": 0.20166252553462982, "learning_rate": 9.998325035229488e-06, "loss": 0.2293, "num_input_tokens_seen": 4241456, "step": 6925 }, { "epoch": 2.150170648464164, "grad_norm": 0.20530365407466888, "learning_rate": 9.998289812864276e-06, "loss": 0.2307, "num_input_tokens_seen": 4244336, "step": 6930 }, { "epoch": 2.15172199813838, "grad_norm": 0.4238809049129486, "learning_rate": 9.998254224065229e-06, "loss": 0.2304, "num_input_tokens_seen": 4246736, "step": 6935 }, { "epoch": 2.153273347812597, "grad_norm": 0.08943674713373184, "learning_rate": 9.998218268834962e-06, "loss": 0.2292, "num_input_tokens_seen": 4250320, "step": 6940 }, { "epoch": 2.1548246974868137, "grad_norm": 0.23035824298858643, "learning_rate": 9.998181947176106e-06, "loss": 0.2272, "num_input_tokens_seen": 4252560, "step": 6945 }, { "epoch": 2.15637604716103, "grad_norm": 0.10183678567409515, "learning_rate": 9.998145259091329e-06, "loss": 0.2333, "num_input_tokens_seen": 4255344, "step": 6950 }, { "epoch": 2.157927396835247, "grad_norm": 0.33374473452568054, "learning_rate": 9.998108204583316e-06, "loss": 0.2321, "num_input_tokens_seen": 4257680, "step": 6955 }, { "epoch": 2.159478746509463, "grad_norm": 0.21798577904701233, "learning_rate": 9.998070783654786e-06, "loss": 0.2299, "num_input_tokens_seen": 4261232, "step": 6960 }, { "epoch": 2.16103009618368, "grad_norm": 0.11307048797607422, "learning_rate": 9.99803299630848e-06, "loss": 0.2372, "num_input_tokens_seen": 4263472, "step": 6965 }, { "epoch": 2.1625814458578962, "grad_norm": 0.08435528725385666, "learning_rate": 9.99799484254717e-06, "loss": 0.2394, "num_input_tokens_seen": 4265936, "step": 6970 }, { "epoch": 2.164132795532113, "grad_norm": 0.068455271422863, "learning_rate": 9.997956322373655e-06, "loss": 0.2284, "num_input_tokens_seen": 4268496, "step": 6975 }, { "epoch": 2.1656841452063293, "grad_norm": 0.1922585666179657, "learning_rate": 9.997917435790753e-06, "loss": 0.2306, "num_input_tokens_seen": 4271056, "step": 6980 }, { "epoch": 2.167235494880546, "grad_norm": 0.20736968517303467, "learning_rate": 9.997878182801322e-06, "loss": 0.2335, "num_input_tokens_seen": 4273616, "step": 6985 }, { "epoch": 2.1687868445547624, "grad_norm": 0.08494336903095245, "learning_rate": 9.997838563408236e-06, "loss": 0.2294, "num_input_tokens_seen": 4276624, "step": 6990 }, { "epoch": 2.1703381942289792, "grad_norm": 0.19409394264221191, "learning_rate": 9.9977985776144e-06, "loss": 0.2374, "num_input_tokens_seen": 4279664, "step": 6995 }, { "epoch": 2.1718895439031956, "grad_norm": 0.2035490870475769, "learning_rate": 9.997758225422745e-06, "loss": 0.2329, "num_input_tokens_seen": 4282320, "step": 7000 }, { "epoch": 2.1734408935774123, "grad_norm": 0.21729612350463867, "learning_rate": 9.997717506836229e-06, "loss": 0.2269, "num_input_tokens_seen": 4284976, "step": 7005 }, { "epoch": 2.174992243251629, "grad_norm": 0.06116090714931488, "learning_rate": 9.99767642185784e-06, "loss": 0.2295, "num_input_tokens_seen": 4287728, "step": 7010 }, { "epoch": 2.1765435929258454, "grad_norm": 0.18443039059638977, "learning_rate": 9.997634970490586e-06, "loss": 0.2211, "num_input_tokens_seen": 4290096, "step": 7015 }, { "epoch": 2.178094942600062, "grad_norm": 0.23846682906150818, "learning_rate": 9.997593152737508e-06, "loss": 0.2352, "num_input_tokens_seen": 4294864, "step": 7020 }, { "epoch": 2.1796462922742785, "grad_norm": 0.2620568871498108, "learning_rate": 9.997550968601673e-06, "loss": 0.2393, "num_input_tokens_seen": 4297008, "step": 7025 }, { "epoch": 2.1811976419484953, "grad_norm": 0.23652765154838562, "learning_rate": 9.99750841808617e-06, "loss": 0.2365, "num_input_tokens_seen": 4299728, "step": 7030 }, { "epoch": 2.1827489916227116, "grad_norm": 0.38888856768608093, "learning_rate": 9.997465501194123e-06, "loss": 0.2348, "num_input_tokens_seen": 4302672, "step": 7035 }, { "epoch": 2.1843003412969284, "grad_norm": 0.20425385236740112, "learning_rate": 9.997422217928674e-06, "loss": 0.2304, "num_input_tokens_seen": 4305104, "step": 7040 }, { "epoch": 2.1858516909711447, "grad_norm": 0.37569740414619446, "learning_rate": 9.997378568292998e-06, "loss": 0.2303, "num_input_tokens_seen": 4307728, "step": 7045 }, { "epoch": 2.1874030406453615, "grad_norm": 0.23351891338825226, "learning_rate": 9.997334552290296e-06, "loss": 0.2357, "num_input_tokens_seen": 4311152, "step": 7050 }, { "epoch": 2.188954390319578, "grad_norm": 0.21318751573562622, "learning_rate": 9.997290169923794e-06, "loss": 0.232, "num_input_tokens_seen": 4313488, "step": 7055 }, { "epoch": 2.1905057399937946, "grad_norm": 0.20734275877475739, "learning_rate": 9.997245421196746e-06, "loss": 0.2305, "num_input_tokens_seen": 4316592, "step": 7060 }, { "epoch": 2.192057089668011, "grad_norm": 0.20531345903873444, "learning_rate": 9.997200306112433e-06, "loss": 0.2368, "num_input_tokens_seen": 4319536, "step": 7065 }, { "epoch": 2.1936084393422277, "grad_norm": 0.06349547207355499, "learning_rate": 9.99715482467416e-06, "loss": 0.2321, "num_input_tokens_seen": 4322928, "step": 7070 }, { "epoch": 2.1951597890164445, "grad_norm": 0.3746649920940399, "learning_rate": 9.997108976885266e-06, "loss": 0.2264, "num_input_tokens_seen": 4326384, "step": 7075 }, { "epoch": 2.196711138690661, "grad_norm": 0.2018752098083496, "learning_rate": 9.997062762749107e-06, "loss": 0.2342, "num_input_tokens_seen": 4329040, "step": 7080 }, { "epoch": 2.1982624883648776, "grad_norm": 0.37831777334213257, "learning_rate": 9.997016182269074e-06, "loss": 0.228, "num_input_tokens_seen": 4332720, "step": 7085 }, { "epoch": 2.199813838039094, "grad_norm": 0.20283988118171692, "learning_rate": 9.99696923544858e-06, "loss": 0.2318, "num_input_tokens_seen": 4335632, "step": 7090 }, { "epoch": 2.2013651877133107, "grad_norm": 0.19828149676322937, "learning_rate": 9.996921922291069e-06, "loss": 0.2307, "num_input_tokens_seen": 4338192, "step": 7095 }, { "epoch": 2.202916537387527, "grad_norm": 0.21503408253192902, "learning_rate": 9.99687424280001e-06, "loss": 0.2282, "num_input_tokens_seen": 4340976, "step": 7100 }, { "epoch": 2.204467887061744, "grad_norm": 0.23001046478748322, "learning_rate": 9.996826196978898e-06, "loss": 0.2377, "num_input_tokens_seen": 4343344, "step": 7105 }, { "epoch": 2.20601923673596, "grad_norm": 0.06869626045227051, "learning_rate": 9.996777784831251e-06, "loss": 0.2293, "num_input_tokens_seen": 4346736, "step": 7110 }, { "epoch": 2.207570586410177, "grad_norm": 0.20662161707878113, "learning_rate": 9.996729006360624e-06, "loss": 0.2295, "num_input_tokens_seen": 4350160, "step": 7115 }, { "epoch": 2.2091219360843932, "grad_norm": 0.06350136548280716, "learning_rate": 9.99667986157059e-06, "loss": 0.2321, "num_input_tokens_seen": 4352560, "step": 7120 }, { "epoch": 2.21067328575861, "grad_norm": 0.24658142030239105, "learning_rate": 9.996630350464753e-06, "loss": 0.2315, "num_input_tokens_seen": 4355952, "step": 7125 }, { "epoch": 2.2122246354328263, "grad_norm": 0.08294808119535446, "learning_rate": 9.996580473046743e-06, "loss": 0.2305, "num_input_tokens_seen": 4358768, "step": 7130 }, { "epoch": 2.213775985107043, "grad_norm": 0.4420052170753479, "learning_rate": 9.996530229320213e-06, "loss": 0.2305, "num_input_tokens_seen": 4361360, "step": 7135 }, { "epoch": 2.21532733478126, "grad_norm": 0.08929982781410217, "learning_rate": 9.996479619288853e-06, "loss": 0.2238, "num_input_tokens_seen": 4363792, "step": 7140 }, { "epoch": 2.216878684455476, "grad_norm": 0.2608608603477478, "learning_rate": 9.996428642956365e-06, "loss": 0.234, "num_input_tokens_seen": 4367312, "step": 7145 }, { "epoch": 2.218430034129693, "grad_norm": 0.26118820905685425, "learning_rate": 9.996377300326496e-06, "loss": 0.2356, "num_input_tokens_seen": 4370704, "step": 7150 }, { "epoch": 2.2199813838039093, "grad_norm": 0.06048394367098808, "learning_rate": 9.996325591403003e-06, "loss": 0.2354, "num_input_tokens_seen": 4373744, "step": 7155 }, { "epoch": 2.221532733478126, "grad_norm": 0.21328593790531158, "learning_rate": 9.996273516189678e-06, "loss": 0.2326, "num_input_tokens_seen": 4377328, "step": 7160 }, { "epoch": 2.2230840831523424, "grad_norm": 0.21334585547447205, "learning_rate": 9.99622107469034e-06, "loss": 0.2326, "num_input_tokens_seen": 4381072, "step": 7165 }, { "epoch": 2.224635432826559, "grad_norm": 0.19079245626926422, "learning_rate": 9.996168266908835e-06, "loss": 0.2275, "num_input_tokens_seen": 4384368, "step": 7170 }, { "epoch": 2.2261867825007755, "grad_norm": 0.060872167348861694, "learning_rate": 9.996115092849031e-06, "loss": 0.2279, "num_input_tokens_seen": 4387184, "step": 7175 }, { "epoch": 2.2277381321749923, "grad_norm": 0.3255065381526947, "learning_rate": 9.996061552514827e-06, "loss": 0.2333, "num_input_tokens_seen": 4389872, "step": 7180 }, { "epoch": 2.2292894818492086, "grad_norm": 0.280845582485199, "learning_rate": 9.99600764591015e-06, "loss": 0.2149, "num_input_tokens_seen": 4392944, "step": 7185 }, { "epoch": 2.2308408315234254, "grad_norm": 0.14994674921035767, "learning_rate": 9.995953373038951e-06, "loss": 0.2225, "num_input_tokens_seen": 4396112, "step": 7190 }, { "epoch": 2.232392181197642, "grad_norm": 0.17635005712509155, "learning_rate": 9.995898733905209e-06, "loss": 0.2203, "num_input_tokens_seen": 4399184, "step": 7195 }, { "epoch": 2.2339435308718585, "grad_norm": 0.472881555557251, "learning_rate": 9.995843728512929e-06, "loss": 0.229, "num_input_tokens_seen": 4401552, "step": 7200 }, { "epoch": 2.2354948805460753, "grad_norm": 0.24788261950016022, "learning_rate": 9.995788356866143e-06, "loss": 0.2148, "num_input_tokens_seen": 4404592, "step": 7205 }, { "epoch": 2.2370462302202916, "grad_norm": 0.5467087030410767, "learning_rate": 9.995732618968914e-06, "loss": 0.2515, "num_input_tokens_seen": 4408496, "step": 7210 }, { "epoch": 2.2385975798945084, "grad_norm": 0.5715152621269226, "learning_rate": 9.995676514825323e-06, "loss": 0.2467, "num_input_tokens_seen": 4410736, "step": 7215 }, { "epoch": 2.2401489295687247, "grad_norm": 0.27835536003112793, "learning_rate": 9.995620044439488e-06, "loss": 0.2273, "num_input_tokens_seen": 4413584, "step": 7220 }, { "epoch": 2.2417002792429415, "grad_norm": 0.1674724668264389, "learning_rate": 9.995563207815546e-06, "loss": 0.2282, "num_input_tokens_seen": 4416208, "step": 7225 }, { "epoch": 2.243251628917158, "grad_norm": 0.4214390814304352, "learning_rate": 9.995506004957664e-06, "loss": 0.2382, "num_input_tokens_seen": 4418864, "step": 7230 }, { "epoch": 2.2448029785913746, "grad_norm": 0.05248064920306206, "learning_rate": 9.995448435870036e-06, "loss": 0.2209, "num_input_tokens_seen": 4421200, "step": 7235 }, { "epoch": 2.246354328265591, "grad_norm": 0.36504170298576355, "learning_rate": 9.995390500556883e-06, "loss": 0.2283, "num_input_tokens_seen": 4424048, "step": 7240 }, { "epoch": 2.2479056779398077, "grad_norm": 0.23336976766586304, "learning_rate": 9.995332199022454e-06, "loss": 0.233, "num_input_tokens_seen": 4427472, "step": 7245 }, { "epoch": 2.249457027614024, "grad_norm": 0.05455336719751358, "learning_rate": 9.99527353127102e-06, "loss": 0.2313, "num_input_tokens_seen": 4429552, "step": 7250 }, { "epoch": 2.2510083772882408, "grad_norm": 0.21777942776679993, "learning_rate": 9.995214497306883e-06, "loss": 0.2338, "num_input_tokens_seen": 4432592, "step": 7255 }, { "epoch": 2.252559726962457, "grad_norm": 0.07673737406730652, "learning_rate": 9.99515509713437e-06, "loss": 0.2337, "num_input_tokens_seen": 4435472, "step": 7260 }, { "epoch": 2.254111076636674, "grad_norm": 0.19804058969020844, "learning_rate": 9.995095330757838e-06, "loss": 0.2283, "num_input_tokens_seen": 4437808, "step": 7265 }, { "epoch": 2.2556624263108906, "grad_norm": 0.22395597398281097, "learning_rate": 9.995035198181667e-06, "loss": 0.2347, "num_input_tokens_seen": 4440752, "step": 7270 }, { "epoch": 2.257213775985107, "grad_norm": 0.08144346624612808, "learning_rate": 9.994974699410267e-06, "loss": 0.2281, "num_input_tokens_seen": 4444432, "step": 7275 }, { "epoch": 2.2587651256593237, "grad_norm": 0.22111405432224274, "learning_rate": 9.994913834448071e-06, "loss": 0.237, "num_input_tokens_seen": 4447024, "step": 7280 }, { "epoch": 2.26031647533354, "grad_norm": 0.18257057666778564, "learning_rate": 9.994852603299544e-06, "loss": 0.2303, "num_input_tokens_seen": 4450096, "step": 7285 }, { "epoch": 2.261867825007757, "grad_norm": 0.38741496205329895, "learning_rate": 9.994791005969172e-06, "loss": 0.2236, "num_input_tokens_seen": 4453392, "step": 7290 }, { "epoch": 2.263419174681973, "grad_norm": 0.1755397766828537, "learning_rate": 9.99472904246147e-06, "loss": 0.2306, "num_input_tokens_seen": 4456944, "step": 7295 }, { "epoch": 2.26497052435619, "grad_norm": 0.19408002495765686, "learning_rate": 9.994666712780986e-06, "loss": 0.2319, "num_input_tokens_seen": 4460112, "step": 7300 }, { "epoch": 2.2665218740304063, "grad_norm": 0.2738293409347534, "learning_rate": 9.994604016932285e-06, "loss": 0.2311, "num_input_tokens_seen": 4462896, "step": 7305 }, { "epoch": 2.268073223704623, "grad_norm": 0.2517591416835785, "learning_rate": 9.994540954919964e-06, "loss": 0.2559, "num_input_tokens_seen": 4465520, "step": 7310 }, { "epoch": 2.26962457337884, "grad_norm": 0.0955648198723793, "learning_rate": 9.994477526748649e-06, "loss": 0.2219, "num_input_tokens_seen": 4467696, "step": 7315 }, { "epoch": 2.271175923053056, "grad_norm": 0.4414996802806854, "learning_rate": 9.994413732422985e-06, "loss": 0.2378, "num_input_tokens_seen": 4470288, "step": 7320 }, { "epoch": 2.2727272727272725, "grad_norm": 0.08522412925958633, "learning_rate": 9.994349571947655e-06, "loss": 0.2326, "num_input_tokens_seen": 4473392, "step": 7325 }, { "epoch": 2.2742786224014893, "grad_norm": 0.2520065903663635, "learning_rate": 9.994285045327356e-06, "loss": 0.2298, "num_input_tokens_seen": 4476048, "step": 7330 }, { "epoch": 2.275829972075706, "grad_norm": 0.4895300269126892, "learning_rate": 9.994220152566825e-06, "loss": 0.232, "num_input_tokens_seen": 4479376, "step": 7335 }, { "epoch": 2.2773813217499224, "grad_norm": 0.33744311332702637, "learning_rate": 9.994154893670813e-06, "loss": 0.2328, "num_input_tokens_seen": 4482352, "step": 7340 }, { "epoch": 2.278932671424139, "grad_norm": 0.1369164139032364, "learning_rate": 9.99408926864411e-06, "loss": 0.2254, "num_input_tokens_seen": 4485584, "step": 7345 }, { "epoch": 2.2804840210983555, "grad_norm": 0.2591361403465271, "learning_rate": 9.994023277491525e-06, "loss": 0.228, "num_input_tokens_seen": 4488432, "step": 7350 }, { "epoch": 2.2820353707725722, "grad_norm": 0.5846945643424988, "learning_rate": 9.993956920217896e-06, "loss": 0.2417, "num_input_tokens_seen": 4491056, "step": 7355 }, { "epoch": 2.2835867204467886, "grad_norm": 0.11919702589511871, "learning_rate": 9.993890196828084e-06, "loss": 0.2273, "num_input_tokens_seen": 4494160, "step": 7360 }, { "epoch": 2.2851380701210053, "grad_norm": 0.11506456136703491, "learning_rate": 9.993823107326988e-06, "loss": 0.2467, "num_input_tokens_seen": 4497680, "step": 7365 }, { "epoch": 2.2866894197952217, "grad_norm": 0.1156449019908905, "learning_rate": 9.99375565171952e-06, "loss": 0.2306, "num_input_tokens_seen": 4501136, "step": 7370 }, { "epoch": 2.2882407694694384, "grad_norm": 0.08398327976465225, "learning_rate": 9.99368783001063e-06, "loss": 0.2315, "num_input_tokens_seen": 4503312, "step": 7375 }, { "epoch": 2.289792119143655, "grad_norm": 0.07496263086795807, "learning_rate": 9.993619642205285e-06, "loss": 0.2263, "num_input_tokens_seen": 4505616, "step": 7380 }, { "epoch": 2.2913434688178715, "grad_norm": 0.2066762000322342, "learning_rate": 9.99355108830849e-06, "loss": 0.2353, "num_input_tokens_seen": 4508336, "step": 7385 }, { "epoch": 2.292894818492088, "grad_norm": 0.1769709438085556, "learning_rate": 9.993482168325266e-06, "loss": 0.235, "num_input_tokens_seen": 4511248, "step": 7390 }, { "epoch": 2.2944461681663046, "grad_norm": 0.07225868105888367, "learning_rate": 9.993412882260667e-06, "loss": 0.2341, "num_input_tokens_seen": 4514800, "step": 7395 }, { "epoch": 2.2959975178405214, "grad_norm": 0.18595916032791138, "learning_rate": 9.993343230119774e-06, "loss": 0.2315, "num_input_tokens_seen": 4517360, "step": 7400 }, { "epoch": 2.2975488675147377, "grad_norm": 0.35849422216415405, "learning_rate": 9.993273211907693e-06, "loss": 0.2278, "num_input_tokens_seen": 4520432, "step": 7405 }, { "epoch": 2.2991002171889545, "grad_norm": 0.23347117006778717, "learning_rate": 9.993202827629555e-06, "loss": 0.237, "num_input_tokens_seen": 4522640, "step": 7410 }, { "epoch": 2.300651566863171, "grad_norm": 0.2408764660358429, "learning_rate": 9.993132077290522e-06, "loss": 0.2248, "num_input_tokens_seen": 4525104, "step": 7415 }, { "epoch": 2.3022029165373876, "grad_norm": 0.25177231431007385, "learning_rate": 9.99306096089578e-06, "loss": 0.2322, "num_input_tokens_seen": 4527536, "step": 7420 }, { "epoch": 2.303754266211604, "grad_norm": 0.0608593225479126, "learning_rate": 9.992989478450542e-06, "loss": 0.2361, "num_input_tokens_seen": 4530352, "step": 7425 }, { "epoch": 2.3053056158858207, "grad_norm": 0.08254372328519821, "learning_rate": 9.99291762996005e-06, "loss": 0.2191, "num_input_tokens_seen": 4533328, "step": 7430 }, { "epoch": 2.306856965560037, "grad_norm": 0.1729871928691864, "learning_rate": 9.99284541542957e-06, "loss": 0.237, "num_input_tokens_seen": 4536816, "step": 7435 }, { "epoch": 2.308408315234254, "grad_norm": 0.22252391278743744, "learning_rate": 9.9927728348644e-06, "loss": 0.2361, "num_input_tokens_seen": 4539344, "step": 7440 }, { "epoch": 2.3099596649084706, "grad_norm": 0.35389524698257446, "learning_rate": 9.992699888269854e-06, "loss": 0.2275, "num_input_tokens_seen": 4542416, "step": 7445 }, { "epoch": 2.311511014582687, "grad_norm": 0.3744885325431824, "learning_rate": 9.992626575651285e-06, "loss": 0.2414, "num_input_tokens_seen": 4544944, "step": 7450 }, { "epoch": 2.3130623642569033, "grad_norm": 0.21424145996570587, "learning_rate": 9.992552897014068e-06, "loss": 0.2318, "num_input_tokens_seen": 4548208, "step": 7455 }, { "epoch": 2.31461371393112, "grad_norm": 0.056794848293066025, "learning_rate": 9.992478852363601e-06, "loss": 0.2323, "num_input_tokens_seen": 4551056, "step": 7460 }, { "epoch": 2.316165063605337, "grad_norm": 0.4443068504333496, "learning_rate": 9.992404441705317e-06, "loss": 0.231, "num_input_tokens_seen": 4554352, "step": 7465 }, { "epoch": 2.317716413279553, "grad_norm": 0.19748906791210175, "learning_rate": 9.992329665044666e-06, "loss": 0.2304, "num_input_tokens_seen": 4556848, "step": 7470 }, { "epoch": 2.31926776295377, "grad_norm": 0.11047220230102539, "learning_rate": 9.992254522387133e-06, "loss": 0.2289, "num_input_tokens_seen": 4560624, "step": 7475 }, { "epoch": 2.3208191126279862, "grad_norm": 0.3570931553840637, "learning_rate": 9.992179013738227e-06, "loss": 0.2277, "num_input_tokens_seen": 4563568, "step": 7480 }, { "epoch": 2.322370462302203, "grad_norm": 0.45603644847869873, "learning_rate": 9.992103139103481e-06, "loss": 0.2447, "num_input_tokens_seen": 4566608, "step": 7485 }, { "epoch": 2.3239218119764193, "grad_norm": 0.09925100952386856, "learning_rate": 9.992026898488462e-06, "loss": 0.2353, "num_input_tokens_seen": 4572720, "step": 7490 }, { "epoch": 2.325473161650636, "grad_norm": 0.07520949840545654, "learning_rate": 9.991950291898757e-06, "loss": 0.2273, "num_input_tokens_seen": 4575024, "step": 7495 }, { "epoch": 2.3270245113248524, "grad_norm": 0.16689667105674744, "learning_rate": 9.99187331933998e-06, "loss": 0.2245, "num_input_tokens_seen": 4577936, "step": 7500 }, { "epoch": 2.328575860999069, "grad_norm": 0.07794839143753052, "learning_rate": 9.991795980817777e-06, "loss": 0.2324, "num_input_tokens_seen": 4581072, "step": 7505 }, { "epoch": 2.330127210673286, "grad_norm": 0.17081663012504578, "learning_rate": 9.991718276337818e-06, "loss": 0.2312, "num_input_tokens_seen": 4583664, "step": 7510 }, { "epoch": 2.3316785603475023, "grad_norm": 0.08901513367891312, "learning_rate": 9.991640205905799e-06, "loss": 0.2249, "num_input_tokens_seen": 4586288, "step": 7515 }, { "epoch": 2.333229910021719, "grad_norm": 0.2089061290025711, "learning_rate": 9.99156176952744e-06, "loss": 0.2385, "num_input_tokens_seen": 4588880, "step": 7520 }, { "epoch": 2.3347812596959354, "grad_norm": 0.16585999727249146, "learning_rate": 9.991482967208496e-06, "loss": 0.2325, "num_input_tokens_seen": 4593488, "step": 7525 }, { "epoch": 2.336332609370152, "grad_norm": 0.20207075774669647, "learning_rate": 9.991403798954744e-06, "loss": 0.2291, "num_input_tokens_seen": 4596816, "step": 7530 }, { "epoch": 2.3378839590443685, "grad_norm": 0.2110612839460373, "learning_rate": 9.991324264771984e-06, "loss": 0.2312, "num_input_tokens_seen": 4600048, "step": 7535 }, { "epoch": 2.3394353087185853, "grad_norm": 0.06133747473359108, "learning_rate": 9.991244364666051e-06, "loss": 0.2342, "num_input_tokens_seen": 4603184, "step": 7540 }, { "epoch": 2.3409866583928016, "grad_norm": 0.1899401843547821, "learning_rate": 9.9911640986428e-06, "loss": 0.232, "num_input_tokens_seen": 4605872, "step": 7545 }, { "epoch": 2.3425380080670184, "grad_norm": 0.04914203658699989, "learning_rate": 9.991083466708117e-06, "loss": 0.2305, "num_input_tokens_seen": 4608304, "step": 7550 }, { "epoch": 2.3440893577412347, "grad_norm": 0.18255144357681274, "learning_rate": 9.991002468867912e-06, "loss": 0.2317, "num_input_tokens_seen": 4612560, "step": 7555 }, { "epoch": 2.3456407074154515, "grad_norm": 0.3428482413291931, "learning_rate": 9.990921105128124e-06, "loss": 0.2256, "num_input_tokens_seen": 4615504, "step": 7560 }, { "epoch": 2.347192057089668, "grad_norm": 0.052129145711660385, "learning_rate": 9.99083937549472e-06, "loss": 0.2361, "num_input_tokens_seen": 4617808, "step": 7565 }, { "epoch": 2.3487434067638846, "grad_norm": 0.2363419085741043, "learning_rate": 9.990757279973685e-06, "loss": 0.2357, "num_input_tokens_seen": 4621392, "step": 7570 }, { "epoch": 2.3502947564381014, "grad_norm": 0.21204249560832977, "learning_rate": 9.990674818571044e-06, "loss": 0.2222, "num_input_tokens_seen": 4624528, "step": 7575 }, { "epoch": 2.3518461061123177, "grad_norm": 0.3570127487182617, "learning_rate": 9.990591991292839e-06, "loss": 0.2232, "num_input_tokens_seen": 4628688, "step": 7580 }, { "epoch": 2.3533974557865345, "grad_norm": 0.1541350781917572, "learning_rate": 9.990508798145147e-06, "loss": 0.2268, "num_input_tokens_seen": 4630928, "step": 7585 }, { "epoch": 2.354948805460751, "grad_norm": 0.227981299161911, "learning_rate": 9.99042523913406e-06, "loss": 0.2419, "num_input_tokens_seen": 4633488, "step": 7590 }, { "epoch": 2.3565001551349676, "grad_norm": 0.07248766720294952, "learning_rate": 9.99034131426571e-06, "loss": 0.2356, "num_input_tokens_seen": 4636880, "step": 7595 }, { "epoch": 2.358051504809184, "grad_norm": 0.20696760714054108, "learning_rate": 9.990257023546246e-06, "loss": 0.2408, "num_input_tokens_seen": 4640752, "step": 7600 }, { "epoch": 2.3596028544834007, "grad_norm": 0.09428663551807404, "learning_rate": 9.990172366981849e-06, "loss": 0.2342, "num_input_tokens_seen": 4644240, "step": 7605 }, { "epoch": 2.361154204157617, "grad_norm": 0.18981461226940155, "learning_rate": 9.990087344578724e-06, "loss": 0.2346, "num_input_tokens_seen": 4647088, "step": 7610 }, { "epoch": 2.362705553831834, "grad_norm": 0.18935972452163696, "learning_rate": 9.990001956343107e-06, "loss": 0.2312, "num_input_tokens_seen": 4649360, "step": 7615 }, { "epoch": 2.36425690350605, "grad_norm": 0.1822275072336197, "learning_rate": 9.989916202281256e-06, "loss": 0.225, "num_input_tokens_seen": 4653136, "step": 7620 }, { "epoch": 2.365808253180267, "grad_norm": 0.23505239188671112, "learning_rate": 9.989830082399456e-06, "loss": 0.2334, "num_input_tokens_seen": 4656176, "step": 7625 }, { "epoch": 2.367359602854483, "grad_norm": 0.18590359389781952, "learning_rate": 9.989743596704025e-06, "loss": 0.227, "num_input_tokens_seen": 4659024, "step": 7630 }, { "epoch": 2.3689109525287, "grad_norm": 0.17928484082221985, "learning_rate": 9.9896567452013e-06, "loss": 0.2292, "num_input_tokens_seen": 4661488, "step": 7635 }, { "epoch": 2.3704623022029168, "grad_norm": 0.0857577919960022, "learning_rate": 9.989569527897647e-06, "loss": 0.2305, "num_input_tokens_seen": 4664752, "step": 7640 }, { "epoch": 2.372013651877133, "grad_norm": 0.07931803911924362, "learning_rate": 9.989481944799465e-06, "loss": 0.238, "num_input_tokens_seen": 4667632, "step": 7645 }, { "epoch": 2.37356500155135, "grad_norm": 0.16799113154411316, "learning_rate": 9.989393995913171e-06, "loss": 0.2302, "num_input_tokens_seen": 4670288, "step": 7650 }, { "epoch": 2.375116351225566, "grad_norm": 0.07333444803953171, "learning_rate": 9.989305681245214e-06, "loss": 0.2224, "num_input_tokens_seen": 4672560, "step": 7655 }, { "epoch": 2.376667700899783, "grad_norm": 0.1742897480726242, "learning_rate": 9.989217000802068e-06, "loss": 0.2372, "num_input_tokens_seen": 4676240, "step": 7660 }, { "epoch": 2.3782190505739993, "grad_norm": 0.3897570073604584, "learning_rate": 9.989127954590236e-06, "loss": 0.2298, "num_input_tokens_seen": 4679344, "step": 7665 }, { "epoch": 2.379770400248216, "grad_norm": 0.06346157193183899, "learning_rate": 9.989038542616243e-06, "loss": 0.2295, "num_input_tokens_seen": 4681392, "step": 7670 }, { "epoch": 2.3813217499224324, "grad_norm": 0.09361934661865234, "learning_rate": 9.988948764886648e-06, "loss": 0.2265, "num_input_tokens_seen": 4684144, "step": 7675 }, { "epoch": 2.382873099596649, "grad_norm": 0.06948346644639969, "learning_rate": 9.98885862140803e-06, "loss": 0.2246, "num_input_tokens_seen": 4686864, "step": 7680 }, { "epoch": 2.3844244492708655, "grad_norm": 0.06327605992555618, "learning_rate": 9.988768112186996e-06, "loss": 0.2387, "num_input_tokens_seen": 4689840, "step": 7685 }, { "epoch": 2.3859757989450823, "grad_norm": 0.4362589418888092, "learning_rate": 9.988677237230185e-06, "loss": 0.2483, "num_input_tokens_seen": 4693232, "step": 7690 }, { "epoch": 2.3875271486192986, "grad_norm": 0.20759548246860504, "learning_rate": 9.988585996544258e-06, "loss": 0.2348, "num_input_tokens_seen": 4696176, "step": 7695 }, { "epoch": 2.3890784982935154, "grad_norm": 0.2165006548166275, "learning_rate": 9.988494390135903e-06, "loss": 0.2313, "num_input_tokens_seen": 4700784, "step": 7700 }, { "epoch": 2.390629847967732, "grad_norm": 0.1073833778500557, "learning_rate": 9.988402418011836e-06, "loss": 0.2315, "num_input_tokens_seen": 4703824, "step": 7705 }, { "epoch": 2.3921811976419485, "grad_norm": 0.2241765260696411, "learning_rate": 9.988310080178802e-06, "loss": 0.2335, "num_input_tokens_seen": 4706896, "step": 7710 }, { "epoch": 2.3937325473161652, "grad_norm": 0.19614410400390625, "learning_rate": 9.988217376643568e-06, "loss": 0.2293, "num_input_tokens_seen": 4710064, "step": 7715 }, { "epoch": 2.3952838969903816, "grad_norm": 0.21659614145755768, "learning_rate": 9.988124307412931e-06, "loss": 0.2298, "num_input_tokens_seen": 4712752, "step": 7720 }, { "epoch": 2.3968352466645984, "grad_norm": 0.05413021147251129, "learning_rate": 9.988030872493714e-06, "loss": 0.2294, "num_input_tokens_seen": 4715152, "step": 7725 }, { "epoch": 2.3983865963388147, "grad_norm": 0.23509015142917633, "learning_rate": 9.987937071892766e-06, "loss": 0.2343, "num_input_tokens_seen": 4718032, "step": 7730 }, { "epoch": 2.3999379460130315, "grad_norm": 0.06681518256664276, "learning_rate": 9.987842905616966e-06, "loss": 0.2251, "num_input_tokens_seen": 4720656, "step": 7735 }, { "epoch": 2.401489295687248, "grad_norm": 0.11340319365262985, "learning_rate": 9.987748373673217e-06, "loss": 0.2467, "num_input_tokens_seen": 4723120, "step": 7740 }, { "epoch": 2.4030406453614646, "grad_norm": 0.39046287536621094, "learning_rate": 9.987653476068447e-06, "loss": 0.2409, "num_input_tokens_seen": 4725616, "step": 7745 }, { "epoch": 2.404591995035681, "grad_norm": 0.19694775342941284, "learning_rate": 9.987558212809614e-06, "loss": 0.2389, "num_input_tokens_seen": 4728816, "step": 7750 }, { "epoch": 2.4061433447098977, "grad_norm": 0.16549544036388397, "learning_rate": 9.987462583903702e-06, "loss": 0.2273, "num_input_tokens_seen": 4731632, "step": 7755 }, { "epoch": 2.407694694384114, "grad_norm": 0.1972668170928955, "learning_rate": 9.987366589357722e-06, "loss": 0.2318, "num_input_tokens_seen": 4734288, "step": 7760 }, { "epoch": 2.4092460440583308, "grad_norm": 0.15446974337100983, "learning_rate": 9.987270229178714e-06, "loss": 0.2292, "num_input_tokens_seen": 4736976, "step": 7765 }, { "epoch": 2.4107973937325475, "grad_norm": 0.08757370710372925, "learning_rate": 9.98717350337374e-06, "loss": 0.2373, "num_input_tokens_seen": 4740240, "step": 7770 }, { "epoch": 2.412348743406764, "grad_norm": 0.07005951553583145, "learning_rate": 9.98707641194989e-06, "loss": 0.2359, "num_input_tokens_seen": 4743376, "step": 7775 }, { "epoch": 2.4139000930809806, "grad_norm": 0.07466260343790054, "learning_rate": 9.986978954914283e-06, "loss": 0.2384, "num_input_tokens_seen": 4747152, "step": 7780 }, { "epoch": 2.415451442755197, "grad_norm": 0.07788048684597015, "learning_rate": 9.986881132274065e-06, "loss": 0.2326, "num_input_tokens_seen": 4750000, "step": 7785 }, { "epoch": 2.4170027924294137, "grad_norm": 0.06987310945987701, "learning_rate": 9.986782944036407e-06, "loss": 0.2329, "num_input_tokens_seen": 4752976, "step": 7790 }, { "epoch": 2.41855414210363, "grad_norm": 0.17664232850074768, "learning_rate": 9.986684390208504e-06, "loss": 0.2333, "num_input_tokens_seen": 4755856, "step": 7795 }, { "epoch": 2.420105491777847, "grad_norm": 0.07070792466402054, "learning_rate": 9.986585470797586e-06, "loss": 0.2297, "num_input_tokens_seen": 4758864, "step": 7800 }, { "epoch": 2.421656841452063, "grad_norm": 0.08709368109703064, "learning_rate": 9.986486185810904e-06, "loss": 0.2341, "num_input_tokens_seen": 4761264, "step": 7805 }, { "epoch": 2.42320819112628, "grad_norm": 0.17283208668231964, "learning_rate": 9.986386535255735e-06, "loss": 0.2368, "num_input_tokens_seen": 4763632, "step": 7810 }, { "epoch": 2.4247595408004963, "grad_norm": 0.08717796951532364, "learning_rate": 9.986286519139383e-06, "loss": 0.2321, "num_input_tokens_seen": 4766384, "step": 7815 }, { "epoch": 2.426310890474713, "grad_norm": 0.0679045096039772, "learning_rate": 9.986186137469184e-06, "loss": 0.2288, "num_input_tokens_seen": 4769456, "step": 7820 }, { "epoch": 2.4278622401489294, "grad_norm": 0.17281973361968994, "learning_rate": 9.986085390252499e-06, "loss": 0.239, "num_input_tokens_seen": 4772880, "step": 7825 }, { "epoch": 2.429413589823146, "grad_norm": 0.05595343932509422, "learning_rate": 9.985984277496706e-06, "loss": 0.23, "num_input_tokens_seen": 4775408, "step": 7830 }, { "epoch": 2.430964939497363, "grad_norm": 0.1707221120595932, "learning_rate": 9.985882799209226e-06, "loss": 0.2303, "num_input_tokens_seen": 4779184, "step": 7835 }, { "epoch": 2.4325162891715792, "grad_norm": 0.17216871678829193, "learning_rate": 9.985780955397493e-06, "loss": 0.2288, "num_input_tokens_seen": 4782640, "step": 7840 }, { "epoch": 2.434067638845796, "grad_norm": 0.06625223159790039, "learning_rate": 9.985678746068976e-06, "loss": 0.2327, "num_input_tokens_seen": 4785840, "step": 7845 }, { "epoch": 2.4356189885200124, "grad_norm": 0.18651162087917328, "learning_rate": 9.985576171231169e-06, "loss": 0.2382, "num_input_tokens_seen": 4789424, "step": 7850 }, { "epoch": 2.437170338194229, "grad_norm": 0.05172032490372658, "learning_rate": 9.985473230891589e-06, "loss": 0.2302, "num_input_tokens_seen": 4792720, "step": 7855 }, { "epoch": 2.4387216878684455, "grad_norm": 0.1844634860754013, "learning_rate": 9.985369925057785e-06, "loss": 0.2347, "num_input_tokens_seen": 4795440, "step": 7860 }, { "epoch": 2.4402730375426622, "grad_norm": 0.18932057917118073, "learning_rate": 9.98526625373733e-06, "loss": 0.2285, "num_input_tokens_seen": 4798192, "step": 7865 }, { "epoch": 2.4418243872168786, "grad_norm": 0.1891254484653473, "learning_rate": 9.985162216937825e-06, "loss": 0.2275, "num_input_tokens_seen": 4800624, "step": 7870 }, { "epoch": 2.4433757368910953, "grad_norm": 0.1584160327911377, "learning_rate": 9.985057814666894e-06, "loss": 0.2275, "num_input_tokens_seen": 4804176, "step": 7875 }, { "epoch": 2.4449270865653117, "grad_norm": 0.15141548216342926, "learning_rate": 9.984953046932195e-06, "loss": 0.2355, "num_input_tokens_seen": 4807088, "step": 7880 }, { "epoch": 2.4464784362395284, "grad_norm": 0.15944448113441467, "learning_rate": 9.984847913741406e-06, "loss": 0.2306, "num_input_tokens_seen": 4809424, "step": 7885 }, { "epoch": 2.4480297859137448, "grad_norm": 0.06913533061742783, "learning_rate": 9.984742415102237e-06, "loss": 0.2326, "num_input_tokens_seen": 4814608, "step": 7890 }, { "epoch": 2.4495811355879615, "grad_norm": 0.05963076278567314, "learning_rate": 9.98463655102242e-06, "loss": 0.2326, "num_input_tokens_seen": 4817584, "step": 7895 }, { "epoch": 2.4511324852621783, "grad_norm": 0.07732284069061279, "learning_rate": 9.984530321509717e-06, "loss": 0.2311, "num_input_tokens_seen": 4820784, "step": 7900 }, { "epoch": 2.4526838349363946, "grad_norm": 0.15221035480499268, "learning_rate": 9.984423726571914e-06, "loss": 0.2267, "num_input_tokens_seen": 4823568, "step": 7905 }, { "epoch": 2.4542351846106114, "grad_norm": 0.05211856588721275, "learning_rate": 9.98431676621683e-06, "loss": 0.2319, "num_input_tokens_seen": 4826704, "step": 7910 }, { "epoch": 2.4557865342848277, "grad_norm": 0.04940318688750267, "learning_rate": 9.984209440452304e-06, "loss": 0.2292, "num_input_tokens_seen": 4829616, "step": 7915 }, { "epoch": 2.4573378839590445, "grad_norm": 0.14669448137283325, "learning_rate": 9.984101749286203e-06, "loss": 0.2279, "num_input_tokens_seen": 4832592, "step": 7920 }, { "epoch": 2.458889233633261, "grad_norm": 0.217315673828125, "learning_rate": 9.983993692726423e-06, "loss": 0.2308, "num_input_tokens_seen": 4837328, "step": 7925 }, { "epoch": 2.4604405833074776, "grad_norm": 0.14054210484027863, "learning_rate": 9.983885270780887e-06, "loss": 0.2314, "num_input_tokens_seen": 4840208, "step": 7930 }, { "epoch": 2.461991932981694, "grad_norm": 0.40640753507614136, "learning_rate": 9.983776483457541e-06, "loss": 0.2374, "num_input_tokens_seen": 4843248, "step": 7935 }, { "epoch": 2.4635432826559107, "grad_norm": 0.13854040205478668, "learning_rate": 9.983667330764362e-06, "loss": 0.2206, "num_input_tokens_seen": 4846640, "step": 7940 }, { "epoch": 2.465094632330127, "grad_norm": 0.07710961997509003, "learning_rate": 9.983557812709354e-06, "loss": 0.2414, "num_input_tokens_seen": 4849456, "step": 7945 }, { "epoch": 2.466645982004344, "grad_norm": 0.20002801716327667, "learning_rate": 9.983447929300543e-06, "loss": 0.252, "num_input_tokens_seen": 4852304, "step": 7950 }, { "epoch": 2.46819733167856, "grad_norm": 0.06533147394657135, "learning_rate": 9.983337680545987e-06, "loss": 0.2339, "num_input_tokens_seen": 4854992, "step": 7955 }, { "epoch": 2.469748681352777, "grad_norm": 0.15947134792804718, "learning_rate": 9.983227066453767e-06, "loss": 0.2327, "num_input_tokens_seen": 4857488, "step": 7960 }, { "epoch": 2.4713000310269937, "grad_norm": 0.1784011721611023, "learning_rate": 9.983116087031991e-06, "loss": 0.2345, "num_input_tokens_seen": 4860688, "step": 7965 }, { "epoch": 2.47285138070121, "grad_norm": 0.17127209901809692, "learning_rate": 9.983004742288799e-06, "loss": 0.2329, "num_input_tokens_seen": 4863280, "step": 7970 }, { "epoch": 2.474402730375427, "grad_norm": 0.16251040995121002, "learning_rate": 9.98289303223235e-06, "loss": 0.2287, "num_input_tokens_seen": 4866672, "step": 7975 }, { "epoch": 2.475954080049643, "grad_norm": 0.16341270506381989, "learning_rate": 9.982780956870837e-06, "loss": 0.234, "num_input_tokens_seen": 4869776, "step": 7980 }, { "epoch": 2.47750542972386, "grad_norm": 0.06849604099988937, "learning_rate": 9.982668516212476e-06, "loss": 0.2346, "num_input_tokens_seen": 4872656, "step": 7985 }, { "epoch": 2.4790567793980762, "grad_norm": 0.043109290301799774, "learning_rate": 9.982555710265507e-06, "loss": 0.2308, "num_input_tokens_seen": 4875792, "step": 7990 }, { "epoch": 2.480608129072293, "grad_norm": 0.1918221414089203, "learning_rate": 9.982442539038203e-06, "loss": 0.2284, "num_input_tokens_seen": 4878576, "step": 7995 }, { "epoch": 2.4821594787465093, "grad_norm": 0.1553920954465866, "learning_rate": 9.98232900253886e-06, "loss": 0.2275, "num_input_tokens_seen": 4880912, "step": 8000 }, { "epoch": 2.483710828420726, "grad_norm": 0.08266396075487137, "learning_rate": 9.982215100775801e-06, "loss": 0.2278, "num_input_tokens_seen": 4886960, "step": 8005 }, { "epoch": 2.4852621780949424, "grad_norm": 0.15766264498233795, "learning_rate": 9.982100833757377e-06, "loss": 0.2256, "num_input_tokens_seen": 4889744, "step": 8010 }, { "epoch": 2.486813527769159, "grad_norm": 0.08374206721782684, "learning_rate": 9.981986201491964e-06, "loss": 0.2354, "num_input_tokens_seen": 4892688, "step": 8015 }, { "epoch": 2.4883648774433755, "grad_norm": 0.16454839706420898, "learning_rate": 9.981871203987969e-06, "loss": 0.2234, "num_input_tokens_seen": 4896976, "step": 8020 }, { "epoch": 2.4899162271175923, "grad_norm": 0.2553686201572418, "learning_rate": 9.98175584125382e-06, "loss": 0.238, "num_input_tokens_seen": 4900336, "step": 8025 }, { "epoch": 2.491467576791809, "grad_norm": 0.07523205131292343, "learning_rate": 9.981640113297974e-06, "loss": 0.2401, "num_input_tokens_seen": 4902896, "step": 8030 }, { "epoch": 2.4930189264660254, "grad_norm": 0.08952376991510391, "learning_rate": 9.981524020128916e-06, "loss": 0.2315, "num_input_tokens_seen": 4906928, "step": 8035 }, { "epoch": 2.494570276140242, "grad_norm": 0.19362682104110718, "learning_rate": 9.98140756175516e-06, "loss": 0.2282, "num_input_tokens_seen": 4910416, "step": 8040 }, { "epoch": 2.4961216258144585, "grad_norm": 0.05740516260266304, "learning_rate": 9.981290738185239e-06, "loss": 0.2343, "num_input_tokens_seen": 4912848, "step": 8045 }, { "epoch": 2.4976729754886753, "grad_norm": 0.3805423676967621, "learning_rate": 9.981173549427718e-06, "loss": 0.2329, "num_input_tokens_seen": 4916048, "step": 8050 }, { "epoch": 2.4992243251628916, "grad_norm": 0.33975911140441895, "learning_rate": 9.981055995491192e-06, "loss": 0.2378, "num_input_tokens_seen": 4920848, "step": 8055 }, { "epoch": 2.5007756748371084, "grad_norm": 0.19310134649276733, "learning_rate": 9.980938076384276e-06, "loss": 0.2293, "num_input_tokens_seen": 4924080, "step": 8060 }, { "epoch": 2.5023270245113247, "grad_norm": 0.06390194594860077, "learning_rate": 9.980819792115617e-06, "loss": 0.2293, "num_input_tokens_seen": 4927376, "step": 8065 }, { "epoch": 2.5038783741855415, "grad_norm": 0.18398697674274445, "learning_rate": 9.980701142693884e-06, "loss": 0.2295, "num_input_tokens_seen": 4930000, "step": 8070 }, { "epoch": 2.5054297238597583, "grad_norm": 0.18163445591926575, "learning_rate": 9.980582128127778e-06, "loss": 0.2228, "num_input_tokens_seen": 4932560, "step": 8075 }, { "epoch": 2.5069810735339746, "grad_norm": 0.08577017486095428, "learning_rate": 9.980462748426025e-06, "loss": 0.2385, "num_input_tokens_seen": 4935696, "step": 8080 }, { "epoch": 2.508532423208191, "grad_norm": 0.27516648173332214, "learning_rate": 9.980343003597372e-06, "loss": 0.2364, "num_input_tokens_seen": 4938672, "step": 8085 }, { "epoch": 2.5100837728824077, "grad_norm": 0.09511707723140717, "learning_rate": 9.980222893650601e-06, "loss": 0.2316, "num_input_tokens_seen": 4940752, "step": 8090 }, { "epoch": 2.5116351225566245, "grad_norm": 0.2492208480834961, "learning_rate": 9.98010241859452e-06, "loss": 0.2298, "num_input_tokens_seen": 4944528, "step": 8095 }, { "epoch": 2.513186472230841, "grad_norm": 0.10155549645423889, "learning_rate": 9.979981578437957e-06, "loss": 0.2272, "num_input_tokens_seen": 4948176, "step": 8100 }, { "epoch": 2.514737821905057, "grad_norm": 0.0804467499256134, "learning_rate": 9.979860373189772e-06, "loss": 0.2331, "num_input_tokens_seen": 4951440, "step": 8105 }, { "epoch": 2.516289171579274, "grad_norm": 0.08265294879674911, "learning_rate": 9.979738802858848e-06, "loss": 0.2421, "num_input_tokens_seen": 4954384, "step": 8110 }, { "epoch": 2.5178405212534907, "grad_norm": 0.04425749555230141, "learning_rate": 9.979616867454106e-06, "loss": 0.2369, "num_input_tokens_seen": 4956976, "step": 8115 }, { "epoch": 2.519391870927707, "grad_norm": 0.045893408358097076, "learning_rate": 9.979494566984478e-06, "loss": 0.2347, "num_input_tokens_seen": 4959088, "step": 8120 }, { "epoch": 2.5209432206019238, "grad_norm": 0.17569954693317413, "learning_rate": 9.979371901458931e-06, "loss": 0.2267, "num_input_tokens_seen": 4961776, "step": 8125 }, { "epoch": 2.52249457027614, "grad_norm": 0.08093016594648361, "learning_rate": 9.979248870886463e-06, "loss": 0.2285, "num_input_tokens_seen": 4964976, "step": 8130 }, { "epoch": 2.524045919950357, "grad_norm": 0.17137019336223602, "learning_rate": 9.979125475276086e-06, "loss": 0.2205, "num_input_tokens_seen": 4968688, "step": 8135 }, { "epoch": 2.5255972696245736, "grad_norm": 0.07270855456590652, "learning_rate": 9.97900171463685e-06, "loss": 0.2344, "num_input_tokens_seen": 4972176, "step": 8140 }, { "epoch": 2.52714861929879, "grad_norm": 0.06990109384059906, "learning_rate": 9.978877588977832e-06, "loss": 0.2477, "num_input_tokens_seen": 4975120, "step": 8145 }, { "epoch": 2.5286999689730063, "grad_norm": 0.14112484455108643, "learning_rate": 9.978753098308124e-06, "loss": 0.2265, "num_input_tokens_seen": 4977712, "step": 8150 }, { "epoch": 2.530251318647223, "grad_norm": 0.194210022687912, "learning_rate": 9.978628242636858e-06, "loss": 0.2394, "num_input_tokens_seen": 4980400, "step": 8155 }, { "epoch": 2.53180266832144, "grad_norm": 0.05331039056181908, "learning_rate": 9.978503021973184e-06, "loss": 0.2319, "num_input_tokens_seen": 4983056, "step": 8160 }, { "epoch": 2.533354017995656, "grad_norm": 0.07067394256591797, "learning_rate": 9.978377436326288e-06, "loss": 0.2349, "num_input_tokens_seen": 4986832, "step": 8165 }, { "epoch": 2.534905367669873, "grad_norm": 0.17287446558475494, "learning_rate": 9.97825148570537e-06, "loss": 0.2367, "num_input_tokens_seen": 4989520, "step": 8170 }, { "epoch": 2.5364567173440893, "grad_norm": 0.05723997950553894, "learning_rate": 9.978125170119668e-06, "loss": 0.2263, "num_input_tokens_seen": 4992336, "step": 8175 }, { "epoch": 2.538008067018306, "grad_norm": 0.05881204456090927, "learning_rate": 9.977998489578441e-06, "loss": 0.2346, "num_input_tokens_seen": 4995152, "step": 8180 }, { "epoch": 2.5395594166925224, "grad_norm": 0.04560442641377449, "learning_rate": 9.977871444090977e-06, "loss": 0.231, "num_input_tokens_seen": 4997968, "step": 8185 }, { "epoch": 2.541110766366739, "grad_norm": 0.04959028959274292, "learning_rate": 9.97774403366659e-06, "loss": 0.232, "num_input_tokens_seen": 4999696, "step": 8190 }, { "epoch": 2.5426621160409555, "grad_norm": 0.06574824452400208, "learning_rate": 9.977616258314618e-06, "loss": 0.2314, "num_input_tokens_seen": 5002224, "step": 8195 }, { "epoch": 2.5442134657151723, "grad_norm": 0.16587281227111816, "learning_rate": 9.977488118044432e-06, "loss": 0.2304, "num_input_tokens_seen": 5005392, "step": 8200 }, { "epoch": 2.545764815389389, "grad_norm": 0.19769287109375, "learning_rate": 9.977359612865424e-06, "loss": 0.2336, "num_input_tokens_seen": 5009264, "step": 8205 }, { "epoch": 2.5473161650636054, "grad_norm": 0.03912147134542465, "learning_rate": 9.977230742787017e-06, "loss": 0.2299, "num_input_tokens_seen": 5013520, "step": 8210 }, { "epoch": 2.5488675147378217, "grad_norm": 0.17065519094467163, "learning_rate": 9.977101507818653e-06, "loss": 0.2336, "num_input_tokens_seen": 5015792, "step": 8215 }, { "epoch": 2.5504188644120385, "grad_norm": 0.31138110160827637, "learning_rate": 9.976971907969814e-06, "loss": 0.2319, "num_input_tokens_seen": 5018768, "step": 8220 }, { "epoch": 2.5519702140862552, "grad_norm": 0.05678314343094826, "learning_rate": 9.97684194325e-06, "loss": 0.2314, "num_input_tokens_seen": 5021424, "step": 8225 }, { "epoch": 2.5535215637604716, "grad_norm": 0.3035171627998352, "learning_rate": 9.976711613668734e-06, "loss": 0.2314, "num_input_tokens_seen": 5024432, "step": 8230 }, { "epoch": 2.5550729134346883, "grad_norm": 0.1552392989397049, "learning_rate": 9.976580919235575e-06, "loss": 0.2315, "num_input_tokens_seen": 5026672, "step": 8235 }, { "epoch": 2.5566242631089047, "grad_norm": 0.06321746855974197, "learning_rate": 9.976449859960102e-06, "loss": 0.2325, "num_input_tokens_seen": 5029264, "step": 8240 }, { "epoch": 2.5581756127831214, "grad_norm": 0.2779842019081116, "learning_rate": 9.976318435851923e-06, "loss": 0.2263, "num_input_tokens_seen": 5032208, "step": 8245 }, { "epoch": 2.5597269624573378, "grad_norm": 0.07077520340681076, "learning_rate": 9.976186646920678e-06, "loss": 0.2323, "num_input_tokens_seen": 5035024, "step": 8250 }, { "epoch": 2.5612783121315545, "grad_norm": 0.1379299908876419, "learning_rate": 9.976054493176021e-06, "loss": 0.2314, "num_input_tokens_seen": 5037648, "step": 8255 }, { "epoch": 2.562829661805771, "grad_norm": 0.1399134248495102, "learning_rate": 9.975921974627647e-06, "loss": 0.2276, "num_input_tokens_seen": 5040752, "step": 8260 }, { "epoch": 2.5643810114799876, "grad_norm": 0.05497416481375694, "learning_rate": 9.975789091285268e-06, "loss": 0.2269, "num_input_tokens_seen": 5045136, "step": 8265 }, { "epoch": 2.5659323611542044, "grad_norm": 0.09115294367074966, "learning_rate": 9.975655843158627e-06, "loss": 0.2338, "num_input_tokens_seen": 5047408, "step": 8270 }, { "epoch": 2.5674837108284208, "grad_norm": 0.04998823627829552, "learning_rate": 9.97552223025749e-06, "loss": 0.23, "num_input_tokens_seen": 5050928, "step": 8275 }, { "epoch": 2.569035060502637, "grad_norm": 0.17680060863494873, "learning_rate": 9.975388252591655e-06, "loss": 0.2354, "num_input_tokens_seen": 5054224, "step": 8280 }, { "epoch": 2.570586410176854, "grad_norm": 0.17580397427082062, "learning_rate": 9.975253910170946e-06, "loss": 0.2355, "num_input_tokens_seen": 5057136, "step": 8285 }, { "epoch": 2.5721377598510706, "grad_norm": 0.0777345523238182, "learning_rate": 9.975119203005207e-06, "loss": 0.231, "num_input_tokens_seen": 5060912, "step": 8290 }, { "epoch": 2.573689109525287, "grad_norm": 0.08161424845457077, "learning_rate": 9.974984131104318e-06, "loss": 0.2274, "num_input_tokens_seen": 5063440, "step": 8295 }, { "epoch": 2.5752404591995037, "grad_norm": 0.057442743331193924, "learning_rate": 9.97484869447818e-06, "loss": 0.2341, "num_input_tokens_seen": 5066416, "step": 8300 }, { "epoch": 2.57679180887372, "grad_norm": 0.3099828064441681, "learning_rate": 9.97471289313672e-06, "loss": 0.2299, "num_input_tokens_seen": 5070704, "step": 8305 }, { "epoch": 2.578343158547937, "grad_norm": 0.16443951427936554, "learning_rate": 9.974576727089896e-06, "loss": 0.2278, "num_input_tokens_seen": 5073872, "step": 8310 }, { "epoch": 2.579894508222153, "grad_norm": 0.15902099013328552, "learning_rate": 9.97444019634769e-06, "loss": 0.232, "num_input_tokens_seen": 5076848, "step": 8315 }, { "epoch": 2.58144585789637, "grad_norm": 0.3387672007083893, "learning_rate": 9.974303300920112e-06, "loss": 0.2363, "num_input_tokens_seen": 5079696, "step": 8320 }, { "epoch": 2.5829972075705863, "grad_norm": 0.32746273279190063, "learning_rate": 9.974166040817195e-06, "loss": 0.2362, "num_input_tokens_seen": 5082768, "step": 8325 }, { "epoch": 2.584548557244803, "grad_norm": 0.048153385519981384, "learning_rate": 9.974028416049007e-06, "loss": 0.2335, "num_input_tokens_seen": 5085424, "step": 8330 }, { "epoch": 2.58609990691902, "grad_norm": 0.20501118898391724, "learning_rate": 9.973890426625634e-06, "loss": 0.2299, "num_input_tokens_seen": 5088848, "step": 8335 }, { "epoch": 2.587651256593236, "grad_norm": 0.0702727735042572, "learning_rate": 9.973752072557194e-06, "loss": 0.2285, "num_input_tokens_seen": 5091280, "step": 8340 }, { "epoch": 2.5892026062674525, "grad_norm": 0.08046534657478333, "learning_rate": 9.973613353853829e-06, "loss": 0.2286, "num_input_tokens_seen": 5093712, "step": 8345 }, { "epoch": 2.5907539559416692, "grad_norm": 0.23193436861038208, "learning_rate": 9.973474270525707e-06, "loss": 0.2365, "num_input_tokens_seen": 5096432, "step": 8350 }, { "epoch": 2.592305305615886, "grad_norm": 0.2147388607263565, "learning_rate": 9.973334822583027e-06, "loss": 0.2399, "num_input_tokens_seen": 5099504, "step": 8355 }, { "epoch": 2.5938566552901023, "grad_norm": 0.20633578300476074, "learning_rate": 9.973195010036012e-06, "loss": 0.2399, "num_input_tokens_seen": 5101968, "step": 8360 }, { "epoch": 2.595408004964319, "grad_norm": 0.07168496400117874, "learning_rate": 9.97305483289491e-06, "loss": 0.232, "num_input_tokens_seen": 5104464, "step": 8365 }, { "epoch": 2.5969593546385354, "grad_norm": 0.21450179815292358, "learning_rate": 9.97291429117e-06, "loss": 0.2319, "num_input_tokens_seen": 5107696, "step": 8370 }, { "epoch": 2.598510704312752, "grad_norm": 0.18376654386520386, "learning_rate": 9.972773384871585e-06, "loss": 0.2308, "num_input_tokens_seen": 5111088, "step": 8375 }, { "epoch": 2.6000620539869685, "grad_norm": 0.1815609186887741, "learning_rate": 9.972632114009992e-06, "loss": 0.2293, "num_input_tokens_seen": 5113552, "step": 8380 }, { "epoch": 2.6016134036611853, "grad_norm": 0.33876290917396545, "learning_rate": 9.972490478595583e-06, "loss": 0.2325, "num_input_tokens_seen": 5116560, "step": 8385 }, { "epoch": 2.6031647533354016, "grad_norm": 0.15704312920570374, "learning_rate": 9.972348478638737e-06, "loss": 0.232, "num_input_tokens_seen": 5119856, "step": 8390 }, { "epoch": 2.6047161030096184, "grad_norm": 0.1541057676076889, "learning_rate": 9.972206114149867e-06, "loss": 0.2289, "num_input_tokens_seen": 5123408, "step": 8395 }, { "epoch": 2.606267452683835, "grad_norm": 0.05862480774521828, "learning_rate": 9.97206338513941e-06, "loss": 0.2321, "num_input_tokens_seen": 5126416, "step": 8400 }, { "epoch": 2.6078188023580515, "grad_norm": 0.061013441532850266, "learning_rate": 9.971920291617828e-06, "loss": 0.2318, "num_input_tokens_seen": 5128816, "step": 8405 }, { "epoch": 2.609370152032268, "grad_norm": 0.31834372878074646, "learning_rate": 9.971776833595612e-06, "loss": 0.2416, "num_input_tokens_seen": 5132816, "step": 8410 }, { "epoch": 2.6109215017064846, "grad_norm": 0.15545077621936798, "learning_rate": 9.971633011083279e-06, "loss": 0.2341, "num_input_tokens_seen": 5135088, "step": 8415 }, { "epoch": 2.6124728513807014, "grad_norm": 0.3279177248477936, "learning_rate": 9.971488824091373e-06, "loss": 0.231, "num_input_tokens_seen": 5140816, "step": 8420 }, { "epoch": 2.6140242010549177, "grad_norm": 0.041106902062892914, "learning_rate": 9.971344272630467e-06, "loss": 0.2255, "num_input_tokens_seen": 5143376, "step": 8425 }, { "epoch": 2.6155755507291345, "grad_norm": 0.20668452978134155, "learning_rate": 9.971199356711154e-06, "loss": 0.2348, "num_input_tokens_seen": 5145936, "step": 8430 }, { "epoch": 2.617126900403351, "grad_norm": 0.15886685252189636, "learning_rate": 9.971054076344061e-06, "loss": 0.2286, "num_input_tokens_seen": 5149200, "step": 8435 }, { "epoch": 2.6186782500775676, "grad_norm": 0.16587235033512115, "learning_rate": 9.970908431539839e-06, "loss": 0.2248, "num_input_tokens_seen": 5152112, "step": 8440 }, { "epoch": 2.620229599751784, "grad_norm": 0.24565866589546204, "learning_rate": 9.970762422309164e-06, "loss": 0.2396, "num_input_tokens_seen": 5155920, "step": 8445 }, { "epoch": 2.6217809494260007, "grad_norm": 0.15309974551200867, "learning_rate": 9.970616048662742e-06, "loss": 0.242, "num_input_tokens_seen": 5158544, "step": 8450 }, { "epoch": 2.623332299100217, "grad_norm": 0.0683981254696846, "learning_rate": 9.970469310611302e-06, "loss": 0.2329, "num_input_tokens_seen": 5161520, "step": 8455 }, { "epoch": 2.624883648774434, "grad_norm": 0.16651281714439392, "learning_rate": 9.970322208165601e-06, "loss": 0.2214, "num_input_tokens_seen": 5164368, "step": 8460 }, { "epoch": 2.6264349984486506, "grad_norm": 0.19807283580303192, "learning_rate": 9.970174741336428e-06, "loss": 0.2306, "num_input_tokens_seen": 5168720, "step": 8465 }, { "epoch": 2.627986348122867, "grad_norm": 0.16821187734603882, "learning_rate": 9.970026910134587e-06, "loss": 0.2376, "num_input_tokens_seen": 5172816, "step": 8470 }, { "epoch": 2.6295376977970832, "grad_norm": 0.16409264504909515, "learning_rate": 9.969878714570922e-06, "loss": 0.2317, "num_input_tokens_seen": 5175504, "step": 8475 }, { "epoch": 2.6310890474713, "grad_norm": 0.16110225021839142, "learning_rate": 9.969730154656294e-06, "loss": 0.2309, "num_input_tokens_seen": 5178128, "step": 8480 }, { "epoch": 2.632640397145517, "grad_norm": 0.18649719655513763, "learning_rate": 9.969581230401595e-06, "loss": 0.23, "num_input_tokens_seen": 5180976, "step": 8485 }, { "epoch": 2.634191746819733, "grad_norm": 0.16971318423748016, "learning_rate": 9.969431941817745e-06, "loss": 0.2318, "num_input_tokens_seen": 5184208, "step": 8490 }, { "epoch": 2.63574309649395, "grad_norm": 0.16711148619651794, "learning_rate": 9.969282288915685e-06, "loss": 0.2277, "num_input_tokens_seen": 5186896, "step": 8495 }, { "epoch": 2.637294446168166, "grad_norm": 0.0627061054110527, "learning_rate": 9.969132271706387e-06, "loss": 0.2349, "num_input_tokens_seen": 5189552, "step": 8500 }, { "epoch": 2.638845795842383, "grad_norm": 0.21155506372451782, "learning_rate": 9.968981890200853e-06, "loss": 0.2336, "num_input_tokens_seen": 5192848, "step": 8505 }, { "epoch": 2.6403971455165993, "grad_norm": 0.0714915320277214, "learning_rate": 9.968831144410103e-06, "loss": 0.2264, "num_input_tokens_seen": 5195856, "step": 8510 }, { "epoch": 2.641948495190816, "grad_norm": 0.17882947623729706, "learning_rate": 9.96868003434519e-06, "loss": 0.2357, "num_input_tokens_seen": 5197776, "step": 8515 }, { "epoch": 2.6434998448650324, "grad_norm": 0.19944524765014648, "learning_rate": 9.968528560017195e-06, "loss": 0.2283, "num_input_tokens_seen": 5200464, "step": 8520 }, { "epoch": 2.645051194539249, "grad_norm": 0.19840070605278015, "learning_rate": 9.968376721437218e-06, "loss": 0.2336, "num_input_tokens_seen": 5202864, "step": 8525 }, { "epoch": 2.646602544213466, "grad_norm": 0.1762526035308838, "learning_rate": 9.968224518616394e-06, "loss": 0.2337, "num_input_tokens_seen": 5206320, "step": 8530 }, { "epoch": 2.6481538938876823, "grad_norm": 0.054719556123018265, "learning_rate": 9.96807195156588e-06, "loss": 0.232, "num_input_tokens_seen": 5209168, "step": 8535 }, { "epoch": 2.6497052435618986, "grad_norm": 0.1553027480840683, "learning_rate": 9.967919020296861e-06, "loss": 0.2319, "num_input_tokens_seen": 5212368, "step": 8540 }, { "epoch": 2.6512565932361154, "grad_norm": 0.1682613044977188, "learning_rate": 9.967765724820549e-06, "loss": 0.2324, "num_input_tokens_seen": 5215184, "step": 8545 }, { "epoch": 2.652807942910332, "grad_norm": 0.07037699222564697, "learning_rate": 9.967612065148183e-06, "loss": 0.2329, "num_input_tokens_seen": 5218480, "step": 8550 }, { "epoch": 2.6543592925845485, "grad_norm": 0.09521666169166565, "learning_rate": 9.967458041291026e-06, "loss": 0.2324, "num_input_tokens_seen": 5221552, "step": 8555 }, { "epoch": 2.6559106422587653, "grad_norm": 0.08129265904426575, "learning_rate": 9.967303653260371e-06, "loss": 0.2335, "num_input_tokens_seen": 5224528, "step": 8560 }, { "epoch": 2.6574619919329816, "grad_norm": 0.17201389372348785, "learning_rate": 9.967148901067538e-06, "loss": 0.2293, "num_input_tokens_seen": 5227248, "step": 8565 }, { "epoch": 2.6590133416071984, "grad_norm": 0.08978857845067978, "learning_rate": 9.96699378472387e-06, "loss": 0.2308, "num_input_tokens_seen": 5230832, "step": 8570 }, { "epoch": 2.6605646912814147, "grad_norm": 0.3657302260398865, "learning_rate": 9.96683830424074e-06, "loss": 0.2238, "num_input_tokens_seen": 5234064, "step": 8575 }, { "epoch": 2.6621160409556315, "grad_norm": 0.27324020862579346, "learning_rate": 9.966682459629543e-06, "loss": 0.2449, "num_input_tokens_seen": 5237904, "step": 8580 }, { "epoch": 2.663667390629848, "grad_norm": 0.20006603002548218, "learning_rate": 9.966526250901711e-06, "loss": 0.2332, "num_input_tokens_seen": 5241136, "step": 8585 }, { "epoch": 2.6652187403040646, "grad_norm": 0.30465543270111084, "learning_rate": 9.966369678068692e-06, "loss": 0.2301, "num_input_tokens_seen": 5244048, "step": 8590 }, { "epoch": 2.6667700899782814, "grad_norm": 0.15966810286045074, "learning_rate": 9.966212741141964e-06, "loss": 0.2294, "num_input_tokens_seen": 5246928, "step": 8595 }, { "epoch": 2.6683214396524977, "grad_norm": 0.16661345958709717, "learning_rate": 9.966055440133034e-06, "loss": 0.2335, "num_input_tokens_seen": 5249392, "step": 8600 }, { "epoch": 2.669872789326714, "grad_norm": 0.1868530511856079, "learning_rate": 9.965897775053433e-06, "loss": 0.2345, "num_input_tokens_seen": 5251760, "step": 8605 }, { "epoch": 2.671424139000931, "grad_norm": 0.08541522175073624, "learning_rate": 9.96573974591472e-06, "loss": 0.2295, "num_input_tokens_seen": 5254864, "step": 8610 }, { "epoch": 2.6729754886751476, "grad_norm": 0.06255742162466049, "learning_rate": 9.96558135272848e-06, "loss": 0.2293, "num_input_tokens_seen": 5257712, "step": 8615 }, { "epoch": 2.674526838349364, "grad_norm": 0.06759762018918991, "learning_rate": 9.965422595506327e-06, "loss": 0.2325, "num_input_tokens_seen": 5260624, "step": 8620 }, { "epoch": 2.6760781880235807, "grad_norm": 0.16350622475147247, "learning_rate": 9.965263474259896e-06, "loss": 0.2273, "num_input_tokens_seen": 5263184, "step": 8625 }, { "epoch": 2.677629537697797, "grad_norm": 0.0760425329208374, "learning_rate": 9.965103989000857e-06, "loss": 0.2285, "num_input_tokens_seen": 5266448, "step": 8630 }, { "epoch": 2.6791808873720138, "grad_norm": 0.05655372515320778, "learning_rate": 9.964944139740899e-06, "loss": 0.2357, "num_input_tokens_seen": 5269168, "step": 8635 }, { "epoch": 2.68073223704623, "grad_norm": 0.06600259989500046, "learning_rate": 9.964783926491741e-06, "loss": 0.2294, "num_input_tokens_seen": 5271984, "step": 8640 }, { "epoch": 2.682283586720447, "grad_norm": 0.15440703928470612, "learning_rate": 9.96462334926513e-06, "loss": 0.2313, "num_input_tokens_seen": 5275152, "step": 8645 }, { "epoch": 2.683834936394663, "grad_norm": 0.14330284297466278, "learning_rate": 9.964462408072839e-06, "loss": 0.2298, "num_input_tokens_seen": 5277360, "step": 8650 }, { "epoch": 2.68538628606888, "grad_norm": 0.1484537422657013, "learning_rate": 9.964301102926663e-06, "loss": 0.2309, "num_input_tokens_seen": 5280528, "step": 8655 }, { "epoch": 2.6869376357430967, "grad_norm": 0.07426512241363525, "learning_rate": 9.964139433838434e-06, "loss": 0.2314, "num_input_tokens_seen": 5282928, "step": 8660 }, { "epoch": 2.688488985417313, "grad_norm": 0.05284794792532921, "learning_rate": 9.963977400819996e-06, "loss": 0.2277, "num_input_tokens_seen": 5286064, "step": 8665 }, { "epoch": 2.6900403350915294, "grad_norm": 0.33400365710258484, "learning_rate": 9.963815003883235e-06, "loss": 0.2367, "num_input_tokens_seen": 5289040, "step": 8670 }, { "epoch": 2.691591684765746, "grad_norm": 0.15931804478168488, "learning_rate": 9.963652243040052e-06, "loss": 0.233, "num_input_tokens_seen": 5291792, "step": 8675 }, { "epoch": 2.693143034439963, "grad_norm": 0.1710643172264099, "learning_rate": 9.963489118302382e-06, "loss": 0.2292, "num_input_tokens_seen": 5295312, "step": 8680 }, { "epoch": 2.6946943841141793, "grad_norm": 0.23183804750442505, "learning_rate": 9.963325629682184e-06, "loss": 0.2321, "num_input_tokens_seen": 5298448, "step": 8685 }, { "epoch": 2.696245733788396, "grad_norm": 0.59891676902771, "learning_rate": 9.963161777191442e-06, "loss": 0.2268, "num_input_tokens_seen": 5302288, "step": 8690 }, { "epoch": 2.6977970834626124, "grad_norm": 0.08906122297048569, "learning_rate": 9.962997560842167e-06, "loss": 0.2358, "num_input_tokens_seen": 5304784, "step": 8695 }, { "epoch": 2.699348433136829, "grad_norm": 0.2126716673374176, "learning_rate": 9.962832980646403e-06, "loss": 0.2256, "num_input_tokens_seen": 5307504, "step": 8700 }, { "epoch": 2.7008997828110455, "grad_norm": 0.16693268716335297, "learning_rate": 9.962668036616212e-06, "loss": 0.229, "num_input_tokens_seen": 5311056, "step": 8705 }, { "epoch": 2.7024511324852623, "grad_norm": 0.19717831909656525, "learning_rate": 9.962502728763687e-06, "loss": 0.2402, "num_input_tokens_seen": 5313520, "step": 8710 }, { "epoch": 2.7040024821594786, "grad_norm": 0.041943036019802094, "learning_rate": 9.962337057100946e-06, "loss": 0.2338, "num_input_tokens_seen": 5316560, "step": 8715 }, { "epoch": 2.7055538318336954, "grad_norm": 0.15626081824302673, "learning_rate": 9.962171021640137e-06, "loss": 0.2325, "num_input_tokens_seen": 5319312, "step": 8720 }, { "epoch": 2.707105181507912, "grad_norm": 0.06549353152513504, "learning_rate": 9.962004622393431e-06, "loss": 0.233, "num_input_tokens_seen": 5321744, "step": 8725 }, { "epoch": 2.7086565311821285, "grad_norm": 0.15201587975025177, "learning_rate": 9.961837859373028e-06, "loss": 0.23, "num_input_tokens_seen": 5325296, "step": 8730 }, { "epoch": 2.710207880856345, "grad_norm": 0.17567849159240723, "learning_rate": 9.961670732591154e-06, "loss": 0.2312, "num_input_tokens_seen": 5328304, "step": 8735 }, { "epoch": 2.7117592305305616, "grad_norm": 0.05659716948866844, "learning_rate": 9.961503242060057e-06, "loss": 0.2263, "num_input_tokens_seen": 5331376, "step": 8740 }, { "epoch": 2.7133105802047783, "grad_norm": 0.16427940130233765, "learning_rate": 9.961335387792022e-06, "loss": 0.2317, "num_input_tokens_seen": 5334480, "step": 8745 }, { "epoch": 2.7148619298789947, "grad_norm": 0.21941900253295898, "learning_rate": 9.96116716979935e-06, "loss": 0.2408, "num_input_tokens_seen": 5338288, "step": 8750 }, { "epoch": 2.7164132795532114, "grad_norm": 0.0627385601401329, "learning_rate": 9.960998588094377e-06, "loss": 0.2214, "num_input_tokens_seen": 5341040, "step": 8755 }, { "epoch": 2.7179646292274278, "grad_norm": 0.2054312378168106, "learning_rate": 9.96082964268946e-06, "loss": 0.2341, "num_input_tokens_seen": 5343920, "step": 8760 }, { "epoch": 2.7195159789016445, "grad_norm": 0.20854231715202332, "learning_rate": 9.960660333596986e-06, "loss": 0.2272, "num_input_tokens_seen": 5347504, "step": 8765 }, { "epoch": 2.721067328575861, "grad_norm": 0.22459854185581207, "learning_rate": 9.960490660829366e-06, "loss": 0.243, "num_input_tokens_seen": 5350832, "step": 8770 }, { "epoch": 2.7226186782500776, "grad_norm": 0.10380084067583084, "learning_rate": 9.960320624399038e-06, "loss": 0.2332, "num_input_tokens_seen": 5354224, "step": 8775 }, { "epoch": 2.724170027924294, "grad_norm": 0.06307604908943176, "learning_rate": 9.960150224318471e-06, "loss": 0.2315, "num_input_tokens_seen": 5357264, "step": 8780 }, { "epoch": 2.7257213775985107, "grad_norm": 0.06998893618583679, "learning_rate": 9.959979460600156e-06, "loss": 0.2244, "num_input_tokens_seen": 5360240, "step": 8785 }, { "epoch": 2.7272727272727275, "grad_norm": 0.18230225145816803, "learning_rate": 9.959808333256612e-06, "loss": 0.2332, "num_input_tokens_seen": 5362896, "step": 8790 }, { "epoch": 2.728824076946944, "grad_norm": 0.18994706869125366, "learning_rate": 9.959636842300382e-06, "loss": 0.2353, "num_input_tokens_seen": 5368912, "step": 8795 }, { "epoch": 2.73037542662116, "grad_norm": 0.3851320147514343, "learning_rate": 9.95946498774404e-06, "loss": 0.225, "num_input_tokens_seen": 5371856, "step": 8800 }, { "epoch": 2.731926776295377, "grad_norm": 0.36509206891059875, "learning_rate": 9.959292769600188e-06, "loss": 0.2412, "num_input_tokens_seen": 5375056, "step": 8805 }, { "epoch": 2.7334781259695937, "grad_norm": 0.3202287256717682, "learning_rate": 9.959120187881448e-06, "loss": 0.2275, "num_input_tokens_seen": 5377648, "step": 8810 }, { "epoch": 2.73502947564381, "grad_norm": 0.17930158972740173, "learning_rate": 9.958947242600474e-06, "loss": 0.2357, "num_input_tokens_seen": 5380176, "step": 8815 }, { "epoch": 2.736580825318027, "grad_norm": 0.16034536063671112, "learning_rate": 9.958773933769946e-06, "loss": 0.2345, "num_input_tokens_seen": 5383248, "step": 8820 }, { "epoch": 2.738132174992243, "grad_norm": 0.061477817595005035, "learning_rate": 9.958600261402565e-06, "loss": 0.2366, "num_input_tokens_seen": 5385584, "step": 8825 }, { "epoch": 2.73968352466646, "grad_norm": 0.11236628890037537, "learning_rate": 9.958426225511066e-06, "loss": 0.2308, "num_input_tokens_seen": 5390672, "step": 8830 }, { "epoch": 2.7412348743406763, "grad_norm": 0.056635674089193344, "learning_rate": 9.958251826108211e-06, "loss": 0.2328, "num_input_tokens_seen": 5393424, "step": 8835 }, { "epoch": 2.742786224014893, "grad_norm": 0.2080785632133484, "learning_rate": 9.958077063206782e-06, "loss": 0.2341, "num_input_tokens_seen": 5395824, "step": 8840 }, { "epoch": 2.7443375736891094, "grad_norm": 0.0727589875459671, "learning_rate": 9.95790193681959e-06, "loss": 0.2319, "num_input_tokens_seen": 5398992, "step": 8845 }, { "epoch": 2.745888923363326, "grad_norm": 0.17084506154060364, "learning_rate": 9.957726446959477e-06, "loss": 0.2306, "num_input_tokens_seen": 5402032, "step": 8850 }, { "epoch": 2.747440273037543, "grad_norm": 0.16538141667842865, "learning_rate": 9.957550593639307e-06, "loss": 0.2327, "num_input_tokens_seen": 5406512, "step": 8855 }, { "epoch": 2.7489916227117592, "grad_norm": 0.15362000465393066, "learning_rate": 9.957374376871973e-06, "loss": 0.2379, "num_input_tokens_seen": 5409360, "step": 8860 }, { "epoch": 2.7505429723859756, "grad_norm": 0.07199180871248245, "learning_rate": 9.957197796670391e-06, "loss": 0.2309, "num_input_tokens_seen": 5412336, "step": 8865 }, { "epoch": 2.7520943220601923, "grad_norm": 0.15641209483146667, "learning_rate": 9.957020853047509e-06, "loss": 0.232, "num_input_tokens_seen": 5414832, "step": 8870 }, { "epoch": 2.753645671734409, "grad_norm": 0.14978958666324615, "learning_rate": 9.9568435460163e-06, "loss": 0.2296, "num_input_tokens_seen": 5417328, "step": 8875 }, { "epoch": 2.7551970214086254, "grad_norm": 0.1925896406173706, "learning_rate": 9.956665875589762e-06, "loss": 0.2316, "num_input_tokens_seen": 5420880, "step": 8880 }, { "epoch": 2.756748371082842, "grad_norm": 0.18900710344314575, "learning_rate": 9.956487841780918e-06, "loss": 0.2352, "num_input_tokens_seen": 5423920, "step": 8885 }, { "epoch": 2.7582997207570585, "grad_norm": 0.09804318845272064, "learning_rate": 9.956309444602821e-06, "loss": 0.2368, "num_input_tokens_seen": 5426896, "step": 8890 }, { "epoch": 2.7598510704312753, "grad_norm": 0.18546487390995026, "learning_rate": 9.956130684068551e-06, "loss": 0.2373, "num_input_tokens_seen": 5430000, "step": 8895 }, { "epoch": 2.7614024201054916, "grad_norm": 0.32451701164245605, "learning_rate": 9.955951560191213e-06, "loss": 0.2322, "num_input_tokens_seen": 5432464, "step": 8900 }, { "epoch": 2.7629537697797084, "grad_norm": 0.0368921160697937, "learning_rate": 9.955772072983939e-06, "loss": 0.2316, "num_input_tokens_seen": 5435664, "step": 8905 }, { "epoch": 2.7645051194539247, "grad_norm": 0.15324656665325165, "learning_rate": 9.955592222459885e-06, "loss": 0.23, "num_input_tokens_seen": 5439312, "step": 8910 }, { "epoch": 2.7660564691281415, "grad_norm": 0.15979234874248505, "learning_rate": 9.955412008632239e-06, "loss": 0.2305, "num_input_tokens_seen": 5442576, "step": 8915 }, { "epoch": 2.7676078188023583, "grad_norm": 0.05758916214108467, "learning_rate": 9.955231431514212e-06, "loss": 0.2337, "num_input_tokens_seen": 5445872, "step": 8920 }, { "epoch": 2.7691591684765746, "grad_norm": 0.03473782166838646, "learning_rate": 9.955050491119041e-06, "loss": 0.2305, "num_input_tokens_seen": 5448432, "step": 8925 }, { "epoch": 2.770710518150791, "grad_norm": 0.3133313059806824, "learning_rate": 9.954869187459994e-06, "loss": 0.2335, "num_input_tokens_seen": 5451280, "step": 8930 }, { "epoch": 2.7722618678250077, "grad_norm": 0.06784053146839142, "learning_rate": 9.954687520550362e-06, "loss": 0.2309, "num_input_tokens_seen": 5454000, "step": 8935 }, { "epoch": 2.7738132174992245, "grad_norm": 0.05827932804822922, "learning_rate": 9.954505490403462e-06, "loss": 0.2346, "num_input_tokens_seen": 5456816, "step": 8940 }, { "epoch": 2.775364567173441, "grad_norm": 0.07387223094701767, "learning_rate": 9.954323097032637e-06, "loss": 0.2321, "num_input_tokens_seen": 5459344, "step": 8945 }, { "epoch": 2.7769159168476576, "grad_norm": 0.18909834325313568, "learning_rate": 9.954140340451262e-06, "loss": 0.2276, "num_input_tokens_seen": 5461776, "step": 8950 }, { "epoch": 2.778467266521874, "grad_norm": 0.04374921694397926, "learning_rate": 9.953957220672736e-06, "loss": 0.2385, "num_input_tokens_seen": 5465200, "step": 8955 }, { "epoch": 2.7800186161960907, "grad_norm": 0.05838081240653992, "learning_rate": 9.953773737710481e-06, "loss": 0.2389, "num_input_tokens_seen": 5467664, "step": 8960 }, { "epoch": 2.781569965870307, "grad_norm": 0.06035482883453369, "learning_rate": 9.95358989157795e-06, "loss": 0.2322, "num_input_tokens_seen": 5471024, "step": 8965 }, { "epoch": 2.783121315544524, "grad_norm": 0.17348575592041016, "learning_rate": 9.95340568228862e-06, "loss": 0.2305, "num_input_tokens_seen": 5474160, "step": 8970 }, { "epoch": 2.78467266521874, "grad_norm": 0.09633345156908035, "learning_rate": 9.953221109855998e-06, "loss": 0.2322, "num_input_tokens_seen": 5477616, "step": 8975 }, { "epoch": 2.786224014892957, "grad_norm": 0.04508298635482788, "learning_rate": 9.953036174293612e-06, "loss": 0.2326, "num_input_tokens_seen": 5481808, "step": 8980 }, { "epoch": 2.7877753645671737, "grad_norm": 0.17204920947551727, "learning_rate": 9.952850875615021e-06, "loss": 0.2314, "num_input_tokens_seen": 5483984, "step": 8985 }, { "epoch": 2.78932671424139, "grad_norm": 0.20495201647281647, "learning_rate": 9.952665213833813e-06, "loss": 0.2335, "num_input_tokens_seen": 5486192, "step": 8990 }, { "epoch": 2.7908780639156063, "grad_norm": 0.05512304604053497, "learning_rate": 9.952479188963593e-06, "loss": 0.2315, "num_input_tokens_seen": 5488976, "step": 8995 }, { "epoch": 2.792429413589823, "grad_norm": 0.18235373497009277, "learning_rate": 9.952292801018004e-06, "loss": 0.2284, "num_input_tokens_seen": 5492432, "step": 9000 }, { "epoch": 2.79398076326404, "grad_norm": 0.16556653380393982, "learning_rate": 9.95210605001071e-06, "loss": 0.2279, "num_input_tokens_seen": 5495312, "step": 9005 }, { "epoch": 2.795532112938256, "grad_norm": 0.3228512406349182, "learning_rate": 9.9519189359554e-06, "loss": 0.2224, "num_input_tokens_seen": 5497744, "step": 9010 }, { "epoch": 2.797083462612473, "grad_norm": 0.352143257856369, "learning_rate": 9.951731458865793e-06, "loss": 0.2387, "num_input_tokens_seen": 5499984, "step": 9015 }, { "epoch": 2.7986348122866893, "grad_norm": 0.062037862837314606, "learning_rate": 9.95154361875563e-06, "loss": 0.2283, "num_input_tokens_seen": 5502704, "step": 9020 }, { "epoch": 2.800186161960906, "grad_norm": 0.10054629296064377, "learning_rate": 9.951355415638689e-06, "loss": 0.2288, "num_input_tokens_seen": 5505008, "step": 9025 }, { "epoch": 2.8017375116351224, "grad_norm": 0.19302572309970856, "learning_rate": 9.95116684952876e-06, "loss": 0.2179, "num_input_tokens_seen": 5507728, "step": 9030 }, { "epoch": 2.803288861309339, "grad_norm": 0.2591543197631836, "learning_rate": 9.95097792043967e-06, "loss": 0.2223, "num_input_tokens_seen": 5509776, "step": 9035 }, { "epoch": 2.8048402109835555, "grad_norm": 0.3624601662158966, "learning_rate": 9.950788628385273e-06, "loss": 0.2411, "num_input_tokens_seen": 5512752, "step": 9040 }, { "epoch": 2.8063915606577723, "grad_norm": 0.2503329813480377, "learning_rate": 9.950598973379442e-06, "loss": 0.2321, "num_input_tokens_seen": 5516112, "step": 9045 }, { "epoch": 2.807942910331989, "grad_norm": 0.06423736363649368, "learning_rate": 9.950408955436082e-06, "loss": 0.2279, "num_input_tokens_seen": 5518448, "step": 9050 }, { "epoch": 2.8094942600062054, "grad_norm": 0.3551366627216339, "learning_rate": 9.950218574569124e-06, "loss": 0.2079, "num_input_tokens_seen": 5521520, "step": 9055 }, { "epoch": 2.8110456096804217, "grad_norm": 0.293277382850647, "learning_rate": 9.950027830792526e-06, "loss": 0.2492, "num_input_tokens_seen": 5525520, "step": 9060 }, { "epoch": 2.8125969593546385, "grad_norm": 0.06281451880931854, "learning_rate": 9.94983672412027e-06, "loss": 0.2366, "num_input_tokens_seen": 5527696, "step": 9065 }, { "epoch": 2.8141483090288553, "grad_norm": 0.09038183093070984, "learning_rate": 9.949645254566367e-06, "loss": 0.2458, "num_input_tokens_seen": 5531952, "step": 9070 }, { "epoch": 2.8156996587030716, "grad_norm": 0.2699136734008789, "learning_rate": 9.949453422144855e-06, "loss": 0.2343, "num_input_tokens_seen": 5535120, "step": 9075 }, { "epoch": 2.8172510083772884, "grad_norm": 0.49496152997016907, "learning_rate": 9.949261226869797e-06, "loss": 0.2249, "num_input_tokens_seen": 5537264, "step": 9080 }, { "epoch": 2.8188023580515047, "grad_norm": 1.218514323234558, "learning_rate": 9.949068668755282e-06, "loss": 0.2313, "num_input_tokens_seen": 5539568, "step": 9085 }, { "epoch": 2.8203537077257215, "grad_norm": 0.6320375204086304, "learning_rate": 9.94887574781543e-06, "loss": 0.2514, "num_input_tokens_seen": 5542288, "step": 9090 }, { "epoch": 2.821905057399938, "grad_norm": 0.8159906268119812, "learning_rate": 9.94868246406438e-06, "loss": 0.2321, "num_input_tokens_seen": 5545040, "step": 9095 }, { "epoch": 2.8234564070741546, "grad_norm": 0.2789185643196106, "learning_rate": 9.948488817516304e-06, "loss": 0.2274, "num_input_tokens_seen": 5549072, "step": 9100 }, { "epoch": 2.825007756748371, "grad_norm": 0.24016991257667542, "learning_rate": 9.9482948081854e-06, "loss": 0.2579, "num_input_tokens_seen": 5551792, "step": 9105 }, { "epoch": 2.8265591064225877, "grad_norm": 0.3413006663322449, "learning_rate": 9.94810043608589e-06, "loss": 0.2313, "num_input_tokens_seen": 5555152, "step": 9110 }, { "epoch": 2.8281104560968044, "grad_norm": 0.3941466510295868, "learning_rate": 9.947905701232023e-06, "loss": 0.2393, "num_input_tokens_seen": 5557936, "step": 9115 }, { "epoch": 2.8296618057710208, "grad_norm": 0.24930790066719055, "learning_rate": 9.947710603638078e-06, "loss": 0.2288, "num_input_tokens_seen": 5559984, "step": 9120 }, { "epoch": 2.831213155445237, "grad_norm": 0.11159325391054153, "learning_rate": 9.947515143318354e-06, "loss": 0.232, "num_input_tokens_seen": 5562448, "step": 9125 }, { "epoch": 2.832764505119454, "grad_norm": 0.2529500722885132, "learning_rate": 9.947319320287183e-06, "loss": 0.2354, "num_input_tokens_seen": 5565200, "step": 9130 }, { "epoch": 2.8343158547936707, "grad_norm": 0.23241838812828064, "learning_rate": 9.947123134558922e-06, "loss": 0.2285, "num_input_tokens_seen": 5567792, "step": 9135 }, { "epoch": 2.835867204467887, "grad_norm": 0.16843612492084503, "learning_rate": 9.946926586147952e-06, "loss": 0.227, "num_input_tokens_seen": 5571184, "step": 9140 }, { "epoch": 2.8374185541421038, "grad_norm": 0.27790141105651855, "learning_rate": 9.946729675068683e-06, "loss": 0.215, "num_input_tokens_seen": 5573328, "step": 9145 }, { "epoch": 2.83896990381632, "grad_norm": 0.7140970826148987, "learning_rate": 9.946532401335548e-06, "loss": 0.2354, "num_input_tokens_seen": 5576208, "step": 9150 }, { "epoch": 2.840521253490537, "grad_norm": 0.5325753092765808, "learning_rate": 9.946334764963017e-06, "loss": 0.2363, "num_input_tokens_seen": 5579056, "step": 9155 }, { "epoch": 2.842072603164753, "grad_norm": 0.30541419982910156, "learning_rate": 9.946136765965572e-06, "loss": 0.2413, "num_input_tokens_seen": 5581648, "step": 9160 }, { "epoch": 2.84362395283897, "grad_norm": 0.20827648043632507, "learning_rate": 9.945938404357732e-06, "loss": 0.228, "num_input_tokens_seen": 5585360, "step": 9165 }, { "epoch": 2.8451753025131863, "grad_norm": 0.21639637649059296, "learning_rate": 9.945739680154037e-06, "loss": 0.2348, "num_input_tokens_seen": 5587920, "step": 9170 }, { "epoch": 2.846726652187403, "grad_norm": 0.2890357971191406, "learning_rate": 9.945540593369056e-06, "loss": 0.2339, "num_input_tokens_seen": 5591344, "step": 9175 }, { "epoch": 2.84827800186162, "grad_norm": 0.12016374617815018, "learning_rate": 9.945341144017387e-06, "loss": 0.2337, "num_input_tokens_seen": 5594224, "step": 9180 }, { "epoch": 2.849829351535836, "grad_norm": 0.2662612497806549, "learning_rate": 9.945141332113651e-06, "loss": 0.2265, "num_input_tokens_seen": 5597072, "step": 9185 }, { "epoch": 2.8513807012100525, "grad_norm": 0.5092495083808899, "learning_rate": 9.944941157672497e-06, "loss": 0.2326, "num_input_tokens_seen": 5599856, "step": 9190 }, { "epoch": 2.8529320508842693, "grad_norm": 0.08930698782205582, "learning_rate": 9.944740620708597e-06, "loss": 0.2332, "num_input_tokens_seen": 5601680, "step": 9195 }, { "epoch": 2.854483400558486, "grad_norm": 0.37209051847457886, "learning_rate": 9.944539721236655e-06, "loss": 0.2275, "num_input_tokens_seen": 5604944, "step": 9200 }, { "epoch": 2.8560347502327024, "grad_norm": 0.37438878417015076, "learning_rate": 9.944338459271401e-06, "loss": 0.2316, "num_input_tokens_seen": 5609328, "step": 9205 }, { "epoch": 2.857586099906919, "grad_norm": 0.216513991355896, "learning_rate": 9.944136834827587e-06, "loss": 0.2367, "num_input_tokens_seen": 5612528, "step": 9210 }, { "epoch": 2.8591374495811355, "grad_norm": 0.1921684890985489, "learning_rate": 9.943934847919995e-06, "loss": 0.2314, "num_input_tokens_seen": 5615120, "step": 9215 }, { "epoch": 2.8606887992553522, "grad_norm": 0.05723228305578232, "learning_rate": 9.943732498563437e-06, "loss": 0.2347, "num_input_tokens_seen": 5617776, "step": 9220 }, { "epoch": 2.8622401489295686, "grad_norm": 0.06625781208276749, "learning_rate": 9.943529786772741e-06, "loss": 0.2304, "num_input_tokens_seen": 5620848, "step": 9225 }, { "epoch": 2.8637914986037853, "grad_norm": 0.04929954558610916, "learning_rate": 9.943326712562773e-06, "loss": 0.232, "num_input_tokens_seen": 5623824, "step": 9230 }, { "epoch": 2.8653428482780017, "grad_norm": 0.17867618799209595, "learning_rate": 9.943123275948419e-06, "loss": 0.232, "num_input_tokens_seen": 5626256, "step": 9235 }, { "epoch": 2.8668941979522184, "grad_norm": 0.16990607976913452, "learning_rate": 9.942919476944594e-06, "loss": 0.2315, "num_input_tokens_seen": 5629552, "step": 9240 }, { "epoch": 2.868445547626435, "grad_norm": 0.17161694169044495, "learning_rate": 9.942715315566238e-06, "loss": 0.2298, "num_input_tokens_seen": 5632880, "step": 9245 }, { "epoch": 2.8699968973006516, "grad_norm": 0.13988207280635834, "learning_rate": 9.942510791828318e-06, "loss": 0.2296, "num_input_tokens_seen": 5635376, "step": 9250 }, { "epoch": 2.871548246974868, "grad_norm": 0.19259318709373474, "learning_rate": 9.94230590574583e-06, "loss": 0.2323, "num_input_tokens_seen": 5638288, "step": 9255 }, { "epoch": 2.8730995966490847, "grad_norm": 0.33490824699401855, "learning_rate": 9.942100657333792e-06, "loss": 0.2338, "num_input_tokens_seen": 5641616, "step": 9260 }, { "epoch": 2.8746509463233014, "grad_norm": 0.14303584396839142, "learning_rate": 9.941895046607255e-06, "loss": 0.2303, "num_input_tokens_seen": 5644432, "step": 9265 }, { "epoch": 2.8762022959975178, "grad_norm": 0.06889587640762329, "learning_rate": 9.94168907358129e-06, "loss": 0.2292, "num_input_tokens_seen": 5647344, "step": 9270 }, { "epoch": 2.8777536456717345, "grad_norm": 0.20031018555164337, "learning_rate": 9.941482738270995e-06, "loss": 0.2347, "num_input_tokens_seen": 5650576, "step": 9275 }, { "epoch": 2.879304995345951, "grad_norm": 0.19425347447395325, "learning_rate": 9.9412760406915e-06, "loss": 0.2418, "num_input_tokens_seen": 5654192, "step": 9280 }, { "epoch": 2.8808563450201676, "grad_norm": 0.33978158235549927, "learning_rate": 9.941068980857959e-06, "loss": 0.2291, "num_input_tokens_seen": 5658416, "step": 9285 }, { "epoch": 2.882407694694384, "grad_norm": 0.17344075441360474, "learning_rate": 9.94086155878555e-06, "loss": 0.231, "num_input_tokens_seen": 5660816, "step": 9290 }, { "epoch": 2.8839590443686007, "grad_norm": 0.30374041199684143, "learning_rate": 9.940653774489482e-06, "loss": 0.2273, "num_input_tokens_seen": 5664304, "step": 9295 }, { "epoch": 2.885510394042817, "grad_norm": 0.2939967215061188, "learning_rate": 9.940445627984984e-06, "loss": 0.2306, "num_input_tokens_seen": 5666800, "step": 9300 }, { "epoch": 2.887061743717034, "grad_norm": 0.05529893934726715, "learning_rate": 9.940237119287318e-06, "loss": 0.2343, "num_input_tokens_seen": 5669296, "step": 9305 }, { "epoch": 2.8886130933912506, "grad_norm": 0.16426606476306915, "learning_rate": 9.940028248411771e-06, "loss": 0.229, "num_input_tokens_seen": 5673296, "step": 9310 }, { "epoch": 2.890164443065467, "grad_norm": 0.17684225738048553, "learning_rate": 9.939819015373654e-06, "loss": 0.2338, "num_input_tokens_seen": 5676016, "step": 9315 }, { "epoch": 2.8917157927396833, "grad_norm": 0.14596232771873474, "learning_rate": 9.939609420188307e-06, "loss": 0.2358, "num_input_tokens_seen": 5678448, "step": 9320 }, { "epoch": 2.8932671424139, "grad_norm": 0.058749981224536896, "learning_rate": 9.939399462871095e-06, "loss": 0.2327, "num_input_tokens_seen": 5680944, "step": 9325 }, { "epoch": 2.894818492088117, "grad_norm": 0.05780545994639397, "learning_rate": 9.939189143437413e-06, "loss": 0.2315, "num_input_tokens_seen": 5684240, "step": 9330 }, { "epoch": 2.896369841762333, "grad_norm": 0.11295720189809799, "learning_rate": 9.938978461902678e-06, "loss": 0.2345, "num_input_tokens_seen": 5687472, "step": 9335 }, { "epoch": 2.89792119143655, "grad_norm": 0.04873109981417656, "learning_rate": 9.938767418282334e-06, "loss": 0.2309, "num_input_tokens_seen": 5689872, "step": 9340 }, { "epoch": 2.8994725411107662, "grad_norm": 0.21315588057041168, "learning_rate": 9.938556012591856e-06, "loss": 0.2318, "num_input_tokens_seen": 5695088, "step": 9345 }, { "epoch": 2.901023890784983, "grad_norm": 0.06446060538291931, "learning_rate": 9.93834424484674e-06, "loss": 0.2374, "num_input_tokens_seen": 5698000, "step": 9350 }, { "epoch": 2.9025752404591993, "grad_norm": 0.07045470923185349, "learning_rate": 9.93813211506251e-06, "loss": 0.2325, "num_input_tokens_seen": 5700528, "step": 9355 }, { "epoch": 2.904126590133416, "grad_norm": 0.07663619518280029, "learning_rate": 9.937919623254724e-06, "loss": 0.232, "num_input_tokens_seen": 5702992, "step": 9360 }, { "epoch": 2.9056779398076324, "grad_norm": 0.1568555235862732, "learning_rate": 9.937706769438954e-06, "loss": 0.2299, "num_input_tokens_seen": 5706512, "step": 9365 }, { "epoch": 2.907229289481849, "grad_norm": 0.15619376301765442, "learning_rate": 9.937493553630806e-06, "loss": 0.233, "num_input_tokens_seen": 5709840, "step": 9370 }, { "epoch": 2.908780639156066, "grad_norm": 0.1562846451997757, "learning_rate": 9.937279975845913e-06, "loss": 0.2283, "num_input_tokens_seen": 5712432, "step": 9375 }, { "epoch": 2.9103319888302823, "grad_norm": 0.14413286745548248, "learning_rate": 9.93706603609993e-06, "loss": 0.2279, "num_input_tokens_seen": 5714928, "step": 9380 }, { "epoch": 2.9118833385044987, "grad_norm": 0.14180290699005127, "learning_rate": 9.936851734408544e-06, "loss": 0.2224, "num_input_tokens_seen": 5717808, "step": 9385 }, { "epoch": 2.9134346881787154, "grad_norm": 0.18914218246936798, "learning_rate": 9.936637070787465e-06, "loss": 0.245, "num_input_tokens_seen": 5720944, "step": 9390 }, { "epoch": 2.914986037852932, "grad_norm": 0.14057859778404236, "learning_rate": 9.936422045252429e-06, "loss": 0.2275, "num_input_tokens_seen": 5723696, "step": 9395 }, { "epoch": 2.9165373875271485, "grad_norm": 0.056107886135578156, "learning_rate": 9.936206657819203e-06, "loss": 0.2307, "num_input_tokens_seen": 5725872, "step": 9400 }, { "epoch": 2.9180887372013653, "grad_norm": 0.061347950249910355, "learning_rate": 9.935990908503574e-06, "loss": 0.2336, "num_input_tokens_seen": 5729712, "step": 9405 }, { "epoch": 2.9196400868755816, "grad_norm": 0.17865192890167236, "learning_rate": 9.93577479732136e-06, "loss": 0.2382, "num_input_tokens_seen": 5732208, "step": 9410 }, { "epoch": 2.9211914365497984, "grad_norm": 0.05006169527769089, "learning_rate": 9.935558324288407e-06, "loss": 0.226, "num_input_tokens_seen": 5734416, "step": 9415 }, { "epoch": 2.9227427862240147, "grad_norm": 0.14129672944545746, "learning_rate": 9.935341489420582e-06, "loss": 0.2273, "num_input_tokens_seen": 5739088, "step": 9420 }, { "epoch": 2.9242941358982315, "grad_norm": 0.06589894741773605, "learning_rate": 9.935124292733785e-06, "loss": 0.2422, "num_input_tokens_seen": 5741968, "step": 9425 }, { "epoch": 2.925845485572448, "grad_norm": 0.17951670289039612, "learning_rate": 9.934906734243932e-06, "loss": 0.2314, "num_input_tokens_seen": 5744880, "step": 9430 }, { "epoch": 2.9273968352466646, "grad_norm": 0.05267830938100815, "learning_rate": 9.934688813966982e-06, "loss": 0.2326, "num_input_tokens_seen": 5749392, "step": 9435 }, { "epoch": 2.9289481849208814, "grad_norm": 0.05238335579633713, "learning_rate": 9.934470531918902e-06, "loss": 0.2257, "num_input_tokens_seen": 5752688, "step": 9440 }, { "epoch": 2.9304995345950977, "grad_norm": 0.06967399269342422, "learning_rate": 9.934251888115702e-06, "loss": 0.2328, "num_input_tokens_seen": 5755984, "step": 9445 }, { "epoch": 2.932050884269314, "grad_norm": 0.17581643164157867, "learning_rate": 9.934032882573409e-06, "loss": 0.2242, "num_input_tokens_seen": 5758928, "step": 9450 }, { "epoch": 2.933602233943531, "grad_norm": 0.1955627053976059, "learning_rate": 9.933813515308077e-06, "loss": 0.2251, "num_input_tokens_seen": 5761936, "step": 9455 }, { "epoch": 2.9351535836177476, "grad_norm": 0.06627807766199112, "learning_rate": 9.93359378633579e-06, "loss": 0.2315, "num_input_tokens_seen": 5764720, "step": 9460 }, { "epoch": 2.936704933291964, "grad_norm": 0.06840529292821884, "learning_rate": 9.933373695672654e-06, "loss": 0.2424, "num_input_tokens_seen": 5767344, "step": 9465 }, { "epoch": 2.9382562829661807, "grad_norm": 0.18810680508613586, "learning_rate": 9.933153243334808e-06, "loss": 0.2292, "num_input_tokens_seen": 5770608, "step": 9470 }, { "epoch": 2.939807632640397, "grad_norm": 0.2839502990245819, "learning_rate": 9.932932429338411e-06, "loss": 0.2331, "num_input_tokens_seen": 5773200, "step": 9475 }, { "epoch": 2.941358982314614, "grad_norm": 0.06767286360263824, "learning_rate": 9.932711253699652e-06, "loss": 0.2321, "num_input_tokens_seen": 5776720, "step": 9480 }, { "epoch": 2.94291033198883, "grad_norm": 0.16771407425403595, "learning_rate": 9.932489716434746e-06, "loss": 0.2309, "num_input_tokens_seen": 5779856, "step": 9485 }, { "epoch": 2.944461681663047, "grad_norm": 0.06415042281150818, "learning_rate": 9.932267817559936e-06, "loss": 0.2287, "num_input_tokens_seen": 5782544, "step": 9490 }, { "epoch": 2.946013031337263, "grad_norm": 0.1509496420621872, "learning_rate": 9.932045557091488e-06, "loss": 0.2348, "num_input_tokens_seen": 5785040, "step": 9495 }, { "epoch": 2.94756438101148, "grad_norm": 0.06265241652727127, "learning_rate": 9.931822935045693e-06, "loss": 0.2285, "num_input_tokens_seen": 5788496, "step": 9500 }, { "epoch": 2.9491157306856968, "grad_norm": 0.28373730182647705, "learning_rate": 9.93159995143888e-06, "loss": 0.2336, "num_input_tokens_seen": 5791504, "step": 9505 }, { "epoch": 2.950667080359913, "grad_norm": 0.2790910005569458, "learning_rate": 9.931376606287388e-06, "loss": 0.2289, "num_input_tokens_seen": 5795120, "step": 9510 }, { "epoch": 2.9522184300341294, "grad_norm": 0.16257192194461823, "learning_rate": 9.931152899607597e-06, "loss": 0.2273, "num_input_tokens_seen": 5797968, "step": 9515 }, { "epoch": 2.953769779708346, "grad_norm": 0.1736905723810196, "learning_rate": 9.930928831415904e-06, "loss": 0.2321, "num_input_tokens_seen": 5800336, "step": 9520 }, { "epoch": 2.955321129382563, "grad_norm": 0.16603916883468628, "learning_rate": 9.930704401728737e-06, "loss": 0.2331, "num_input_tokens_seen": 5803888, "step": 9525 }, { "epoch": 2.9568724790567793, "grad_norm": 0.2981356382369995, "learning_rate": 9.93047961056255e-06, "loss": 0.2336, "num_input_tokens_seen": 5806800, "step": 9530 }, { "epoch": 2.958423828730996, "grad_norm": 0.2749839425086975, "learning_rate": 9.93025445793382e-06, "loss": 0.2346, "num_input_tokens_seen": 5809456, "step": 9535 }, { "epoch": 2.9599751784052124, "grad_norm": 0.03969133272767067, "learning_rate": 9.930028943859055e-06, "loss": 0.2319, "num_input_tokens_seen": 5812784, "step": 9540 }, { "epoch": 2.961526528079429, "grad_norm": 0.1502736657857895, "learning_rate": 9.92980306835479e-06, "loss": 0.2335, "num_input_tokens_seen": 5815504, "step": 9545 }, { "epoch": 2.9630778777536455, "grad_norm": 0.07160253822803497, "learning_rate": 9.929576831437582e-06, "loss": 0.2304, "num_input_tokens_seen": 5818928, "step": 9550 }, { "epoch": 2.9646292274278623, "grad_norm": 0.18273749947547913, "learning_rate": 9.929350233124018e-06, "loss": 0.2287, "num_input_tokens_seen": 5822064, "step": 9555 }, { "epoch": 2.9661805771020786, "grad_norm": 0.31937500834465027, "learning_rate": 9.929123273430708e-06, "loss": 0.2281, "num_input_tokens_seen": 5825712, "step": 9560 }, { "epoch": 2.9677319267762954, "grad_norm": 0.3306894898414612, "learning_rate": 9.928895952374293e-06, "loss": 0.2441, "num_input_tokens_seen": 5828176, "step": 9565 }, { "epoch": 2.969283276450512, "grad_norm": 0.05764073133468628, "learning_rate": 9.928668269971438e-06, "loss": 0.2325, "num_input_tokens_seen": 5830480, "step": 9570 }, { "epoch": 2.9708346261247285, "grad_norm": 0.05579277127981186, "learning_rate": 9.928440226238835e-06, "loss": 0.2348, "num_input_tokens_seen": 5832784, "step": 9575 }, { "epoch": 2.972385975798945, "grad_norm": 0.15384460985660553, "learning_rate": 9.9282118211932e-06, "loss": 0.2321, "num_input_tokens_seen": 5835472, "step": 9580 }, { "epoch": 2.9739373254731616, "grad_norm": 0.18774865567684174, "learning_rate": 9.927983054851283e-06, "loss": 0.2311, "num_input_tokens_seen": 5838096, "step": 9585 }, { "epoch": 2.9754886751473784, "grad_norm": 0.15765878558158875, "learning_rate": 9.92775392722985e-06, "loss": 0.2222, "num_input_tokens_seen": 5841936, "step": 9590 }, { "epoch": 2.9770400248215947, "grad_norm": 0.10344846546649933, "learning_rate": 9.927524438345701e-06, "loss": 0.2263, "num_input_tokens_seen": 5844624, "step": 9595 }, { "epoch": 2.9785913744958115, "grad_norm": 0.17742280662059784, "learning_rate": 9.927294588215661e-06, "loss": 0.229, "num_input_tokens_seen": 5846992, "step": 9600 }, { "epoch": 2.980142724170028, "grad_norm": 0.3356454074382782, "learning_rate": 9.92706437685658e-06, "loss": 0.2351, "num_input_tokens_seen": 5850000, "step": 9605 }, { "epoch": 2.9816940738442446, "grad_norm": 0.2653602361679077, "learning_rate": 9.926833804285332e-06, "loss": 0.2553, "num_input_tokens_seen": 5853968, "step": 9610 }, { "epoch": 2.9832454235184613, "grad_norm": 0.1941944658756256, "learning_rate": 9.926602870518826e-06, "loss": 0.2304, "num_input_tokens_seen": 5856656, "step": 9615 }, { "epoch": 2.9847967731926777, "grad_norm": 0.18078047037124634, "learning_rate": 9.92637157557399e-06, "loss": 0.2266, "num_input_tokens_seen": 5860496, "step": 9620 }, { "epoch": 2.986348122866894, "grad_norm": 0.21421115100383759, "learning_rate": 9.926139919467781e-06, "loss": 0.2358, "num_input_tokens_seen": 5863312, "step": 9625 }, { "epoch": 2.9878994725411108, "grad_norm": 0.38425779342651367, "learning_rate": 9.92590790221718e-06, "loss": 0.2326, "num_input_tokens_seen": 5866288, "step": 9630 }, { "epoch": 2.9894508222153275, "grad_norm": 0.18547943234443665, "learning_rate": 9.9256755238392e-06, "loss": 0.2302, "num_input_tokens_seen": 5869200, "step": 9635 }, { "epoch": 2.991002171889544, "grad_norm": 0.15370668470859528, "learning_rate": 9.925442784350874e-06, "loss": 0.2316, "num_input_tokens_seen": 5871472, "step": 9640 }, { "epoch": 2.99255352156376, "grad_norm": 0.14895054697990417, "learning_rate": 9.925209683769267e-06, "loss": 0.2284, "num_input_tokens_seen": 5874800, "step": 9645 }, { "epoch": 2.994104871237977, "grad_norm": 0.17646875977516174, "learning_rate": 9.924976222111466e-06, "loss": 0.2301, "num_input_tokens_seen": 5878736, "step": 9650 }, { "epoch": 2.9956562209121937, "grad_norm": 0.0709516704082489, "learning_rate": 9.92474239939459e-06, "loss": 0.2237, "num_input_tokens_seen": 5882608, "step": 9655 }, { "epoch": 2.99720757058641, "grad_norm": 0.0662672221660614, "learning_rate": 9.924508215635776e-06, "loss": 0.2303, "num_input_tokens_seen": 5885616, "step": 9660 }, { "epoch": 2.998758920260627, "grad_norm": 0.1619495302438736, "learning_rate": 9.924273670852193e-06, "loss": 0.2196, "num_input_tokens_seen": 5888944, "step": 9665 }, { "epoch": 3.000310269934843, "grad_norm": 0.13436578214168549, "learning_rate": 9.924038765061042e-06, "loss": 0.2325, "num_input_tokens_seen": 5891664, "step": 9670 }, { "epoch": 3.00186161960906, "grad_norm": 0.08961589634418488, "learning_rate": 9.923803498279536e-06, "loss": 0.2326, "num_input_tokens_seen": 5894544, "step": 9675 }, { "epoch": 3.0034129692832763, "grad_norm": 0.1626533567905426, "learning_rate": 9.92356787052493e-06, "loss": 0.2271, "num_input_tokens_seen": 5897136, "step": 9680 }, { "epoch": 3.004964318957493, "grad_norm": 0.2606083154678345, "learning_rate": 9.923331881814495e-06, "loss": 0.2226, "num_input_tokens_seen": 5899664, "step": 9685 }, { "epoch": 3.0065156686317094, "grad_norm": 0.09039187431335449, "learning_rate": 9.923095532165532e-06, "loss": 0.2467, "num_input_tokens_seen": 5902512, "step": 9690 }, { "epoch": 3.008067018305926, "grad_norm": 0.14661575853824615, "learning_rate": 9.922858821595368e-06, "loss": 0.2283, "num_input_tokens_seen": 5905584, "step": 9695 }, { "epoch": 3.009618367980143, "grad_norm": 0.08327730745077133, "learning_rate": 9.922621750121358e-06, "loss": 0.2268, "num_input_tokens_seen": 5910064, "step": 9700 }, { "epoch": 3.0111697176543593, "grad_norm": 0.18175241351127625, "learning_rate": 9.92238431776088e-06, "loss": 0.233, "num_input_tokens_seen": 5913328, "step": 9705 }, { "epoch": 3.012721067328576, "grad_norm": 0.0750880315899849, "learning_rate": 9.922146524531341e-06, "loss": 0.2254, "num_input_tokens_seen": 5915856, "step": 9710 }, { "epoch": 3.0142724170027924, "grad_norm": 0.1938856691122055, "learning_rate": 9.921908370450175e-06, "loss": 0.2338, "num_input_tokens_seen": 5918480, "step": 9715 }, { "epoch": 3.015823766677009, "grad_norm": 0.18068324029445648, "learning_rate": 9.921669855534843e-06, "loss": 0.2285, "num_input_tokens_seen": 5922736, "step": 9720 }, { "epoch": 3.0173751163512255, "grad_norm": 0.12906494736671448, "learning_rate": 9.921430979802829e-06, "loss": 0.2367, "num_input_tokens_seen": 5925392, "step": 9725 }, { "epoch": 3.0189264660254422, "grad_norm": 0.16851690411567688, "learning_rate": 9.921191743271645e-06, "loss": 0.2294, "num_input_tokens_seen": 5929936, "step": 9730 }, { "epoch": 3.0204778156996586, "grad_norm": 0.182194322347641, "learning_rate": 9.920952145958831e-06, "loss": 0.2284, "num_input_tokens_seen": 5932848, "step": 9735 }, { "epoch": 3.0220291653738753, "grad_norm": 0.4121567904949188, "learning_rate": 9.920712187881951e-06, "loss": 0.2347, "num_input_tokens_seen": 5936624, "step": 9740 }, { "epoch": 3.0235805150480917, "grad_norm": 0.05589817091822624, "learning_rate": 9.920471869058599e-06, "loss": 0.2313, "num_input_tokens_seen": 5938960, "step": 9745 }, { "epoch": 3.0251318647223084, "grad_norm": 0.05529923737049103, "learning_rate": 9.92023118950639e-06, "loss": 0.2304, "num_input_tokens_seen": 5942960, "step": 9750 }, { "epoch": 3.0266832143965248, "grad_norm": 0.15993273258209229, "learning_rate": 9.919990149242973e-06, "loss": 0.2335, "num_input_tokens_seen": 5946128, "step": 9755 }, { "epoch": 3.0282345640707415, "grad_norm": 0.17405910789966583, "learning_rate": 9.919748748286015e-06, "loss": 0.2283, "num_input_tokens_seen": 5949872, "step": 9760 }, { "epoch": 3.0297859137449583, "grad_norm": 0.11816762387752533, "learning_rate": 9.919506986653215e-06, "loss": 0.2285, "num_input_tokens_seen": 5954416, "step": 9765 }, { "epoch": 3.0313372634191746, "grad_norm": 0.1743844449520111, "learning_rate": 9.919264864362298e-06, "loss": 0.2286, "num_input_tokens_seen": 5956816, "step": 9770 }, { "epoch": 3.0328886130933914, "grad_norm": 0.36898118257522583, "learning_rate": 9.919022381431014e-06, "loss": 0.2231, "num_input_tokens_seen": 5959856, "step": 9775 }, { "epoch": 3.0344399627676077, "grad_norm": 0.2080017328262329, "learning_rate": 9.91877953787714e-06, "loss": 0.2351, "num_input_tokens_seen": 5962832, "step": 9780 }, { "epoch": 3.0359913124418245, "grad_norm": 0.26781967282295227, "learning_rate": 9.918536333718476e-06, "loss": 0.2369, "num_input_tokens_seen": 5965840, "step": 9785 }, { "epoch": 3.037542662116041, "grad_norm": 0.19867733120918274, "learning_rate": 9.918292768972857e-06, "loss": 0.2315, "num_input_tokens_seen": 5968176, "step": 9790 }, { "epoch": 3.0390940117902576, "grad_norm": 0.0883481502532959, "learning_rate": 9.918048843658136e-06, "loss": 0.2337, "num_input_tokens_seen": 5970928, "step": 9795 }, { "epoch": 3.040645361464474, "grad_norm": 0.0891474038362503, "learning_rate": 9.917804557792197e-06, "loss": 0.2388, "num_input_tokens_seen": 5973808, "step": 9800 }, { "epoch": 3.0421967111386907, "grad_norm": 0.07929158955812454, "learning_rate": 9.917559911392946e-06, "loss": 0.2268, "num_input_tokens_seen": 5977008, "step": 9805 }, { "epoch": 3.043748060812907, "grad_norm": 0.16498731076717377, "learning_rate": 9.917314904478323e-06, "loss": 0.2362, "num_input_tokens_seen": 5980272, "step": 9810 }, { "epoch": 3.045299410487124, "grad_norm": 0.14629904925823212, "learning_rate": 9.917069537066289e-06, "loss": 0.2294, "num_input_tokens_seen": 5983344, "step": 9815 }, { "epoch": 3.04685076016134, "grad_norm": 0.16153109073638916, "learning_rate": 9.916823809174828e-06, "loss": 0.2348, "num_input_tokens_seen": 5985872, "step": 9820 }, { "epoch": 3.048402109835557, "grad_norm": 0.1723511517047882, "learning_rate": 9.91657772082196e-06, "loss": 0.2355, "num_input_tokens_seen": 5988752, "step": 9825 }, { "epoch": 3.0499534595097737, "grad_norm": 0.0634525865316391, "learning_rate": 9.916331272025724e-06, "loss": 0.2276, "num_input_tokens_seen": 5991728, "step": 9830 }, { "epoch": 3.05150480918399, "grad_norm": 0.1429944783449173, "learning_rate": 9.916084462804187e-06, "loss": 0.2294, "num_input_tokens_seen": 5995952, "step": 9835 }, { "epoch": 3.053056158858207, "grad_norm": 0.164847269654274, "learning_rate": 9.915837293175444e-06, "loss": 0.233, "num_input_tokens_seen": 5998448, "step": 9840 }, { "epoch": 3.054607508532423, "grad_norm": 0.1795000433921814, "learning_rate": 9.915589763157614e-06, "loss": 0.2305, "num_input_tokens_seen": 6001072, "step": 9845 }, { "epoch": 3.05615885820664, "grad_norm": 0.1588113009929657, "learning_rate": 9.915341872768846e-06, "loss": 0.2337, "num_input_tokens_seen": 6004240, "step": 9850 }, { "epoch": 3.0577102078808562, "grad_norm": 0.09256376326084137, "learning_rate": 9.915093622027314e-06, "loss": 0.2285, "num_input_tokens_seen": 6006832, "step": 9855 }, { "epoch": 3.059261557555073, "grad_norm": 0.15941983461380005, "learning_rate": 9.914845010951215e-06, "loss": 0.2194, "num_input_tokens_seen": 6009872, "step": 9860 }, { "epoch": 3.0608129072292893, "grad_norm": 0.0876823365688324, "learning_rate": 9.914596039558775e-06, "loss": 0.2289, "num_input_tokens_seen": 6012624, "step": 9865 }, { "epoch": 3.062364256903506, "grad_norm": 0.06971988826990128, "learning_rate": 9.914346707868248e-06, "loss": 0.2385, "num_input_tokens_seen": 6015536, "step": 9870 }, { "epoch": 3.0639156065777224, "grad_norm": 0.35964423418045044, "learning_rate": 9.914097015897915e-06, "loss": 0.2363, "num_input_tokens_seen": 6018608, "step": 9875 }, { "epoch": 3.065466956251939, "grad_norm": 0.07916180044412613, "learning_rate": 9.913846963666076e-06, "loss": 0.2381, "num_input_tokens_seen": 6021136, "step": 9880 }, { "epoch": 3.0670183059261555, "grad_norm": 0.054468125104904175, "learning_rate": 9.913596551191068e-06, "loss": 0.23, "num_input_tokens_seen": 6023664, "step": 9885 }, { "epoch": 3.0685696556003723, "grad_norm": 0.3007294237613678, "learning_rate": 9.913345778491246e-06, "loss": 0.2296, "num_input_tokens_seen": 6025904, "step": 9890 }, { "epoch": 3.070121005274589, "grad_norm": 0.1664058268070221, "learning_rate": 9.913094645584997e-06, "loss": 0.2304, "num_input_tokens_seen": 6028624, "step": 9895 }, { "epoch": 3.0716723549488054, "grad_norm": 0.2733299732208252, "learning_rate": 9.91284315249073e-06, "loss": 0.2312, "num_input_tokens_seen": 6031696, "step": 9900 }, { "epoch": 3.073223704623022, "grad_norm": 0.14555777609348297, "learning_rate": 9.912591299226883e-06, "loss": 0.2271, "num_input_tokens_seen": 6034384, "step": 9905 }, { "epoch": 3.0747750542972385, "grad_norm": 0.3299349844455719, "learning_rate": 9.91233908581192e-06, "loss": 0.2289, "num_input_tokens_seen": 6037296, "step": 9910 }, { "epoch": 3.0763264039714553, "grad_norm": 0.08836525678634644, "learning_rate": 9.912086512264332e-06, "loss": 0.2375, "num_input_tokens_seen": 6040976, "step": 9915 }, { "epoch": 3.0778777536456716, "grad_norm": 0.15994830429553986, "learning_rate": 9.911833578602634e-06, "loss": 0.2319, "num_input_tokens_seen": 6043824, "step": 9920 }, { "epoch": 3.0794291033198884, "grad_norm": 0.13096004724502563, "learning_rate": 9.91158028484537e-06, "loss": 0.2281, "num_input_tokens_seen": 6046896, "step": 9925 }, { "epoch": 3.0809804529941047, "grad_norm": 0.3085070252418518, "learning_rate": 9.91132663101111e-06, "loss": 0.2315, "num_input_tokens_seen": 6050640, "step": 9930 }, { "epoch": 3.0825318026683215, "grad_norm": 0.0648176297545433, "learning_rate": 9.911072617118446e-06, "loss": 0.2303, "num_input_tokens_seen": 6054000, "step": 9935 }, { "epoch": 3.084083152342538, "grad_norm": 0.09286758303642273, "learning_rate": 9.910818243186006e-06, "loss": 0.2284, "num_input_tokens_seen": 6057712, "step": 9940 }, { "epoch": 3.0856345020167546, "grad_norm": 0.16874024271965027, "learning_rate": 9.910563509232437e-06, "loss": 0.2317, "num_input_tokens_seen": 6060368, "step": 9945 }, { "epoch": 3.087185851690971, "grad_norm": 0.16671420633792877, "learning_rate": 9.910308415276413e-06, "loss": 0.238, "num_input_tokens_seen": 6063376, "step": 9950 }, { "epoch": 3.0887372013651877, "grad_norm": 0.09361261874437332, "learning_rate": 9.910052961336634e-06, "loss": 0.2325, "num_input_tokens_seen": 6065936, "step": 9955 }, { "epoch": 3.0902885510394045, "grad_norm": 0.19707278907299042, "learning_rate": 9.909797147431829e-06, "loss": 0.2328, "num_input_tokens_seen": 6070640, "step": 9960 }, { "epoch": 3.091839900713621, "grad_norm": 0.13221903145313263, "learning_rate": 9.909540973580752e-06, "loss": 0.2294, "num_input_tokens_seen": 6073680, "step": 9965 }, { "epoch": 3.0933912503878376, "grad_norm": 0.12034308165311813, "learning_rate": 9.909284439802184e-06, "loss": 0.2287, "num_input_tokens_seen": 6076432, "step": 9970 }, { "epoch": 3.094942600062054, "grad_norm": 0.2552601099014282, "learning_rate": 9.909027546114935e-06, "loss": 0.2269, "num_input_tokens_seen": 6079408, "step": 9975 }, { "epoch": 3.0964939497362707, "grad_norm": 0.2104310691356659, "learning_rate": 9.908770292537833e-06, "loss": 0.2333, "num_input_tokens_seen": 6081872, "step": 9980 }, { "epoch": 3.098045299410487, "grad_norm": 0.10656052827835083, "learning_rate": 9.908512679089739e-06, "loss": 0.2302, "num_input_tokens_seen": 6084464, "step": 9985 }, { "epoch": 3.099596649084704, "grad_norm": 0.36648353934288025, "learning_rate": 9.908254705789541e-06, "loss": 0.2274, "num_input_tokens_seen": 6087664, "step": 9990 }, { "epoch": 3.10114799875892, "grad_norm": 0.09201276302337646, "learning_rate": 9.907996372656153e-06, "loss": 0.2335, "num_input_tokens_seen": 6089872, "step": 9995 }, { "epoch": 3.102699348433137, "grad_norm": 0.35430359840393066, "learning_rate": 9.907737679708508e-06, "loss": 0.2274, "num_input_tokens_seen": 6092688, "step": 10000 }, { "epoch": 3.104250698107353, "grad_norm": 0.23345522582530975, "learning_rate": 9.907478626965576e-06, "loss": 0.2412, "num_input_tokens_seen": 6095696, "step": 10005 }, { "epoch": 3.10580204778157, "grad_norm": 0.23712162673473358, "learning_rate": 9.907219214446348e-06, "loss": 0.2252, "num_input_tokens_seen": 6098768, "step": 10010 }, { "epoch": 3.1073533974557863, "grad_norm": 0.1725388616323471, "learning_rate": 9.906959442169841e-06, "loss": 0.2348, "num_input_tokens_seen": 6101456, "step": 10015 }, { "epoch": 3.108904747130003, "grad_norm": 0.2640952467918396, "learning_rate": 9.906699310155098e-06, "loss": 0.2256, "num_input_tokens_seen": 6104336, "step": 10020 }, { "epoch": 3.11045609680422, "grad_norm": 0.20579509437084198, "learning_rate": 9.906438818421195e-06, "loss": 0.2296, "num_input_tokens_seen": 6107568, "step": 10025 }, { "epoch": 3.112007446478436, "grad_norm": 0.21740411221981049, "learning_rate": 9.906177966987225e-06, "loss": 0.2239, "num_input_tokens_seen": 6110608, "step": 10030 }, { "epoch": 3.113558796152653, "grad_norm": 0.24857154488563538, "learning_rate": 9.905916755872309e-06, "loss": 0.2223, "num_input_tokens_seen": 6112848, "step": 10035 }, { "epoch": 3.1151101458268693, "grad_norm": 0.455390989780426, "learning_rate": 9.905655185095602e-06, "loss": 0.2454, "num_input_tokens_seen": 6116176, "step": 10040 }, { "epoch": 3.116661495501086, "grad_norm": 0.37488922476768494, "learning_rate": 9.905393254676279e-06, "loss": 0.2339, "num_input_tokens_seen": 6120208, "step": 10045 }, { "epoch": 3.1182128451753024, "grad_norm": 0.3141174912452698, "learning_rate": 9.905130964633543e-06, "loss": 0.2295, "num_input_tokens_seen": 6122640, "step": 10050 }, { "epoch": 3.119764194849519, "grad_norm": 0.18758375942707062, "learning_rate": 9.90486831498662e-06, "loss": 0.2336, "num_input_tokens_seen": 6125360, "step": 10055 }, { "epoch": 3.1213155445237355, "grad_norm": 0.09889324009418488, "learning_rate": 9.904605305754766e-06, "loss": 0.2241, "num_input_tokens_seen": 6129104, "step": 10060 }, { "epoch": 3.1228668941979523, "grad_norm": 0.09754214435815811, "learning_rate": 9.904341936957267e-06, "loss": 0.2315, "num_input_tokens_seen": 6131824, "step": 10065 }, { "epoch": 3.1244182438721686, "grad_norm": 0.17083629965782166, "learning_rate": 9.904078208613426e-06, "loss": 0.2337, "num_input_tokens_seen": 6135952, "step": 10070 }, { "epoch": 3.1259695935463854, "grad_norm": 0.31851926445961, "learning_rate": 9.903814120742578e-06, "loss": 0.2305, "num_input_tokens_seen": 6138768, "step": 10075 }, { "epoch": 3.1275209432206017, "grad_norm": 0.10758588463068008, "learning_rate": 9.903549673364088e-06, "loss": 0.2352, "num_input_tokens_seen": 6142480, "step": 10080 }, { "epoch": 3.1290722928948185, "grad_norm": 0.19458721578121185, "learning_rate": 9.903284866497338e-06, "loss": 0.2309, "num_input_tokens_seen": 6145008, "step": 10085 }, { "epoch": 3.1306236425690352, "grad_norm": 0.17875437438488007, "learning_rate": 9.903019700161744e-06, "loss": 0.2345, "num_input_tokens_seen": 6148368, "step": 10090 }, { "epoch": 3.1321749922432516, "grad_norm": 0.08451156318187714, "learning_rate": 9.902754174376747e-06, "loss": 0.2335, "num_input_tokens_seen": 6153328, "step": 10095 }, { "epoch": 3.1337263419174683, "grad_norm": 0.16137444972991943, "learning_rate": 9.902488289161811e-06, "loss": 0.2309, "num_input_tokens_seen": 6156048, "step": 10100 }, { "epoch": 3.1352776915916847, "grad_norm": 0.07281500846147537, "learning_rate": 9.90222204453643e-06, "loss": 0.2299, "num_input_tokens_seen": 6159024, "step": 10105 }, { "epoch": 3.1368290412659015, "grad_norm": 0.22030511498451233, "learning_rate": 9.901955440520121e-06, "loss": 0.2311, "num_input_tokens_seen": 6161680, "step": 10110 }, { "epoch": 3.138380390940118, "grad_norm": 0.15472613275051117, "learning_rate": 9.90168847713243e-06, "loss": 0.2362, "num_input_tokens_seen": 6164816, "step": 10115 }, { "epoch": 3.1399317406143346, "grad_norm": 0.2749292552471161, "learning_rate": 9.90142115439293e-06, "loss": 0.2301, "num_input_tokens_seen": 6167440, "step": 10120 }, { "epoch": 3.141483090288551, "grad_norm": 0.16760294139385223, "learning_rate": 9.901153472321217e-06, "loss": 0.2384, "num_input_tokens_seen": 6170256, "step": 10125 }, { "epoch": 3.1430344399627677, "grad_norm": 0.2890659272670746, "learning_rate": 9.900885430936917e-06, "loss": 0.2279, "num_input_tokens_seen": 6172976, "step": 10130 }, { "epoch": 3.144585789636984, "grad_norm": 0.16877098381519318, "learning_rate": 9.900617030259679e-06, "loss": 0.2339, "num_input_tokens_seen": 6175600, "step": 10135 }, { "epoch": 3.1461371393112008, "grad_norm": 0.1559201329946518, "learning_rate": 9.900348270309181e-06, "loss": 0.2399, "num_input_tokens_seen": 6178832, "step": 10140 }, { "epoch": 3.147688488985417, "grad_norm": 0.10378996282815933, "learning_rate": 9.900079151105126e-06, "loss": 0.2351, "num_input_tokens_seen": 6181392, "step": 10145 }, { "epoch": 3.149239838659634, "grad_norm": 0.16391754150390625, "learning_rate": 9.899809672667243e-06, "loss": 0.2334, "num_input_tokens_seen": 6183696, "step": 10150 }, { "epoch": 3.1507911883338506, "grad_norm": 0.16966161131858826, "learning_rate": 9.899539835015289e-06, "loss": 0.2352, "num_input_tokens_seen": 6187120, "step": 10155 }, { "epoch": 3.152342538008067, "grad_norm": 0.14389359951019287, "learning_rate": 9.899269638169046e-06, "loss": 0.2383, "num_input_tokens_seen": 6189648, "step": 10160 }, { "epoch": 3.1538938876822837, "grad_norm": 0.12894293665885925, "learning_rate": 9.898999082148323e-06, "loss": 0.2305, "num_input_tokens_seen": 6192848, "step": 10165 }, { "epoch": 3.1554452373565, "grad_norm": 0.13572975993156433, "learning_rate": 9.898728166972954e-06, "loss": 0.2304, "num_input_tokens_seen": 6196048, "step": 10170 }, { "epoch": 3.156996587030717, "grad_norm": 0.14313800632953644, "learning_rate": 9.898456892662802e-06, "loss": 0.2289, "num_input_tokens_seen": 6199120, "step": 10175 }, { "epoch": 3.158547936704933, "grad_norm": 0.141348198056221, "learning_rate": 9.89818525923775e-06, "loss": 0.2341, "num_input_tokens_seen": 6201680, "step": 10180 }, { "epoch": 3.16009928637915, "grad_norm": 0.14230918884277344, "learning_rate": 9.897913266717718e-06, "loss": 0.2352, "num_input_tokens_seen": 6204784, "step": 10185 }, { "epoch": 3.1616506360533663, "grad_norm": 0.15271158516407013, "learning_rate": 9.897640915122644e-06, "loss": 0.2331, "num_input_tokens_seen": 6207856, "step": 10190 }, { "epoch": 3.163201985727583, "grad_norm": 0.05289698392152786, "learning_rate": 9.897368204472493e-06, "loss": 0.2363, "num_input_tokens_seen": 6210864, "step": 10195 }, { "epoch": 3.1647533354017994, "grad_norm": 0.13959002494812012, "learning_rate": 9.897095134787259e-06, "loss": 0.2325, "num_input_tokens_seen": 6213648, "step": 10200 }, { "epoch": 3.166304685076016, "grad_norm": 0.05203654617071152, "learning_rate": 9.89682170608696e-06, "loss": 0.2342, "num_input_tokens_seen": 6216016, "step": 10205 }, { "epoch": 3.1678560347502325, "grad_norm": 0.0468403585255146, "learning_rate": 9.896547918391646e-06, "loss": 0.2315, "num_input_tokens_seen": 6219280, "step": 10210 }, { "epoch": 3.1694073844244492, "grad_norm": 0.14648517966270447, "learning_rate": 9.896273771721384e-06, "loss": 0.2305, "num_input_tokens_seen": 6222000, "step": 10215 }, { "epoch": 3.170958734098666, "grad_norm": 0.14178495109081268, "learning_rate": 9.895999266096275e-06, "loss": 0.2316, "num_input_tokens_seen": 6225168, "step": 10220 }, { "epoch": 3.1725100837728823, "grad_norm": 0.2819196581840515, "learning_rate": 9.895724401536441e-06, "loss": 0.2306, "num_input_tokens_seen": 6227792, "step": 10225 }, { "epoch": 3.174061433447099, "grad_norm": 0.061895810067653656, "learning_rate": 9.895449178062035e-06, "loss": 0.2326, "num_input_tokens_seen": 6231056, "step": 10230 }, { "epoch": 3.1756127831213155, "grad_norm": 0.13540199398994446, "learning_rate": 9.895173595693234e-06, "loss": 0.2346, "num_input_tokens_seen": 6233552, "step": 10235 }, { "epoch": 3.1771641327955322, "grad_norm": 0.13266105949878693, "learning_rate": 9.89489765445024e-06, "loss": 0.2298, "num_input_tokens_seen": 6235952, "step": 10240 }, { "epoch": 3.1787154824697486, "grad_norm": 0.1469624638557434, "learning_rate": 9.894621354353285e-06, "loss": 0.2336, "num_input_tokens_seen": 6239152, "step": 10245 }, { "epoch": 3.1802668321439653, "grad_norm": 0.23988638818264008, "learning_rate": 9.894344695422621e-06, "loss": 0.2263, "num_input_tokens_seen": 6241872, "step": 10250 }, { "epoch": 3.1818181818181817, "grad_norm": 0.14376062154769897, "learning_rate": 9.894067677678535e-06, "loss": 0.2325, "num_input_tokens_seen": 6245072, "step": 10255 }, { "epoch": 3.1833695314923984, "grad_norm": 0.1465548276901245, "learning_rate": 9.893790301141335e-06, "loss": 0.2328, "num_input_tokens_seen": 6248592, "step": 10260 }, { "epoch": 3.1849208811666148, "grad_norm": 0.04848930984735489, "learning_rate": 9.893512565831353e-06, "loss": 0.2228, "num_input_tokens_seen": 6251984, "step": 10265 }, { "epoch": 3.1864722308408315, "grad_norm": 0.1560327261686325, "learning_rate": 9.893234471768954e-06, "loss": 0.2367, "num_input_tokens_seen": 6254704, "step": 10270 }, { "epoch": 3.188023580515048, "grad_norm": 0.04314358904957771, "learning_rate": 9.892956018974522e-06, "loss": 0.2326, "num_input_tokens_seen": 6257424, "step": 10275 }, { "epoch": 3.1895749301892646, "grad_norm": 0.05673319101333618, "learning_rate": 9.892677207468472e-06, "loss": 0.2267, "num_input_tokens_seen": 6260144, "step": 10280 }, { "epoch": 3.1911262798634814, "grad_norm": 0.15795502066612244, "learning_rate": 9.892398037271245e-06, "loss": 0.2319, "num_input_tokens_seen": 6263376, "step": 10285 }, { "epoch": 3.1926776295376977, "grad_norm": 0.08259036391973495, "learning_rate": 9.892118508403307e-06, "loss": 0.2366, "num_input_tokens_seen": 6266096, "step": 10290 }, { "epoch": 3.1942289792119145, "grad_norm": 0.14451591670513153, "learning_rate": 9.891838620885152e-06, "loss": 0.2311, "num_input_tokens_seen": 6270416, "step": 10295 }, { "epoch": 3.195780328886131, "grad_norm": 0.06756636500358582, "learning_rate": 9.891558374737298e-06, "loss": 0.2331, "num_input_tokens_seen": 6273776, "step": 10300 }, { "epoch": 3.1973316785603476, "grad_norm": 0.14596889913082123, "learning_rate": 9.89127776998029e-06, "loss": 0.2324, "num_input_tokens_seen": 6276400, "step": 10305 }, { "epoch": 3.198883028234564, "grad_norm": 0.14291168749332428, "learning_rate": 9.8909968066347e-06, "loss": 0.2319, "num_input_tokens_seen": 6279568, "step": 10310 }, { "epoch": 3.2004343779087807, "grad_norm": 0.26270419359207153, "learning_rate": 9.890715484721127e-06, "loss": 0.2366, "num_input_tokens_seen": 6282128, "step": 10315 }, { "epoch": 3.201985727582997, "grad_norm": 0.05211792141199112, "learning_rate": 9.890433804260194e-06, "loss": 0.234, "num_input_tokens_seen": 6286448, "step": 10320 }, { "epoch": 3.203537077257214, "grad_norm": 0.13198885321617126, "learning_rate": 9.890151765272552e-06, "loss": 0.2299, "num_input_tokens_seen": 6290096, "step": 10325 }, { "epoch": 3.20508842693143, "grad_norm": 0.1305892914533615, "learning_rate": 9.889869367778875e-06, "loss": 0.2304, "num_input_tokens_seen": 6292432, "step": 10330 }, { "epoch": 3.206639776605647, "grad_norm": 0.0676041841506958, "learning_rate": 9.889586611799871e-06, "loss": 0.2331, "num_input_tokens_seen": 6295600, "step": 10335 }, { "epoch": 3.2081911262798632, "grad_norm": 0.12406980991363525, "learning_rate": 9.889303497356266e-06, "loss": 0.2317, "num_input_tokens_seen": 6299408, "step": 10340 }, { "epoch": 3.20974247595408, "grad_norm": 0.14593841135501862, "learning_rate": 9.889020024468818e-06, "loss": 0.229, "num_input_tokens_seen": 6303312, "step": 10345 }, { "epoch": 3.211293825628297, "grad_norm": 0.07993239909410477, "learning_rate": 9.888736193158306e-06, "loss": 0.2316, "num_input_tokens_seen": 6306288, "step": 10350 }, { "epoch": 3.212845175302513, "grad_norm": 0.14694800972938538, "learning_rate": 9.88845200344554e-06, "loss": 0.2326, "num_input_tokens_seen": 6308720, "step": 10355 }, { "epoch": 3.21439652497673, "grad_norm": 0.13780993223190308, "learning_rate": 9.888167455351354e-06, "loss": 0.231, "num_input_tokens_seen": 6311120, "step": 10360 }, { "epoch": 3.2159478746509462, "grad_norm": 0.13468262553215027, "learning_rate": 9.88788254889661e-06, "loss": 0.234, "num_input_tokens_seen": 6313648, "step": 10365 }, { "epoch": 3.217499224325163, "grad_norm": 0.06919676065444946, "learning_rate": 9.887597284102193e-06, "loss": 0.2278, "num_input_tokens_seen": 6315952, "step": 10370 }, { "epoch": 3.2190505739993793, "grad_norm": 0.25557151436805725, "learning_rate": 9.887311660989019e-06, "loss": 0.2283, "num_input_tokens_seen": 6319184, "step": 10375 }, { "epoch": 3.220601923673596, "grad_norm": 0.14469337463378906, "learning_rate": 9.887025679578024e-06, "loss": 0.2299, "num_input_tokens_seen": 6322256, "step": 10380 }, { "epoch": 3.2221532733478124, "grad_norm": 0.26075854897499084, "learning_rate": 9.886739339890176e-06, "loss": 0.2269, "num_input_tokens_seen": 6324784, "step": 10385 }, { "epoch": 3.223704623022029, "grad_norm": 0.07171787321567535, "learning_rate": 9.886452641946467e-06, "loss": 0.231, "num_input_tokens_seen": 6329104, "step": 10390 }, { "epoch": 3.2252559726962455, "grad_norm": 0.16798663139343262, "learning_rate": 9.886165585767914e-06, "loss": 0.2333, "num_input_tokens_seen": 6331856, "step": 10395 }, { "epoch": 3.2268073223704623, "grad_norm": 0.16609135270118713, "learning_rate": 9.885878171375564e-06, "loss": 0.2302, "num_input_tokens_seen": 6335184, "step": 10400 }, { "epoch": 3.228358672044679, "grad_norm": 0.18447257578372955, "learning_rate": 9.885590398790486e-06, "loss": 0.2317, "num_input_tokens_seen": 6338416, "step": 10405 }, { "epoch": 3.2299100217188954, "grad_norm": 0.0970485582947731, "learning_rate": 9.885302268033778e-06, "loss": 0.2316, "num_input_tokens_seen": 6341264, "step": 10410 }, { "epoch": 3.231461371393112, "grad_norm": 0.3267863094806671, "learning_rate": 9.885013779126565e-06, "loss": 0.2258, "num_input_tokens_seen": 6343760, "step": 10415 }, { "epoch": 3.2330127210673285, "grad_norm": 0.19909848272800446, "learning_rate": 9.884724932089991e-06, "loss": 0.2349, "num_input_tokens_seen": 6346064, "step": 10420 }, { "epoch": 3.2345640707415453, "grad_norm": 0.35914769768714905, "learning_rate": 9.884435726945238e-06, "loss": 0.2379, "num_input_tokens_seen": 6349744, "step": 10425 }, { "epoch": 3.2361154204157616, "grad_norm": 0.30911985039711, "learning_rate": 9.884146163713506e-06, "loss": 0.2291, "num_input_tokens_seen": 6353104, "step": 10430 }, { "epoch": 3.2376667700899784, "grad_norm": 0.10763494670391083, "learning_rate": 9.883856242416022e-06, "loss": 0.2311, "num_input_tokens_seen": 6355760, "step": 10435 }, { "epoch": 3.2392181197641947, "grad_norm": 0.18917705118656158, "learning_rate": 9.883565963074043e-06, "loss": 0.2329, "num_input_tokens_seen": 6359056, "step": 10440 }, { "epoch": 3.2407694694384115, "grad_norm": 0.26950663328170776, "learning_rate": 9.883275325708848e-06, "loss": 0.2303, "num_input_tokens_seen": 6364720, "step": 10445 }, { "epoch": 3.242320819112628, "grad_norm": 0.1585819274187088, "learning_rate": 9.882984330341745e-06, "loss": 0.2329, "num_input_tokens_seen": 6367248, "step": 10450 }, { "epoch": 3.2438721687868446, "grad_norm": 0.17888730764389038, "learning_rate": 9.882692976994069e-06, "loss": 0.2335, "num_input_tokens_seen": 6369648, "step": 10455 }, { "epoch": 3.245423518461061, "grad_norm": 0.18590496480464935, "learning_rate": 9.882401265687176e-06, "loss": 0.2332, "num_input_tokens_seen": 6373008, "step": 10460 }, { "epoch": 3.2469748681352777, "grad_norm": 0.3110610544681549, "learning_rate": 9.882109196442456e-06, "loss": 0.2253, "num_input_tokens_seen": 6376496, "step": 10465 }, { "epoch": 3.2485262178094945, "grad_norm": 0.17547903954982758, "learning_rate": 9.881816769281318e-06, "loss": 0.2245, "num_input_tokens_seen": 6383216, "step": 10470 }, { "epoch": 3.250077567483711, "grad_norm": 0.18914352357387543, "learning_rate": 9.881523984225201e-06, "loss": 0.2267, "num_input_tokens_seen": 6386928, "step": 10475 }, { "epoch": 3.2516289171579276, "grad_norm": 0.2019301950931549, "learning_rate": 9.881230841295571e-06, "loss": 0.2285, "num_input_tokens_seen": 6389488, "step": 10480 }, { "epoch": 3.253180266832144, "grad_norm": 0.23803704977035522, "learning_rate": 9.880937340513916e-06, "loss": 0.245, "num_input_tokens_seen": 6391696, "step": 10485 }, { "epoch": 3.2547316165063607, "grad_norm": 0.16222621500492096, "learning_rate": 9.880643481901758e-06, "loss": 0.2402, "num_input_tokens_seen": 6394832, "step": 10490 }, { "epoch": 3.256282966180577, "grad_norm": 0.15250252187252045, "learning_rate": 9.880349265480634e-06, "loss": 0.2295, "num_input_tokens_seen": 6396944, "step": 10495 }, { "epoch": 3.2578343158547938, "grad_norm": 0.16378146409988403, "learning_rate": 9.88005469127212e-06, "loss": 0.2325, "num_input_tokens_seen": 6399600, "step": 10500 }, { "epoch": 3.25938566552901, "grad_norm": 0.3249582052230835, "learning_rate": 9.879759759297808e-06, "loss": 0.233, "num_input_tokens_seen": 6402736, "step": 10505 }, { "epoch": 3.260937015203227, "grad_norm": 0.1524244099855423, "learning_rate": 9.879464469579319e-06, "loss": 0.232, "num_input_tokens_seen": 6405296, "step": 10510 }, { "epoch": 3.262488364877443, "grad_norm": 0.1688760370016098, "learning_rate": 9.879168822138303e-06, "loss": 0.2357, "num_input_tokens_seen": 6408432, "step": 10515 }, { "epoch": 3.26403971455166, "grad_norm": 0.06652607768774033, "learning_rate": 9.878872816996434e-06, "loss": 0.233, "num_input_tokens_seen": 6411696, "step": 10520 }, { "epoch": 3.2655910642258767, "grad_norm": 0.163303405046463, "learning_rate": 9.878576454175416e-06, "loss": 0.2299, "num_input_tokens_seen": 6414224, "step": 10525 }, { "epoch": 3.267142413900093, "grad_norm": 0.09516515582799911, "learning_rate": 9.87827973369697e-06, "loss": 0.2273, "num_input_tokens_seen": 6416240, "step": 10530 }, { "epoch": 3.2686937635743094, "grad_norm": 0.11184172332286835, "learning_rate": 9.877982655582852e-06, "loss": 0.236, "num_input_tokens_seen": 6420752, "step": 10535 }, { "epoch": 3.270245113248526, "grad_norm": 0.25551584362983704, "learning_rate": 9.877685219854843e-06, "loss": 0.239, "num_input_tokens_seen": 6423888, "step": 10540 }, { "epoch": 3.271796462922743, "grad_norm": 0.1764717549085617, "learning_rate": 9.877387426534746e-06, "loss": 0.2325, "num_input_tokens_seen": 6426320, "step": 10545 }, { "epoch": 3.2733478125969593, "grad_norm": 0.17669931054115295, "learning_rate": 9.877089275644393e-06, "loss": 0.2296, "num_input_tokens_seen": 6429744, "step": 10550 }, { "epoch": 3.274899162271176, "grad_norm": 0.1484987586736679, "learning_rate": 9.876790767205644e-06, "loss": 0.2253, "num_input_tokens_seen": 6432432, "step": 10555 }, { "epoch": 3.2764505119453924, "grad_norm": 0.10865769535303116, "learning_rate": 9.876491901240382e-06, "loss": 0.2255, "num_input_tokens_seen": 6435536, "step": 10560 }, { "epoch": 3.278001861619609, "grad_norm": 0.16649794578552246, "learning_rate": 9.876192677770518e-06, "loss": 0.2433, "num_input_tokens_seen": 6438608, "step": 10565 }, { "epoch": 3.2795532112938255, "grad_norm": 0.30345281958580017, "learning_rate": 9.875893096817987e-06, "loss": 0.2335, "num_input_tokens_seen": 6441584, "step": 10570 }, { "epoch": 3.2811045609680423, "grad_norm": 0.16655416786670685, "learning_rate": 9.875593158404753e-06, "loss": 0.233, "num_input_tokens_seen": 6444400, "step": 10575 }, { "epoch": 3.2826559106422586, "grad_norm": 0.20291535556316376, "learning_rate": 9.875292862552803e-06, "loss": 0.2325, "num_input_tokens_seen": 6447696, "step": 10580 }, { "epoch": 3.2842072603164754, "grad_norm": 0.05655999109148979, "learning_rate": 9.874992209284156e-06, "loss": 0.2336, "num_input_tokens_seen": 6450032, "step": 10585 }, { "epoch": 3.285758609990692, "grad_norm": 0.04709651693701744, "learning_rate": 9.874691198620852e-06, "loss": 0.2328, "num_input_tokens_seen": 6452560, "step": 10590 }, { "epoch": 3.2873099596649085, "grad_norm": 0.14150631427764893, "learning_rate": 9.874389830584958e-06, "loss": 0.2292, "num_input_tokens_seen": 6455120, "step": 10595 }, { "epoch": 3.288861309339125, "grad_norm": 0.1563429832458496, "learning_rate": 9.874088105198568e-06, "loss": 0.2287, "num_input_tokens_seen": 6459312, "step": 10600 }, { "epoch": 3.2904126590133416, "grad_norm": 0.043839599937200546, "learning_rate": 9.8737860224838e-06, "loss": 0.2288, "num_input_tokens_seen": 6461392, "step": 10605 }, { "epoch": 3.2919640086875583, "grad_norm": 0.13234665989875793, "learning_rate": 9.873483582462804e-06, "loss": 0.2325, "num_input_tokens_seen": 6463824, "step": 10610 }, { "epoch": 3.2935153583617747, "grad_norm": 0.1712481528520584, "learning_rate": 9.87318078515775e-06, "loss": 0.2288, "num_input_tokens_seen": 6466640, "step": 10615 }, { "epoch": 3.2950667080359914, "grad_norm": 0.1823289841413498, "learning_rate": 9.872877630590839e-06, "loss": 0.2345, "num_input_tokens_seen": 6469264, "step": 10620 }, { "epoch": 3.2966180577102078, "grad_norm": 0.2829687297344208, "learning_rate": 9.872574118784292e-06, "loss": 0.2257, "num_input_tokens_seen": 6474576, "step": 10625 }, { "epoch": 3.2981694073844245, "grad_norm": 0.1661050170660019, "learning_rate": 9.872270249760363e-06, "loss": 0.2329, "num_input_tokens_seen": 6477104, "step": 10630 }, { "epoch": 3.299720757058641, "grad_norm": 0.1808203160762787, "learning_rate": 9.87196602354133e-06, "loss": 0.2324, "num_input_tokens_seen": 6479376, "step": 10635 }, { "epoch": 3.3012721067328576, "grad_norm": 0.09793820232152939, "learning_rate": 9.871661440149491e-06, "loss": 0.2291, "num_input_tokens_seen": 6482064, "step": 10640 }, { "epoch": 3.302823456407074, "grad_norm": 0.057997506111860275, "learning_rate": 9.871356499607183e-06, "loss": 0.2335, "num_input_tokens_seen": 6485840, "step": 10645 }, { "epoch": 3.3043748060812907, "grad_norm": 0.16079583764076233, "learning_rate": 9.871051201936756e-06, "loss": 0.2314, "num_input_tokens_seen": 6489168, "step": 10650 }, { "epoch": 3.3059261557555075, "grad_norm": 0.12972836196422577, "learning_rate": 9.870745547160594e-06, "loss": 0.2292, "num_input_tokens_seen": 6492496, "step": 10655 }, { "epoch": 3.307477505429724, "grad_norm": 0.04848996922373772, "learning_rate": 9.870439535301107e-06, "loss": 0.2413, "num_input_tokens_seen": 6495696, "step": 10660 }, { "epoch": 3.30902885510394, "grad_norm": 0.1384078860282898, "learning_rate": 9.870133166380726e-06, "loss": 0.2336, "num_input_tokens_seen": 6498000, "step": 10665 }, { "epoch": 3.310580204778157, "grad_norm": 0.051223140209913254, "learning_rate": 9.869826440421912e-06, "loss": 0.2298, "num_input_tokens_seen": 6502032, "step": 10670 }, { "epoch": 3.3121315544523737, "grad_norm": 0.13731388747692108, "learning_rate": 9.869519357447155e-06, "loss": 0.232, "num_input_tokens_seen": 6504688, "step": 10675 }, { "epoch": 3.31368290412659, "grad_norm": 0.14118680357933044, "learning_rate": 9.869211917478963e-06, "loss": 0.232, "num_input_tokens_seen": 6507248, "step": 10680 }, { "epoch": 3.315234253800807, "grad_norm": 0.2623809278011322, "learning_rate": 9.868904120539879e-06, "loss": 0.2394, "num_input_tokens_seen": 6509968, "step": 10685 }, { "epoch": 3.316785603475023, "grad_norm": 0.052869368344545364, "learning_rate": 9.868595966652465e-06, "loss": 0.2289, "num_input_tokens_seen": 6512560, "step": 10690 }, { "epoch": 3.31833695314924, "grad_norm": 0.13002978265285492, "learning_rate": 9.868287455839316e-06, "loss": 0.2325, "num_input_tokens_seen": 6515184, "step": 10695 }, { "epoch": 3.3198883028234563, "grad_norm": 0.12783659994602203, "learning_rate": 9.867978588123047e-06, "loss": 0.2299, "num_input_tokens_seen": 6517808, "step": 10700 }, { "epoch": 3.321439652497673, "grad_norm": 0.1340750902891159, "learning_rate": 9.8676693635263e-06, "loss": 0.2299, "num_input_tokens_seen": 6520752, "step": 10705 }, { "epoch": 3.3229910021718894, "grad_norm": 0.06206240504980087, "learning_rate": 9.867359782071749e-06, "loss": 0.2314, "num_input_tokens_seen": 6523376, "step": 10710 }, { "epoch": 3.324542351846106, "grad_norm": 0.251715749502182, "learning_rate": 9.867049843782087e-06, "loss": 0.2346, "num_input_tokens_seen": 6525744, "step": 10715 }, { "epoch": 3.326093701520323, "grad_norm": 0.14206163585186005, "learning_rate": 9.866739548680035e-06, "loss": 0.2289, "num_input_tokens_seen": 6528976, "step": 10720 }, { "epoch": 3.3276450511945392, "grad_norm": 0.15943418443202972, "learning_rate": 9.866428896788346e-06, "loss": 0.2342, "num_input_tokens_seen": 6531920, "step": 10725 }, { "epoch": 3.329196400868756, "grad_norm": 0.1272725760936737, "learning_rate": 9.86611788812979e-06, "loss": 0.2284, "num_input_tokens_seen": 6534448, "step": 10730 }, { "epoch": 3.3307477505429723, "grad_norm": 0.12328299134969711, "learning_rate": 9.86580652272717e-06, "loss": 0.2321, "num_input_tokens_seen": 6537840, "step": 10735 }, { "epoch": 3.332299100217189, "grad_norm": 0.15213072299957275, "learning_rate": 9.865494800603313e-06, "loss": 0.2367, "num_input_tokens_seen": 6540432, "step": 10740 }, { "epoch": 3.3338504498914054, "grad_norm": 0.2681483328342438, "learning_rate": 9.86518272178107e-06, "loss": 0.233, "num_input_tokens_seen": 6543760, "step": 10745 }, { "epoch": 3.335401799565622, "grad_norm": 0.04919906333088875, "learning_rate": 9.864870286283322e-06, "loss": 0.2351, "num_input_tokens_seen": 6546800, "step": 10750 }, { "epoch": 3.3369531492398385, "grad_norm": 0.2721627354621887, "learning_rate": 9.864557494132971e-06, "loss": 0.2326, "num_input_tokens_seen": 6549648, "step": 10755 }, { "epoch": 3.3385044989140553, "grad_norm": 0.1277204304933548, "learning_rate": 9.864244345352954e-06, "loss": 0.228, "num_input_tokens_seen": 6552560, "step": 10760 }, { "epoch": 3.3400558485882716, "grad_norm": 0.2521480619907379, "learning_rate": 9.863930839966222e-06, "loss": 0.2265, "num_input_tokens_seen": 6555024, "step": 10765 }, { "epoch": 3.3416071982624884, "grad_norm": 0.15617519617080688, "learning_rate": 9.863616977995764e-06, "loss": 0.2317, "num_input_tokens_seen": 6557840, "step": 10770 }, { "epoch": 3.3431585479367047, "grad_norm": 0.07581235468387604, "learning_rate": 9.863302759464589e-06, "loss": 0.2348, "num_input_tokens_seen": 6561488, "step": 10775 }, { "epoch": 3.3447098976109215, "grad_norm": 0.1400804966688156, "learning_rate": 9.862988184395728e-06, "loss": 0.2321, "num_input_tokens_seen": 6565104, "step": 10780 }, { "epoch": 3.3462612472851383, "grad_norm": 0.14035366475582123, "learning_rate": 9.862673252812249e-06, "loss": 0.2309, "num_input_tokens_seen": 6568944, "step": 10785 }, { "epoch": 3.3478125969593546, "grad_norm": 0.04424106329679489, "learning_rate": 9.862357964737237e-06, "loss": 0.2268, "num_input_tokens_seen": 6572080, "step": 10790 }, { "epoch": 3.3493639466335714, "grad_norm": 0.24950642883777618, "learning_rate": 9.862042320193807e-06, "loss": 0.227, "num_input_tokens_seen": 6575120, "step": 10795 }, { "epoch": 3.3509152963077877, "grad_norm": 0.1305488497018814, "learning_rate": 9.8617263192051e-06, "loss": 0.2274, "num_input_tokens_seen": 6578320, "step": 10800 }, { "epoch": 3.3524666459820045, "grad_norm": 0.05554705113172531, "learning_rate": 9.861409961794283e-06, "loss": 0.2301, "num_input_tokens_seen": 6582096, "step": 10805 }, { "epoch": 3.354017995656221, "grad_norm": 0.046994518488645554, "learning_rate": 9.861093247984547e-06, "loss": 0.2261, "num_input_tokens_seen": 6584656, "step": 10810 }, { "epoch": 3.3555693453304376, "grad_norm": 0.17235849797725677, "learning_rate": 9.860776177799114e-06, "loss": 0.2402, "num_input_tokens_seen": 6587376, "step": 10815 }, { "epoch": 3.357120695004654, "grad_norm": 0.13067401945590973, "learning_rate": 9.860458751261226e-06, "loss": 0.2282, "num_input_tokens_seen": 6589808, "step": 10820 }, { "epoch": 3.3586720446788707, "grad_norm": 0.30332696437835693, "learning_rate": 9.860140968394154e-06, "loss": 0.2421, "num_input_tokens_seen": 6593520, "step": 10825 }, { "epoch": 3.360223394353087, "grad_norm": 0.13609568774700165, "learning_rate": 9.859822829221196e-06, "loss": 0.2341, "num_input_tokens_seen": 6596016, "step": 10830 }, { "epoch": 3.361774744027304, "grad_norm": 0.13071955740451813, "learning_rate": 9.859504333765678e-06, "loss": 0.23, "num_input_tokens_seen": 6599312, "step": 10835 }, { "epoch": 3.36332609370152, "grad_norm": 0.06044677272439003, "learning_rate": 9.859185482050946e-06, "loss": 0.2294, "num_input_tokens_seen": 6603504, "step": 10840 }, { "epoch": 3.364877443375737, "grad_norm": 0.045239221304655075, "learning_rate": 9.858866274100377e-06, "loss": 0.2319, "num_input_tokens_seen": 6606192, "step": 10845 }, { "epoch": 3.3664287930499537, "grad_norm": 0.09863264113664627, "learning_rate": 9.858546709937373e-06, "loss": 0.2273, "num_input_tokens_seen": 6608688, "step": 10850 }, { "epoch": 3.36798014272417, "grad_norm": 0.16061964631080627, "learning_rate": 9.85822678958536e-06, "loss": 0.2336, "num_input_tokens_seen": 6611600, "step": 10855 }, { "epoch": 3.369531492398387, "grad_norm": 0.17349395155906677, "learning_rate": 9.857906513067794e-06, "loss": 0.2324, "num_input_tokens_seen": 6615536, "step": 10860 }, { "epoch": 3.371082842072603, "grad_norm": 0.14952987432479858, "learning_rate": 9.857585880408155e-06, "loss": 0.2269, "num_input_tokens_seen": 6618352, "step": 10865 }, { "epoch": 3.37263419174682, "grad_norm": 0.061723560094833374, "learning_rate": 9.857264891629948e-06, "loss": 0.2309, "num_input_tokens_seen": 6621392, "step": 10870 }, { "epoch": 3.374185541421036, "grad_norm": 0.09507241100072861, "learning_rate": 9.856943546756706e-06, "loss": 0.2245, "num_input_tokens_seen": 6624048, "step": 10875 }, { "epoch": 3.375736891095253, "grad_norm": 0.1337556093931198, "learning_rate": 9.856621845811987e-06, "loss": 0.2393, "num_input_tokens_seen": 6626480, "step": 10880 }, { "epoch": 3.3772882407694693, "grad_norm": 0.19305621087551117, "learning_rate": 9.856299788819374e-06, "loss": 0.2447, "num_input_tokens_seen": 6629040, "step": 10885 }, { "epoch": 3.378839590443686, "grad_norm": 0.13046923279762268, "learning_rate": 9.855977375802481e-06, "loss": 0.2323, "num_input_tokens_seen": 6631920, "step": 10890 }, { "epoch": 3.3803909401179024, "grad_norm": 0.0675591453909874, "learning_rate": 9.855654606784944e-06, "loss": 0.2371, "num_input_tokens_seen": 6634448, "step": 10895 }, { "epoch": 3.381942289792119, "grad_norm": 0.12950503826141357, "learning_rate": 9.855331481790423e-06, "loss": 0.2254, "num_input_tokens_seen": 6637040, "step": 10900 }, { "epoch": 3.3834936394663355, "grad_norm": 0.10098337382078171, "learning_rate": 9.85500800084261e-06, "loss": 0.2323, "num_input_tokens_seen": 6639344, "step": 10905 }, { "epoch": 3.3850449891405523, "grad_norm": 0.08664771169424057, "learning_rate": 9.854684163965218e-06, "loss": 0.2286, "num_input_tokens_seen": 6642448, "step": 10910 }, { "epoch": 3.386596338814769, "grad_norm": 0.1319928914308548, "learning_rate": 9.854359971181988e-06, "loss": 0.2274, "num_input_tokens_seen": 6645936, "step": 10915 }, { "epoch": 3.3881476884889854, "grad_norm": 0.29331785440444946, "learning_rate": 9.854035422516688e-06, "loss": 0.2315, "num_input_tokens_seen": 6648784, "step": 10920 }, { "epoch": 3.389699038163202, "grad_norm": 0.15645509958267212, "learning_rate": 9.853710517993113e-06, "loss": 0.2315, "num_input_tokens_seen": 6653488, "step": 10925 }, { "epoch": 3.3912503878374185, "grad_norm": 0.1454153060913086, "learning_rate": 9.85338525763508e-06, "loss": 0.2353, "num_input_tokens_seen": 6656560, "step": 10930 }, { "epoch": 3.3928017375116353, "grad_norm": 0.15041737258434296, "learning_rate": 9.853059641466433e-06, "loss": 0.2298, "num_input_tokens_seen": 6659952, "step": 10935 }, { "epoch": 3.3943530871858516, "grad_norm": 0.15015241503715515, "learning_rate": 9.852733669511047e-06, "loss": 0.2315, "num_input_tokens_seen": 6662928, "step": 10940 }, { "epoch": 3.3959044368600684, "grad_norm": 0.06299540400505066, "learning_rate": 9.852407341792817e-06, "loss": 0.2284, "num_input_tokens_seen": 6665520, "step": 10945 }, { "epoch": 3.3974557865342847, "grad_norm": 0.14203821122646332, "learning_rate": 9.852080658335669e-06, "loss": 0.2336, "num_input_tokens_seen": 6668432, "step": 10950 }, { "epoch": 3.3990071362085015, "grad_norm": 0.07700122147798538, "learning_rate": 9.851753619163552e-06, "loss": 0.2365, "num_input_tokens_seen": 6671376, "step": 10955 }, { "epoch": 3.400558485882718, "grad_norm": 0.13379420340061188, "learning_rate": 9.85142622430044e-06, "loss": 0.2324, "num_input_tokens_seen": 6674128, "step": 10960 }, { "epoch": 3.4021098355569346, "grad_norm": 0.08975081890821457, "learning_rate": 9.851098473770336e-06, "loss": 0.2345, "num_input_tokens_seen": 6677136, "step": 10965 }, { "epoch": 3.403661185231151, "grad_norm": 0.04115692898631096, "learning_rate": 9.85077036759727e-06, "loss": 0.2231, "num_input_tokens_seen": 6680048, "step": 10970 }, { "epoch": 3.4052125349053677, "grad_norm": 0.1812852919101715, "learning_rate": 9.850441905805292e-06, "loss": 0.2284, "num_input_tokens_seen": 6682192, "step": 10975 }, { "epoch": 3.4067638845795845, "grad_norm": 0.14115220308303833, "learning_rate": 9.850113088418486e-06, "loss": 0.2281, "num_input_tokens_seen": 6685232, "step": 10980 }, { "epoch": 3.408315234253801, "grad_norm": 0.19319681823253632, "learning_rate": 9.849783915460957e-06, "loss": 0.2396, "num_input_tokens_seen": 6687504, "step": 10985 }, { "epoch": 3.4098665839280176, "grad_norm": 0.14417307078838348, "learning_rate": 9.849454386956836e-06, "loss": 0.2259, "num_input_tokens_seen": 6690160, "step": 10990 }, { "epoch": 3.411417933602234, "grad_norm": 0.18205708265304565, "learning_rate": 9.849124502930282e-06, "loss": 0.2397, "num_input_tokens_seen": 6693328, "step": 10995 }, { "epoch": 3.4129692832764507, "grad_norm": 0.16521209478378296, "learning_rate": 9.848794263405481e-06, "loss": 0.2321, "num_input_tokens_seen": 6696528, "step": 11000 }, { "epoch": 3.414520632950667, "grad_norm": 0.13123711943626404, "learning_rate": 9.84846366840664e-06, "loss": 0.2323, "num_input_tokens_seen": 6699472, "step": 11005 }, { "epoch": 3.4160719826248838, "grad_norm": 0.04461527615785599, "learning_rate": 9.848132717958002e-06, "loss": 0.2284, "num_input_tokens_seen": 6702064, "step": 11010 }, { "epoch": 3.4176233322991, "grad_norm": 0.15867501497268677, "learning_rate": 9.847801412083821e-06, "loss": 0.2296, "num_input_tokens_seen": 6705808, "step": 11015 }, { "epoch": 3.419174681973317, "grad_norm": 0.15187351405620575, "learning_rate": 9.847469750808392e-06, "loss": 0.2341, "num_input_tokens_seen": 6709040, "step": 11020 }, { "epoch": 3.420726031647533, "grad_norm": 0.1531003713607788, "learning_rate": 9.847137734156028e-06, "loss": 0.2412, "num_input_tokens_seen": 6712208, "step": 11025 }, { "epoch": 3.42227738132175, "grad_norm": 0.1361733078956604, "learning_rate": 9.846805362151067e-06, "loss": 0.232, "num_input_tokens_seen": 6715088, "step": 11030 }, { "epoch": 3.4238287309959663, "grad_norm": 0.0573514886200428, "learning_rate": 9.84647263481788e-06, "loss": 0.2303, "num_input_tokens_seen": 6718448, "step": 11035 }, { "epoch": 3.425380080670183, "grad_norm": 0.12338168174028397, "learning_rate": 9.846139552180858e-06, "loss": 0.2264, "num_input_tokens_seen": 6722224, "step": 11040 }, { "epoch": 3.4269314303444, "grad_norm": 0.3004142940044403, "learning_rate": 9.845806114264419e-06, "loss": 0.2316, "num_input_tokens_seen": 6724624, "step": 11045 }, { "epoch": 3.428482780018616, "grad_norm": 0.24615927040576935, "learning_rate": 9.84547232109301e-06, "loss": 0.2328, "num_input_tokens_seen": 6727856, "step": 11050 }, { "epoch": 3.430034129692833, "grad_norm": 0.39056164026260376, "learning_rate": 9.8451381726911e-06, "loss": 0.2519, "num_input_tokens_seen": 6731120, "step": 11055 }, { "epoch": 3.4315854793670493, "grad_norm": 0.23139876127243042, "learning_rate": 9.844803669083188e-06, "loss": 0.2349, "num_input_tokens_seen": 6735344, "step": 11060 }, { "epoch": 3.433136829041266, "grad_norm": 0.10162738710641861, "learning_rate": 9.844468810293794e-06, "loss": 0.2343, "num_input_tokens_seen": 6738320, "step": 11065 }, { "epoch": 3.4346881787154824, "grad_norm": 0.2788376808166504, "learning_rate": 9.844133596347471e-06, "loss": 0.2281, "num_input_tokens_seen": 6741296, "step": 11070 }, { "epoch": 3.436239528389699, "grad_norm": 0.3753094971179962, "learning_rate": 9.843798027268791e-06, "loss": 0.2465, "num_input_tokens_seen": 6744176, "step": 11075 }, { "epoch": 3.4377908780639155, "grad_norm": 0.06424913555383682, "learning_rate": 9.843462103082359e-06, "loss": 0.2308, "num_input_tokens_seen": 6746672, "step": 11080 }, { "epoch": 3.4393422277381323, "grad_norm": 0.2843053340911865, "learning_rate": 9.843125823812798e-06, "loss": 0.2299, "num_input_tokens_seen": 6749872, "step": 11085 }, { "epoch": 3.4408935774123486, "grad_norm": 0.13563977181911469, "learning_rate": 9.842789189484763e-06, "loss": 0.2327, "num_input_tokens_seen": 6752720, "step": 11090 }, { "epoch": 3.4424449270865654, "grad_norm": 0.11910931020975113, "learning_rate": 9.842452200122931e-06, "loss": 0.2247, "num_input_tokens_seen": 6755888, "step": 11095 }, { "epoch": 3.4439962767607817, "grad_norm": 0.13342437148094177, "learning_rate": 9.842114855752013e-06, "loss": 0.2276, "num_input_tokens_seen": 6759216, "step": 11100 }, { "epoch": 3.4455476264349985, "grad_norm": 0.19591209292411804, "learning_rate": 9.841777156396734e-06, "loss": 0.2338, "num_input_tokens_seen": 6762096, "step": 11105 }, { "epoch": 3.4470989761092152, "grad_norm": 0.19455058872699738, "learning_rate": 9.841439102081857e-06, "loss": 0.2373, "num_input_tokens_seen": 6764784, "step": 11110 }, { "epoch": 3.4486503257834316, "grad_norm": 0.1522979587316513, "learning_rate": 9.841100692832163e-06, "loss": 0.2322, "num_input_tokens_seen": 6769520, "step": 11115 }, { "epoch": 3.4502016754576483, "grad_norm": 0.12388896942138672, "learning_rate": 9.840761928672458e-06, "loss": 0.232, "num_input_tokens_seen": 6771888, "step": 11120 }, { "epoch": 3.4517530251318647, "grad_norm": 0.13505107164382935, "learning_rate": 9.840422809627581e-06, "loss": 0.2347, "num_input_tokens_seen": 6775472, "step": 11125 }, { "epoch": 3.4533043748060814, "grad_norm": 0.13212217390537262, "learning_rate": 9.840083335722394e-06, "loss": 0.2326, "num_input_tokens_seen": 6778544, "step": 11130 }, { "epoch": 3.4548557244802978, "grad_norm": 0.05181629955768585, "learning_rate": 9.839743506981783e-06, "loss": 0.2389, "num_input_tokens_seen": 6781776, "step": 11135 }, { "epoch": 3.4564070741545145, "grad_norm": 0.047969672828912735, "learning_rate": 9.83940332343066e-06, "loss": 0.2315, "num_input_tokens_seen": 6784336, "step": 11140 }, { "epoch": 3.457958423828731, "grad_norm": 0.060651473701000214, "learning_rate": 9.83906278509397e-06, "loss": 0.2337, "num_input_tokens_seen": 6787472, "step": 11145 }, { "epoch": 3.4595097735029476, "grad_norm": 0.055510763078927994, "learning_rate": 9.838721891996672e-06, "loss": 0.229, "num_input_tokens_seen": 6790224, "step": 11150 }, { "epoch": 3.461061123177164, "grad_norm": 0.24018913507461548, "learning_rate": 9.838380644163761e-06, "loss": 0.2269, "num_input_tokens_seen": 6792944, "step": 11155 }, { "epoch": 3.4626124728513807, "grad_norm": 0.27445411682128906, "learning_rate": 9.838039041620253e-06, "loss": 0.2384, "num_input_tokens_seen": 6796304, "step": 11160 }, { "epoch": 3.464163822525597, "grad_norm": 0.17376089096069336, "learning_rate": 9.837697084391193e-06, "loss": 0.2397, "num_input_tokens_seen": 6799216, "step": 11165 }, { "epoch": 3.465715172199814, "grad_norm": 0.03752828761935234, "learning_rate": 9.83735477250165e-06, "loss": 0.2339, "num_input_tokens_seen": 6802032, "step": 11170 }, { "epoch": 3.4672665218740306, "grad_norm": 0.12365829199552536, "learning_rate": 9.83701210597672e-06, "loss": 0.2284, "num_input_tokens_seen": 6804912, "step": 11175 }, { "epoch": 3.468817871548247, "grad_norm": 0.13631169497966766, "learning_rate": 9.836669084841522e-06, "loss": 0.2366, "num_input_tokens_seen": 6807312, "step": 11180 }, { "epoch": 3.4703692212224637, "grad_norm": 0.07062117010354996, "learning_rate": 9.836325709121205e-06, "loss": 0.234, "num_input_tokens_seen": 6812048, "step": 11185 }, { "epoch": 3.47192057089668, "grad_norm": 0.12645494937896729, "learning_rate": 9.835981978840945e-06, "loss": 0.2335, "num_input_tokens_seen": 6814896, "step": 11190 }, { "epoch": 3.473471920570897, "grad_norm": 0.12284950166940689, "learning_rate": 9.83563789402594e-06, "loss": 0.2309, "num_input_tokens_seen": 6818608, "step": 11195 }, { "epoch": 3.475023270245113, "grad_norm": 0.11792898178100586, "learning_rate": 9.835293454701413e-06, "loss": 0.2298, "num_input_tokens_seen": 6821520, "step": 11200 }, { "epoch": 3.47657461991933, "grad_norm": 0.045200739055871964, "learning_rate": 9.834948660892618e-06, "loss": 0.231, "num_input_tokens_seen": 6824816, "step": 11205 }, { "epoch": 3.4781259695935463, "grad_norm": 0.22239156067371368, "learning_rate": 9.834603512624832e-06, "loss": 0.2289, "num_input_tokens_seen": 6827792, "step": 11210 }, { "epoch": 3.479677319267763, "grad_norm": 0.22596712410449982, "learning_rate": 9.834258009923357e-06, "loss": 0.2299, "num_input_tokens_seen": 6830576, "step": 11215 }, { "epoch": 3.4812286689419794, "grad_norm": 0.049056828022003174, "learning_rate": 9.833912152813524e-06, "loss": 0.2315, "num_input_tokens_seen": 6833488, "step": 11220 }, { "epoch": 3.482780018616196, "grad_norm": 0.2430625855922699, "learning_rate": 9.83356594132069e-06, "loss": 0.2326, "num_input_tokens_seen": 6835760, "step": 11225 }, { "epoch": 3.4843313682904125, "grad_norm": 0.2444777488708496, "learning_rate": 9.833219375470234e-06, "loss": 0.2344, "num_input_tokens_seen": 6839120, "step": 11230 }, { "epoch": 3.4858827179646292, "grad_norm": 0.060915809124708176, "learning_rate": 9.832872455287562e-06, "loss": 0.2311, "num_input_tokens_seen": 6841808, "step": 11235 }, { "epoch": 3.487434067638846, "grad_norm": 0.23576608300209045, "learning_rate": 9.83252518079811e-06, "loss": 0.2308, "num_input_tokens_seen": 6844720, "step": 11240 }, { "epoch": 3.4889854173130623, "grad_norm": 0.22858501970767975, "learning_rate": 9.832177552027338e-06, "loss": 0.231, "num_input_tokens_seen": 6847280, "step": 11245 }, { "epoch": 3.490536766987279, "grad_norm": 0.1299629509449005, "learning_rate": 9.831829569000729e-06, "loss": 0.2335, "num_input_tokens_seen": 6850672, "step": 11250 }, { "epoch": 3.4920881166614954, "grad_norm": 0.1794375330209732, "learning_rate": 9.831481231743793e-06, "loss": 0.234, "num_input_tokens_seen": 6853520, "step": 11255 }, { "epoch": 3.493639466335712, "grad_norm": 0.06192610040307045, "learning_rate": 9.831132540282072e-06, "loss": 0.2303, "num_input_tokens_seen": 6856912, "step": 11260 }, { "epoch": 3.4951908160099285, "grad_norm": 0.07078041136264801, "learning_rate": 9.830783494641126e-06, "loss": 0.234, "num_input_tokens_seen": 6859792, "step": 11265 }, { "epoch": 3.4967421656841453, "grad_norm": 0.12154120951890945, "learning_rate": 9.830434094846544e-06, "loss": 0.2361, "num_input_tokens_seen": 6864592, "step": 11270 }, { "epoch": 3.4982935153583616, "grad_norm": 0.06338126957416534, "learning_rate": 9.830084340923945e-06, "loss": 0.2304, "num_input_tokens_seen": 6867536, "step": 11275 }, { "epoch": 3.4998448650325784, "grad_norm": 0.13063660264015198, "learning_rate": 9.829734232898964e-06, "loss": 0.2283, "num_input_tokens_seen": 6870480, "step": 11280 }, { "epoch": 3.501396214706795, "grad_norm": 0.14029178023338318, "learning_rate": 9.829383770797271e-06, "loss": 0.2296, "num_input_tokens_seen": 6872848, "step": 11285 }, { "epoch": 3.5029475643810115, "grad_norm": 0.06659162044525146, "learning_rate": 9.829032954644561e-06, "loss": 0.2214, "num_input_tokens_seen": 6875344, "step": 11290 }, { "epoch": 3.504498914055228, "grad_norm": 0.11085540056228638, "learning_rate": 9.82868178446655e-06, "loss": 0.2324, "num_input_tokens_seen": 6877648, "step": 11295 }, { "epoch": 3.5060502637294446, "grad_norm": 0.18709096312522888, "learning_rate": 9.828330260288984e-06, "loss": 0.2211, "num_input_tokens_seen": 6880144, "step": 11300 }, { "epoch": 3.5076016134036614, "grad_norm": 0.23978778719902039, "learning_rate": 9.827978382137635e-06, "loss": 0.2225, "num_input_tokens_seen": 6882960, "step": 11305 }, { "epoch": 3.5091529630778777, "grad_norm": 0.2096099555492401, "learning_rate": 9.827626150038297e-06, "loss": 0.2321, "num_input_tokens_seen": 6885968, "step": 11310 }, { "epoch": 3.510704312752094, "grad_norm": 0.6527460217475891, "learning_rate": 9.827273564016796e-06, "loss": 0.2219, "num_input_tokens_seen": 6888528, "step": 11315 }, { "epoch": 3.512255662426311, "grad_norm": 0.1817292422056198, "learning_rate": 9.826920624098978e-06, "loss": 0.2642, "num_input_tokens_seen": 6891600, "step": 11320 }, { "epoch": 3.5138070121005276, "grad_norm": 0.08581163734197617, "learning_rate": 9.826567330310721e-06, "loss": 0.2252, "num_input_tokens_seen": 6894448, "step": 11325 }, { "epoch": 3.515358361774744, "grad_norm": 0.07017761468887329, "learning_rate": 9.826213682677921e-06, "loss": 0.2348, "num_input_tokens_seen": 6897584, "step": 11330 }, { "epoch": 3.5169097114489607, "grad_norm": 0.07187194377183914, "learning_rate": 9.82585968122651e-06, "loss": 0.2198, "num_input_tokens_seen": 6900112, "step": 11335 }, { "epoch": 3.518461061123177, "grad_norm": 0.04027051478624344, "learning_rate": 9.825505325982437e-06, "loss": 0.2375, "num_input_tokens_seen": 6903600, "step": 11340 }, { "epoch": 3.520012410797394, "grad_norm": 0.1244565024971962, "learning_rate": 9.825150616971679e-06, "loss": 0.2347, "num_input_tokens_seen": 6906256, "step": 11345 }, { "epoch": 3.5215637604716106, "grad_norm": 0.15489937365055084, "learning_rate": 9.824795554220243e-06, "loss": 0.2337, "num_input_tokens_seen": 6908944, "step": 11350 }, { "epoch": 3.523115110145827, "grad_norm": 0.1402658224105835, "learning_rate": 9.82444013775416e-06, "loss": 0.2362, "num_input_tokens_seen": 6911632, "step": 11355 }, { "epoch": 3.5246664598200432, "grad_norm": 0.11975599080324173, "learning_rate": 9.824084367599484e-06, "loss": 0.2351, "num_input_tokens_seen": 6914576, "step": 11360 }, { "epoch": 3.52621780949426, "grad_norm": 0.03143859654664993, "learning_rate": 9.823728243782298e-06, "loss": 0.235, "num_input_tokens_seen": 6917680, "step": 11365 }, { "epoch": 3.5277691591684768, "grad_norm": 0.15336349606513977, "learning_rate": 9.82337176632871e-06, "loss": 0.2325, "num_input_tokens_seen": 6919696, "step": 11370 }, { "epoch": 3.529320508842693, "grad_norm": 0.16268764436244965, "learning_rate": 9.823014935264856e-06, "loss": 0.231, "num_input_tokens_seen": 6922544, "step": 11375 }, { "epoch": 3.5308718585169094, "grad_norm": 0.26839765906333923, "learning_rate": 9.822657750616894e-06, "loss": 0.2383, "num_input_tokens_seen": 6925008, "step": 11380 }, { "epoch": 3.532423208191126, "grad_norm": 0.1323300004005432, "learning_rate": 9.822300212411009e-06, "loss": 0.232, "num_input_tokens_seen": 6927920, "step": 11385 }, { "epoch": 3.533974557865343, "grad_norm": 0.12778572738170624, "learning_rate": 9.821942320673414e-06, "loss": 0.2309, "num_input_tokens_seen": 6930608, "step": 11390 }, { "epoch": 3.5355259075395593, "grad_norm": 0.22208815813064575, "learning_rate": 9.821584075430348e-06, "loss": 0.2304, "num_input_tokens_seen": 6933264, "step": 11395 }, { "epoch": 3.537077257213776, "grad_norm": 0.2192222625017166, "learning_rate": 9.821225476708071e-06, "loss": 0.2304, "num_input_tokens_seen": 6936208, "step": 11400 }, { "epoch": 3.5386286068879924, "grad_norm": 0.12177303433418274, "learning_rate": 9.820866524532876e-06, "loss": 0.2324, "num_input_tokens_seen": 6939024, "step": 11405 }, { "epoch": 3.540179956562209, "grad_norm": 0.2208721786737442, "learning_rate": 9.820507218931077e-06, "loss": 0.2309, "num_input_tokens_seen": 6942256, "step": 11410 }, { "epoch": 3.541731306236426, "grad_norm": 0.12175446003675461, "learning_rate": 9.820147559929014e-06, "loss": 0.2325, "num_input_tokens_seen": 6945712, "step": 11415 }, { "epoch": 3.5432826559106423, "grad_norm": 0.11770587414503098, "learning_rate": 9.819787547553058e-06, "loss": 0.2298, "num_input_tokens_seen": 6948176, "step": 11420 }, { "epoch": 3.5448340055848586, "grad_norm": 0.04150763154029846, "learning_rate": 9.819427181829598e-06, "loss": 0.2326, "num_input_tokens_seen": 6951344, "step": 11425 }, { "epoch": 3.5463853552590754, "grad_norm": 0.22658926248550415, "learning_rate": 9.819066462785056e-06, "loss": 0.2326, "num_input_tokens_seen": 6954672, "step": 11430 }, { "epoch": 3.547936704933292, "grad_norm": 0.11974841356277466, "learning_rate": 9.818705390445876e-06, "loss": 0.2315, "num_input_tokens_seen": 6957680, "step": 11435 }, { "epoch": 3.5494880546075085, "grad_norm": 0.1159321740269661, "learning_rate": 9.81834396483853e-06, "loss": 0.2304, "num_input_tokens_seen": 6960592, "step": 11440 }, { "epoch": 3.5510394042817253, "grad_norm": 0.12515485286712646, "learning_rate": 9.817982185989511e-06, "loss": 0.2314, "num_input_tokens_seen": 6964080, "step": 11445 }, { "epoch": 3.5525907539559416, "grad_norm": 0.21559666097164154, "learning_rate": 9.817620053925347e-06, "loss": 0.2299, "num_input_tokens_seen": 6967632, "step": 11450 }, { "epoch": 3.5541421036301584, "grad_norm": 0.12484196573495865, "learning_rate": 9.817257568672582e-06, "loss": 0.2337, "num_input_tokens_seen": 6970736, "step": 11455 }, { "epoch": 3.5556934533043747, "grad_norm": 0.04193166270852089, "learning_rate": 9.816894730257793e-06, "loss": 0.2358, "num_input_tokens_seen": 6974192, "step": 11460 }, { "epoch": 3.5572448029785915, "grad_norm": 0.05276286602020264, "learning_rate": 9.81653153870758e-06, "loss": 0.2284, "num_input_tokens_seen": 6976912, "step": 11465 }, { "epoch": 3.558796152652808, "grad_norm": 0.12119525671005249, "learning_rate": 9.816167994048569e-06, "loss": 0.2305, "num_input_tokens_seen": 6980784, "step": 11470 }, { "epoch": 3.5603475023270246, "grad_norm": 0.11518339812755585, "learning_rate": 9.81580409630741e-06, "loss": 0.2294, "num_input_tokens_seen": 6983792, "step": 11475 }, { "epoch": 3.5618988520012413, "grad_norm": 0.03968868777155876, "learning_rate": 9.815439845510786e-06, "loss": 0.2325, "num_input_tokens_seen": 6986000, "step": 11480 }, { "epoch": 3.5634502016754577, "grad_norm": 0.03140319138765335, "learning_rate": 9.815075241685397e-06, "loss": 0.2325, "num_input_tokens_seen": 6988688, "step": 11485 }, { "epoch": 3.565001551349674, "grad_norm": 0.0354788564145565, "learning_rate": 9.814710284857973e-06, "loss": 0.2325, "num_input_tokens_seen": 6991408, "step": 11490 }, { "epoch": 3.5665529010238908, "grad_norm": 0.11571168899536133, "learning_rate": 9.814344975055273e-06, "loss": 0.2319, "num_input_tokens_seen": 6993744, "step": 11495 }, { "epoch": 3.5681042506981075, "grad_norm": 0.034568894654512405, "learning_rate": 9.813979312304072e-06, "loss": 0.2283, "num_input_tokens_seen": 6996048, "step": 11500 }, { "epoch": 3.569655600372324, "grad_norm": 0.12050691992044449, "learning_rate": 9.813613296631183e-06, "loss": 0.2299, "num_input_tokens_seen": 6998704, "step": 11505 }, { "epoch": 3.5712069500465407, "grad_norm": 0.22697843611240387, "learning_rate": 9.81324692806344e-06, "loss": 0.2315, "num_input_tokens_seen": 7001488, "step": 11510 }, { "epoch": 3.572758299720757, "grad_norm": 0.1214093416929245, "learning_rate": 9.812880206627698e-06, "loss": 0.2325, "num_input_tokens_seen": 7004784, "step": 11515 }, { "epoch": 3.5743096493949738, "grad_norm": 0.12697914242744446, "learning_rate": 9.812513132350843e-06, "loss": 0.2325, "num_input_tokens_seen": 7007440, "step": 11520 }, { "epoch": 3.57586099906919, "grad_norm": 0.043618541210889816, "learning_rate": 9.812145705259786e-06, "loss": 0.2351, "num_input_tokens_seen": 7011024, "step": 11525 }, { "epoch": 3.577412348743407, "grad_norm": 0.12882550060749054, "learning_rate": 9.811777925381467e-06, "loss": 0.233, "num_input_tokens_seen": 7013616, "step": 11530 }, { "epoch": 3.578963698417623, "grad_norm": 0.12012266367673874, "learning_rate": 9.811409792742845e-06, "loss": 0.2294, "num_input_tokens_seen": 7016880, "step": 11535 }, { "epoch": 3.58051504809184, "grad_norm": 0.05608828365802765, "learning_rate": 9.81104130737091e-06, "loss": 0.2337, "num_input_tokens_seen": 7021904, "step": 11540 }, { "epoch": 3.5820663977660567, "grad_norm": 0.12866656482219696, "learning_rate": 9.810672469292674e-06, "loss": 0.2358, "num_input_tokens_seen": 7024880, "step": 11545 }, { "epoch": 3.583617747440273, "grad_norm": 0.12033302336931229, "learning_rate": 9.810303278535182e-06, "loss": 0.2295, "num_input_tokens_seen": 7027312, "step": 11550 }, { "epoch": 3.5851690971144894, "grad_norm": 0.0519808754324913, "learning_rate": 9.809933735125495e-06, "loss": 0.2253, "num_input_tokens_seen": 7029744, "step": 11555 }, { "epoch": 3.586720446788706, "grad_norm": 0.2567502558231354, "learning_rate": 9.809563839090708e-06, "loss": 0.2397, "num_input_tokens_seen": 7032496, "step": 11560 }, { "epoch": 3.588271796462923, "grad_norm": 0.12594659626483917, "learning_rate": 9.809193590457936e-06, "loss": 0.2338, "num_input_tokens_seen": 7035184, "step": 11565 }, { "epoch": 3.5898231461371393, "grad_norm": 0.05425047129392624, "learning_rate": 9.808822989254328e-06, "loss": 0.2295, "num_input_tokens_seen": 7038832, "step": 11570 }, { "epoch": 3.591374495811356, "grad_norm": 0.12329286336898804, "learning_rate": 9.808452035507048e-06, "loss": 0.2259, "num_input_tokens_seen": 7042032, "step": 11575 }, { "epoch": 3.5929258454855724, "grad_norm": 0.22894519567489624, "learning_rate": 9.808080729243292e-06, "loss": 0.2287, "num_input_tokens_seen": 7044560, "step": 11580 }, { "epoch": 3.594477195159789, "grad_norm": 0.11447533220052719, "learning_rate": 9.807709070490284e-06, "loss": 0.2219, "num_input_tokens_seen": 7049264, "step": 11585 }, { "epoch": 3.5960285448340055, "grad_norm": 0.10166049003601074, "learning_rate": 9.807337059275269e-06, "loss": 0.2217, "num_input_tokens_seen": 7053968, "step": 11590 }, { "epoch": 3.5975798945082222, "grad_norm": 0.10685990750789642, "learning_rate": 9.806964695625521e-06, "loss": 0.221, "num_input_tokens_seen": 7056496, "step": 11595 }, { "epoch": 3.5991312441824386, "grad_norm": 0.058200228959321976, "learning_rate": 9.806591979568335e-06, "loss": 0.2311, "num_input_tokens_seen": 7059376, "step": 11600 }, { "epoch": 3.6006825938566553, "grad_norm": 0.0997689962387085, "learning_rate": 9.806218911131041e-06, "loss": 0.2313, "num_input_tokens_seen": 7061744, "step": 11605 }, { "epoch": 3.602233943530872, "grad_norm": 0.1898859590291977, "learning_rate": 9.805845490340987e-06, "loss": 0.2416, "num_input_tokens_seen": 7064400, "step": 11610 }, { "epoch": 3.6037852932050884, "grad_norm": 0.2946339249610901, "learning_rate": 9.805471717225548e-06, "loss": 0.2589, "num_input_tokens_seen": 7068208, "step": 11615 }, { "epoch": 3.6053366428793048, "grad_norm": 0.1034546047449112, "learning_rate": 9.805097591812126e-06, "loss": 0.2322, "num_input_tokens_seen": 7070896, "step": 11620 }, { "epoch": 3.6068879925535215, "grad_norm": 0.10114197432994843, "learning_rate": 9.80472311412815e-06, "loss": 0.2298, "num_input_tokens_seen": 7074352, "step": 11625 }, { "epoch": 3.6084393422277383, "grad_norm": 0.10434820502996445, "learning_rate": 9.804348284201073e-06, "loss": 0.2244, "num_input_tokens_seen": 7077008, "step": 11630 }, { "epoch": 3.6099906919019547, "grad_norm": 0.1309669464826584, "learning_rate": 9.803973102058376e-06, "loss": 0.2335, "num_input_tokens_seen": 7080048, "step": 11635 }, { "epoch": 3.6115420415761714, "grad_norm": 0.10269255191087723, "learning_rate": 9.803597567727562e-06, "loss": 0.2345, "num_input_tokens_seen": 7083088, "step": 11640 }, { "epoch": 3.6130933912503878, "grad_norm": 0.10584384202957153, "learning_rate": 9.803221681236164e-06, "loss": 0.2323, "num_input_tokens_seen": 7086768, "step": 11645 }, { "epoch": 3.6146447409246045, "grad_norm": 0.11085471510887146, "learning_rate": 9.802845442611737e-06, "loss": 0.2322, "num_input_tokens_seen": 7089200, "step": 11650 }, { "epoch": 3.616196090598821, "grad_norm": 0.11754696071147919, "learning_rate": 9.802468851881866e-06, "loss": 0.2309, "num_input_tokens_seen": 7091280, "step": 11655 }, { "epoch": 3.6177474402730376, "grad_norm": 0.11798548698425293, "learning_rate": 9.802091909074158e-06, "loss": 0.2346, "num_input_tokens_seen": 7094480, "step": 11660 }, { "epoch": 3.619298789947254, "grad_norm": 0.10869251936674118, "learning_rate": 9.801714614216246e-06, "loss": 0.2293, "num_input_tokens_seen": 7096880, "step": 11665 }, { "epoch": 3.6208501396214707, "grad_norm": 0.04052231088280678, "learning_rate": 9.801336967335796e-06, "loss": 0.2309, "num_input_tokens_seen": 7100176, "step": 11670 }, { "epoch": 3.6224014892956875, "grad_norm": 0.11287807673215866, "learning_rate": 9.800958968460485e-06, "loss": 0.2299, "num_input_tokens_seen": 7105008, "step": 11675 }, { "epoch": 3.623952838969904, "grad_norm": 0.10503769665956497, "learning_rate": 9.800580617618033e-06, "loss": 0.2311, "num_input_tokens_seen": 7107216, "step": 11680 }, { "epoch": 3.62550418864412, "grad_norm": 0.10830026865005493, "learning_rate": 9.800201914836174e-06, "loss": 0.2274, "num_input_tokens_seen": 7109808, "step": 11685 }, { "epoch": 3.627055538318337, "grad_norm": 0.22920426726341248, "learning_rate": 9.799822860142672e-06, "loss": 0.2371, "num_input_tokens_seen": 7112848, "step": 11690 }, { "epoch": 3.6286068879925537, "grad_norm": 0.12917184829711914, "learning_rate": 9.799443453565315e-06, "loss": 0.2366, "num_input_tokens_seen": 7115152, "step": 11695 }, { "epoch": 3.63015823766677, "grad_norm": 0.11554313451051712, "learning_rate": 9.79906369513192e-06, "loss": 0.2281, "num_input_tokens_seen": 7117584, "step": 11700 }, { "epoch": 3.631709587340987, "grad_norm": 0.05046612769365311, "learning_rate": 9.798683584870326e-06, "loss": 0.228, "num_input_tokens_seen": 7120528, "step": 11705 }, { "epoch": 3.633260937015203, "grad_norm": 0.050926774740219116, "learning_rate": 9.798303122808399e-06, "loss": 0.229, "num_input_tokens_seen": 7123568, "step": 11710 }, { "epoch": 3.63481228668942, "grad_norm": 0.1312248706817627, "learning_rate": 9.797922308974034e-06, "loss": 0.2337, "num_input_tokens_seen": 7126064, "step": 11715 }, { "epoch": 3.6363636363636362, "grad_norm": 0.13666664063930511, "learning_rate": 9.797541143395149e-06, "loss": 0.23, "num_input_tokens_seen": 7129040, "step": 11720 }, { "epoch": 3.637914986037853, "grad_norm": 0.1207251250743866, "learning_rate": 9.797159626099686e-06, "loss": 0.231, "num_input_tokens_seen": 7132112, "step": 11725 }, { "epoch": 3.6394663357120693, "grad_norm": 0.06278972327709198, "learning_rate": 9.796777757115614e-06, "loss": 0.2352, "num_input_tokens_seen": 7134480, "step": 11730 }, { "epoch": 3.641017685386286, "grad_norm": 0.05034060776233673, "learning_rate": 9.796395536470932e-06, "loss": 0.2321, "num_input_tokens_seen": 7138416, "step": 11735 }, { "epoch": 3.642569035060503, "grad_norm": 0.056029852479696274, "learning_rate": 9.79601296419366e-06, "loss": 0.23, "num_input_tokens_seen": 7141904, "step": 11740 }, { "epoch": 3.644120384734719, "grad_norm": 0.1453656256198883, "learning_rate": 9.795630040311842e-06, "loss": 0.2312, "num_input_tokens_seen": 7145328, "step": 11745 }, { "epoch": 3.6456717344089355, "grad_norm": 0.15378060936927795, "learning_rate": 9.795246764853555e-06, "loss": 0.2297, "num_input_tokens_seen": 7149072, "step": 11750 }, { "epoch": 3.6472230840831523, "grad_norm": 0.05687493830919266, "learning_rate": 9.794863137846894e-06, "loss": 0.2327, "num_input_tokens_seen": 7153008, "step": 11755 }, { "epoch": 3.648774433757369, "grad_norm": 0.14628198742866516, "learning_rate": 9.794479159319987e-06, "loss": 0.2328, "num_input_tokens_seen": 7155280, "step": 11760 }, { "epoch": 3.6503257834315854, "grad_norm": 0.12234713137149811, "learning_rate": 9.794094829300982e-06, "loss": 0.2229, "num_input_tokens_seen": 7158000, "step": 11765 }, { "epoch": 3.651877133105802, "grad_norm": 0.06916996836662292, "learning_rate": 9.793710147818056e-06, "loss": 0.2308, "num_input_tokens_seen": 7161200, "step": 11770 }, { "epoch": 3.6534284827800185, "grad_norm": 0.26755011081695557, "learning_rate": 9.79332511489941e-06, "loss": 0.2392, "num_input_tokens_seen": 7164592, "step": 11775 }, { "epoch": 3.6549798324542353, "grad_norm": 0.06871028244495392, "learning_rate": 9.792939730573272e-06, "loss": 0.2218, "num_input_tokens_seen": 7167376, "step": 11780 }, { "epoch": 3.6565311821284516, "grad_norm": 0.11589297652244568, "learning_rate": 9.792553994867893e-06, "loss": 0.2308, "num_input_tokens_seen": 7169712, "step": 11785 }, { "epoch": 3.6580825318026684, "grad_norm": 0.10630038380622864, "learning_rate": 9.792167907811556e-06, "loss": 0.2221, "num_input_tokens_seen": 7172784, "step": 11790 }, { "epoch": 3.6596338814768847, "grad_norm": 0.06254756450653076, "learning_rate": 9.791781469432562e-06, "loss": 0.2232, "num_input_tokens_seen": 7175184, "step": 11795 }, { "epoch": 3.6611852311511015, "grad_norm": 0.12945176661014557, "learning_rate": 9.791394679759244e-06, "loss": 0.2243, "num_input_tokens_seen": 7178288, "step": 11800 }, { "epoch": 3.6627365808253183, "grad_norm": 0.2512963116168976, "learning_rate": 9.791007538819956e-06, "loss": 0.216, "num_input_tokens_seen": 7182000, "step": 11805 }, { "epoch": 3.6642879304995346, "grad_norm": 0.21474552154541016, "learning_rate": 9.790620046643083e-06, "loss": 0.253, "num_input_tokens_seen": 7186128, "step": 11810 }, { "epoch": 3.665839280173751, "grad_norm": 0.22234630584716797, "learning_rate": 9.79023220325703e-06, "loss": 0.2482, "num_input_tokens_seen": 7188912, "step": 11815 }, { "epoch": 3.6673906298479677, "grad_norm": 0.18384204804897308, "learning_rate": 9.789844008690234e-06, "loss": 0.2283, "num_input_tokens_seen": 7192304, "step": 11820 }, { "epoch": 3.6689419795221845, "grad_norm": 0.05571703985333443, "learning_rate": 9.789455462971148e-06, "loss": 0.2252, "num_input_tokens_seen": 7194416, "step": 11825 }, { "epoch": 3.670493329196401, "grad_norm": 0.28538778424263, "learning_rate": 9.789066566128265e-06, "loss": 0.235, "num_input_tokens_seen": 7197264, "step": 11830 }, { "epoch": 3.6720446788706176, "grad_norm": 0.16656555235385895, "learning_rate": 9.78867731819009e-06, "loss": 0.2306, "num_input_tokens_seen": 7199728, "step": 11835 }, { "epoch": 3.673596028544834, "grad_norm": 0.09023293107748032, "learning_rate": 9.78828771918516e-06, "loss": 0.2313, "num_input_tokens_seen": 7203376, "step": 11840 }, { "epoch": 3.6751473782190507, "grad_norm": 0.07284724712371826, "learning_rate": 9.78789776914204e-06, "loss": 0.2323, "num_input_tokens_seen": 7206000, "step": 11845 }, { "epoch": 3.676698727893267, "grad_norm": 0.1566399782896042, "learning_rate": 9.787507468089317e-06, "loss": 0.2308, "num_input_tokens_seen": 7208240, "step": 11850 }, { "epoch": 3.678250077567484, "grad_norm": 0.17376694083213806, "learning_rate": 9.787116816055603e-06, "loss": 0.2348, "num_input_tokens_seen": 7210512, "step": 11855 }, { "epoch": 3.6798014272417, "grad_norm": 0.16933639347553253, "learning_rate": 9.786725813069537e-06, "loss": 0.2335, "num_input_tokens_seen": 7213360, "step": 11860 }, { "epoch": 3.681352776915917, "grad_norm": 0.19400399923324585, "learning_rate": 9.786334459159787e-06, "loss": 0.2278, "num_input_tokens_seen": 7215984, "step": 11865 }, { "epoch": 3.6829041265901337, "grad_norm": 0.09132444113492966, "learning_rate": 9.785942754355043e-06, "loss": 0.2352, "num_input_tokens_seen": 7218512, "step": 11870 }, { "epoch": 3.68445547626435, "grad_norm": 0.26512405276298523, "learning_rate": 9.785550698684019e-06, "loss": 0.239, "num_input_tokens_seen": 7221712, "step": 11875 }, { "epoch": 3.6860068259385663, "grad_norm": 0.17215986549854279, "learning_rate": 9.785158292175461e-06, "loss": 0.2312, "num_input_tokens_seen": 7225360, "step": 11880 }, { "epoch": 3.687558175612783, "grad_norm": 0.13541032373905182, "learning_rate": 9.784765534858135e-06, "loss": 0.2362, "num_input_tokens_seen": 7228560, "step": 11885 }, { "epoch": 3.689109525287, "grad_norm": 0.13079622387886047, "learning_rate": 9.784372426760836e-06, "loss": 0.2369, "num_input_tokens_seen": 7231632, "step": 11890 }, { "epoch": 3.690660874961216, "grad_norm": 0.05834071710705757, "learning_rate": 9.78397896791238e-06, "loss": 0.2336, "num_input_tokens_seen": 7236048, "step": 11895 }, { "epoch": 3.692212224635433, "grad_norm": 0.13095992803573608, "learning_rate": 9.783585158341618e-06, "loss": 0.231, "num_input_tokens_seen": 7239184, "step": 11900 }, { "epoch": 3.6937635743096493, "grad_norm": 0.12848000228405, "learning_rate": 9.783190998077417e-06, "loss": 0.2351, "num_input_tokens_seen": 7242160, "step": 11905 }, { "epoch": 3.695314923983866, "grad_norm": 0.045202020555734634, "learning_rate": 9.782796487148673e-06, "loss": 0.233, "num_input_tokens_seen": 7244912, "step": 11910 }, { "epoch": 3.6968662736580824, "grad_norm": 0.11708780378103256, "learning_rate": 9.78240162558431e-06, "loss": 0.233, "num_input_tokens_seen": 7247792, "step": 11915 }, { "epoch": 3.698417623332299, "grad_norm": 0.06833053380250931, "learning_rate": 9.782006413413278e-06, "loss": 0.2303, "num_input_tokens_seen": 7250704, "step": 11920 }, { "epoch": 3.6999689730065155, "grad_norm": 0.11190641671419144, "learning_rate": 9.781610850664547e-06, "loss": 0.2297, "num_input_tokens_seen": 7253456, "step": 11925 }, { "epoch": 3.7015203226807323, "grad_norm": 0.22782544791698456, "learning_rate": 9.781214937367118e-06, "loss": 0.2339, "num_input_tokens_seen": 7257040, "step": 11930 }, { "epoch": 3.703071672354949, "grad_norm": 0.05130481347441673, "learning_rate": 9.780818673550016e-06, "loss": 0.2319, "num_input_tokens_seen": 7260304, "step": 11935 }, { "epoch": 3.7046230220291654, "grad_norm": 0.05160543695092201, "learning_rate": 9.780422059242291e-06, "loss": 0.2274, "num_input_tokens_seen": 7263280, "step": 11940 }, { "epoch": 3.7061743717033817, "grad_norm": 0.1260513961315155, "learning_rate": 9.780025094473021e-06, "loss": 0.2357, "num_input_tokens_seen": 7266448, "step": 11945 }, { "epoch": 3.7077257213775985, "grad_norm": 0.1281314492225647, "learning_rate": 9.779627779271308e-06, "loss": 0.2319, "num_input_tokens_seen": 7269840, "step": 11950 }, { "epoch": 3.7092770710518153, "grad_norm": 0.11565223336219788, "learning_rate": 9.77923011366628e-06, "loss": 0.2361, "num_input_tokens_seen": 7273072, "step": 11955 }, { "epoch": 3.7108284207260316, "grad_norm": 0.1352090835571289, "learning_rate": 9.778832097687088e-06, "loss": 0.2318, "num_input_tokens_seen": 7276240, "step": 11960 }, { "epoch": 3.7123797704002484, "grad_norm": 0.12406610697507858, "learning_rate": 9.778433731362915e-06, "loss": 0.2278, "num_input_tokens_seen": 7281264, "step": 11965 }, { "epoch": 3.7139311200744647, "grad_norm": 0.031948190182447433, "learning_rate": 9.778035014722963e-06, "loss": 0.2315, "num_input_tokens_seen": 7284208, "step": 11970 }, { "epoch": 3.7154824697486815, "grad_norm": 0.06850134581327438, "learning_rate": 9.777635947796466e-06, "loss": 0.225, "num_input_tokens_seen": 7287152, "step": 11975 }, { "epoch": 3.717033819422898, "grad_norm": 0.10611595958471298, "learning_rate": 9.777236530612679e-06, "loss": 0.2319, "num_input_tokens_seen": 7290576, "step": 11980 }, { "epoch": 3.7185851690971146, "grad_norm": 0.11546394973993301, "learning_rate": 9.776836763200881e-06, "loss": 0.2327, "num_input_tokens_seen": 7293456, "step": 11985 }, { "epoch": 3.720136518771331, "grad_norm": 0.11110017448663712, "learning_rate": 9.776436645590383e-06, "loss": 0.216, "num_input_tokens_seen": 7296560, "step": 11990 }, { "epoch": 3.7216878684455477, "grad_norm": 0.06336124986410141, "learning_rate": 9.776036177810518e-06, "loss": 0.2412, "num_input_tokens_seen": 7299792, "step": 11995 }, { "epoch": 3.7232392181197644, "grad_norm": 0.15884891152381897, "learning_rate": 9.775635359890643e-06, "loss": 0.2377, "num_input_tokens_seen": 7302800, "step": 12000 }, { "epoch": 3.7247905677939808, "grad_norm": 0.14898952841758728, "learning_rate": 9.775234191860144e-06, "loss": 0.2374, "num_input_tokens_seen": 7305584, "step": 12005 }, { "epoch": 3.726341917468197, "grad_norm": 0.0655369982123375, "learning_rate": 9.774832673748433e-06, "loss": 0.2276, "num_input_tokens_seen": 7308432, "step": 12010 }, { "epoch": 3.727893267142414, "grad_norm": 0.029412254691123962, "learning_rate": 9.774430805584945e-06, "loss": 0.2321, "num_input_tokens_seen": 7311184, "step": 12015 }, { "epoch": 3.7294446168166306, "grad_norm": 0.03783287853002548, "learning_rate": 9.77402858739914e-06, "loss": 0.2334, "num_input_tokens_seen": 7314576, "step": 12020 }, { "epoch": 3.730995966490847, "grad_norm": 0.05723639205098152, "learning_rate": 9.773626019220506e-06, "loss": 0.2324, "num_input_tokens_seen": 7318128, "step": 12025 }, { "epoch": 3.7325473161650637, "grad_norm": 0.11545396596193314, "learning_rate": 9.773223101078557e-06, "loss": 0.2288, "num_input_tokens_seen": 7320880, "step": 12030 }, { "epoch": 3.73409866583928, "grad_norm": 0.12092932313680649, "learning_rate": 9.772819833002832e-06, "loss": 0.2268, "num_input_tokens_seen": 7324016, "step": 12035 }, { "epoch": 3.735650015513497, "grad_norm": 0.11125874519348145, "learning_rate": 9.772416215022893e-06, "loss": 0.2354, "num_input_tokens_seen": 7327536, "step": 12040 }, { "epoch": 3.737201365187713, "grad_norm": 0.10679413378238678, "learning_rate": 9.772012247168334e-06, "loss": 0.2255, "num_input_tokens_seen": 7329968, "step": 12045 }, { "epoch": 3.73875271486193, "grad_norm": 0.037123244255781174, "learning_rate": 9.771607929468768e-06, "loss": 0.2351, "num_input_tokens_seen": 7332720, "step": 12050 }, { "epoch": 3.7403040645361463, "grad_norm": 0.1462327390909195, "learning_rate": 9.771203261953835e-06, "loss": 0.237, "num_input_tokens_seen": 7336432, "step": 12055 }, { "epoch": 3.741855414210363, "grad_norm": 0.04746760427951813, "learning_rate": 9.770798244653203e-06, "loss": 0.2324, "num_input_tokens_seen": 7339440, "step": 12060 }, { "epoch": 3.74340676388458, "grad_norm": 0.13042660057544708, "learning_rate": 9.770392877596566e-06, "loss": 0.2322, "num_input_tokens_seen": 7342640, "step": 12065 }, { "epoch": 3.744958113558796, "grad_norm": 0.12225829064846039, "learning_rate": 9.769987160813641e-06, "loss": 0.2268, "num_input_tokens_seen": 7345392, "step": 12070 }, { "epoch": 3.7465094632330125, "grad_norm": 0.04970259591937065, "learning_rate": 9.769581094334173e-06, "loss": 0.2327, "num_input_tokens_seen": 7348080, "step": 12075 }, { "epoch": 3.7480608129072293, "grad_norm": 0.10773493349552155, "learning_rate": 9.769174678187929e-06, "loss": 0.23, "num_input_tokens_seen": 7350992, "step": 12080 }, { "epoch": 3.749612162581446, "grad_norm": 0.0412178710103035, "learning_rate": 9.768767912404706e-06, "loss": 0.2307, "num_input_tokens_seen": 7353712, "step": 12085 }, { "epoch": 3.7511635122556624, "grad_norm": 0.21756300330162048, "learning_rate": 9.768360797014325e-06, "loss": 0.2276, "num_input_tokens_seen": 7358096, "step": 12090 }, { "epoch": 3.752714861929879, "grad_norm": 0.11062147468328476, "learning_rate": 9.767953332046631e-06, "loss": 0.2364, "num_input_tokens_seen": 7361104, "step": 12095 }, { "epoch": 3.7542662116040955, "grad_norm": 0.2194194197654724, "learning_rate": 9.767545517531495e-06, "loss": 0.2284, "num_input_tokens_seen": 7363568, "step": 12100 }, { "epoch": 3.7558175612783122, "grad_norm": 0.24965685606002808, "learning_rate": 9.76713735349882e-06, "loss": 0.24, "num_input_tokens_seen": 7366544, "step": 12105 }, { "epoch": 3.7573689109525286, "grad_norm": 0.11925462633371353, "learning_rate": 9.766728839978524e-06, "loss": 0.2313, "num_input_tokens_seen": 7369552, "step": 12110 }, { "epoch": 3.7589202606267453, "grad_norm": 0.04114270955324173, "learning_rate": 9.766319977000558e-06, "loss": 0.2365, "num_input_tokens_seen": 7372400, "step": 12115 }, { "epoch": 3.7604716103009617, "grad_norm": 0.04989838972687721, "learning_rate": 9.765910764594895e-06, "loss": 0.2291, "num_input_tokens_seen": 7375664, "step": 12120 }, { "epoch": 3.7620229599751784, "grad_norm": 0.038933537900447845, "learning_rate": 9.765501202791537e-06, "loss": 0.238, "num_input_tokens_seen": 7377840, "step": 12125 }, { "epoch": 3.763574309649395, "grad_norm": 0.22865094244480133, "learning_rate": 9.765091291620507e-06, "loss": 0.2309, "num_input_tokens_seen": 7380528, "step": 12130 }, { "epoch": 3.7651256593236115, "grad_norm": 0.12267875671386719, "learning_rate": 9.76468103111186e-06, "loss": 0.2319, "num_input_tokens_seen": 7383664, "step": 12135 }, { "epoch": 3.766677008997828, "grad_norm": 0.03621415048837662, "learning_rate": 9.764270421295672e-06, "loss": 0.2283, "num_input_tokens_seen": 7386672, "step": 12140 }, { "epoch": 3.7682283586720446, "grad_norm": 0.2296590805053711, "learning_rate": 9.763859462202043e-06, "loss": 0.2293, "num_input_tokens_seen": 7389328, "step": 12145 }, { "epoch": 3.7697797083462614, "grad_norm": 0.020298413932323456, "learning_rate": 9.763448153861104e-06, "loss": 0.231, "num_input_tokens_seen": 7391632, "step": 12150 }, { "epoch": 3.7713310580204777, "grad_norm": 0.10867490619421005, "learning_rate": 9.763036496303007e-06, "loss": 0.229, "num_input_tokens_seen": 7395088, "step": 12155 }, { "epoch": 3.7728824076946945, "grad_norm": 0.13309428095817566, "learning_rate": 9.762624489557933e-06, "loss": 0.2334, "num_input_tokens_seen": 7398000, "step": 12160 }, { "epoch": 3.774433757368911, "grad_norm": 0.1431334763765335, "learning_rate": 9.762212133656084e-06, "loss": 0.2374, "num_input_tokens_seen": 7401104, "step": 12165 }, { "epoch": 3.7759851070431276, "grad_norm": 0.04355806112289429, "learning_rate": 9.761799428627693e-06, "loss": 0.2382, "num_input_tokens_seen": 7404176, "step": 12170 }, { "epoch": 3.777536456717344, "grad_norm": 0.20536984503269196, "learning_rate": 9.761386374503017e-06, "loss": 0.2236, "num_input_tokens_seen": 7407280, "step": 12175 }, { "epoch": 3.7790878063915607, "grad_norm": 0.034009192138910294, "learning_rate": 9.760972971312337e-06, "loss": 0.2355, "num_input_tokens_seen": 7410384, "step": 12180 }, { "epoch": 3.780639156065777, "grad_norm": 0.12190283834934235, "learning_rate": 9.760559219085958e-06, "loss": 0.2308, "num_input_tokens_seen": 7413456, "step": 12185 }, { "epoch": 3.782190505739994, "grad_norm": 0.04598477482795715, "learning_rate": 9.760145117854216e-06, "loss": 0.2288, "num_input_tokens_seen": 7416016, "step": 12190 }, { "epoch": 3.7837418554142106, "grad_norm": 0.10623445361852646, "learning_rate": 9.759730667647467e-06, "loss": 0.2231, "num_input_tokens_seen": 7418864, "step": 12195 }, { "epoch": 3.785293205088427, "grad_norm": 0.24000713229179382, "learning_rate": 9.759315868496097e-06, "loss": 0.2346, "num_input_tokens_seen": 7421168, "step": 12200 }, { "epoch": 3.7868445547626433, "grad_norm": 0.13662706315517426, "learning_rate": 9.758900720430516e-06, "loss": 0.2319, "num_input_tokens_seen": 7424560, "step": 12205 }, { "epoch": 3.78839590443686, "grad_norm": 0.13985733687877655, "learning_rate": 9.758485223481158e-06, "loss": 0.2364, "num_input_tokens_seen": 7427824, "step": 12210 }, { "epoch": 3.789947254111077, "grad_norm": 0.03122127614915371, "learning_rate": 9.758069377678485e-06, "loss": 0.2289, "num_input_tokens_seen": 7430864, "step": 12215 }, { "epoch": 3.791498603785293, "grad_norm": 0.12297002971172333, "learning_rate": 9.757653183052981e-06, "loss": 0.2343, "num_input_tokens_seen": 7434416, "step": 12220 }, { "epoch": 3.79304995345951, "grad_norm": 0.10542221367359161, "learning_rate": 9.75723663963516e-06, "loss": 0.2281, "num_input_tokens_seen": 7436816, "step": 12225 }, { "epoch": 3.7946013031337262, "grad_norm": 0.04377548396587372, "learning_rate": 9.756819747455559e-06, "loss": 0.2326, "num_input_tokens_seen": 7439120, "step": 12230 }, { "epoch": 3.796152652807943, "grad_norm": 0.11692739278078079, "learning_rate": 9.756402506544743e-06, "loss": 0.2331, "num_input_tokens_seen": 7441584, "step": 12235 }, { "epoch": 3.7977040024821593, "grad_norm": 0.03588327765464783, "learning_rate": 9.755984916933298e-06, "loss": 0.23, "num_input_tokens_seen": 7445136, "step": 12240 }, { "epoch": 3.799255352156376, "grad_norm": 0.1225811094045639, "learning_rate": 9.755566978651837e-06, "loss": 0.2324, "num_input_tokens_seen": 7447952, "step": 12245 }, { "epoch": 3.8008067018305924, "grad_norm": 0.10519039630889893, "learning_rate": 9.755148691731007e-06, "loss": 0.2298, "num_input_tokens_seen": 7451664, "step": 12250 }, { "epoch": 3.802358051504809, "grad_norm": 0.2239844799041748, "learning_rate": 9.754730056201465e-06, "loss": 0.2314, "num_input_tokens_seen": 7455120, "step": 12255 }, { "epoch": 3.803909401179026, "grad_norm": 0.02640867419540882, "learning_rate": 9.754311072093907e-06, "loss": 0.2325, "num_input_tokens_seen": 7458192, "step": 12260 }, { "epoch": 3.8054607508532423, "grad_norm": 0.1061512678861618, "learning_rate": 9.753891739439047e-06, "loss": 0.2321, "num_input_tokens_seen": 7461040, "step": 12265 }, { "epoch": 3.8070121005274586, "grad_norm": 0.13384094834327698, "learning_rate": 9.753472058267628e-06, "loss": 0.2345, "num_input_tokens_seen": 7463664, "step": 12270 }, { "epoch": 3.8085634502016754, "grad_norm": 0.11339543014764786, "learning_rate": 9.753052028610418e-06, "loss": 0.232, "num_input_tokens_seen": 7466224, "step": 12275 }, { "epoch": 3.810114799875892, "grad_norm": 0.035697709769010544, "learning_rate": 9.75263165049821e-06, "loss": 0.2305, "num_input_tokens_seen": 7469680, "step": 12280 }, { "epoch": 3.8116661495501085, "grad_norm": 0.10708664357662201, "learning_rate": 9.752210923961821e-06, "loss": 0.2295, "num_input_tokens_seen": 7472720, "step": 12285 }, { "epoch": 3.8132174992243253, "grad_norm": 0.12181372940540314, "learning_rate": 9.751789849032098e-06, "loss": 0.2337, "num_input_tokens_seen": 7475632, "step": 12290 }, { "epoch": 3.8147688488985416, "grad_norm": 0.21383869647979736, "learning_rate": 9.751368425739908e-06, "loss": 0.2316, "num_input_tokens_seen": 7478736, "step": 12295 }, { "epoch": 3.8163201985727584, "grad_norm": 0.051612235605716705, "learning_rate": 9.75094665411615e-06, "loss": 0.233, "num_input_tokens_seen": 7483056, "step": 12300 }, { "epoch": 3.8178715482469747, "grad_norm": 0.12238199263811111, "learning_rate": 9.750524534191741e-06, "loss": 0.2288, "num_input_tokens_seen": 7485680, "step": 12305 }, { "epoch": 3.8194228979211915, "grad_norm": 0.03656063973903656, "learning_rate": 9.750102065997631e-06, "loss": 0.2294, "num_input_tokens_seen": 7489264, "step": 12310 }, { "epoch": 3.820974247595408, "grad_norm": 0.12166289985179901, "learning_rate": 9.749679249564787e-06, "loss": 0.2259, "num_input_tokens_seen": 7491728, "step": 12315 }, { "epoch": 3.8225255972696246, "grad_norm": 0.16046260297298431, "learning_rate": 9.749256084924212e-06, "loss": 0.2314, "num_input_tokens_seen": 7496848, "step": 12320 }, { "epoch": 3.8240769469438414, "grad_norm": 0.145257830619812, "learning_rate": 9.748832572106925e-06, "loss": 0.232, "num_input_tokens_seen": 7499824, "step": 12325 }, { "epoch": 3.8256282966180577, "grad_norm": 0.11738588660955429, "learning_rate": 9.748408711143977e-06, "loss": 0.2338, "num_input_tokens_seen": 7503568, "step": 12330 }, { "epoch": 3.827179646292274, "grad_norm": 0.11124683171510696, "learning_rate": 9.747984502066438e-06, "loss": 0.2265, "num_input_tokens_seen": 7506416, "step": 12335 }, { "epoch": 3.828730995966491, "grad_norm": 0.04219284653663635, "learning_rate": 9.747559944905413e-06, "loss": 0.2333, "num_input_tokens_seen": 7509136, "step": 12340 }, { "epoch": 3.8302823456407076, "grad_norm": 0.03956867754459381, "learning_rate": 9.747135039692024e-06, "loss": 0.2318, "num_input_tokens_seen": 7512144, "step": 12345 }, { "epoch": 3.831833695314924, "grad_norm": 0.21616943180561066, "learning_rate": 9.746709786457422e-06, "loss": 0.2324, "num_input_tokens_seen": 7515440, "step": 12350 }, { "epoch": 3.8333850449891407, "grad_norm": 0.13238786160945892, "learning_rate": 9.746284185232783e-06, "loss": 0.2314, "num_input_tokens_seen": 7518384, "step": 12355 }, { "epoch": 3.834936394663357, "grad_norm": 0.04976145550608635, "learning_rate": 9.74585823604931e-06, "loss": 0.2333, "num_input_tokens_seen": 7520912, "step": 12360 }, { "epoch": 3.836487744337574, "grad_norm": 0.11436694860458374, "learning_rate": 9.745431938938227e-06, "loss": 0.2341, "num_input_tokens_seen": 7523632, "step": 12365 }, { "epoch": 3.83803909401179, "grad_norm": 0.11652632057666779, "learning_rate": 9.745005293930791e-06, "loss": 0.232, "num_input_tokens_seen": 7526064, "step": 12370 }, { "epoch": 3.839590443686007, "grad_norm": 0.062187258154153824, "learning_rate": 9.744578301058276e-06, "loss": 0.2329, "num_input_tokens_seen": 7529680, "step": 12375 }, { "epoch": 3.841141793360223, "grad_norm": 0.21965055167675018, "learning_rate": 9.744150960351989e-06, "loss": 0.2299, "num_input_tokens_seen": 7532176, "step": 12380 }, { "epoch": 3.84269314303444, "grad_norm": 0.056813303381204605, "learning_rate": 9.743723271843255e-06, "loss": 0.233, "num_input_tokens_seen": 7535216, "step": 12385 }, { "epoch": 3.8442444927086568, "grad_norm": 0.09842151403427124, "learning_rate": 9.743295235563432e-06, "loss": 0.2331, "num_input_tokens_seen": 7537840, "step": 12390 }, { "epoch": 3.845795842382873, "grad_norm": 0.0357135534286499, "learning_rate": 9.742866851543899e-06, "loss": 0.231, "num_input_tokens_seen": 7540112, "step": 12395 }, { "epoch": 3.8473471920570894, "grad_norm": 0.05180566385388374, "learning_rate": 9.742438119816062e-06, "loss": 0.2299, "num_input_tokens_seen": 7544592, "step": 12400 }, { "epoch": 3.848898541731306, "grad_norm": 0.06388399004936218, "learning_rate": 9.742009040411353e-06, "loss": 0.2357, "num_input_tokens_seen": 7547536, "step": 12405 }, { "epoch": 3.850449891405523, "grad_norm": 0.12433787435293198, "learning_rate": 9.741579613361227e-06, "loss": 0.2342, "num_input_tokens_seen": 7551184, "step": 12410 }, { "epoch": 3.8520012410797393, "grad_norm": 0.10693667829036713, "learning_rate": 9.741149838697165e-06, "loss": 0.2336, "num_input_tokens_seen": 7553616, "step": 12415 }, { "epoch": 3.853552590753956, "grad_norm": 0.11679494380950928, "learning_rate": 9.740719716450679e-06, "loss": 0.2293, "num_input_tokens_seen": 7556720, "step": 12420 }, { "epoch": 3.8551039404281724, "grad_norm": 0.22310936450958252, "learning_rate": 9.740289246653298e-06, "loss": 0.2299, "num_input_tokens_seen": 7559664, "step": 12425 }, { "epoch": 3.856655290102389, "grad_norm": 0.11521657556295395, "learning_rate": 9.73985842933658e-06, "loss": 0.2309, "num_input_tokens_seen": 7563056, "step": 12430 }, { "epoch": 3.8582066397766055, "grad_norm": 0.11106418073177338, "learning_rate": 9.739427264532114e-06, "loss": 0.2314, "num_input_tokens_seen": 7565904, "step": 12435 }, { "epoch": 3.8597579894508223, "grad_norm": 0.11714570224285126, "learning_rate": 9.738995752271505e-06, "loss": 0.2314, "num_input_tokens_seen": 7568400, "step": 12440 }, { "epoch": 3.8613093391250386, "grad_norm": 0.0362064354121685, "learning_rate": 9.738563892586388e-06, "loss": 0.2341, "num_input_tokens_seen": 7571760, "step": 12445 }, { "epoch": 3.8628606887992554, "grad_norm": 0.0423109345138073, "learning_rate": 9.738131685508427e-06, "loss": 0.233, "num_input_tokens_seen": 7574416, "step": 12450 }, { "epoch": 3.864412038473472, "grad_norm": 0.11311633884906769, "learning_rate": 9.737699131069303e-06, "loss": 0.2304, "num_input_tokens_seen": 7577232, "step": 12455 }, { "epoch": 3.8659633881476885, "grad_norm": 0.11658310145139694, "learning_rate": 9.73726622930073e-06, "loss": 0.2346, "num_input_tokens_seen": 7579376, "step": 12460 }, { "epoch": 3.867514737821905, "grad_norm": 0.11318280547857285, "learning_rate": 9.736832980234447e-06, "loss": 0.2278, "num_input_tokens_seen": 7581904, "step": 12465 }, { "epoch": 3.8690660874961216, "grad_norm": 0.11014215648174286, "learning_rate": 9.736399383902213e-06, "loss": 0.2294, "num_input_tokens_seen": 7584656, "step": 12470 }, { "epoch": 3.8706174371703383, "grad_norm": 0.1131046935915947, "learning_rate": 9.735965440335818e-06, "loss": 0.2303, "num_input_tokens_seen": 7588272, "step": 12475 }, { "epoch": 3.8721687868445547, "grad_norm": 0.11686914414167404, "learning_rate": 9.735531149567072e-06, "loss": 0.2304, "num_input_tokens_seen": 7592304, "step": 12480 }, { "epoch": 3.8737201365187715, "grad_norm": 0.04148191213607788, "learning_rate": 9.735096511627817e-06, "loss": 0.234, "num_input_tokens_seen": 7595504, "step": 12485 }, { "epoch": 3.875271486192988, "grad_norm": 0.11706028878688812, "learning_rate": 9.734661526549916e-06, "loss": 0.2309, "num_input_tokens_seen": 7598032, "step": 12490 }, { "epoch": 3.8768228358672046, "grad_norm": 0.05536722019314766, "learning_rate": 9.734226194365257e-06, "loss": 0.2274, "num_input_tokens_seen": 7600752, "step": 12495 }, { "epoch": 3.878374185541421, "grad_norm": 0.11208425462245941, "learning_rate": 9.733790515105756e-06, "loss": 0.2306, "num_input_tokens_seen": 7603536, "step": 12500 }, { "epoch": 3.8799255352156377, "grad_norm": 0.13315464556217194, "learning_rate": 9.733354488803356e-06, "loss": 0.2305, "num_input_tokens_seen": 7605936, "step": 12505 }, { "epoch": 3.881476884889854, "grad_norm": 0.129570871591568, "learning_rate": 9.732918115490017e-06, "loss": 0.2387, "num_input_tokens_seen": 7609232, "step": 12510 }, { "epoch": 3.8830282345640708, "grad_norm": 0.03702912479639053, "learning_rate": 9.732481395197736e-06, "loss": 0.233, "num_input_tokens_seen": 7611696, "step": 12515 }, { "epoch": 3.8845795842382875, "grad_norm": 0.04819978401064873, "learning_rate": 9.73204432795853e-06, "loss": 0.2375, "num_input_tokens_seen": 7614480, "step": 12520 }, { "epoch": 3.886130933912504, "grad_norm": 0.11229295283555984, "learning_rate": 9.731606913804437e-06, "loss": 0.2289, "num_input_tokens_seen": 7617040, "step": 12525 }, { "epoch": 3.88768228358672, "grad_norm": 0.03457857295870781, "learning_rate": 9.731169152767526e-06, "loss": 0.233, "num_input_tokens_seen": 7620272, "step": 12530 }, { "epoch": 3.889233633260937, "grad_norm": 0.04284020885825157, "learning_rate": 9.730731044879891e-06, "loss": 0.2304, "num_input_tokens_seen": 7623632, "step": 12535 }, { "epoch": 3.8907849829351537, "grad_norm": 0.12229108810424805, "learning_rate": 9.73029259017365e-06, "loss": 0.2283, "num_input_tokens_seen": 7626928, "step": 12540 }, { "epoch": 3.89233633260937, "grad_norm": 0.11528097093105316, "learning_rate": 9.72985378868095e-06, "loss": 0.233, "num_input_tokens_seen": 7629296, "step": 12545 }, { "epoch": 3.893887682283587, "grad_norm": 0.11936123669147491, "learning_rate": 9.729414640433955e-06, "loss": 0.2345, "num_input_tokens_seen": 7632976, "step": 12550 }, { "epoch": 3.895439031957803, "grad_norm": 0.21740034222602844, "learning_rate": 9.72897514546486e-06, "loss": 0.2336, "num_input_tokens_seen": 7635600, "step": 12555 }, { "epoch": 3.89699038163202, "grad_norm": 0.10596324503421783, "learning_rate": 9.72853530380589e-06, "loss": 0.231, "num_input_tokens_seen": 7640272, "step": 12560 }, { "epoch": 3.8985417313062363, "grad_norm": 0.11019153892993927, "learning_rate": 9.728095115489289e-06, "loss": 0.2315, "num_input_tokens_seen": 7643280, "step": 12565 }, { "epoch": 3.900093080980453, "grad_norm": 0.11244742572307587, "learning_rate": 9.727654580547325e-06, "loss": 0.2331, "num_input_tokens_seen": 7648688, "step": 12570 }, { "epoch": 3.9016444306546694, "grad_norm": 0.11585606634616852, "learning_rate": 9.727213699012296e-06, "loss": 0.233, "num_input_tokens_seen": 7653296, "step": 12575 }, { "epoch": 3.903195780328886, "grad_norm": 0.11535921692848206, "learning_rate": 9.726772470916527e-06, "loss": 0.2284, "num_input_tokens_seen": 7656016, "step": 12580 }, { "epoch": 3.904747130003103, "grad_norm": 0.12666550278663635, "learning_rate": 9.72633089629236e-06, "loss": 0.2299, "num_input_tokens_seen": 7659120, "step": 12585 }, { "epoch": 3.9062984796773192, "grad_norm": 0.12291531264781952, "learning_rate": 9.725888975172171e-06, "loss": 0.2337, "num_input_tokens_seen": 7663440, "step": 12590 }, { "epoch": 3.9078498293515356, "grad_norm": 0.11021821945905685, "learning_rate": 9.725446707588357e-06, "loss": 0.2331, "num_input_tokens_seen": 7666352, "step": 12595 }, { "epoch": 3.9094011790257523, "grad_norm": 0.21602945029735565, "learning_rate": 9.725004093573343e-06, "loss": 0.2336, "num_input_tokens_seen": 7669072, "step": 12600 }, { "epoch": 3.910952528699969, "grad_norm": 0.11872327327728271, "learning_rate": 9.724561133159576e-06, "loss": 0.2335, "num_input_tokens_seen": 7671952, "step": 12605 }, { "epoch": 3.9125038783741855, "grad_norm": 0.11234267801046371, "learning_rate": 9.72411782637953e-06, "loss": 0.2315, "num_input_tokens_seen": 7675920, "step": 12610 }, { "epoch": 3.9140552280484022, "grad_norm": 0.04439824819564819, "learning_rate": 9.723674173265706e-06, "loss": 0.2357, "num_input_tokens_seen": 7678544, "step": 12615 }, { "epoch": 3.9156065777226186, "grad_norm": 0.06085581332445145, "learning_rate": 9.723230173850628e-06, "loss": 0.2363, "num_input_tokens_seen": 7682064, "step": 12620 }, { "epoch": 3.9171579273968353, "grad_norm": 0.03504890576004982, "learning_rate": 9.722785828166847e-06, "loss": 0.23, "num_input_tokens_seen": 7686480, "step": 12625 }, { "epoch": 3.9187092770710517, "grad_norm": 0.11128289252519608, "learning_rate": 9.722341136246937e-06, "loss": 0.2335, "num_input_tokens_seen": 7688624, "step": 12630 }, { "epoch": 3.9202606267452684, "grad_norm": 0.11711550503969193, "learning_rate": 9.721896098123505e-06, "loss": 0.232, "num_input_tokens_seen": 7693520, "step": 12635 }, { "epoch": 3.9218119764194848, "grad_norm": 0.10912735015153885, "learning_rate": 9.721450713829171e-06, "loss": 0.2332, "num_input_tokens_seen": 7695664, "step": 12640 }, { "epoch": 3.9233633260937015, "grad_norm": 0.0676194503903389, "learning_rate": 9.721004983396588e-06, "loss": 0.2311, "num_input_tokens_seen": 7698800, "step": 12645 }, { "epoch": 3.9249146757679183, "grad_norm": 0.10399173945188522, "learning_rate": 9.720558906858435e-06, "loss": 0.2272, "num_input_tokens_seen": 7701680, "step": 12650 }, { "epoch": 3.9264660254421346, "grad_norm": 0.049860622733831406, "learning_rate": 9.720112484247414e-06, "loss": 0.2299, "num_input_tokens_seen": 7704368, "step": 12655 }, { "epoch": 3.928017375116351, "grad_norm": 0.14600777626037598, "learning_rate": 9.719665715596257e-06, "loss": 0.2356, "num_input_tokens_seen": 7706960, "step": 12660 }, { "epoch": 3.9295687247905677, "grad_norm": 0.03515235707163811, "learning_rate": 9.71921860093771e-06, "loss": 0.235, "num_input_tokens_seen": 7709744, "step": 12665 }, { "epoch": 3.9311200744647845, "grad_norm": 0.23680008947849274, "learning_rate": 9.718771140304557e-06, "loss": 0.2366, "num_input_tokens_seen": 7712944, "step": 12670 }, { "epoch": 3.932671424139001, "grad_norm": 0.13152308762073517, "learning_rate": 9.718323333729602e-06, "loss": 0.2342, "num_input_tokens_seen": 7715728, "step": 12675 }, { "epoch": 3.9342227738132176, "grad_norm": 0.1180848702788353, "learning_rate": 9.717875181245671e-06, "loss": 0.2345, "num_input_tokens_seen": 7718608, "step": 12680 }, { "epoch": 3.935774123487434, "grad_norm": 0.03974370285868645, "learning_rate": 9.717426682885623e-06, "loss": 0.2321, "num_input_tokens_seen": 7721072, "step": 12685 }, { "epoch": 3.9373254731616507, "grad_norm": 0.2825953960418701, "learning_rate": 9.716977838682335e-06, "loss": 0.2299, "num_input_tokens_seen": 7723664, "step": 12690 }, { "epoch": 3.938876822835867, "grad_norm": 0.0351499579846859, "learning_rate": 9.716528648668716e-06, "loss": 0.2409, "num_input_tokens_seen": 7726704, "step": 12695 }, { "epoch": 3.940428172510084, "grad_norm": 0.11919280886650085, "learning_rate": 9.716079112877695e-06, "loss": 0.2328, "num_input_tokens_seen": 7730256, "step": 12700 }, { "epoch": 3.9419795221843, "grad_norm": 0.21891985833644867, "learning_rate": 9.715629231342226e-06, "loss": 0.2321, "num_input_tokens_seen": 7734384, "step": 12705 }, { "epoch": 3.943530871858517, "grad_norm": 0.042741287499666214, "learning_rate": 9.715179004095295e-06, "loss": 0.2294, "num_input_tokens_seen": 7737296, "step": 12710 }, { "epoch": 3.9450822215327337, "grad_norm": 0.11016181111335754, "learning_rate": 9.714728431169904e-06, "loss": 0.2304, "num_input_tokens_seen": 7741104, "step": 12715 }, { "epoch": 3.94663357120695, "grad_norm": 0.21203583478927612, "learning_rate": 9.714277512599091e-06, "loss": 0.2314, "num_input_tokens_seen": 7743792, "step": 12720 }, { "epoch": 3.9481849208811663, "grad_norm": 0.039804913103580475, "learning_rate": 9.713826248415911e-06, "loss": 0.2299, "num_input_tokens_seen": 7746832, "step": 12725 }, { "epoch": 3.949736270555383, "grad_norm": 0.20424820482730865, "learning_rate": 9.713374638653445e-06, "loss": 0.2283, "num_input_tokens_seen": 7750032, "step": 12730 }, { "epoch": 3.9512876202296, "grad_norm": 0.1229216530919075, "learning_rate": 9.712922683344806e-06, "loss": 0.2312, "num_input_tokens_seen": 7752304, "step": 12735 }, { "epoch": 3.9528389699038162, "grad_norm": 0.09980400651693344, "learning_rate": 9.712470382523123e-06, "loss": 0.2301, "num_input_tokens_seen": 7754896, "step": 12740 }, { "epoch": 3.954390319578033, "grad_norm": 0.09359890967607498, "learning_rate": 9.712017736221557e-06, "loss": 0.2255, "num_input_tokens_seen": 7757424, "step": 12745 }, { "epoch": 3.9559416692522493, "grad_norm": 0.20199701189994812, "learning_rate": 9.711564744473294e-06, "loss": 0.2269, "num_input_tokens_seen": 7760304, "step": 12750 }, { "epoch": 3.957493018926466, "grad_norm": 0.23575997352600098, "learning_rate": 9.71111140731154e-06, "loss": 0.2383, "num_input_tokens_seen": 7763312, "step": 12755 }, { "epoch": 3.9590443686006824, "grad_norm": 0.04748249426484108, "learning_rate": 9.710657724769532e-06, "loss": 0.2384, "num_input_tokens_seen": 7766512, "step": 12760 }, { "epoch": 3.960595718274899, "grad_norm": 0.10026376694440842, "learning_rate": 9.710203696880532e-06, "loss": 0.2257, "num_input_tokens_seen": 7769360, "step": 12765 }, { "epoch": 3.9621470679491155, "grad_norm": 0.1023336723446846, "learning_rate": 9.709749323677822e-06, "loss": 0.2344, "num_input_tokens_seen": 7772528, "step": 12770 }, { "epoch": 3.9636984176233323, "grad_norm": 0.19148248434066772, "learning_rate": 9.709294605194714e-06, "loss": 0.2324, "num_input_tokens_seen": 7776112, "step": 12775 }, { "epoch": 3.965249767297549, "grad_norm": 0.09678471833467484, "learning_rate": 9.708839541464547e-06, "loss": 0.2287, "num_input_tokens_seen": 7778544, "step": 12780 }, { "epoch": 3.9668011169717654, "grad_norm": 0.03436288610100746, "learning_rate": 9.708384132520681e-06, "loss": 0.2283, "num_input_tokens_seen": 7781584, "step": 12785 }, { "epoch": 3.9683524666459817, "grad_norm": 0.06325648725032806, "learning_rate": 9.707928378396501e-06, "loss": 0.228, "num_input_tokens_seen": 7785936, "step": 12790 }, { "epoch": 3.9699038163201985, "grad_norm": 0.1314982920885086, "learning_rate": 9.707472279125422e-06, "loss": 0.2371, "num_input_tokens_seen": 7789360, "step": 12795 }, { "epoch": 3.9714551659944153, "grad_norm": 0.03821085765957832, "learning_rate": 9.707015834740878e-06, "loss": 0.2325, "num_input_tokens_seen": 7791920, "step": 12800 }, { "epoch": 3.9730065156686316, "grad_norm": 0.235836461186409, "learning_rate": 9.706559045276336e-06, "loss": 0.233, "num_input_tokens_seen": 7795856, "step": 12805 }, { "epoch": 3.9745578653428484, "grad_norm": 0.0990707278251648, "learning_rate": 9.706101910765283e-06, "loss": 0.2348, "num_input_tokens_seen": 7798096, "step": 12810 }, { "epoch": 3.9761092150170647, "grad_norm": 0.21533748507499695, "learning_rate": 9.705644431241227e-06, "loss": 0.2337, "num_input_tokens_seen": 7801040, "step": 12815 }, { "epoch": 3.9776605646912815, "grad_norm": 0.04151826351881027, "learning_rate": 9.705186606737715e-06, "loss": 0.2263, "num_input_tokens_seen": 7804880, "step": 12820 }, { "epoch": 3.9792119143654983, "grad_norm": 0.10510412603616714, "learning_rate": 9.704728437288305e-06, "loss": 0.2324, "num_input_tokens_seen": 7810704, "step": 12825 }, { "epoch": 3.9807632640397146, "grad_norm": 0.11844950914382935, "learning_rate": 9.704269922926591e-06, "loss": 0.2341, "num_input_tokens_seen": 7813520, "step": 12830 }, { "epoch": 3.982314613713931, "grad_norm": 0.03073319047689438, "learning_rate": 9.703811063686182e-06, "loss": 0.2369, "num_input_tokens_seen": 7817840, "step": 12835 }, { "epoch": 3.9838659633881477, "grad_norm": 0.11441955715417862, "learning_rate": 9.703351859600721e-06, "loss": 0.2322, "num_input_tokens_seen": 7820656, "step": 12840 }, { "epoch": 3.9854173130623645, "grad_norm": 0.1108928844332695, "learning_rate": 9.702892310703874e-06, "loss": 0.2352, "num_input_tokens_seen": 7825584, "step": 12845 }, { "epoch": 3.986968662736581, "grad_norm": 0.1004263237118721, "learning_rate": 9.702432417029327e-06, "loss": 0.2325, "num_input_tokens_seen": 7828272, "step": 12850 }, { "epoch": 3.988520012410797, "grad_norm": 0.11451250314712524, "learning_rate": 9.701972178610802e-06, "loss": 0.2294, "num_input_tokens_seen": 7831152, "step": 12855 }, { "epoch": 3.990071362085014, "grad_norm": 0.044719148427248, "learning_rate": 9.701511595482035e-06, "loss": 0.2335, "num_input_tokens_seen": 7833872, "step": 12860 }, { "epoch": 3.9916227117592307, "grad_norm": 0.03412773832678795, "learning_rate": 9.701050667676793e-06, "loss": 0.2304, "num_input_tokens_seen": 7836368, "step": 12865 }, { "epoch": 3.993174061433447, "grad_norm": 0.09795916080474854, "learning_rate": 9.70058939522887e-06, "loss": 0.2263, "num_input_tokens_seen": 7838736, "step": 12870 }, { "epoch": 3.9947254111076638, "grad_norm": 0.033343054354190826, "learning_rate": 9.70012777817208e-06, "loss": 0.2304, "num_input_tokens_seen": 7841776, "step": 12875 }, { "epoch": 3.99627676078188, "grad_norm": 0.02816491387784481, "learning_rate": 9.699665816540267e-06, "loss": 0.2335, "num_input_tokens_seen": 7843824, "step": 12880 }, { "epoch": 3.997828110456097, "grad_norm": 0.10360660403966904, "learning_rate": 9.699203510367297e-06, "loss": 0.2305, "num_input_tokens_seen": 7846960, "step": 12885 }, { "epoch": 3.9993794601303136, "grad_norm": 0.04431036114692688, "learning_rate": 9.69874085968706e-06, "loss": 0.2373, "num_input_tokens_seen": 7850736, "step": 12890 }, { "epoch": 4.0, "eval_loss": 0.23131409287452698, "eval_runtime": 34.4713, "eval_samples_per_second": 93.498, "eval_steps_per_second": 23.382, "num_input_tokens_seen": 7851504, "step": 12892 }, { "epoch": 4.0009308098045295, "grad_norm": 0.04322264343500137, "learning_rate": 9.69827786453348e-06, "loss": 0.2326, "num_input_tokens_seen": 7853072, "step": 12895 }, { "epoch": 4.002482159478746, "grad_norm": 0.05657974258065224, "learning_rate": 9.697814524940496e-06, "loss": 0.232, "num_input_tokens_seen": 7856368, "step": 12900 }, { "epoch": 4.004033509152963, "grad_norm": 0.02872118540108204, "learning_rate": 9.697350840942077e-06, "loss": 0.2324, "num_input_tokens_seen": 7860208, "step": 12905 }, { "epoch": 4.00558485882718, "grad_norm": 0.036303043365478516, "learning_rate": 9.696886812572217e-06, "loss": 0.2298, "num_input_tokens_seen": 7863344, "step": 12910 }, { "epoch": 4.007136208501397, "grad_norm": 0.038629401475191116, "learning_rate": 9.696422439864932e-06, "loss": 0.2283, "num_input_tokens_seen": 7866640, "step": 12915 }, { "epoch": 4.0086875581756125, "grad_norm": 0.10624056309461594, "learning_rate": 9.695957722854269e-06, "loss": 0.2293, "num_input_tokens_seen": 7869360, "step": 12920 }, { "epoch": 4.010238907849829, "grad_norm": 0.04035560041666031, "learning_rate": 9.695492661574298e-06, "loss": 0.2314, "num_input_tokens_seen": 7873712, "step": 12925 }, { "epoch": 4.011790257524046, "grad_norm": 0.04574258625507355, "learning_rate": 9.69502725605911e-06, "loss": 0.2341, "num_input_tokens_seen": 7876112, "step": 12930 }, { "epoch": 4.013341607198263, "grad_norm": 0.09734190255403519, "learning_rate": 9.694561506342828e-06, "loss": 0.2285, "num_input_tokens_seen": 7878672, "step": 12935 }, { "epoch": 4.014892956872479, "grad_norm": 0.04081209748983383, "learning_rate": 9.694095412459594e-06, "loss": 0.2262, "num_input_tokens_seen": 7881456, "step": 12940 }, { "epoch": 4.0164443065466955, "grad_norm": 0.10409050434827805, "learning_rate": 9.69362897444358e-06, "loss": 0.2355, "num_input_tokens_seen": 7884624, "step": 12945 }, { "epoch": 4.017995656220912, "grad_norm": 0.054607830941677094, "learning_rate": 9.693162192328983e-06, "loss": 0.2256, "num_input_tokens_seen": 7887088, "step": 12950 }, { "epoch": 4.019547005895129, "grad_norm": 0.2344217300415039, "learning_rate": 9.692695066150021e-06, "loss": 0.2296, "num_input_tokens_seen": 7889552, "step": 12955 }, { "epoch": 4.021098355569345, "grad_norm": 0.051086440682411194, "learning_rate": 9.692227595940942e-06, "loss": 0.2243, "num_input_tokens_seen": 7892688, "step": 12960 }, { "epoch": 4.022649705243562, "grad_norm": 0.15691956877708435, "learning_rate": 9.691759781736015e-06, "loss": 0.2476, "num_input_tokens_seen": 7896464, "step": 12965 }, { "epoch": 4.0242010549177785, "grad_norm": 0.04774864763021469, "learning_rate": 9.691291623569537e-06, "loss": 0.2293, "num_input_tokens_seen": 7899408, "step": 12970 }, { "epoch": 4.025752404591995, "grad_norm": 0.12297189235687256, "learning_rate": 9.69082312147583e-06, "loss": 0.2316, "num_input_tokens_seen": 7902480, "step": 12975 }, { "epoch": 4.027303754266212, "grad_norm": 0.10539617389440536, "learning_rate": 9.69035427548924e-06, "loss": 0.23, "num_input_tokens_seen": 7904688, "step": 12980 }, { "epoch": 4.028855103940428, "grad_norm": 0.13847079873085022, "learning_rate": 9.689885085644139e-06, "loss": 0.2298, "num_input_tokens_seen": 7908688, "step": 12985 }, { "epoch": 4.030406453614645, "grad_norm": 0.14251010119915009, "learning_rate": 9.689415551974927e-06, "loss": 0.2364, "num_input_tokens_seen": 7910896, "step": 12990 }, { "epoch": 4.031957803288861, "grad_norm": 0.12077353149652481, "learning_rate": 9.688945674516023e-06, "loss": 0.2332, "num_input_tokens_seen": 7913872, "step": 12995 }, { "epoch": 4.033509152963078, "grad_norm": 0.11648263037204742, "learning_rate": 9.688475453301876e-06, "loss": 0.23, "num_input_tokens_seen": 7917168, "step": 13000 }, { "epoch": 4.035060502637294, "grad_norm": 0.043426159769296646, "learning_rate": 9.688004888366956e-06, "loss": 0.2379, "num_input_tokens_seen": 7920176, "step": 13005 }, { "epoch": 4.036611852311511, "grad_norm": 0.10942310094833374, "learning_rate": 9.687533979745766e-06, "loss": 0.2332, "num_input_tokens_seen": 7922800, "step": 13010 }, { "epoch": 4.038163201985728, "grad_norm": 0.04119979217648506, "learning_rate": 9.687062727472826e-06, "loss": 0.2358, "num_input_tokens_seen": 7925936, "step": 13015 }, { "epoch": 4.039714551659944, "grad_norm": 0.10655750334262848, "learning_rate": 9.686591131582684e-06, "loss": 0.2259, "num_input_tokens_seen": 7928912, "step": 13020 }, { "epoch": 4.04126590133416, "grad_norm": 0.10055170953273773, "learning_rate": 9.686119192109916e-06, "loss": 0.2343, "num_input_tokens_seen": 7931120, "step": 13025 }, { "epoch": 4.042817251008377, "grad_norm": 0.04841354489326477, "learning_rate": 9.685646909089119e-06, "loss": 0.2412, "num_input_tokens_seen": 7933520, "step": 13030 }, { "epoch": 4.044368600682594, "grad_norm": 0.03223349153995514, "learning_rate": 9.685174282554915e-06, "loss": 0.2326, "num_input_tokens_seen": 7936304, "step": 13035 }, { "epoch": 4.045919950356811, "grad_norm": 0.03632846474647522, "learning_rate": 9.684701312541957e-06, "loss": 0.233, "num_input_tokens_seen": 7938416, "step": 13040 }, { "epoch": 4.047471300031027, "grad_norm": 0.031436480581760406, "learning_rate": 9.68422799908492e-06, "loss": 0.23, "num_input_tokens_seen": 7942288, "step": 13045 }, { "epoch": 4.049022649705243, "grad_norm": 0.11841578781604767, "learning_rate": 9.683754342218496e-06, "loss": 0.226, "num_input_tokens_seen": 7944752, "step": 13050 }, { "epoch": 4.05057399937946, "grad_norm": 0.13320285081863403, "learning_rate": 9.683280341977419e-06, "loss": 0.2422, "num_input_tokens_seen": 7947536, "step": 13055 }, { "epoch": 4.052125349053677, "grad_norm": 0.11024259030818939, "learning_rate": 9.682805998396432e-06, "loss": 0.2299, "num_input_tokens_seen": 7950032, "step": 13060 }, { "epoch": 4.053676698727894, "grad_norm": 0.11549681425094604, "learning_rate": 9.682331311510314e-06, "loss": 0.2294, "num_input_tokens_seen": 7952432, "step": 13065 }, { "epoch": 4.0552280484021095, "grad_norm": 0.04810095205903053, "learning_rate": 9.681856281353865e-06, "loss": 0.2317, "num_input_tokens_seen": 7955024, "step": 13070 }, { "epoch": 4.056779398076326, "grad_norm": 0.1495751142501831, "learning_rate": 9.681380907961906e-06, "loss": 0.2344, "num_input_tokens_seen": 7957616, "step": 13075 }, { "epoch": 4.058330747750543, "grad_norm": 0.10806575417518616, "learning_rate": 9.680905191369293e-06, "loss": 0.2309, "num_input_tokens_seen": 7960720, "step": 13080 }, { "epoch": 4.05988209742476, "grad_norm": 0.1538877785205841, "learning_rate": 9.680429131610898e-06, "loss": 0.228, "num_input_tokens_seen": 7963344, "step": 13085 }, { "epoch": 4.061433447098976, "grad_norm": 0.23951666057109833, "learning_rate": 9.679952728721624e-06, "loss": 0.2299, "num_input_tokens_seen": 7966192, "step": 13090 }, { "epoch": 4.0629847967731925, "grad_norm": 0.17368091642856598, "learning_rate": 9.679475982736395e-06, "loss": 0.2343, "num_input_tokens_seen": 7969136, "step": 13095 }, { "epoch": 4.064536146447409, "grad_norm": 0.04708410054445267, "learning_rate": 9.678998893690164e-06, "loss": 0.2231, "num_input_tokens_seen": 7971824, "step": 13100 }, { "epoch": 4.066087496121626, "grad_norm": 0.06912622600793839, "learning_rate": 9.678521461617908e-06, "loss": 0.2365, "num_input_tokens_seen": 7975088, "step": 13105 }, { "epoch": 4.067638845795843, "grad_norm": 0.2592995762825012, "learning_rate": 9.678043686554624e-06, "loss": 0.225, "num_input_tokens_seen": 7978416, "step": 13110 }, { "epoch": 4.069190195470059, "grad_norm": 0.2812253534793854, "learning_rate": 9.677565568535343e-06, "loss": 0.2122, "num_input_tokens_seen": 7980816, "step": 13115 }, { "epoch": 4.070741545144275, "grad_norm": 0.12931613624095917, "learning_rate": 9.677087107595113e-06, "loss": 0.2187, "num_input_tokens_seen": 7983504, "step": 13120 }, { "epoch": 4.072292894818492, "grad_norm": 0.32782793045043945, "learning_rate": 9.676608303769015e-06, "loss": 0.2463, "num_input_tokens_seen": 7986448, "step": 13125 }, { "epoch": 4.073844244492709, "grad_norm": 0.04980318620800972, "learning_rate": 9.67612915709215e-06, "loss": 0.2649, "num_input_tokens_seen": 7988784, "step": 13130 }, { "epoch": 4.075395594166925, "grad_norm": 0.052012547850608826, "learning_rate": 9.675649667599643e-06, "loss": 0.2535, "num_input_tokens_seen": 7993232, "step": 13135 }, { "epoch": 4.076946943841142, "grad_norm": 0.12505558133125305, "learning_rate": 9.675169835326647e-06, "loss": 0.2331, "num_input_tokens_seen": 7996816, "step": 13140 }, { "epoch": 4.078498293515358, "grad_norm": 0.11006627231836319, "learning_rate": 9.67468966030834e-06, "loss": 0.2361, "num_input_tokens_seen": 7999376, "step": 13145 }, { "epoch": 4.080049643189575, "grad_norm": 0.05355359613895416, "learning_rate": 9.674209142579925e-06, "loss": 0.2284, "num_input_tokens_seen": 8002032, "step": 13150 }, { "epoch": 4.081600992863791, "grad_norm": 0.05747560039162636, "learning_rate": 9.673728282176626e-06, "loss": 0.2377, "num_input_tokens_seen": 8005904, "step": 13155 }, { "epoch": 4.083152342538008, "grad_norm": 0.11478939652442932, "learning_rate": 9.673247079133702e-06, "loss": 0.2355, "num_input_tokens_seen": 8009008, "step": 13160 }, { "epoch": 4.084703692212225, "grad_norm": 0.04490077123045921, "learning_rate": 9.672765533486427e-06, "loss": 0.2314, "num_input_tokens_seen": 8014576, "step": 13165 }, { "epoch": 4.086255041886441, "grad_norm": 0.21124956011772156, "learning_rate": 9.672283645270105e-06, "loss": 0.2332, "num_input_tokens_seen": 8017104, "step": 13170 }, { "epoch": 4.087806391560658, "grad_norm": 0.11724337190389633, "learning_rate": 9.671801414520061e-06, "loss": 0.2339, "num_input_tokens_seen": 8020144, "step": 13175 }, { "epoch": 4.089357741234874, "grad_norm": 0.14796286821365356, "learning_rate": 9.671318841271654e-06, "loss": 0.233, "num_input_tokens_seen": 8023536, "step": 13180 }, { "epoch": 4.090909090909091, "grad_norm": 0.038423649966716766, "learning_rate": 9.670835925560257e-06, "loss": 0.2389, "num_input_tokens_seen": 8027536, "step": 13185 }, { "epoch": 4.092460440583308, "grad_norm": 0.13231733441352844, "learning_rate": 9.670352667421276e-06, "loss": 0.2305, "num_input_tokens_seen": 8029872, "step": 13190 }, { "epoch": 4.094011790257524, "grad_norm": 0.1248178705573082, "learning_rate": 9.66986906689014e-06, "loss": 0.2369, "num_input_tokens_seen": 8032528, "step": 13195 }, { "epoch": 4.09556313993174, "grad_norm": 0.1213509812951088, "learning_rate": 9.669385124002301e-06, "loss": 0.2295, "num_input_tokens_seen": 8036784, "step": 13200 }, { "epoch": 4.097114489605957, "grad_norm": 0.1139359325170517, "learning_rate": 9.668900838793241e-06, "loss": 0.2352, "num_input_tokens_seen": 8040944, "step": 13205 }, { "epoch": 4.098665839280174, "grad_norm": 0.041733819991350174, "learning_rate": 9.66841621129846e-06, "loss": 0.233, "num_input_tokens_seen": 8044688, "step": 13210 }, { "epoch": 4.100217188954391, "grad_norm": 0.12317850440740585, "learning_rate": 9.667931241553487e-06, "loss": 0.2294, "num_input_tokens_seen": 8047120, "step": 13215 }, { "epoch": 4.1017685386286065, "grad_norm": 0.11236461251974106, "learning_rate": 9.66744592959388e-06, "loss": 0.2289, "num_input_tokens_seen": 8050256, "step": 13220 }, { "epoch": 4.103319888302823, "grad_norm": 0.11088734120130539, "learning_rate": 9.666960275455212e-06, "loss": 0.225, "num_input_tokens_seen": 8053552, "step": 13225 }, { "epoch": 4.10487123797704, "grad_norm": 0.03484678640961647, "learning_rate": 9.666474279173095e-06, "loss": 0.2348, "num_input_tokens_seen": 8056336, "step": 13230 }, { "epoch": 4.106422587651257, "grad_norm": 0.0424012616276741, "learning_rate": 9.665987940783152e-06, "loss": 0.2361, "num_input_tokens_seen": 8059216, "step": 13235 }, { "epoch": 4.107973937325474, "grad_norm": 0.11451917141675949, "learning_rate": 9.665501260321038e-06, "loss": 0.2299, "num_input_tokens_seen": 8063088, "step": 13240 }, { "epoch": 4.109525286999689, "grad_norm": 0.28195643424987793, "learning_rate": 9.665014237822435e-06, "loss": 0.2265, "num_input_tokens_seen": 8065648, "step": 13245 }, { "epoch": 4.111076636673906, "grad_norm": 0.035718560218811035, "learning_rate": 9.664526873323047e-06, "loss": 0.2395, "num_input_tokens_seen": 8070928, "step": 13250 }, { "epoch": 4.112627986348123, "grad_norm": 0.1340446025133133, "learning_rate": 9.664039166858604e-06, "loss": 0.2355, "num_input_tokens_seen": 8073104, "step": 13255 }, { "epoch": 4.11417933602234, "grad_norm": 0.25547710061073303, "learning_rate": 9.663551118464855e-06, "loss": 0.2322, "num_input_tokens_seen": 8075952, "step": 13260 }, { "epoch": 4.115730685696556, "grad_norm": 0.040548086166381836, "learning_rate": 9.663062728177587e-06, "loss": 0.2377, "num_input_tokens_seen": 8078960, "step": 13265 }, { "epoch": 4.117282035370772, "grad_norm": 0.032003480941057205, "learning_rate": 9.662573996032603e-06, "loss": 0.2337, "num_input_tokens_seen": 8081904, "step": 13270 }, { "epoch": 4.118833385044989, "grad_norm": 0.12178381532430649, "learning_rate": 9.662084922065733e-06, "loss": 0.2351, "num_input_tokens_seen": 8084368, "step": 13275 }, { "epoch": 4.120384734719206, "grad_norm": 0.11572323739528656, "learning_rate": 9.661595506312828e-06, "loss": 0.2362, "num_input_tokens_seen": 8087600, "step": 13280 }, { "epoch": 4.121936084393422, "grad_norm": 0.042517878115177155, "learning_rate": 9.661105748809772e-06, "loss": 0.2346, "num_input_tokens_seen": 8090448, "step": 13285 }, { "epoch": 4.123487434067639, "grad_norm": 0.11573540419340134, "learning_rate": 9.660615649592469e-06, "loss": 0.2304, "num_input_tokens_seen": 8094640, "step": 13290 }, { "epoch": 4.125038783741855, "grad_norm": 0.0393361933529377, "learning_rate": 9.660125208696849e-06, "loss": 0.2324, "num_input_tokens_seen": 8097168, "step": 13295 }, { "epoch": 4.126590133416072, "grad_norm": 0.10471148043870926, "learning_rate": 9.659634426158867e-06, "loss": 0.2294, "num_input_tokens_seen": 8099920, "step": 13300 }, { "epoch": 4.128141483090289, "grad_norm": 0.029214058071374893, "learning_rate": 9.659143302014503e-06, "loss": 0.2332, "num_input_tokens_seen": 8102768, "step": 13305 }, { "epoch": 4.129692832764505, "grad_norm": 0.10466323792934418, "learning_rate": 9.658651836299763e-06, "loss": 0.2296, "num_input_tokens_seen": 8105264, "step": 13310 }, { "epoch": 4.131244182438722, "grad_norm": 0.21154411137104034, "learning_rate": 9.658160029050676e-06, "loss": 0.2305, "num_input_tokens_seen": 8107696, "step": 13315 }, { "epoch": 4.132795532112938, "grad_norm": 0.14577332139015198, "learning_rate": 9.657667880303299e-06, "loss": 0.2273, "num_input_tokens_seen": 8111088, "step": 13320 }, { "epoch": 4.134346881787155, "grad_norm": 0.04905332252383232, "learning_rate": 9.657175390093708e-06, "loss": 0.225, "num_input_tokens_seen": 8116016, "step": 13325 }, { "epoch": 4.135898231461371, "grad_norm": 0.1543150097131729, "learning_rate": 9.656682558458015e-06, "loss": 0.2297, "num_input_tokens_seen": 8118768, "step": 13330 }, { "epoch": 4.137449581135588, "grad_norm": 0.04378588870167732, "learning_rate": 9.656189385432347e-06, "loss": 0.24, "num_input_tokens_seen": 8121136, "step": 13335 }, { "epoch": 4.139000930809805, "grad_norm": 0.10284591466188431, "learning_rate": 9.65569587105286e-06, "loss": 0.2239, "num_input_tokens_seen": 8123888, "step": 13340 }, { "epoch": 4.140552280484021, "grad_norm": 0.09891744703054428, "learning_rate": 9.655202015355732e-06, "loss": 0.2288, "num_input_tokens_seen": 8126896, "step": 13345 }, { "epoch": 4.142103630158237, "grad_norm": 0.14195352792739868, "learning_rate": 9.654707818377172e-06, "loss": 0.2391, "num_input_tokens_seen": 8129488, "step": 13350 }, { "epoch": 4.143654979832454, "grad_norm": 0.21392640471458435, "learning_rate": 9.654213280153408e-06, "loss": 0.2315, "num_input_tokens_seen": 8131952, "step": 13355 }, { "epoch": 4.145206329506671, "grad_norm": 0.10041865706443787, "learning_rate": 9.653718400720696e-06, "loss": 0.2317, "num_input_tokens_seen": 8135120, "step": 13360 }, { "epoch": 4.146757679180888, "grad_norm": 0.10715066641569138, "learning_rate": 9.65322318011532e-06, "loss": 0.2306, "num_input_tokens_seen": 8137680, "step": 13365 }, { "epoch": 4.148309028855104, "grad_norm": 0.13922172784805298, "learning_rate": 9.652727618373578e-06, "loss": 0.2326, "num_input_tokens_seen": 8139728, "step": 13370 }, { "epoch": 4.14986037852932, "grad_norm": 0.13648909330368042, "learning_rate": 9.652231715531808e-06, "loss": 0.2319, "num_input_tokens_seen": 8142480, "step": 13375 }, { "epoch": 4.151411728203537, "grad_norm": 0.10777807235717773, "learning_rate": 9.651735471626361e-06, "loss": 0.2357, "num_input_tokens_seen": 8145136, "step": 13380 }, { "epoch": 4.152963077877754, "grad_norm": 0.04870372265577316, "learning_rate": 9.65123888669362e-06, "loss": 0.2348, "num_input_tokens_seen": 8147920, "step": 13385 }, { "epoch": 4.1545144275519705, "grad_norm": 0.043670568615198135, "learning_rate": 9.65074196076999e-06, "loss": 0.2337, "num_input_tokens_seen": 8150768, "step": 13390 }, { "epoch": 4.156065777226186, "grad_norm": 0.10851085931062698, "learning_rate": 9.650244693891901e-06, "loss": 0.2309, "num_input_tokens_seen": 8154160, "step": 13395 }, { "epoch": 4.157617126900403, "grad_norm": 0.037060365080833435, "learning_rate": 9.649747086095809e-06, "loss": 0.2293, "num_input_tokens_seen": 8156432, "step": 13400 }, { "epoch": 4.15916847657462, "grad_norm": 0.11680066585540771, "learning_rate": 9.649249137418193e-06, "loss": 0.2308, "num_input_tokens_seen": 8159216, "step": 13405 }, { "epoch": 4.160719826248837, "grad_norm": 0.12469100207090378, "learning_rate": 9.64875084789556e-06, "loss": 0.2304, "num_input_tokens_seen": 8161904, "step": 13410 }, { "epoch": 4.162271175923053, "grad_norm": 0.1335511952638626, "learning_rate": 9.648252217564442e-06, "loss": 0.227, "num_input_tokens_seen": 8166576, "step": 13415 }, { "epoch": 4.163822525597269, "grad_norm": 0.15254981815814972, "learning_rate": 9.647753246461393e-06, "loss": 0.2323, "num_input_tokens_seen": 8168816, "step": 13420 }, { "epoch": 4.165373875271486, "grad_norm": 0.11636842042207718, "learning_rate": 9.647253934622993e-06, "loss": 0.2269, "num_input_tokens_seen": 8171216, "step": 13425 }, { "epoch": 4.166925224945703, "grad_norm": 0.11341748386621475, "learning_rate": 9.646754282085846e-06, "loss": 0.2338, "num_input_tokens_seen": 8174896, "step": 13430 }, { "epoch": 4.16847657461992, "grad_norm": 0.14734962582588196, "learning_rate": 9.646254288886586e-06, "loss": 0.2387, "num_input_tokens_seen": 8177392, "step": 13435 }, { "epoch": 4.170027924294136, "grad_norm": 0.1537982076406479, "learning_rate": 9.645753955061867e-06, "loss": 0.2267, "num_input_tokens_seen": 8180560, "step": 13440 }, { "epoch": 4.171579273968352, "grad_norm": 0.2535419762134552, "learning_rate": 9.645253280648367e-06, "loss": 0.2314, "num_input_tokens_seen": 8184080, "step": 13445 }, { "epoch": 4.173130623642569, "grad_norm": 0.04818422719836235, "learning_rate": 9.644752265682797e-06, "loss": 0.2322, "num_input_tokens_seen": 8186736, "step": 13450 }, { "epoch": 4.174681973316786, "grad_norm": 0.04301873594522476, "learning_rate": 9.644250910201883e-06, "loss": 0.2242, "num_input_tokens_seen": 8189552, "step": 13455 }, { "epoch": 4.176233322991002, "grad_norm": 0.13130073249340057, "learning_rate": 9.643749214242379e-06, "loss": 0.2329, "num_input_tokens_seen": 8192976, "step": 13460 }, { "epoch": 4.177784672665219, "grad_norm": 0.2473168969154358, "learning_rate": 9.643247177841069e-06, "loss": 0.2391, "num_input_tokens_seen": 8195632, "step": 13465 }, { "epoch": 4.179336022339435, "grad_norm": 0.11827093362808228, "learning_rate": 9.642744801034756e-06, "loss": 0.2328, "num_input_tokens_seen": 8199728, "step": 13470 }, { "epoch": 4.180887372013652, "grad_norm": 0.11932337284088135, "learning_rate": 9.64224208386027e-06, "loss": 0.2289, "num_input_tokens_seen": 8202992, "step": 13475 }, { "epoch": 4.182438721687868, "grad_norm": 0.11531716585159302, "learning_rate": 9.64173902635447e-06, "loss": 0.2314, "num_input_tokens_seen": 8205840, "step": 13480 }, { "epoch": 4.183990071362085, "grad_norm": 0.11504152417182922, "learning_rate": 9.641235628554228e-06, "loss": 0.2292, "num_input_tokens_seen": 8208336, "step": 13485 }, { "epoch": 4.185541421036302, "grad_norm": 0.14383429288864136, "learning_rate": 9.640731890496458e-06, "loss": 0.2319, "num_input_tokens_seen": 8211664, "step": 13490 }, { "epoch": 4.187092770710518, "grad_norm": 0.05496819317340851, "learning_rate": 9.640227812218084e-06, "loss": 0.232, "num_input_tokens_seen": 8213808, "step": 13495 }, { "epoch": 4.188644120384735, "grad_norm": 0.11590800434350967, "learning_rate": 9.639723393756064e-06, "loss": 0.2384, "num_input_tokens_seen": 8216368, "step": 13500 }, { "epoch": 4.190195470058951, "grad_norm": 0.11212185770273209, "learning_rate": 9.639218635147376e-06, "loss": 0.235, "num_input_tokens_seen": 8219280, "step": 13505 }, { "epoch": 4.191746819733168, "grad_norm": 0.11260765045881271, "learning_rate": 9.638713536429025e-06, "loss": 0.2304, "num_input_tokens_seen": 8222000, "step": 13510 }, { "epoch": 4.1932981694073845, "grad_norm": 0.041610486805438995, "learning_rate": 9.638208097638042e-06, "loss": 0.2345, "num_input_tokens_seen": 8225424, "step": 13515 }, { "epoch": 4.194849519081601, "grad_norm": 0.21573469042778015, "learning_rate": 9.63770231881148e-06, "loss": 0.2298, "num_input_tokens_seen": 8230384, "step": 13520 }, { "epoch": 4.196400868755817, "grad_norm": 0.12628668546676636, "learning_rate": 9.63719619998642e-06, "loss": 0.2319, "num_input_tokens_seen": 8233008, "step": 13525 }, { "epoch": 4.197952218430034, "grad_norm": 0.04384395107626915, "learning_rate": 9.636689741199967e-06, "loss": 0.2293, "num_input_tokens_seen": 8235824, "step": 13530 }, { "epoch": 4.199503568104251, "grad_norm": 0.11165662109851837, "learning_rate": 9.636182942489247e-06, "loss": 0.2304, "num_input_tokens_seen": 8239344, "step": 13535 }, { "epoch": 4.2010549177784675, "grad_norm": 0.06534019112586975, "learning_rate": 9.63567580389142e-06, "loss": 0.2336, "num_input_tokens_seen": 8245296, "step": 13540 }, { "epoch": 4.202606267452683, "grad_norm": 0.039857249706983566, "learning_rate": 9.63516832544366e-06, "loss": 0.2335, "num_input_tokens_seen": 8248016, "step": 13545 }, { "epoch": 4.2041576171269, "grad_norm": 0.10467449575662613, "learning_rate": 9.634660507183171e-06, "loss": 0.2324, "num_input_tokens_seen": 8250800, "step": 13550 }, { "epoch": 4.205708966801117, "grad_norm": 0.10773011296987534, "learning_rate": 9.634152349147188e-06, "loss": 0.2309, "num_input_tokens_seen": 8253520, "step": 13555 }, { "epoch": 4.207260316475334, "grad_norm": 0.11940081417560577, "learning_rate": 9.633643851372959e-06, "loss": 0.2341, "num_input_tokens_seen": 8256080, "step": 13560 }, { "epoch": 4.2088116661495505, "grad_norm": 0.035009536892175674, "learning_rate": 9.633135013897766e-06, "loss": 0.2314, "num_input_tokens_seen": 8258864, "step": 13565 }, { "epoch": 4.210363015823766, "grad_norm": 0.03505924344062805, "learning_rate": 9.63262583675891e-06, "loss": 0.2294, "num_input_tokens_seen": 8261744, "step": 13570 }, { "epoch": 4.211914365497983, "grad_norm": 0.12110363692045212, "learning_rate": 9.632116319993726e-06, "loss": 0.2337, "num_input_tokens_seen": 8265168, "step": 13575 }, { "epoch": 4.2134657151722, "grad_norm": 0.12244557589292526, "learning_rate": 9.63160646363956e-06, "loss": 0.2321, "num_input_tokens_seen": 8267952, "step": 13580 }, { "epoch": 4.215017064846417, "grad_norm": 0.21672116219997406, "learning_rate": 9.631096267733793e-06, "loss": 0.2331, "num_input_tokens_seen": 8271312, "step": 13585 }, { "epoch": 4.216568414520633, "grad_norm": 0.04385360702872276, "learning_rate": 9.63058573231383e-06, "loss": 0.232, "num_input_tokens_seen": 8274608, "step": 13590 }, { "epoch": 4.218119764194849, "grad_norm": 0.10300233960151672, "learning_rate": 9.6300748574171e-06, "loss": 0.2269, "num_input_tokens_seen": 8276976, "step": 13595 }, { "epoch": 4.219671113869066, "grad_norm": 0.1301506906747818, "learning_rate": 9.629563643081055e-06, "loss": 0.2368, "num_input_tokens_seen": 8279568, "step": 13600 }, { "epoch": 4.221222463543283, "grad_norm": 0.05123094096779823, "learning_rate": 9.629052089343171e-06, "loss": 0.2353, "num_input_tokens_seen": 8282000, "step": 13605 }, { "epoch": 4.222773813217499, "grad_norm": 0.045050106942653656, "learning_rate": 9.628540196240953e-06, "loss": 0.2239, "num_input_tokens_seen": 8286384, "step": 13610 }, { "epoch": 4.224325162891716, "grad_norm": 0.12728251516819, "learning_rate": 9.628027963811928e-06, "loss": 0.2365, "num_input_tokens_seen": 8290384, "step": 13615 }, { "epoch": 4.225876512565932, "grad_norm": 0.05655066296458244, "learning_rate": 9.62751539209365e-06, "loss": 0.2345, "num_input_tokens_seen": 8292624, "step": 13620 }, { "epoch": 4.227427862240149, "grad_norm": 0.03281504660844803, "learning_rate": 9.627002481123696e-06, "loss": 0.2395, "num_input_tokens_seen": 8295760, "step": 13625 }, { "epoch": 4.228979211914366, "grad_norm": 0.1141592264175415, "learning_rate": 9.626489230939669e-06, "loss": 0.235, "num_input_tokens_seen": 8298704, "step": 13630 }, { "epoch": 4.230530561588582, "grad_norm": 0.10689437389373779, "learning_rate": 9.625975641579193e-06, "loss": 0.2321, "num_input_tokens_seen": 8302064, "step": 13635 }, { "epoch": 4.2320819112627985, "grad_norm": 0.10813425481319427, "learning_rate": 9.625461713079924e-06, "loss": 0.231, "num_input_tokens_seen": 8304688, "step": 13640 }, { "epoch": 4.233633260937015, "grad_norm": 0.10217594355344772, "learning_rate": 9.624947445479538e-06, "loss": 0.2314, "num_input_tokens_seen": 8309488, "step": 13645 }, { "epoch": 4.235184610611232, "grad_norm": 0.03046317771077156, "learning_rate": 9.624432838815736e-06, "loss": 0.2299, "num_input_tokens_seen": 8312496, "step": 13650 }, { "epoch": 4.236735960285448, "grad_norm": 0.04057746380567551, "learning_rate": 9.623917893126247e-06, "loss": 0.2315, "num_input_tokens_seen": 8316624, "step": 13655 }, { "epoch": 4.238287309959665, "grad_norm": 0.09838995337486267, "learning_rate": 9.62340260844882e-06, "loss": 0.2315, "num_input_tokens_seen": 8318992, "step": 13660 }, { "epoch": 4.2398386596338815, "grad_norm": 0.049148716032505035, "learning_rate": 9.622886984821234e-06, "loss": 0.2321, "num_input_tokens_seen": 8322512, "step": 13665 }, { "epoch": 4.241390009308098, "grad_norm": 0.04696498066186905, "learning_rate": 9.622371022281287e-06, "loss": 0.2311, "num_input_tokens_seen": 8326608, "step": 13670 }, { "epoch": 4.242941358982314, "grad_norm": 0.1051630824804306, "learning_rate": 9.621854720866806e-06, "loss": 0.2274, "num_input_tokens_seen": 8329424, "step": 13675 }, { "epoch": 4.244492708656531, "grad_norm": 0.12287742644548416, "learning_rate": 9.621338080615645e-06, "loss": 0.226, "num_input_tokens_seen": 8332144, "step": 13680 }, { "epoch": 4.246044058330748, "grad_norm": 0.12294932454824448, "learning_rate": 9.620821101565678e-06, "loss": 0.237, "num_input_tokens_seen": 8334864, "step": 13685 }, { "epoch": 4.2475954080049645, "grad_norm": 0.03274325653910637, "learning_rate": 9.620303783754805e-06, "loss": 0.2365, "num_input_tokens_seen": 8337168, "step": 13690 }, { "epoch": 4.249146757679181, "grad_norm": 0.10916777700185776, "learning_rate": 9.619786127220951e-06, "loss": 0.2325, "num_input_tokens_seen": 8339888, "step": 13695 }, { "epoch": 4.250698107353397, "grad_norm": 0.0336516909301281, "learning_rate": 9.619268132002069e-06, "loss": 0.2299, "num_input_tokens_seen": 8342192, "step": 13700 }, { "epoch": 4.252249457027614, "grad_norm": 0.04014017805457115, "learning_rate": 9.618749798136131e-06, "loss": 0.2304, "num_input_tokens_seen": 8346320, "step": 13705 }, { "epoch": 4.253800806701831, "grad_norm": 0.033738087862730026, "learning_rate": 9.618231125661141e-06, "loss": 0.2315, "num_input_tokens_seen": 8350128, "step": 13710 }, { "epoch": 4.2553521563760475, "grad_norm": 0.21560271084308624, "learning_rate": 9.617712114615119e-06, "loss": 0.2293, "num_input_tokens_seen": 8353200, "step": 13715 }, { "epoch": 4.256903506050263, "grad_norm": 0.12436478585004807, "learning_rate": 9.617192765036119e-06, "loss": 0.2295, "num_input_tokens_seen": 8356176, "step": 13720 }, { "epoch": 4.25845485572448, "grad_norm": 0.05297096073627472, "learning_rate": 9.616673076962213e-06, "loss": 0.2257, "num_input_tokens_seen": 8359344, "step": 13725 }, { "epoch": 4.260006205398697, "grad_norm": 0.12407777458429337, "learning_rate": 9.6161530504315e-06, "loss": 0.2393, "num_input_tokens_seen": 8362576, "step": 13730 }, { "epoch": 4.261557555072914, "grad_norm": 0.057936955243349075, "learning_rate": 9.615632685482106e-06, "loss": 0.2309, "num_input_tokens_seen": 8367152, "step": 13735 }, { "epoch": 4.26310890474713, "grad_norm": 0.133900985121727, "learning_rate": 9.615111982152177e-06, "loss": 0.2337, "num_input_tokens_seen": 8370640, "step": 13740 }, { "epoch": 4.264660254421346, "grad_norm": 0.04474259540438652, "learning_rate": 9.61459094047989e-06, "loss": 0.2355, "num_input_tokens_seen": 8373584, "step": 13745 }, { "epoch": 4.266211604095563, "grad_norm": 0.20476627349853516, "learning_rate": 9.614069560503443e-06, "loss": 0.2275, "num_input_tokens_seen": 8376080, "step": 13750 }, { "epoch": 4.26776295376978, "grad_norm": 0.12075857073068619, "learning_rate": 9.613547842261057e-06, "loss": 0.2311, "num_input_tokens_seen": 8379792, "step": 13755 }, { "epoch": 4.269314303443997, "grad_norm": 0.10321872681379318, "learning_rate": 9.613025785790984e-06, "loss": 0.2326, "num_input_tokens_seen": 8382992, "step": 13760 }, { "epoch": 4.2708656531182125, "grad_norm": 0.19561536610126495, "learning_rate": 9.612503391131492e-06, "loss": 0.2243, "num_input_tokens_seen": 8385872, "step": 13765 }, { "epoch": 4.272417002792429, "grad_norm": 0.19662249088287354, "learning_rate": 9.611980658320882e-06, "loss": 0.2294, "num_input_tokens_seen": 8388432, "step": 13770 }, { "epoch": 4.273968352466646, "grad_norm": 0.04658951982855797, "learning_rate": 9.611457587397474e-06, "loss": 0.2291, "num_input_tokens_seen": 8391248, "step": 13775 }, { "epoch": 4.275519702140863, "grad_norm": 0.05376431345939636, "learning_rate": 9.61093417839962e-06, "loss": 0.2295, "num_input_tokens_seen": 8393840, "step": 13780 }, { "epoch": 4.277071051815079, "grad_norm": 0.13674107193946838, "learning_rate": 9.610410431365688e-06, "loss": 0.2278, "num_input_tokens_seen": 8396496, "step": 13785 }, { "epoch": 4.2786224014892955, "grad_norm": 0.1348845213651657, "learning_rate": 9.609886346334075e-06, "loss": 0.2369, "num_input_tokens_seen": 8398960, "step": 13790 }, { "epoch": 4.280173751163512, "grad_norm": 0.10420148074626923, "learning_rate": 9.609361923343206e-06, "loss": 0.235, "num_input_tokens_seen": 8403056, "step": 13795 }, { "epoch": 4.281725100837729, "grad_norm": 0.12266954034566879, "learning_rate": 9.608837162431522e-06, "loss": 0.2358, "num_input_tokens_seen": 8405936, "step": 13800 }, { "epoch": 4.283276450511945, "grad_norm": 0.03319878503680229, "learning_rate": 9.6083120636375e-06, "loss": 0.2302, "num_input_tokens_seen": 8408976, "step": 13805 }, { "epoch": 4.284827800186162, "grad_norm": 0.0975218117237091, "learning_rate": 9.607786626999632e-06, "loss": 0.2346, "num_input_tokens_seen": 8411376, "step": 13810 }, { "epoch": 4.2863791498603785, "grad_norm": 0.10948837548494339, "learning_rate": 9.607260852556438e-06, "loss": 0.2295, "num_input_tokens_seen": 8415056, "step": 13815 }, { "epoch": 4.287930499534595, "grad_norm": 0.03829440474510193, "learning_rate": 9.606734740346466e-06, "loss": 0.2304, "num_input_tokens_seen": 8417808, "step": 13820 }, { "epoch": 4.289481849208812, "grad_norm": 0.045116592198610306, "learning_rate": 9.606208290408288e-06, "loss": 0.2309, "num_input_tokens_seen": 8420688, "step": 13825 }, { "epoch": 4.291033198883028, "grad_norm": 0.11745308339595795, "learning_rate": 9.605681502780495e-06, "loss": 0.2293, "num_input_tokens_seen": 8423280, "step": 13830 }, { "epoch": 4.292584548557245, "grad_norm": 0.040126148611307144, "learning_rate": 9.605154377501709e-06, "loss": 0.2283, "num_input_tokens_seen": 8426224, "step": 13835 }, { "epoch": 4.2941358982314615, "grad_norm": 0.10857382416725159, "learning_rate": 9.604626914610573e-06, "loss": 0.2324, "num_input_tokens_seen": 8429232, "step": 13840 }, { "epoch": 4.295687247905678, "grad_norm": 0.2106412798166275, "learning_rate": 9.604099114145757e-06, "loss": 0.2277, "num_input_tokens_seen": 8431920, "step": 13845 }, { "epoch": 4.297238597579894, "grad_norm": 0.12294883280992508, "learning_rate": 9.603570976145958e-06, "loss": 0.2278, "num_input_tokens_seen": 8435888, "step": 13850 }, { "epoch": 4.298789947254111, "grad_norm": 0.07838909327983856, "learning_rate": 9.603042500649888e-06, "loss": 0.2303, "num_input_tokens_seen": 8440336, "step": 13855 }, { "epoch": 4.300341296928328, "grad_norm": 0.1180918887257576, "learning_rate": 9.602513687696298e-06, "loss": 0.2323, "num_input_tokens_seen": 8443088, "step": 13860 }, { "epoch": 4.3018926466025444, "grad_norm": 0.10688266903162003, "learning_rate": 9.601984537323951e-06, "loss": 0.232, "num_input_tokens_seen": 8446192, "step": 13865 }, { "epoch": 4.30344399627676, "grad_norm": 0.1145123764872551, "learning_rate": 9.601455049571642e-06, "loss": 0.2346, "num_input_tokens_seen": 8449040, "step": 13870 }, { "epoch": 4.304995345950977, "grad_norm": 0.22522960603237152, "learning_rate": 9.600925224478188e-06, "loss": 0.2404, "num_input_tokens_seen": 8452624, "step": 13875 }, { "epoch": 4.306546695625194, "grad_norm": 0.1077989861369133, "learning_rate": 9.600395062082433e-06, "loss": 0.2314, "num_input_tokens_seen": 8455280, "step": 13880 }, { "epoch": 4.308098045299411, "grad_norm": 0.04647769033908844, "learning_rate": 9.599864562423242e-06, "loss": 0.2288, "num_input_tokens_seen": 8458192, "step": 13885 }, { "epoch": 4.309649394973627, "grad_norm": 0.04153535142540932, "learning_rate": 9.59933372553951e-06, "loss": 0.2325, "num_input_tokens_seen": 8460944, "step": 13890 }, { "epoch": 4.311200744647843, "grad_norm": 0.04253935441374779, "learning_rate": 9.59880255147015e-06, "loss": 0.2333, "num_input_tokens_seen": 8463728, "step": 13895 }, { "epoch": 4.31275209432206, "grad_norm": 0.11620529741048813, "learning_rate": 9.598271040254106e-06, "loss": 0.2318, "num_input_tokens_seen": 8466800, "step": 13900 }, { "epoch": 4.314303443996277, "grad_norm": 0.0460701659321785, "learning_rate": 9.597739191930342e-06, "loss": 0.2349, "num_input_tokens_seen": 8469776, "step": 13905 }, { "epoch": 4.315854793670494, "grad_norm": 0.05988026037812233, "learning_rate": 9.59720700653785e-06, "loss": 0.2274, "num_input_tokens_seen": 8472176, "step": 13910 }, { "epoch": 4.3174061433447095, "grad_norm": 0.11002083122730255, "learning_rate": 9.596674484115646e-06, "loss": 0.2274, "num_input_tokens_seen": 8476080, "step": 13915 }, { "epoch": 4.318957493018926, "grad_norm": 0.03524283319711685, "learning_rate": 9.596141624702769e-06, "loss": 0.2336, "num_input_tokens_seen": 8478960, "step": 13920 }, { "epoch": 4.320508842693143, "grad_norm": 0.057470276951789856, "learning_rate": 9.595608428338285e-06, "loss": 0.2305, "num_input_tokens_seen": 8482064, "step": 13925 }, { "epoch": 4.32206019236736, "grad_norm": 0.20466123521327972, "learning_rate": 9.595074895061282e-06, "loss": 0.228, "num_input_tokens_seen": 8484880, "step": 13930 }, { "epoch": 4.323611542041576, "grad_norm": 0.03959719091653824, "learning_rate": 9.594541024910876e-06, "loss": 0.2248, "num_input_tokens_seen": 8487568, "step": 13935 }, { "epoch": 4.3251628917157925, "grad_norm": 0.24132311344146729, "learning_rate": 9.594006817926206e-06, "loss": 0.2316, "num_input_tokens_seen": 8490096, "step": 13940 }, { "epoch": 4.326714241390009, "grad_norm": 0.050817981362342834, "learning_rate": 9.593472274146434e-06, "loss": 0.2304, "num_input_tokens_seen": 8492912, "step": 13945 }, { "epoch": 4.328265591064226, "grad_norm": 0.1358383446931839, "learning_rate": 9.592937393610751e-06, "loss": 0.2358, "num_input_tokens_seen": 8497328, "step": 13950 }, { "epoch": 4.329816940738443, "grad_norm": 0.10289572179317474, "learning_rate": 9.592402176358367e-06, "loss": 0.2291, "num_input_tokens_seen": 8499920, "step": 13955 }, { "epoch": 4.331368290412659, "grad_norm": 0.0969790369272232, "learning_rate": 9.591866622428521e-06, "loss": 0.2307, "num_input_tokens_seen": 8502608, "step": 13960 }, { "epoch": 4.3329196400868755, "grad_norm": 0.05451877415180206, "learning_rate": 9.591330731860478e-06, "loss": 0.2288, "num_input_tokens_seen": 8506096, "step": 13965 }, { "epoch": 4.334470989761092, "grad_norm": 0.051386237144470215, "learning_rate": 9.590794504693521e-06, "loss": 0.2381, "num_input_tokens_seen": 8509840, "step": 13970 }, { "epoch": 4.336022339435309, "grad_norm": 0.12434318661689758, "learning_rate": 9.590257940966964e-06, "loss": 0.2301, "num_input_tokens_seen": 8515120, "step": 13975 }, { "epoch": 4.337573689109525, "grad_norm": 0.15530046820640564, "learning_rate": 9.589721040720143e-06, "loss": 0.2354, "num_input_tokens_seen": 8518448, "step": 13980 }, { "epoch": 4.339125038783742, "grad_norm": 0.1417132169008255, "learning_rate": 9.58918380399242e-06, "loss": 0.2371, "num_input_tokens_seen": 8521296, "step": 13985 }, { "epoch": 4.3406763884579584, "grad_norm": 0.25736135244369507, "learning_rate": 9.588646230823179e-06, "loss": 0.2347, "num_input_tokens_seen": 8524208, "step": 13990 }, { "epoch": 4.342227738132175, "grad_norm": 0.13093318045139313, "learning_rate": 9.588108321251832e-06, "loss": 0.2294, "num_input_tokens_seen": 8526544, "step": 13995 }, { "epoch": 4.343779087806391, "grad_norm": 0.050074152648448944, "learning_rate": 9.587570075317813e-06, "loss": 0.2284, "num_input_tokens_seen": 8528976, "step": 14000 }, { "epoch": 4.345330437480608, "grad_norm": 0.04834885895252228, "learning_rate": 9.587031493060585e-06, "loss": 0.2327, "num_input_tokens_seen": 8532080, "step": 14005 }, { "epoch": 4.346881787154825, "grad_norm": 0.04828425124287605, "learning_rate": 9.586492574519628e-06, "loss": 0.229, "num_input_tokens_seen": 8534192, "step": 14010 }, { "epoch": 4.348433136829041, "grad_norm": 0.2667260766029358, "learning_rate": 9.585953319734454e-06, "loss": 0.2353, "num_input_tokens_seen": 8536592, "step": 14015 }, { "epoch": 4.349984486503258, "grad_norm": 0.04157206416130066, "learning_rate": 9.585413728744595e-06, "loss": 0.2294, "num_input_tokens_seen": 8539216, "step": 14020 }, { "epoch": 4.351535836177474, "grad_norm": 0.23930802941322327, "learning_rate": 9.58487380158961e-06, "loss": 0.2274, "num_input_tokens_seen": 8542320, "step": 14025 }, { "epoch": 4.353087185851691, "grad_norm": 0.0525834858417511, "learning_rate": 9.584333538309083e-06, "loss": 0.2295, "num_input_tokens_seen": 8544784, "step": 14030 }, { "epoch": 4.354638535525908, "grad_norm": 0.12227725237607956, "learning_rate": 9.583792938942622e-06, "loss": 0.2295, "num_input_tokens_seen": 8547888, "step": 14035 }, { "epoch": 4.356189885200124, "grad_norm": 0.047248441725969315, "learning_rate": 9.583252003529856e-06, "loss": 0.2336, "num_input_tokens_seen": 8550192, "step": 14040 }, { "epoch": 4.35774123487434, "grad_norm": 0.12431290000677109, "learning_rate": 9.582710732110446e-06, "loss": 0.231, "num_input_tokens_seen": 8553680, "step": 14045 }, { "epoch": 4.359292584548557, "grad_norm": 0.03318170830607414, "learning_rate": 9.582169124724071e-06, "loss": 0.23, "num_input_tokens_seen": 8556368, "step": 14050 }, { "epoch": 4.360843934222774, "grad_norm": 0.058791302144527435, "learning_rate": 9.58162718141044e-06, "loss": 0.2315, "num_input_tokens_seen": 8560464, "step": 14055 }, { "epoch": 4.362395283896991, "grad_norm": 0.11404898017644882, "learning_rate": 9.581084902209278e-06, "loss": 0.2316, "num_input_tokens_seen": 8563216, "step": 14060 }, { "epoch": 4.3639466335712065, "grad_norm": 0.03857605904340744, "learning_rate": 9.580542287160348e-06, "loss": 0.2323, "num_input_tokens_seen": 8567280, "step": 14065 }, { "epoch": 4.365497983245423, "grad_norm": 0.11541605740785599, "learning_rate": 9.579999336303427e-06, "loss": 0.2322, "num_input_tokens_seen": 8570000, "step": 14070 }, { "epoch": 4.36704933291964, "grad_norm": 0.05438017100095749, "learning_rate": 9.579456049678318e-06, "loss": 0.2308, "num_input_tokens_seen": 8572784, "step": 14075 }, { "epoch": 4.368600682593857, "grad_norm": 0.13680319488048553, "learning_rate": 9.578912427324851e-06, "loss": 0.2427, "num_input_tokens_seen": 8575248, "step": 14080 }, { "epoch": 4.370152032268074, "grad_norm": 0.13805446028709412, "learning_rate": 9.578368469282882e-06, "loss": 0.232, "num_input_tokens_seen": 8577552, "step": 14085 }, { "epoch": 4.3717033819422895, "grad_norm": 0.13061082363128662, "learning_rate": 9.577824175592287e-06, "loss": 0.2282, "num_input_tokens_seen": 8580176, "step": 14090 }, { "epoch": 4.373254731616506, "grad_norm": 0.04861839860677719, "learning_rate": 9.577279546292972e-06, "loss": 0.2325, "num_input_tokens_seen": 8583504, "step": 14095 }, { "epoch": 4.374806081290723, "grad_norm": 0.04453451186418533, "learning_rate": 9.576734581424863e-06, "loss": 0.2291, "num_input_tokens_seen": 8586448, "step": 14100 }, { "epoch": 4.37635743096494, "grad_norm": 0.12755268812179565, "learning_rate": 9.57618928102791e-06, "loss": 0.2297, "num_input_tokens_seen": 8588976, "step": 14105 }, { "epoch": 4.377908780639156, "grad_norm": 0.1042853444814682, "learning_rate": 9.575643645142095e-06, "loss": 0.2231, "num_input_tokens_seen": 8591696, "step": 14110 }, { "epoch": 4.3794601303133724, "grad_norm": 0.12916550040245056, "learning_rate": 9.575097673807416e-06, "loss": 0.2361, "num_input_tokens_seen": 8594864, "step": 14115 }, { "epoch": 4.381011479987589, "grad_norm": 0.14015375077724457, "learning_rate": 9.574551367063902e-06, "loss": 0.2352, "num_input_tokens_seen": 8597936, "step": 14120 }, { "epoch": 4.382562829661806, "grad_norm": 0.12981119751930237, "learning_rate": 9.5740047249516e-06, "loss": 0.2309, "num_input_tokens_seen": 8600496, "step": 14125 }, { "epoch": 4.384114179336022, "grad_norm": 0.06671428680419922, "learning_rate": 9.573457747510589e-06, "loss": 0.2278, "num_input_tokens_seen": 8603568, "step": 14130 }, { "epoch": 4.385665529010239, "grad_norm": 0.12316977232694626, "learning_rate": 9.572910434780965e-06, "loss": 0.2289, "num_input_tokens_seen": 8606992, "step": 14135 }, { "epoch": 4.387216878684455, "grad_norm": 0.12758861482143402, "learning_rate": 9.572362786802856e-06, "loss": 0.2307, "num_input_tokens_seen": 8610736, "step": 14140 }, { "epoch": 4.388768228358672, "grad_norm": 0.22893011569976807, "learning_rate": 9.57181480361641e-06, "loss": 0.2399, "num_input_tokens_seen": 8612784, "step": 14145 }, { "epoch": 4.390319578032889, "grad_norm": 0.12066496163606644, "learning_rate": 9.5712664852618e-06, "loss": 0.232, "num_input_tokens_seen": 8616144, "step": 14150 }, { "epoch": 4.391870927707105, "grad_norm": 0.11527804285287857, "learning_rate": 9.570717831779225e-06, "loss": 0.23, "num_input_tokens_seen": 8619408, "step": 14155 }, { "epoch": 4.393422277381322, "grad_norm": 0.12007381021976471, "learning_rate": 9.570168843208908e-06, "loss": 0.2321, "num_input_tokens_seen": 8623024, "step": 14160 }, { "epoch": 4.394973627055538, "grad_norm": 0.10954219847917557, "learning_rate": 9.569619519591095e-06, "loss": 0.2334, "num_input_tokens_seen": 8625520, "step": 14165 }, { "epoch": 4.396524976729755, "grad_norm": 0.1017882227897644, "learning_rate": 9.56906986096606e-06, "loss": 0.2308, "num_input_tokens_seen": 8628304, "step": 14170 }, { "epoch": 4.398076326403971, "grad_norm": 0.21695537865161896, "learning_rate": 9.568519867374098e-06, "loss": 0.2297, "num_input_tokens_seen": 8631376, "step": 14175 }, { "epoch": 4.399627676078188, "grad_norm": 0.056286729872226715, "learning_rate": 9.56796953885553e-06, "loss": 0.2342, "num_input_tokens_seen": 8634384, "step": 14180 }, { "epoch": 4.401179025752405, "grad_norm": 0.1295560598373413, "learning_rate": 9.567418875450701e-06, "loss": 0.2338, "num_input_tokens_seen": 8637680, "step": 14185 }, { "epoch": 4.402730375426621, "grad_norm": 0.22595016658306122, "learning_rate": 9.566867877199983e-06, "loss": 0.228, "num_input_tokens_seen": 8640944, "step": 14190 }, { "epoch": 4.404281725100837, "grad_norm": 0.22050467133522034, "learning_rate": 9.56631654414377e-06, "loss": 0.2311, "num_input_tokens_seen": 8644720, "step": 14195 }, { "epoch": 4.405833074775054, "grad_norm": 0.21185043454170227, "learning_rate": 9.565764876322482e-06, "loss": 0.2233, "num_input_tokens_seen": 8648272, "step": 14200 }, { "epoch": 4.407384424449271, "grad_norm": 0.12129569053649902, "learning_rate": 9.56521287377656e-06, "loss": 0.2347, "num_input_tokens_seen": 8650768, "step": 14205 }, { "epoch": 4.408935774123488, "grad_norm": 0.043826594948768616, "learning_rate": 9.564660536546475e-06, "loss": 0.2379, "num_input_tokens_seen": 8654544, "step": 14210 }, { "epoch": 4.410487123797704, "grad_norm": 0.04199371114373207, "learning_rate": 9.564107864672718e-06, "loss": 0.231, "num_input_tokens_seen": 8657808, "step": 14215 }, { "epoch": 4.41203847347192, "grad_norm": 0.047520704567432404, "learning_rate": 9.563554858195808e-06, "loss": 0.2263, "num_input_tokens_seen": 8661296, "step": 14220 }, { "epoch": 4.413589823146137, "grad_norm": 0.11253049224615097, "learning_rate": 9.563001517156287e-06, "loss": 0.2294, "num_input_tokens_seen": 8664432, "step": 14225 }, { "epoch": 4.415141172820354, "grad_norm": 0.1224411353468895, "learning_rate": 9.56244784159472e-06, "loss": 0.2315, "num_input_tokens_seen": 8668400, "step": 14230 }, { "epoch": 4.416692522494571, "grad_norm": 0.043934889137744904, "learning_rate": 9.561893831551699e-06, "loss": 0.2278, "num_input_tokens_seen": 8670832, "step": 14235 }, { "epoch": 4.4182438721687864, "grad_norm": 0.11743637174367905, "learning_rate": 9.56133948706784e-06, "loss": 0.2324, "num_input_tokens_seen": 8675280, "step": 14240 }, { "epoch": 4.419795221843003, "grad_norm": 0.11569277942180634, "learning_rate": 9.56078480818378e-06, "loss": 0.231, "num_input_tokens_seen": 8679824, "step": 14245 }, { "epoch": 4.42134657151722, "grad_norm": 0.05038028582930565, "learning_rate": 9.560229794940189e-06, "loss": 0.2295, "num_input_tokens_seen": 8682672, "step": 14250 }, { "epoch": 4.422897921191437, "grad_norm": 0.11090163141489029, "learning_rate": 9.55967444737775e-06, "loss": 0.2261, "num_input_tokens_seen": 8687632, "step": 14255 }, { "epoch": 4.424449270865653, "grad_norm": 0.11388064175844193, "learning_rate": 9.55911876553718e-06, "loss": 0.219, "num_input_tokens_seen": 8691568, "step": 14260 }, { "epoch": 4.426000620539869, "grad_norm": 0.05935598909854889, "learning_rate": 9.558562749459219e-06, "loss": 0.2433, "num_input_tokens_seen": 8694192, "step": 14265 }, { "epoch": 4.427551970214086, "grad_norm": 0.05061854049563408, "learning_rate": 9.558006399184624e-06, "loss": 0.23, "num_input_tokens_seen": 8697328, "step": 14270 }, { "epoch": 4.429103319888303, "grad_norm": 0.048023954033851624, "learning_rate": 9.557449714754187e-06, "loss": 0.2203, "num_input_tokens_seen": 8699632, "step": 14275 }, { "epoch": 4.43065466956252, "grad_norm": 0.10453673452138901, "learning_rate": 9.556892696208717e-06, "loss": 0.2275, "num_input_tokens_seen": 8702128, "step": 14280 }, { "epoch": 4.432206019236736, "grad_norm": 0.06453680247068405, "learning_rate": 9.55633534358905e-06, "loss": 0.2239, "num_input_tokens_seen": 8705840, "step": 14285 }, { "epoch": 4.433757368910952, "grad_norm": 0.1580449640750885, "learning_rate": 9.555777656936048e-06, "loss": 0.2302, "num_input_tokens_seen": 8708848, "step": 14290 }, { "epoch": 4.435308718585169, "grad_norm": 0.3202005922794342, "learning_rate": 9.555219636290594e-06, "loss": 0.2375, "num_input_tokens_seen": 8711248, "step": 14295 }, { "epoch": 4.436860068259386, "grad_norm": 0.06389549374580383, "learning_rate": 9.5546612816936e-06, "loss": 0.2414, "num_input_tokens_seen": 8714608, "step": 14300 }, { "epoch": 4.438411417933602, "grad_norm": 0.06891347467899323, "learning_rate": 9.554102593185998e-06, "loss": 0.2284, "num_input_tokens_seen": 8717584, "step": 14305 }, { "epoch": 4.439962767607819, "grad_norm": 0.2265293449163437, "learning_rate": 9.553543570808747e-06, "loss": 0.2307, "num_input_tokens_seen": 8721168, "step": 14310 }, { "epoch": 4.441514117282035, "grad_norm": 0.07732713222503662, "learning_rate": 9.552984214602832e-06, "loss": 0.2299, "num_input_tokens_seen": 8723568, "step": 14315 }, { "epoch": 4.443065466956252, "grad_norm": 0.293612003326416, "learning_rate": 9.552424524609259e-06, "loss": 0.2287, "num_input_tokens_seen": 8726704, "step": 14320 }, { "epoch": 4.444616816630468, "grad_norm": 0.1399945616722107, "learning_rate": 9.551864500869056e-06, "loss": 0.2367, "num_input_tokens_seen": 8729776, "step": 14325 }, { "epoch": 4.446168166304685, "grad_norm": 0.12462151050567627, "learning_rate": 9.551304143423287e-06, "loss": 0.2318, "num_input_tokens_seen": 8732080, "step": 14330 }, { "epoch": 4.447719515978902, "grad_norm": 0.0442403219640255, "learning_rate": 9.550743452313026e-06, "loss": 0.2321, "num_input_tokens_seen": 8735984, "step": 14335 }, { "epoch": 4.449270865653118, "grad_norm": 0.06901956349611282, "learning_rate": 9.550182427579382e-06, "loss": 0.2285, "num_input_tokens_seen": 8738256, "step": 14340 }, { "epoch": 4.450822215327335, "grad_norm": 0.05154138058423996, "learning_rate": 9.549621069263483e-06, "loss": 0.2291, "num_input_tokens_seen": 8740848, "step": 14345 }, { "epoch": 4.452373565001551, "grad_norm": 0.22323299944400787, "learning_rate": 9.549059377406486e-06, "loss": 0.2348, "num_input_tokens_seen": 8744016, "step": 14350 }, { "epoch": 4.453924914675768, "grad_norm": 0.13544584810733795, "learning_rate": 9.548497352049567e-06, "loss": 0.2341, "num_input_tokens_seen": 8747696, "step": 14355 }, { "epoch": 4.455476264349985, "grad_norm": 0.22108638286590576, "learning_rate": 9.54793499323393e-06, "loss": 0.2295, "num_input_tokens_seen": 8750864, "step": 14360 }, { "epoch": 4.457027614024201, "grad_norm": 0.03654290363192558, "learning_rate": 9.547372301000801e-06, "loss": 0.2289, "num_input_tokens_seen": 8754160, "step": 14365 }, { "epoch": 4.458578963698417, "grad_norm": 0.12064630538225174, "learning_rate": 9.546809275391436e-06, "loss": 0.229, "num_input_tokens_seen": 8757168, "step": 14370 }, { "epoch": 4.460130313372634, "grad_norm": 0.11422985047101974, "learning_rate": 9.546245916447106e-06, "loss": 0.2265, "num_input_tokens_seen": 8760976, "step": 14375 }, { "epoch": 4.461681663046851, "grad_norm": 0.1149328202009201, "learning_rate": 9.545682224209116e-06, "loss": 0.2306, "num_input_tokens_seen": 8764016, "step": 14380 }, { "epoch": 4.4632330127210675, "grad_norm": 0.05247374251484871, "learning_rate": 9.54511819871879e-06, "loss": 0.2259, "num_input_tokens_seen": 8767344, "step": 14385 }, { "epoch": 4.464784362395284, "grad_norm": 0.11371970176696777, "learning_rate": 9.544553840017476e-06, "loss": 0.2387, "num_input_tokens_seen": 8769744, "step": 14390 }, { "epoch": 4.4663357120695, "grad_norm": 0.10730724036693573, "learning_rate": 9.543989148146552e-06, "loss": 0.2268, "num_input_tokens_seen": 8772240, "step": 14395 }, { "epoch": 4.467887061743717, "grad_norm": 0.13785749673843384, "learning_rate": 9.543424123147416e-06, "loss": 0.2257, "num_input_tokens_seen": 8775920, "step": 14400 }, { "epoch": 4.469438411417934, "grad_norm": 0.040411174297332764, "learning_rate": 9.542858765061486e-06, "loss": 0.2362, "num_input_tokens_seen": 8778128, "step": 14405 }, { "epoch": 4.4709897610921505, "grad_norm": 0.14368078112602234, "learning_rate": 9.542293073930216e-06, "loss": 0.233, "num_input_tokens_seen": 8781200, "step": 14410 }, { "epoch": 4.472541110766366, "grad_norm": 0.138649582862854, "learning_rate": 9.541727049795072e-06, "loss": 0.2299, "num_input_tokens_seen": 8783856, "step": 14415 }, { "epoch": 4.474092460440583, "grad_norm": 0.24555861949920654, "learning_rate": 9.541160692697556e-06, "loss": 0.2407, "num_input_tokens_seen": 8786864, "step": 14420 }, { "epoch": 4.4756438101148, "grad_norm": 0.07603304088115692, "learning_rate": 9.540594002679184e-06, "loss": 0.2343, "num_input_tokens_seen": 8789328, "step": 14425 }, { "epoch": 4.477195159789017, "grad_norm": 0.12361834198236465, "learning_rate": 9.540026979781505e-06, "loss": 0.2282, "num_input_tokens_seen": 8793200, "step": 14430 }, { "epoch": 4.478746509463233, "grad_norm": 0.05858355015516281, "learning_rate": 9.539459624046083e-06, "loss": 0.2315, "num_input_tokens_seen": 8795280, "step": 14435 }, { "epoch": 4.480297859137449, "grad_norm": 0.12046100944280624, "learning_rate": 9.538891935514519e-06, "loss": 0.2344, "num_input_tokens_seen": 8797552, "step": 14440 }, { "epoch": 4.481849208811666, "grad_norm": 0.14435437321662903, "learning_rate": 9.538323914228426e-06, "loss": 0.2295, "num_input_tokens_seen": 8800560, "step": 14445 }, { "epoch": 4.483400558485883, "grad_norm": 0.13638220727443695, "learning_rate": 9.537755560229447e-06, "loss": 0.2268, "num_input_tokens_seen": 8803280, "step": 14450 }, { "epoch": 4.4849519081601, "grad_norm": 0.22883889079093933, "learning_rate": 9.537186873559252e-06, "loss": 0.2331, "num_input_tokens_seen": 8807568, "step": 14455 }, { "epoch": 4.486503257834316, "grad_norm": 0.2269754409790039, "learning_rate": 9.536617854259532e-06, "loss": 0.2326, "num_input_tokens_seen": 8811344, "step": 14460 }, { "epoch": 4.488054607508532, "grad_norm": 0.0683334618806839, "learning_rate": 9.536048502371998e-06, "loss": 0.2264, "num_input_tokens_seen": 8814448, "step": 14465 }, { "epoch": 4.489605957182749, "grad_norm": 0.10465306043624878, "learning_rate": 9.535478817938397e-06, "loss": 0.2309, "num_input_tokens_seen": 8817680, "step": 14470 }, { "epoch": 4.491157306856966, "grad_norm": 0.14089764654636383, "learning_rate": 9.534908801000489e-06, "loss": 0.2315, "num_input_tokens_seen": 8821616, "step": 14475 }, { "epoch": 4.492708656531182, "grad_norm": 0.22368009388446808, "learning_rate": 9.534338451600065e-06, "loss": 0.2311, "num_input_tokens_seen": 8826512, "step": 14480 }, { "epoch": 4.494260006205399, "grad_norm": 0.11769373714923859, "learning_rate": 9.53376776977894e-06, "loss": 0.2327, "num_input_tokens_seen": 8829232, "step": 14485 }, { "epoch": 4.495811355879615, "grad_norm": 0.12920276820659637, "learning_rate": 9.533196755578947e-06, "loss": 0.2243, "num_input_tokens_seen": 8831888, "step": 14490 }, { "epoch": 4.497362705553832, "grad_norm": 0.11683529615402222, "learning_rate": 9.53262540904195e-06, "loss": 0.2274, "num_input_tokens_seen": 8834736, "step": 14495 }, { "epoch": 4.498914055228048, "grad_norm": 0.12139938771724701, "learning_rate": 9.532053730209837e-06, "loss": 0.2311, "num_input_tokens_seen": 8837744, "step": 14500 }, { "epoch": 4.500465404902265, "grad_norm": 0.057593200355768204, "learning_rate": 9.53148171912452e-06, "loss": 0.2321, "num_input_tokens_seen": 8841648, "step": 14505 }, { "epoch": 4.5020167545764815, "grad_norm": 0.05642315000295639, "learning_rate": 9.530909375827931e-06, "loss": 0.2321, "num_input_tokens_seen": 8844784, "step": 14510 }, { "epoch": 4.503568104250698, "grad_norm": 0.22520603239536285, "learning_rate": 9.53033670036203e-06, "loss": 0.231, "num_input_tokens_seen": 8847280, "step": 14515 }, { "epoch": 4.505119453924914, "grad_norm": 0.11745858937501907, "learning_rate": 9.529763692768802e-06, "loss": 0.2284, "num_input_tokens_seen": 8851344, "step": 14520 }, { "epoch": 4.506670803599131, "grad_norm": 0.10899137705564499, "learning_rate": 9.529190353090256e-06, "loss": 0.2251, "num_input_tokens_seen": 8854096, "step": 14525 }, { "epoch": 4.508222153273348, "grad_norm": 0.13078422844409943, "learning_rate": 9.528616681368422e-06, "loss": 0.2231, "num_input_tokens_seen": 8856784, "step": 14530 }, { "epoch": 4.5097735029475645, "grad_norm": 0.10832265764474869, "learning_rate": 9.52804267764536e-06, "loss": 0.2202, "num_input_tokens_seen": 8859760, "step": 14535 }, { "epoch": 4.511324852621781, "grad_norm": 0.08931054919958115, "learning_rate": 9.527468341963148e-06, "loss": 0.2384, "num_input_tokens_seen": 8863824, "step": 14540 }, { "epoch": 4.512876202295997, "grad_norm": 0.06602787226438522, "learning_rate": 9.526893674363893e-06, "loss": 0.2285, "num_input_tokens_seen": 8867184, "step": 14545 }, { "epoch": 4.514427551970214, "grad_norm": 0.143090158700943, "learning_rate": 9.526318674889728e-06, "loss": 0.2242, "num_input_tokens_seen": 8870608, "step": 14550 }, { "epoch": 4.515978901644431, "grad_norm": 0.07980065792798996, "learning_rate": 9.525743343582801e-06, "loss": 0.2291, "num_input_tokens_seen": 8873232, "step": 14555 }, { "epoch": 4.5175302513186475, "grad_norm": 0.38086220622062683, "learning_rate": 9.525167680485297e-06, "loss": 0.2356, "num_input_tokens_seen": 8876816, "step": 14560 }, { "epoch": 4.519081600992864, "grad_norm": 0.1820303052663803, "learning_rate": 9.524591685639414e-06, "loss": 0.245, "num_input_tokens_seen": 8879408, "step": 14565 }, { "epoch": 4.52063295066708, "grad_norm": 0.051798056811094284, "learning_rate": 9.524015359087382e-06, "loss": 0.235, "num_input_tokens_seen": 8882352, "step": 14570 }, { "epoch": 4.522184300341297, "grad_norm": 0.042510297149419785, "learning_rate": 9.523438700871452e-06, "loss": 0.2406, "num_input_tokens_seen": 8885296, "step": 14575 }, { "epoch": 4.523735650015514, "grad_norm": 0.20349127054214478, "learning_rate": 9.5228617110339e-06, "loss": 0.2286, "num_input_tokens_seen": 8888304, "step": 14580 }, { "epoch": 4.52528699968973, "grad_norm": 0.15974092483520508, "learning_rate": 9.522284389617024e-06, "loss": 0.2338, "num_input_tokens_seen": 8893360, "step": 14585 }, { "epoch": 4.526838349363946, "grad_norm": 0.12642943859100342, "learning_rate": 9.521706736663154e-06, "loss": 0.227, "num_input_tokens_seen": 8896784, "step": 14590 }, { "epoch": 4.528389699038163, "grad_norm": 0.13189385831356049, "learning_rate": 9.521128752214632e-06, "loss": 0.2333, "num_input_tokens_seen": 8899216, "step": 14595 }, { "epoch": 4.52994104871238, "grad_norm": 0.10082178562879562, "learning_rate": 9.520550436313835e-06, "loss": 0.2328, "num_input_tokens_seen": 8902288, "step": 14600 }, { "epoch": 4.531492398386597, "grad_norm": 0.20900528132915497, "learning_rate": 9.51997178900316e-06, "loss": 0.2278, "num_input_tokens_seen": 8905232, "step": 14605 }, { "epoch": 4.533043748060813, "grad_norm": 0.1189073920249939, "learning_rate": 9.519392810325028e-06, "loss": 0.2328, "num_input_tokens_seen": 8907408, "step": 14610 }, { "epoch": 4.534595097735029, "grad_norm": 0.03567558526992798, "learning_rate": 9.518813500321886e-06, "loss": 0.2356, "num_input_tokens_seen": 8910416, "step": 14615 }, { "epoch": 4.536146447409246, "grad_norm": 0.12438146770000458, "learning_rate": 9.518233859036204e-06, "loss": 0.2314, "num_input_tokens_seen": 8913456, "step": 14620 }, { "epoch": 4.537697797083463, "grad_norm": 0.06444555521011353, "learning_rate": 9.517653886510476e-06, "loss": 0.2304, "num_input_tokens_seen": 8916432, "step": 14625 }, { "epoch": 4.53924914675768, "grad_norm": 0.05509554594755173, "learning_rate": 9.517073582787221e-06, "loss": 0.2309, "num_input_tokens_seen": 8919088, "step": 14630 }, { "epoch": 4.5408004964318955, "grad_norm": 0.13198770582675934, "learning_rate": 9.516492947908982e-06, "loss": 0.2263, "num_input_tokens_seen": 8922384, "step": 14635 }, { "epoch": 4.542351846106112, "grad_norm": 0.04791148379445076, "learning_rate": 9.515911981918326e-06, "loss": 0.229, "num_input_tokens_seen": 8924912, "step": 14640 }, { "epoch": 4.543903195780329, "grad_norm": 0.05168185383081436, "learning_rate": 9.515330684857846e-06, "loss": 0.2267, "num_input_tokens_seen": 8928144, "step": 14645 }, { "epoch": 4.545454545454545, "grad_norm": 0.1335839182138443, "learning_rate": 9.514749056770159e-06, "loss": 0.224, "num_input_tokens_seen": 8930800, "step": 14650 }, { "epoch": 4.547005895128762, "grad_norm": 0.0764492005109787, "learning_rate": 9.5141670976979e-06, "loss": 0.233, "num_input_tokens_seen": 8934320, "step": 14655 }, { "epoch": 4.5485572448029785, "grad_norm": 0.05065225437283516, "learning_rate": 9.51358480768374e-06, "loss": 0.2335, "num_input_tokens_seen": 8936912, "step": 14660 }, { "epoch": 4.550108594477195, "grad_norm": 0.11915320158004761, "learning_rate": 9.513002186770364e-06, "loss": 0.229, "num_input_tokens_seen": 8939600, "step": 14665 }, { "epoch": 4.551659944151412, "grad_norm": 0.11972381919622421, "learning_rate": 9.512419235000485e-06, "loss": 0.2331, "num_input_tokens_seen": 8941872, "step": 14670 }, { "epoch": 4.553211293825628, "grad_norm": 0.15850238502025604, "learning_rate": 9.511835952416841e-06, "loss": 0.2384, "num_input_tokens_seen": 8944976, "step": 14675 }, { "epoch": 4.554762643499845, "grad_norm": 0.05463625490665436, "learning_rate": 9.511252339062193e-06, "loss": 0.2292, "num_input_tokens_seen": 8947056, "step": 14680 }, { "epoch": 4.5563139931740615, "grad_norm": 0.152663916349411, "learning_rate": 9.510668394979328e-06, "loss": 0.2351, "num_input_tokens_seen": 8949616, "step": 14685 }, { "epoch": 4.557865342848278, "grad_norm": 0.10422153770923615, "learning_rate": 9.510084120211057e-06, "loss": 0.2277, "num_input_tokens_seen": 8953808, "step": 14690 }, { "epoch": 4.559416692522495, "grad_norm": 0.11485446244478226, "learning_rate": 9.509499514800211e-06, "loss": 0.2271, "num_input_tokens_seen": 8957136, "step": 14695 }, { "epoch": 4.560968042196711, "grad_norm": 0.14232520759105682, "learning_rate": 9.50891457878965e-06, "loss": 0.2319, "num_input_tokens_seen": 8960080, "step": 14700 }, { "epoch": 4.562519391870928, "grad_norm": 0.13662156462669373, "learning_rate": 9.508329312222258e-06, "loss": 0.2365, "num_input_tokens_seen": 8962800, "step": 14705 }, { "epoch": 4.5640707415451445, "grad_norm": 0.042491473257541656, "learning_rate": 9.507743715140939e-06, "loss": 0.2342, "num_input_tokens_seen": 8965072, "step": 14710 }, { "epoch": 4.56562209121936, "grad_norm": 0.03538811206817627, "learning_rate": 9.507157787588627e-06, "loss": 0.2353, "num_input_tokens_seen": 8967824, "step": 14715 }, { "epoch": 4.567173440893577, "grad_norm": 0.04838940501213074, "learning_rate": 9.506571529608277e-06, "loss": 0.2286, "num_input_tokens_seen": 8970928, "step": 14720 }, { "epoch": 4.568724790567794, "grad_norm": 0.11529036611318588, "learning_rate": 9.505984941242867e-06, "loss": 0.2304, "num_input_tokens_seen": 8973008, "step": 14725 }, { "epoch": 4.570276140242011, "grad_norm": 0.12341935187578201, "learning_rate": 9.505398022535403e-06, "loss": 0.2278, "num_input_tokens_seen": 8975952, "step": 14730 }, { "epoch": 4.5718274899162274, "grad_norm": 0.130713552236557, "learning_rate": 9.50481077352891e-06, "loss": 0.2377, "num_input_tokens_seen": 8979216, "step": 14735 }, { "epoch": 4.573378839590443, "grad_norm": 0.10403963923454285, "learning_rate": 9.504223194266445e-06, "loss": 0.2269, "num_input_tokens_seen": 8982384, "step": 14740 }, { "epoch": 4.57493018926466, "grad_norm": 0.051996324211359024, "learning_rate": 9.50363528479108e-06, "loss": 0.2304, "num_input_tokens_seen": 8985232, "step": 14745 }, { "epoch": 4.576481538938877, "grad_norm": 0.10765295475721359, "learning_rate": 9.503047045145918e-06, "loss": 0.2319, "num_input_tokens_seen": 8988624, "step": 14750 }, { "epoch": 4.578032888613094, "grad_norm": 0.1363040804862976, "learning_rate": 9.502458475374085e-06, "loss": 0.2358, "num_input_tokens_seen": 8991920, "step": 14755 }, { "epoch": 4.57958423828731, "grad_norm": 0.044189710170030594, "learning_rate": 9.501869575518728e-06, "loss": 0.2315, "num_input_tokens_seen": 8993968, "step": 14760 }, { "epoch": 4.581135587961526, "grad_norm": 0.06664394587278366, "learning_rate": 9.501280345623019e-06, "loss": 0.2343, "num_input_tokens_seen": 8997168, "step": 14765 }, { "epoch": 4.582686937635743, "grad_norm": 0.05574585497379303, "learning_rate": 9.50069078573016e-06, "loss": 0.2296, "num_input_tokens_seen": 8999856, "step": 14770 }, { "epoch": 4.58423828730996, "grad_norm": 0.1562042534351349, "learning_rate": 9.50010089588337e-06, "loss": 0.2297, "num_input_tokens_seen": 9003760, "step": 14775 }, { "epoch": 4.585789636984176, "grad_norm": 0.12192399054765701, "learning_rate": 9.499510676125893e-06, "loss": 0.2289, "num_input_tokens_seen": 9007248, "step": 14780 }, { "epoch": 4.5873409866583925, "grad_norm": 0.22085833549499512, "learning_rate": 9.498920126501004e-06, "loss": 0.2274, "num_input_tokens_seen": 9010192, "step": 14785 }, { "epoch": 4.588892336332609, "grad_norm": 0.07300614565610886, "learning_rate": 9.498329247051994e-06, "loss": 0.2336, "num_input_tokens_seen": 9012880, "step": 14790 }, { "epoch": 4.590443686006826, "grad_norm": 0.1396126002073288, "learning_rate": 9.497738037822179e-06, "loss": 0.2266, "num_input_tokens_seen": 9015824, "step": 14795 }, { "epoch": 4.591995035681043, "grad_norm": 0.1463688611984253, "learning_rate": 9.497146498854908e-06, "loss": 0.2355, "num_input_tokens_seen": 9018992, "step": 14800 }, { "epoch": 4.593546385355259, "grad_norm": 0.06581821292638779, "learning_rate": 9.496554630193543e-06, "loss": 0.2234, "num_input_tokens_seen": 9022128, "step": 14805 }, { "epoch": 4.5950977350294755, "grad_norm": 0.06038011237978935, "learning_rate": 9.495962431881478e-06, "loss": 0.227, "num_input_tokens_seen": 9025264, "step": 14810 }, { "epoch": 4.596649084703692, "grad_norm": 0.10602767765522003, "learning_rate": 9.495369903962124e-06, "loss": 0.2241, "num_input_tokens_seen": 9028560, "step": 14815 }, { "epoch": 4.598200434377909, "grad_norm": 0.06096765398979187, "learning_rate": 9.494777046478925e-06, "loss": 0.217, "num_input_tokens_seen": 9031408, "step": 14820 }, { "epoch": 4.599751784052126, "grad_norm": 0.13006474077701569, "learning_rate": 9.494183859475341e-06, "loss": 0.2204, "num_input_tokens_seen": 9034672, "step": 14825 }, { "epoch": 4.601303133726342, "grad_norm": 0.12824615836143494, "learning_rate": 9.493590342994863e-06, "loss": 0.2051, "num_input_tokens_seen": 9038256, "step": 14830 }, { "epoch": 4.6028544834005585, "grad_norm": 0.4504031538963318, "learning_rate": 9.492996497081e-06, "loss": 0.2123, "num_input_tokens_seen": 9040432, "step": 14835 }, { "epoch": 4.604405833074775, "grad_norm": 0.36203518509864807, "learning_rate": 9.492402321777288e-06, "loss": 0.2671, "num_input_tokens_seen": 9042704, "step": 14840 }, { "epoch": 4.605957182748991, "grad_norm": 0.12471498548984528, "learning_rate": 9.491807817127287e-06, "loss": 0.2419, "num_input_tokens_seen": 9045840, "step": 14845 }, { "epoch": 4.607508532423208, "grad_norm": 0.1929989457130432, "learning_rate": 9.491212983174582e-06, "loss": 0.2335, "num_input_tokens_seen": 9048528, "step": 14850 }, { "epoch": 4.609059882097425, "grad_norm": 0.22144150733947754, "learning_rate": 9.490617819962784e-06, "loss": 0.2289, "num_input_tokens_seen": 9051600, "step": 14855 }, { "epoch": 4.6106112317716414, "grad_norm": 0.16447266936302185, "learning_rate": 9.49002232753552e-06, "loss": 0.2396, "num_input_tokens_seen": 9055184, "step": 14860 }, { "epoch": 4.612162581445858, "grad_norm": 0.13856062293052673, "learning_rate": 9.489426505936453e-06, "loss": 0.2145, "num_input_tokens_seen": 9057840, "step": 14865 }, { "epoch": 4.613713931120074, "grad_norm": 0.15235188603401184, "learning_rate": 9.488830355209257e-06, "loss": 0.234, "num_input_tokens_seen": 9060752, "step": 14870 }, { "epoch": 4.615265280794291, "grad_norm": 0.15615031123161316, "learning_rate": 9.488233875397642e-06, "loss": 0.2339, "num_input_tokens_seen": 9063536, "step": 14875 }, { "epoch": 4.616816630468508, "grad_norm": 0.07596307247877121, "learning_rate": 9.487637066545334e-06, "loss": 0.2342, "num_input_tokens_seen": 9066576, "step": 14880 }, { "epoch": 4.618367980142724, "grad_norm": 0.30220308899879456, "learning_rate": 9.487039928696089e-06, "loss": 0.2307, "num_input_tokens_seen": 9069456, "step": 14885 }, { "epoch": 4.619919329816941, "grad_norm": 0.16573970019817352, "learning_rate": 9.486442461893683e-06, "loss": 0.2296, "num_input_tokens_seen": 9072880, "step": 14890 }, { "epoch": 4.621470679491157, "grad_norm": 0.26852887868881226, "learning_rate": 9.485844666181916e-06, "loss": 0.2282, "num_input_tokens_seen": 9075312, "step": 14895 }, { "epoch": 4.623022029165374, "grad_norm": 0.061764299869537354, "learning_rate": 9.485246541604614e-06, "loss": 0.2238, "num_input_tokens_seen": 9078608, "step": 14900 }, { "epoch": 4.624573378839591, "grad_norm": 0.14078974723815918, "learning_rate": 9.484648088205627e-06, "loss": 0.236, "num_input_tokens_seen": 9081712, "step": 14905 }, { "epoch": 4.6261247285138065, "grad_norm": 0.22438117861747742, "learning_rate": 9.484049306028831e-06, "loss": 0.2261, "num_input_tokens_seen": 9084528, "step": 14910 }, { "epoch": 4.627676078188023, "grad_norm": 0.17488062381744385, "learning_rate": 9.483450195118122e-06, "loss": 0.2335, "num_input_tokens_seen": 9087376, "step": 14915 }, { "epoch": 4.62922742786224, "grad_norm": 0.09868773818016052, "learning_rate": 9.482850755517419e-06, "loss": 0.2241, "num_input_tokens_seen": 9089904, "step": 14920 }, { "epoch": 4.630778777536457, "grad_norm": 0.10316652059555054, "learning_rate": 9.482250987270672e-06, "loss": 0.2369, "num_input_tokens_seen": 9092240, "step": 14925 }, { "epoch": 4.632330127210674, "grad_norm": 0.1703682392835617, "learning_rate": 9.48165089042185e-06, "loss": 0.2418, "num_input_tokens_seen": 9095408, "step": 14930 }, { "epoch": 4.6338814768848895, "grad_norm": 0.2936844825744629, "learning_rate": 9.481050465014947e-06, "loss": 0.2333, "num_input_tokens_seen": 9098832, "step": 14935 }, { "epoch": 4.635432826559106, "grad_norm": 0.11322388797998428, "learning_rate": 9.480449711093982e-06, "loss": 0.2227, "num_input_tokens_seen": 9101488, "step": 14940 }, { "epoch": 4.636984176233323, "grad_norm": 0.3182373642921448, "learning_rate": 9.479848628702997e-06, "loss": 0.233, "num_input_tokens_seen": 9104560, "step": 14945 }, { "epoch": 4.63853552590754, "grad_norm": 0.14747686684131622, "learning_rate": 9.479247217886057e-06, "loss": 0.2381, "num_input_tokens_seen": 9109168, "step": 14950 }, { "epoch": 4.640086875581757, "grad_norm": 0.13326574862003326, "learning_rate": 9.478645478687254e-06, "loss": 0.2342, "num_input_tokens_seen": 9112528, "step": 14955 }, { "epoch": 4.6416382252559725, "grad_norm": 0.2317216545343399, "learning_rate": 9.478043411150701e-06, "loss": 0.2298, "num_input_tokens_seen": 9115760, "step": 14960 }, { "epoch": 4.643189574930189, "grad_norm": 0.11645358800888062, "learning_rate": 9.477441015320541e-06, "loss": 0.2283, "num_input_tokens_seen": 9118864, "step": 14965 }, { "epoch": 4.644740924604406, "grad_norm": 0.2276582270860672, "learning_rate": 9.476838291240931e-06, "loss": 0.232, "num_input_tokens_seen": 9121680, "step": 14970 }, { "epoch": 4.646292274278622, "grad_norm": 0.05587061867117882, "learning_rate": 9.476235238956061e-06, "loss": 0.2337, "num_input_tokens_seen": 9125392, "step": 14975 }, { "epoch": 4.647843623952839, "grad_norm": 0.11812333017587662, "learning_rate": 9.475631858510143e-06, "loss": 0.2284, "num_input_tokens_seen": 9129680, "step": 14980 }, { "epoch": 4.6493949736270554, "grad_norm": 0.11417393386363983, "learning_rate": 9.47502814994741e-06, "loss": 0.2288, "num_input_tokens_seen": 9131984, "step": 14985 }, { "epoch": 4.650946323301272, "grad_norm": 0.09517930448055267, "learning_rate": 9.47442411331212e-06, "loss": 0.2317, "num_input_tokens_seen": 9135056, "step": 14990 }, { "epoch": 4.652497672975489, "grad_norm": 0.12853579223155975, "learning_rate": 9.473819748648559e-06, "loss": 0.2288, "num_input_tokens_seen": 9138320, "step": 14995 }, { "epoch": 4.654049022649705, "grad_norm": 0.14024750888347626, "learning_rate": 9.47321505600103e-06, "loss": 0.2373, "num_input_tokens_seen": 9142096, "step": 15000 }, { "epoch": 4.655600372323922, "grad_norm": 0.04708339646458626, "learning_rate": 9.47261003541387e-06, "loss": 0.2226, "num_input_tokens_seen": 9144304, "step": 15005 }, { "epoch": 4.657151721998138, "grad_norm": 0.13600151240825653, "learning_rate": 9.472004686931429e-06, "loss": 0.2321, "num_input_tokens_seen": 9147536, "step": 15010 }, { "epoch": 4.658703071672355, "grad_norm": 0.040627073496580124, "learning_rate": 9.471399010598088e-06, "loss": 0.2214, "num_input_tokens_seen": 9151600, "step": 15015 }, { "epoch": 4.660254421346572, "grad_norm": 0.05762796103954315, "learning_rate": 9.47079300645825e-06, "loss": 0.2307, "num_input_tokens_seen": 9154640, "step": 15020 }, { "epoch": 4.661805771020788, "grad_norm": 0.1097845733165741, "learning_rate": 9.470186674556342e-06, "loss": 0.2291, "num_input_tokens_seen": 9157968, "step": 15025 }, { "epoch": 4.663357120695005, "grad_norm": 0.06328091770410538, "learning_rate": 9.469580014936816e-06, "loss": 0.2377, "num_input_tokens_seen": 9161232, "step": 15030 }, { "epoch": 4.664908470369221, "grad_norm": 0.06047124043107033, "learning_rate": 9.468973027644148e-06, "loss": 0.2268, "num_input_tokens_seen": 9163920, "step": 15035 }, { "epoch": 4.666459820043438, "grad_norm": 0.10997919738292694, "learning_rate": 9.468365712722838e-06, "loss": 0.2166, "num_input_tokens_seen": 9166128, "step": 15040 }, { "epoch": 4.668011169717654, "grad_norm": 0.1041056364774704, "learning_rate": 9.467758070217406e-06, "loss": 0.225, "num_input_tokens_seen": 9169456, "step": 15045 }, { "epoch": 4.669562519391871, "grad_norm": 0.06467296928167343, "learning_rate": 9.4671501001724e-06, "loss": 0.2285, "num_input_tokens_seen": 9172176, "step": 15050 }, { "epoch": 4.671113869066088, "grad_norm": 0.10850438475608826, "learning_rate": 9.466541802632394e-06, "loss": 0.2214, "num_input_tokens_seen": 9174320, "step": 15055 }, { "epoch": 4.672665218740304, "grad_norm": 0.17292508482933044, "learning_rate": 9.465933177641981e-06, "loss": 0.2578, "num_input_tokens_seen": 9176560, "step": 15060 }, { "epoch": 4.67421656841452, "grad_norm": 0.09418310970067978, "learning_rate": 9.465324225245784e-06, "loss": 0.2261, "num_input_tokens_seen": 9178992, "step": 15065 }, { "epoch": 4.675767918088737, "grad_norm": 0.2676616907119751, "learning_rate": 9.464714945488443e-06, "loss": 0.2483, "num_input_tokens_seen": 9182224, "step": 15070 }, { "epoch": 4.677319267762954, "grad_norm": 0.13377320766448975, "learning_rate": 9.464105338414626e-06, "loss": 0.2392, "num_input_tokens_seen": 9184592, "step": 15075 }, { "epoch": 4.678870617437171, "grad_norm": 0.0981411412358284, "learning_rate": 9.463495404069026e-06, "loss": 0.2343, "num_input_tokens_seen": 9186832, "step": 15080 }, { "epoch": 4.680421967111387, "grad_norm": 0.1321197897195816, "learning_rate": 9.462885142496354e-06, "loss": 0.2339, "num_input_tokens_seen": 9189808, "step": 15085 }, { "epoch": 4.681973316785603, "grad_norm": 0.11151238530874252, "learning_rate": 9.462274553741355e-06, "loss": 0.2259, "num_input_tokens_seen": 9192496, "step": 15090 }, { "epoch": 4.68352466645982, "grad_norm": 0.10639221966266632, "learning_rate": 9.461663637848791e-06, "loss": 0.2341, "num_input_tokens_seen": 9195152, "step": 15095 }, { "epoch": 4.685076016134037, "grad_norm": 0.042150530964136124, "learning_rate": 9.461052394863447e-06, "loss": 0.232, "num_input_tokens_seen": 9198032, "step": 15100 }, { "epoch": 4.686627365808254, "grad_norm": 0.05766459181904793, "learning_rate": 9.460440824830135e-06, "loss": 0.2347, "num_input_tokens_seen": 9200624, "step": 15105 }, { "epoch": 4.6881787154824694, "grad_norm": 0.1120477169752121, "learning_rate": 9.45982892779369e-06, "loss": 0.23, "num_input_tokens_seen": 9203568, "step": 15110 }, { "epoch": 4.689730065156686, "grad_norm": 0.13086938858032227, "learning_rate": 9.459216703798974e-06, "loss": 0.2307, "num_input_tokens_seen": 9206576, "step": 15115 }, { "epoch": 4.691281414830903, "grad_norm": 0.20647165179252625, "learning_rate": 9.458604152890869e-06, "loss": 0.2253, "num_input_tokens_seen": 9209904, "step": 15120 }, { "epoch": 4.69283276450512, "grad_norm": 0.04150263965129852, "learning_rate": 9.457991275114282e-06, "loss": 0.2208, "num_input_tokens_seen": 9213328, "step": 15125 }, { "epoch": 4.694384114179336, "grad_norm": 0.04488705098628998, "learning_rate": 9.457378070514143e-06, "loss": 0.2433, "num_input_tokens_seen": 9216176, "step": 15130 }, { "epoch": 4.695935463853552, "grad_norm": 0.12796297669410706, "learning_rate": 9.456764539135408e-06, "loss": 0.2338, "num_input_tokens_seen": 9218928, "step": 15135 }, { "epoch": 4.697486813527769, "grad_norm": 0.09096890687942505, "learning_rate": 9.456150681023057e-06, "loss": 0.2361, "num_input_tokens_seen": 9221680, "step": 15140 }, { "epoch": 4.699038163201986, "grad_norm": 0.04972629249095917, "learning_rate": 9.455536496222093e-06, "loss": 0.2371, "num_input_tokens_seen": 9225104, "step": 15145 }, { "epoch": 4.700589512876203, "grad_norm": 0.10930422693490982, "learning_rate": 9.45492198477754e-06, "loss": 0.2396, "num_input_tokens_seen": 9228912, "step": 15150 }, { "epoch": 4.702140862550419, "grad_norm": 0.04978274181485176, "learning_rate": 9.454307146734454e-06, "loss": 0.2297, "num_input_tokens_seen": 9232720, "step": 15155 }, { "epoch": 4.703692212224635, "grad_norm": 0.11977823823690414, "learning_rate": 9.453691982137905e-06, "loss": 0.2384, "num_input_tokens_seen": 9235504, "step": 15160 }, { "epoch": 4.705243561898852, "grad_norm": 0.10438612848520279, "learning_rate": 9.453076491032996e-06, "loss": 0.2379, "num_input_tokens_seen": 9237872, "step": 15165 }, { "epoch": 4.706794911573069, "grad_norm": 0.12152125686407089, "learning_rate": 9.452460673464848e-06, "loss": 0.2294, "num_input_tokens_seen": 9240784, "step": 15170 }, { "epoch": 4.708346261247285, "grad_norm": 0.10471683740615845, "learning_rate": 9.451844529478607e-06, "loss": 0.2264, "num_input_tokens_seen": 9243760, "step": 15175 }, { "epoch": 4.709897610921502, "grad_norm": 0.1380278319120407, "learning_rate": 9.451228059119444e-06, "loss": 0.2275, "num_input_tokens_seen": 9246800, "step": 15180 }, { "epoch": 4.711448960595718, "grad_norm": 0.05201920494437218, "learning_rate": 9.450611262432553e-06, "loss": 0.2342, "num_input_tokens_seen": 9250640, "step": 15185 }, { "epoch": 4.713000310269935, "grad_norm": 0.04108155146241188, "learning_rate": 9.449994139463154e-06, "loss": 0.2376, "num_input_tokens_seen": 9253296, "step": 15190 }, { "epoch": 4.714551659944151, "grad_norm": 0.11423756182193756, "learning_rate": 9.449376690256489e-06, "loss": 0.2316, "num_input_tokens_seen": 9256592, "step": 15195 }, { "epoch": 4.716103009618368, "grad_norm": 0.1157970055937767, "learning_rate": 9.448758914857825e-06, "loss": 0.2336, "num_input_tokens_seen": 9258864, "step": 15200 }, { "epoch": 4.717654359292585, "grad_norm": 0.12739188969135284, "learning_rate": 9.44814081331245e-06, "loss": 0.229, "num_input_tokens_seen": 9261520, "step": 15205 }, { "epoch": 4.719205708966801, "grad_norm": 0.22666001319885254, "learning_rate": 9.447522385665679e-06, "loss": 0.2296, "num_input_tokens_seen": 9263760, "step": 15210 }, { "epoch": 4.720757058641018, "grad_norm": 0.13699427247047424, "learning_rate": 9.446903631962853e-06, "loss": 0.2325, "num_input_tokens_seen": 9266512, "step": 15215 }, { "epoch": 4.722308408315234, "grad_norm": 0.14272071421146393, "learning_rate": 9.44628455224933e-06, "loss": 0.2353, "num_input_tokens_seen": 9268848, "step": 15220 }, { "epoch": 4.723859757989451, "grad_norm": 0.2625575065612793, "learning_rate": 9.445665146570497e-06, "loss": 0.237, "num_input_tokens_seen": 9271984, "step": 15225 }, { "epoch": 4.725411107663668, "grad_norm": 0.06229299306869507, "learning_rate": 9.445045414971764e-06, "loss": 0.2308, "num_input_tokens_seen": 9274736, "step": 15230 }, { "epoch": 4.726962457337884, "grad_norm": 0.11898180097341537, "learning_rate": 9.444425357498565e-06, "loss": 0.2269, "num_input_tokens_seen": 9277776, "step": 15235 }, { "epoch": 4.7285138070121, "grad_norm": 0.2782919108867645, "learning_rate": 9.44380497419636e-06, "loss": 0.2339, "num_input_tokens_seen": 9281360, "step": 15240 }, { "epoch": 4.730065156686317, "grad_norm": 0.11440097540616989, "learning_rate": 9.443184265110626e-06, "loss": 0.2354, "num_input_tokens_seen": 9284496, "step": 15245 }, { "epoch": 4.731616506360534, "grad_norm": 0.05513155832886696, "learning_rate": 9.44256323028687e-06, "loss": 0.2433, "num_input_tokens_seen": 9287856, "step": 15250 }, { "epoch": 4.7331678560347505, "grad_norm": 0.12900808453559875, "learning_rate": 9.44194186977062e-06, "loss": 0.2351, "num_input_tokens_seen": 9290960, "step": 15255 }, { "epoch": 4.734719205708966, "grad_norm": 0.12849099934101105, "learning_rate": 9.441320183607433e-06, "loss": 0.2332, "num_input_tokens_seen": 9293520, "step": 15260 }, { "epoch": 4.736270555383183, "grad_norm": 0.12545828521251678, "learning_rate": 9.440698171842882e-06, "loss": 0.233, "num_input_tokens_seen": 9296560, "step": 15265 }, { "epoch": 4.7378219050574, "grad_norm": 0.04542696848511696, "learning_rate": 9.44007583452257e-06, "loss": 0.2294, "num_input_tokens_seen": 9299376, "step": 15270 }, { "epoch": 4.739373254731617, "grad_norm": 0.1450260877609253, "learning_rate": 9.439453171692121e-06, "loss": 0.2326, "num_input_tokens_seen": 9302448, "step": 15275 }, { "epoch": 4.7409246044058335, "grad_norm": 0.03744841739535332, "learning_rate": 9.438830183397182e-06, "loss": 0.2315, "num_input_tokens_seen": 9304944, "step": 15280 }, { "epoch": 4.742475954080049, "grad_norm": 0.13344161212444305, "learning_rate": 9.438206869683428e-06, "loss": 0.2298, "num_input_tokens_seen": 9308048, "step": 15285 }, { "epoch": 4.744027303754266, "grad_norm": 0.14159010350704193, "learning_rate": 9.437583230596556e-06, "loss": 0.2396, "num_input_tokens_seen": 9311120, "step": 15290 }, { "epoch": 4.745578653428483, "grad_norm": 0.035018760710954666, "learning_rate": 9.436959266182281e-06, "loss": 0.2302, "num_input_tokens_seen": 9314000, "step": 15295 }, { "epoch": 4.7471300031027, "grad_norm": 0.047235917299985886, "learning_rate": 9.436334976486354e-06, "loss": 0.2379, "num_input_tokens_seen": 9317168, "step": 15300 }, { "epoch": 4.748681352776916, "grad_norm": 0.049266550689935684, "learning_rate": 9.435710361554537e-06, "loss": 0.2305, "num_input_tokens_seen": 9319760, "step": 15305 }, { "epoch": 4.750232702451132, "grad_norm": 0.04758508875966072, "learning_rate": 9.435085421432623e-06, "loss": 0.2253, "num_input_tokens_seen": 9322832, "step": 15310 }, { "epoch": 4.751784052125349, "grad_norm": 0.14243647456169128, "learning_rate": 9.43446015616643e-06, "loss": 0.2327, "num_input_tokens_seen": 9325776, "step": 15315 }, { "epoch": 4.753335401799566, "grad_norm": 0.1180635392665863, "learning_rate": 9.433834565801796e-06, "loss": 0.2303, "num_input_tokens_seen": 9329104, "step": 15320 }, { "epoch": 4.754886751473782, "grad_norm": 0.12530933320522308, "learning_rate": 9.433208650384582e-06, "loss": 0.2331, "num_input_tokens_seen": 9331664, "step": 15325 }, { "epoch": 4.756438101147999, "grad_norm": 0.1262074112892151, "learning_rate": 9.432582409960678e-06, "loss": 0.2306, "num_input_tokens_seen": 9334416, "step": 15330 }, { "epoch": 4.757989450822215, "grad_norm": 0.06009398028254509, "learning_rate": 9.431955844575993e-06, "loss": 0.229, "num_input_tokens_seen": 9336976, "step": 15335 }, { "epoch": 4.759540800496432, "grad_norm": 0.2201462835073471, "learning_rate": 9.431328954276464e-06, "loss": 0.2357, "num_input_tokens_seen": 9340624, "step": 15340 }, { "epoch": 4.761092150170649, "grad_norm": 0.11043746024370193, "learning_rate": 9.430701739108047e-06, "loss": 0.2305, "num_input_tokens_seen": 9343760, "step": 15345 }, { "epoch": 4.762643499844865, "grad_norm": 0.13277080655097961, "learning_rate": 9.430074199116723e-06, "loss": 0.2337, "num_input_tokens_seen": 9346256, "step": 15350 }, { "epoch": 4.764194849519082, "grad_norm": 0.11952229589223862, "learning_rate": 9.429446334348503e-06, "loss": 0.2367, "num_input_tokens_seen": 9348336, "step": 15355 }, { "epoch": 4.765746199193298, "grad_norm": 0.10913373529911041, "learning_rate": 9.428818144849413e-06, "loss": 0.23, "num_input_tokens_seen": 9351888, "step": 15360 }, { "epoch": 4.767297548867515, "grad_norm": 0.06279905885457993, "learning_rate": 9.428189630665508e-06, "loss": 0.2339, "num_input_tokens_seen": 9354128, "step": 15365 }, { "epoch": 4.768848898541731, "grad_norm": 0.05411583557724953, "learning_rate": 9.427560791842867e-06, "loss": 0.2329, "num_input_tokens_seen": 9357200, "step": 15370 }, { "epoch": 4.770400248215948, "grad_norm": 0.10669253021478653, "learning_rate": 9.426931628427588e-06, "loss": 0.2344, "num_input_tokens_seen": 9360176, "step": 15375 }, { "epoch": 4.7719515978901645, "grad_norm": 0.05492249131202698, "learning_rate": 9.426302140465798e-06, "loss": 0.2326, "num_input_tokens_seen": 9363216, "step": 15380 }, { "epoch": 4.773502947564381, "grad_norm": 0.1120419055223465, "learning_rate": 9.425672328003646e-06, "loss": 0.2269, "num_input_tokens_seen": 9366160, "step": 15385 }, { "epoch": 4.775054297238597, "grad_norm": 0.05545026808977127, "learning_rate": 9.425042191087306e-06, "loss": 0.2342, "num_input_tokens_seen": 9370064, "step": 15390 }, { "epoch": 4.776605646912814, "grad_norm": 0.21173161268234253, "learning_rate": 9.424411729762973e-06, "loss": 0.2315, "num_input_tokens_seen": 9375248, "step": 15395 }, { "epoch": 4.778156996587031, "grad_norm": 0.11486800760030746, "learning_rate": 9.423780944076866e-06, "loss": 0.236, "num_input_tokens_seen": 9378416, "step": 15400 }, { "epoch": 4.7797083462612475, "grad_norm": 0.1165163591504097, "learning_rate": 9.423149834075229e-06, "loss": 0.2377, "num_input_tokens_seen": 9380976, "step": 15405 }, { "epoch": 4.781259695935464, "grad_norm": 0.05279428884387016, "learning_rate": 9.422518399804333e-06, "loss": 0.2324, "num_input_tokens_seen": 9384080, "step": 15410 }, { "epoch": 4.78281104560968, "grad_norm": 0.10756021738052368, "learning_rate": 9.42188664131047e-06, "loss": 0.2333, "num_input_tokens_seen": 9388688, "step": 15415 }, { "epoch": 4.784362395283897, "grad_norm": 0.10504788905382156, "learning_rate": 9.42125455863995e-06, "loss": 0.2322, "num_input_tokens_seen": 9391920, "step": 15420 }, { "epoch": 4.785913744958114, "grad_norm": 0.10876276344060898, "learning_rate": 9.420622151839115e-06, "loss": 0.2326, "num_input_tokens_seen": 9394704, "step": 15425 }, { "epoch": 4.7874650946323305, "grad_norm": 0.09772980958223343, "learning_rate": 9.41998942095433e-06, "loss": 0.2353, "num_input_tokens_seen": 9398672, "step": 15430 }, { "epoch": 4.789016444306546, "grad_norm": 0.10068583488464355, "learning_rate": 9.41935636603198e-06, "loss": 0.2327, "num_input_tokens_seen": 9402256, "step": 15435 }, { "epoch": 4.790567793980763, "grad_norm": 0.11574850976467133, "learning_rate": 9.418722987118475e-06, "loss": 0.2342, "num_input_tokens_seen": 9405136, "step": 15440 }, { "epoch": 4.79211914365498, "grad_norm": 0.07368747144937515, "learning_rate": 9.41808928426025e-06, "loss": 0.2289, "num_input_tokens_seen": 9410736, "step": 15445 }, { "epoch": 4.793670493329197, "grad_norm": 0.09629276394844055, "learning_rate": 9.417455257503762e-06, "loss": 0.2304, "num_input_tokens_seen": 9412624, "step": 15450 }, { "epoch": 4.795221843003413, "grad_norm": 0.10339704155921936, "learning_rate": 9.416820906895492e-06, "loss": 0.2341, "num_input_tokens_seen": 9415664, "step": 15455 }, { "epoch": 4.796773192677629, "grad_norm": 0.1041281521320343, "learning_rate": 9.416186232481948e-06, "loss": 0.231, "num_input_tokens_seen": 9418960, "step": 15460 }, { "epoch": 4.798324542351846, "grad_norm": 0.052263788878917694, "learning_rate": 9.415551234309658e-06, "loss": 0.2341, "num_input_tokens_seen": 9422320, "step": 15465 }, { "epoch": 4.799875892026063, "grad_norm": 0.036258917301893234, "learning_rate": 9.414915912425173e-06, "loss": 0.2289, "num_input_tokens_seen": 9425072, "step": 15470 }, { "epoch": 4.80142724170028, "grad_norm": 0.11700480431318283, "learning_rate": 9.414280266875073e-06, "loss": 0.232, "num_input_tokens_seen": 9432464, "step": 15475 }, { "epoch": 4.802978591374496, "grad_norm": 0.10671624541282654, "learning_rate": 9.413644297705955e-06, "loss": 0.2343, "num_input_tokens_seen": 9435248, "step": 15480 }, { "epoch": 4.804529941048712, "grad_norm": 0.10326887667179108, "learning_rate": 9.413008004964446e-06, "loss": 0.2229, "num_input_tokens_seen": 9438192, "step": 15485 }, { "epoch": 4.806081290722929, "grad_norm": 0.04430686682462692, "learning_rate": 9.412371388697191e-06, "loss": 0.2341, "num_input_tokens_seen": 9440496, "step": 15490 }, { "epoch": 4.807632640397146, "grad_norm": 0.05366969853639603, "learning_rate": 9.411734448950864e-06, "loss": 0.2295, "num_input_tokens_seen": 9445008, "step": 15495 }, { "epoch": 4.809183990071362, "grad_norm": 0.1050923764705658, "learning_rate": 9.411097185772158e-06, "loss": 0.2268, "num_input_tokens_seen": 9447568, "step": 15500 }, { "epoch": 4.8107353397455785, "grad_norm": 0.11790214478969574, "learning_rate": 9.410459599207794e-06, "loss": 0.2331, "num_input_tokens_seen": 9449584, "step": 15505 }, { "epoch": 4.812286689419795, "grad_norm": 0.1088293194770813, "learning_rate": 9.409821689304513e-06, "loss": 0.2337, "num_input_tokens_seen": 9452976, "step": 15510 }, { "epoch": 4.813838039094012, "grad_norm": 0.20039112865924835, "learning_rate": 9.409183456109083e-06, "loss": 0.2352, "num_input_tokens_seen": 9456688, "step": 15515 }, { "epoch": 4.815389388768228, "grad_norm": 0.10816167294979095, "learning_rate": 9.408544899668293e-06, "loss": 0.2305, "num_input_tokens_seen": 9460048, "step": 15520 }, { "epoch": 4.816940738442445, "grad_norm": 0.046168114989995956, "learning_rate": 9.407906020028956e-06, "loss": 0.2279, "num_input_tokens_seen": 9462576, "step": 15525 }, { "epoch": 4.8184920881166615, "grad_norm": 0.037604913115501404, "learning_rate": 9.40726681723791e-06, "loss": 0.2281, "num_input_tokens_seen": 9465456, "step": 15530 }, { "epoch": 4.820043437790878, "grad_norm": 0.10527822375297546, "learning_rate": 9.406627291342018e-06, "loss": 0.2342, "num_input_tokens_seen": 9468560, "step": 15535 }, { "epoch": 4.821594787465095, "grad_norm": 0.08303363621234894, "learning_rate": 9.405987442388163e-06, "loss": 0.2225, "num_input_tokens_seen": 9471344, "step": 15540 }, { "epoch": 4.823146137139311, "grad_norm": 0.2233494222164154, "learning_rate": 9.405347270423252e-06, "loss": 0.2379, "num_input_tokens_seen": 9473968, "step": 15545 }, { "epoch": 4.824697486813528, "grad_norm": 0.2336968630552292, "learning_rate": 9.404706775494221e-06, "loss": 0.2292, "num_input_tokens_seen": 9476496, "step": 15550 }, { "epoch": 4.8262488364877445, "grad_norm": 0.1215471550822258, "learning_rate": 9.404065957648023e-06, "loss": 0.2402, "num_input_tokens_seen": 9479440, "step": 15555 }, { "epoch": 4.827800186161961, "grad_norm": 0.11078425496816635, "learning_rate": 9.403424816931639e-06, "loss": 0.2311, "num_input_tokens_seen": 9482320, "step": 15560 }, { "epoch": 4.829351535836177, "grad_norm": 0.1189805343747139, "learning_rate": 9.40278335339207e-06, "loss": 0.2299, "num_input_tokens_seen": 9485296, "step": 15565 }, { "epoch": 4.830902885510394, "grad_norm": 0.07364513725042343, "learning_rate": 9.402141567076345e-06, "loss": 0.2363, "num_input_tokens_seen": 9488496, "step": 15570 }, { "epoch": 4.832454235184611, "grad_norm": 0.18713800609111786, "learning_rate": 9.401499458031515e-06, "loss": 0.2316, "num_input_tokens_seen": 9491056, "step": 15575 }, { "epoch": 4.8340055848588275, "grad_norm": 0.08281702548265457, "learning_rate": 9.400857026304655e-06, "loss": 0.2275, "num_input_tokens_seen": 9493424, "step": 15580 }, { "epoch": 4.835556934533043, "grad_norm": 0.16134366393089294, "learning_rate": 9.400214271942859e-06, "loss": 0.2362, "num_input_tokens_seen": 9496272, "step": 15585 }, { "epoch": 4.83710828420726, "grad_norm": 0.10089106857776642, "learning_rate": 9.399571194993249e-06, "loss": 0.2312, "num_input_tokens_seen": 9498960, "step": 15590 }, { "epoch": 4.838659633881477, "grad_norm": 0.2912026047706604, "learning_rate": 9.398927795502972e-06, "loss": 0.226, "num_input_tokens_seen": 9502160, "step": 15595 }, { "epoch": 4.840210983555694, "grad_norm": 0.2848764657974243, "learning_rate": 9.398284073519198e-06, "loss": 0.2286, "num_input_tokens_seen": 9506000, "step": 15600 }, { "epoch": 4.8417623332299105, "grad_norm": 0.24997788667678833, "learning_rate": 9.397640029089116e-06, "loss": 0.2234, "num_input_tokens_seen": 9509616, "step": 15605 }, { "epoch": 4.843313682904126, "grad_norm": 0.23633500933647156, "learning_rate": 9.396995662259946e-06, "loss": 0.2284, "num_input_tokens_seen": 9512976, "step": 15610 }, { "epoch": 4.844865032578343, "grad_norm": 0.4282030165195465, "learning_rate": 9.396350973078926e-06, "loss": 0.2203, "num_input_tokens_seen": 9515888, "step": 15615 }, { "epoch": 4.84641638225256, "grad_norm": 0.35402148962020874, "learning_rate": 9.395705961593317e-06, "loss": 0.2294, "num_input_tokens_seen": 9517904, "step": 15620 }, { "epoch": 4.847967731926777, "grad_norm": 0.5010839700698853, "learning_rate": 9.39506062785041e-06, "loss": 0.227, "num_input_tokens_seen": 9521168, "step": 15625 }, { "epoch": 4.8495190816009925, "grad_norm": 0.7975861430168152, "learning_rate": 9.394414971897514e-06, "loss": 0.2262, "num_input_tokens_seen": 9523728, "step": 15630 }, { "epoch": 4.851070431275209, "grad_norm": 1.7500228881835938, "learning_rate": 9.393768993781962e-06, "loss": 0.2146, "num_input_tokens_seen": 9526576, "step": 15635 }, { "epoch": 4.852621780949426, "grad_norm": 0.9519813060760498, "learning_rate": 9.393122693551113e-06, "loss": 0.2366, "num_input_tokens_seen": 9529648, "step": 15640 }, { "epoch": 4.854173130623643, "grad_norm": 0.7465935945510864, "learning_rate": 9.392476071252347e-06, "loss": 0.254, "num_input_tokens_seen": 9533776, "step": 15645 }, { "epoch": 4.855724480297859, "grad_norm": 0.49007290601730347, "learning_rate": 9.391829126933069e-06, "loss": 0.2265, "num_input_tokens_seen": 9536784, "step": 15650 }, { "epoch": 4.8572758299720755, "grad_norm": 0.258515864610672, "learning_rate": 9.391181860640712e-06, "loss": 0.2313, "num_input_tokens_seen": 9540304, "step": 15655 }, { "epoch": 4.858827179646292, "grad_norm": 0.26656025648117065, "learning_rate": 9.390534272422724e-06, "loss": 0.2285, "num_input_tokens_seen": 9543216, "step": 15660 }, { "epoch": 4.860378529320509, "grad_norm": 0.1414017379283905, "learning_rate": 9.389886362326583e-06, "loss": 0.2358, "num_input_tokens_seen": 9546096, "step": 15665 }, { "epoch": 4.861929878994726, "grad_norm": 0.1956585943698883, "learning_rate": 9.389238130399788e-06, "loss": 0.232, "num_input_tokens_seen": 9549200, "step": 15670 }, { "epoch": 4.863481228668942, "grad_norm": 0.21227234601974487, "learning_rate": 9.38858957668986e-06, "loss": 0.2316, "num_input_tokens_seen": 9552528, "step": 15675 }, { "epoch": 4.8650325783431585, "grad_norm": 0.2166386991739273, "learning_rate": 9.387940701244348e-06, "loss": 0.2391, "num_input_tokens_seen": 9557488, "step": 15680 }, { "epoch": 4.866583928017375, "grad_norm": 0.29878365993499756, "learning_rate": 9.387291504110823e-06, "loss": 0.2375, "num_input_tokens_seen": 9561168, "step": 15685 }, { "epoch": 4.868135277691592, "grad_norm": 0.29692932963371277, "learning_rate": 9.386641985336875e-06, "loss": 0.2331, "num_input_tokens_seen": 9563536, "step": 15690 }, { "epoch": 4.869686627365808, "grad_norm": 0.0639873743057251, "learning_rate": 9.385992144970124e-06, "loss": 0.2315, "num_input_tokens_seen": 9566064, "step": 15695 }, { "epoch": 4.871237977040025, "grad_norm": 0.050368595868349075, "learning_rate": 9.385341983058212e-06, "loss": 0.2358, "num_input_tokens_seen": 9568880, "step": 15700 }, { "epoch": 4.8727893267142415, "grad_norm": 0.06527377665042877, "learning_rate": 9.384691499648803e-06, "loss": 0.2363, "num_input_tokens_seen": 9571376, "step": 15705 }, { "epoch": 4.874340676388458, "grad_norm": 0.12910157442092896, "learning_rate": 9.384040694789585e-06, "loss": 0.2331, "num_input_tokens_seen": 9575376, "step": 15710 }, { "epoch": 4.875892026062674, "grad_norm": 0.13503949344158173, "learning_rate": 9.383389568528267e-06, "loss": 0.2322, "num_input_tokens_seen": 9578128, "step": 15715 }, { "epoch": 4.877443375736891, "grad_norm": 0.1329255998134613, "learning_rate": 9.382738120912588e-06, "loss": 0.2255, "num_input_tokens_seen": 9581008, "step": 15720 }, { "epoch": 4.878994725411108, "grad_norm": 0.30031540989875793, "learning_rate": 9.382086351990306e-06, "loss": 0.2371, "num_input_tokens_seen": 9583952, "step": 15725 }, { "epoch": 4.8805460750853245, "grad_norm": 0.152054563164711, "learning_rate": 9.3814342618092e-06, "loss": 0.2318, "num_input_tokens_seen": 9586352, "step": 15730 }, { "epoch": 4.882097424759541, "grad_norm": 0.043336037546396255, "learning_rate": 9.38078185041708e-06, "loss": 0.2315, "num_input_tokens_seen": 9589648, "step": 15735 }, { "epoch": 4.883648774433757, "grad_norm": 0.04979484900832176, "learning_rate": 9.380129117861775e-06, "loss": 0.2294, "num_input_tokens_seen": 9592144, "step": 15740 }, { "epoch": 4.885200124107974, "grad_norm": 0.05303725227713585, "learning_rate": 9.379476064191137e-06, "loss": 0.2299, "num_input_tokens_seen": 9595568, "step": 15745 }, { "epoch": 4.886751473782191, "grad_norm": 0.1375114619731903, "learning_rate": 9.378822689453043e-06, "loss": 0.2316, "num_input_tokens_seen": 9598416, "step": 15750 }, { "epoch": 4.888302823456407, "grad_norm": 0.15939605236053467, "learning_rate": 9.378168993695391e-06, "loss": 0.2301, "num_input_tokens_seen": 9601520, "step": 15755 }, { "epoch": 4.889854173130623, "grad_norm": 0.1715235412120819, "learning_rate": 9.377514976966109e-06, "loss": 0.2334, "num_input_tokens_seen": 9604400, "step": 15760 }, { "epoch": 4.89140552280484, "grad_norm": 0.14098341763019562, "learning_rate": 9.376860639313139e-06, "loss": 0.2324, "num_input_tokens_seen": 9606704, "step": 15765 }, { "epoch": 4.892956872479057, "grad_norm": 0.04497254639863968, "learning_rate": 9.376205980784456e-06, "loss": 0.2273, "num_input_tokens_seen": 9609488, "step": 15770 }, { "epoch": 4.894508222153274, "grad_norm": 0.2957552969455719, "learning_rate": 9.375551001428053e-06, "loss": 0.233, "num_input_tokens_seen": 9612944, "step": 15775 }, { "epoch": 4.8960595718274895, "grad_norm": 0.1605653017759323, "learning_rate": 9.374895701291945e-06, "loss": 0.2329, "num_input_tokens_seen": 9615856, "step": 15780 }, { "epoch": 4.897610921501706, "grad_norm": 0.14074242115020752, "learning_rate": 9.374240080424178e-06, "loss": 0.2316, "num_input_tokens_seen": 9618416, "step": 15785 }, { "epoch": 4.899162271175923, "grad_norm": 0.05051097646355629, "learning_rate": 9.373584138872813e-06, "loss": 0.2323, "num_input_tokens_seen": 9620880, "step": 15790 }, { "epoch": 4.90071362085014, "grad_norm": 0.15138041973114014, "learning_rate": 9.372927876685941e-06, "loss": 0.2316, "num_input_tokens_seen": 9623888, "step": 15795 }, { "epoch": 4.902264970524357, "grad_norm": 0.13795511424541473, "learning_rate": 9.372271293911672e-06, "loss": 0.2314, "num_input_tokens_seen": 9625936, "step": 15800 }, { "epoch": 4.9038163201985725, "grad_norm": 0.13370297849178314, "learning_rate": 9.371614390598141e-06, "loss": 0.2315, "num_input_tokens_seen": 9628528, "step": 15805 }, { "epoch": 4.905367669872789, "grad_norm": 0.05540516972541809, "learning_rate": 9.370957166793508e-06, "loss": 0.228, "num_input_tokens_seen": 9632528, "step": 15810 }, { "epoch": 4.906919019547006, "grad_norm": 0.1345766931772232, "learning_rate": 9.370299622545955e-06, "loss": 0.2307, "num_input_tokens_seen": 9635632, "step": 15815 }, { "epoch": 4.908470369221223, "grad_norm": 0.14829102158546448, "learning_rate": 9.369641757903687e-06, "loss": 0.234, "num_input_tokens_seen": 9637936, "step": 15820 }, { "epoch": 4.910021718895439, "grad_norm": 0.14987607300281525, "learning_rate": 9.368983572914933e-06, "loss": 0.2323, "num_input_tokens_seen": 9641072, "step": 15825 }, { "epoch": 4.9115730685696555, "grad_norm": 0.03604636341333389, "learning_rate": 9.368325067627949e-06, "loss": 0.2341, "num_input_tokens_seen": 9643696, "step": 15830 }, { "epoch": 4.913124418243872, "grad_norm": 0.05022779107093811, "learning_rate": 9.367666242091007e-06, "loss": 0.2289, "num_input_tokens_seen": 9646864, "step": 15835 }, { "epoch": 4.914675767918089, "grad_norm": 0.12931841611862183, "learning_rate": 9.36700709635241e-06, "loss": 0.2299, "num_input_tokens_seen": 9650576, "step": 15840 }, { "epoch": 4.916227117592305, "grad_norm": 0.04816512018442154, "learning_rate": 9.366347630460477e-06, "loss": 0.2368, "num_input_tokens_seen": 9654352, "step": 15845 }, { "epoch": 4.917778467266522, "grad_norm": 0.041730161756277084, "learning_rate": 9.36568784446356e-06, "loss": 0.2346, "num_input_tokens_seen": 9656880, "step": 15850 }, { "epoch": 4.9193298169407385, "grad_norm": 0.13033953309059143, "learning_rate": 9.365027738410026e-06, "loss": 0.2356, "num_input_tokens_seen": 9659792, "step": 15855 }, { "epoch": 4.920881166614955, "grad_norm": 0.05601510778069496, "learning_rate": 9.364367312348267e-06, "loss": 0.2309, "num_input_tokens_seen": 9662608, "step": 15860 }, { "epoch": 4.922432516289172, "grad_norm": 0.04147116839885712, "learning_rate": 9.363706566326706e-06, "loss": 0.2358, "num_input_tokens_seen": 9665552, "step": 15865 }, { "epoch": 4.923983865963388, "grad_norm": 0.05278780311346054, "learning_rate": 9.363045500393777e-06, "loss": 0.231, "num_input_tokens_seen": 9668496, "step": 15870 }, { "epoch": 4.925535215637605, "grad_norm": 0.05473915860056877, "learning_rate": 9.362384114597947e-06, "loss": 0.2362, "num_input_tokens_seen": 9670832, "step": 15875 }, { "epoch": 4.927086565311821, "grad_norm": 0.05698508396744728, "learning_rate": 9.361722408987703e-06, "loss": 0.2284, "num_input_tokens_seen": 9674160, "step": 15880 }, { "epoch": 4.928637914986038, "grad_norm": 0.06838351488113403, "learning_rate": 9.361060383611557e-06, "loss": 0.2269, "num_input_tokens_seen": 9676464, "step": 15885 }, { "epoch": 4.930189264660254, "grad_norm": 0.1388397514820099, "learning_rate": 9.360398038518041e-06, "loss": 0.2305, "num_input_tokens_seen": 9678992, "step": 15890 }, { "epoch": 4.931740614334471, "grad_norm": 0.06311114877462387, "learning_rate": 9.359735373755716e-06, "loss": 0.2369, "num_input_tokens_seen": 9682032, "step": 15895 }, { "epoch": 4.933291964008688, "grad_norm": 0.07382053881883621, "learning_rate": 9.359072389373158e-06, "loss": 0.2268, "num_input_tokens_seen": 9685072, "step": 15900 }, { "epoch": 4.934843313682904, "grad_norm": 0.05847828462719917, "learning_rate": 9.358409085418976e-06, "loss": 0.233, "num_input_tokens_seen": 9687600, "step": 15905 }, { "epoch": 4.93639466335712, "grad_norm": 0.12853246927261353, "learning_rate": 9.357745461941798e-06, "loss": 0.2271, "num_input_tokens_seen": 9690576, "step": 15910 }, { "epoch": 4.937946013031337, "grad_norm": 0.2931790053844452, "learning_rate": 9.357081518990274e-06, "loss": 0.2319, "num_input_tokens_seen": 9693392, "step": 15915 }, { "epoch": 4.939497362705554, "grad_norm": 0.24959219992160797, "learning_rate": 9.356417256613077e-06, "loss": 0.2228, "num_input_tokens_seen": 9697616, "step": 15920 }, { "epoch": 4.941048712379771, "grad_norm": 0.059211406856775284, "learning_rate": 9.35575267485891e-06, "loss": 0.2304, "num_input_tokens_seen": 9700432, "step": 15925 }, { "epoch": 4.942600062053987, "grad_norm": 0.12213299423456192, "learning_rate": 9.35508777377649e-06, "loss": 0.2307, "num_input_tokens_seen": 9703632, "step": 15930 }, { "epoch": 4.944151411728203, "grad_norm": 0.07070647925138474, "learning_rate": 9.354422553414565e-06, "loss": 0.2332, "num_input_tokens_seen": 9706224, "step": 15935 }, { "epoch": 4.94570276140242, "grad_norm": 0.1265215426683426, "learning_rate": 9.353757013821903e-06, "loss": 0.2242, "num_input_tokens_seen": 9708496, "step": 15940 }, { "epoch": 4.947254111076637, "grad_norm": 0.30886411666870117, "learning_rate": 9.353091155047295e-06, "loss": 0.2379, "num_input_tokens_seen": 9712304, "step": 15945 }, { "epoch": 4.948805460750854, "grad_norm": 0.11414899677038193, "learning_rate": 9.352424977139559e-06, "loss": 0.2303, "num_input_tokens_seen": 9714768, "step": 15950 }, { "epoch": 4.9503568104250695, "grad_norm": 0.11400870978832245, "learning_rate": 9.351758480147529e-06, "loss": 0.2281, "num_input_tokens_seen": 9717776, "step": 15955 }, { "epoch": 4.951908160099286, "grad_norm": 0.05835762247443199, "learning_rate": 9.351091664120072e-06, "loss": 0.2432, "num_input_tokens_seen": 9720976, "step": 15960 }, { "epoch": 4.953459509773503, "grad_norm": 0.27496635913848877, "learning_rate": 9.35042452910607e-06, "loss": 0.2441, "num_input_tokens_seen": 9723888, "step": 15965 }, { "epoch": 4.95501085944772, "grad_norm": 0.24898235499858856, "learning_rate": 9.349757075154434e-06, "loss": 0.229, "num_input_tokens_seen": 9726928, "step": 15970 }, { "epoch": 4.956562209121936, "grad_norm": 0.04670589417219162, "learning_rate": 9.349089302314094e-06, "loss": 0.2295, "num_input_tokens_seen": 9729904, "step": 15975 }, { "epoch": 4.9581135587961525, "grad_norm": 0.13386870920658112, "learning_rate": 9.348421210634008e-06, "loss": 0.2295, "num_input_tokens_seen": 9733392, "step": 15980 }, { "epoch": 4.959664908470369, "grad_norm": 0.1292804628610611, "learning_rate": 9.347752800163156e-06, "loss": 0.2314, "num_input_tokens_seen": 9736144, "step": 15985 }, { "epoch": 4.961216258144586, "grad_norm": 0.055436477065086365, "learning_rate": 9.347084070950538e-06, "loss": 0.2304, "num_input_tokens_seen": 9738928, "step": 15990 }, { "epoch": 4.962767607818803, "grad_norm": 0.04318033903837204, "learning_rate": 9.346415023045178e-06, "loss": 0.2325, "num_input_tokens_seen": 9741520, "step": 15995 }, { "epoch": 4.964318957493019, "grad_norm": 0.04267216473817825, "learning_rate": 9.34574565649613e-06, "loss": 0.2345, "num_input_tokens_seen": 9745552, "step": 16000 }, { "epoch": 4.965870307167235, "grad_norm": 0.050793372094631195, "learning_rate": 9.345075971352464e-06, "loss": 0.2299, "num_input_tokens_seen": 9748016, "step": 16005 }, { "epoch": 4.967421656841452, "grad_norm": 0.06232389435172081, "learning_rate": 9.344405967663275e-06, "loss": 0.2303, "num_input_tokens_seen": 9751664, "step": 16010 }, { "epoch": 4.968973006515669, "grad_norm": 0.03918719291687012, "learning_rate": 9.343735645477684e-06, "loss": 0.2341, "num_input_tokens_seen": 9756592, "step": 16015 }, { "epoch": 4.970524356189885, "grad_norm": 0.04294850304722786, "learning_rate": 9.343065004844832e-06, "loss": 0.2288, "num_input_tokens_seen": 9759664, "step": 16020 }, { "epoch": 4.972075705864102, "grad_norm": 0.23654805123806, "learning_rate": 9.342394045813887e-06, "loss": 0.2332, "num_input_tokens_seen": 9762672, "step": 16025 }, { "epoch": 4.973627055538318, "grad_norm": 0.12051774561405182, "learning_rate": 9.341722768434034e-06, "loss": 0.2283, "num_input_tokens_seen": 9765744, "step": 16030 }, { "epoch": 4.975178405212535, "grad_norm": 0.1220950335264206, "learning_rate": 9.34105117275449e-06, "loss": 0.2314, "num_input_tokens_seen": 9770032, "step": 16035 }, { "epoch": 4.976729754886751, "grad_norm": 0.12229893356561661, "learning_rate": 9.34037925882449e-06, "loss": 0.2303, "num_input_tokens_seen": 9772592, "step": 16040 }, { "epoch": 4.978281104560968, "grad_norm": 0.12474238872528076, "learning_rate": 9.339707026693292e-06, "loss": 0.2324, "num_input_tokens_seen": 9775632, "step": 16045 }, { "epoch": 4.979832454235185, "grad_norm": 0.1190161183476448, "learning_rate": 9.339034476410177e-06, "loss": 0.2293, "num_input_tokens_seen": 9779760, "step": 16050 }, { "epoch": 4.981383803909401, "grad_norm": 0.2425748109817505, "learning_rate": 9.338361608024456e-06, "loss": 0.2345, "num_input_tokens_seen": 9783952, "step": 16055 }, { "epoch": 4.982935153583618, "grad_norm": 0.11850958317518234, "learning_rate": 9.337688421585455e-06, "loss": 0.2258, "num_input_tokens_seen": 9788144, "step": 16060 }, { "epoch": 4.984486503257834, "grad_norm": 0.24317127466201782, "learning_rate": 9.337014917142526e-06, "loss": 0.2356, "num_input_tokens_seen": 9790448, "step": 16065 }, { "epoch": 4.986037852932051, "grad_norm": 0.11266729235649109, "learning_rate": 9.336341094745044e-06, "loss": 0.2373, "num_input_tokens_seen": 9792816, "step": 16070 }, { "epoch": 4.987589202606268, "grad_norm": 0.12519392371177673, "learning_rate": 9.335666954442413e-06, "loss": 0.2325, "num_input_tokens_seen": 9796176, "step": 16075 }, { "epoch": 4.989140552280484, "grad_norm": 0.12464827299118042, "learning_rate": 9.33499249628405e-06, "loss": 0.2345, "num_input_tokens_seen": 9798736, "step": 16080 }, { "epoch": 4.9906919019547, "grad_norm": 0.049457255750894547, "learning_rate": 9.334317720319403e-06, "loss": 0.2303, "num_input_tokens_seen": 9802704, "step": 16085 }, { "epoch": 4.992243251628917, "grad_norm": 0.13218280673027039, "learning_rate": 9.333642626597942e-06, "loss": 0.23, "num_input_tokens_seen": 9805264, "step": 16090 }, { "epoch": 4.993794601303134, "grad_norm": 0.11542253941297531, "learning_rate": 9.332967215169157e-06, "loss": 0.2313, "num_input_tokens_seen": 9808528, "step": 16095 }, { "epoch": 4.995345950977351, "grad_norm": 0.04575575143098831, "learning_rate": 9.332291486082568e-06, "loss": 0.2217, "num_input_tokens_seen": 9812208, "step": 16100 }, { "epoch": 4.9968973006515665, "grad_norm": 0.19070248305797577, "learning_rate": 9.33161543938771e-06, "loss": 0.2364, "num_input_tokens_seen": 9814896, "step": 16105 }, { "epoch": 4.998448650325783, "grad_norm": 0.30498331785202026, "learning_rate": 9.330939075134144e-06, "loss": 0.2451, "num_input_tokens_seen": 9818352, "step": 16110 }, { "epoch": 5.0, "grad_norm": 0.23523904383182526, "learning_rate": 9.330262393371461e-06, "loss": 0.2297, "num_input_tokens_seen": 9821920, "step": 16115 }, { "epoch": 5.001551349674217, "grad_norm": 0.12136494368314743, "learning_rate": 9.329585394149264e-06, "loss": 0.2295, "num_input_tokens_seen": 9825824, "step": 16120 }, { "epoch": 5.0031026993484335, "grad_norm": 0.1294672042131424, "learning_rate": 9.328908077517189e-06, "loss": 0.2348, "num_input_tokens_seen": 9828864, "step": 16125 }, { "epoch": 5.004654049022649, "grad_norm": 0.21089914441108704, "learning_rate": 9.32823044352489e-06, "loss": 0.2297, "num_input_tokens_seen": 9831904, "step": 16130 }, { "epoch": 5.006205398696866, "grad_norm": 0.10834994167089462, "learning_rate": 9.327552492222046e-06, "loss": 0.2301, "num_input_tokens_seen": 9834848, "step": 16135 }, { "epoch": 5.007756748371083, "grad_norm": 0.13013306260108948, "learning_rate": 9.326874223658356e-06, "loss": 0.2323, "num_input_tokens_seen": 9837504, "step": 16140 }, { "epoch": 5.0093080980453, "grad_norm": 0.11528048664331436, "learning_rate": 9.32619563788355e-06, "loss": 0.2285, "num_input_tokens_seen": 9841440, "step": 16145 }, { "epoch": 5.010859447719516, "grad_norm": 0.045729007571935654, "learning_rate": 9.325516734947372e-06, "loss": 0.2359, "num_input_tokens_seen": 9844576, "step": 16150 }, { "epoch": 5.012410797393732, "grad_norm": 0.05302665755152702, "learning_rate": 9.324837514899598e-06, "loss": 0.2295, "num_input_tokens_seen": 9847616, "step": 16155 }, { "epoch": 5.013962147067949, "grad_norm": 0.12348455935716629, "learning_rate": 9.324157977790018e-06, "loss": 0.2305, "num_input_tokens_seen": 9853088, "step": 16160 }, { "epoch": 5.015513496742166, "grad_norm": 0.03843802958726883, "learning_rate": 9.323478123668455e-06, "loss": 0.2325, "num_input_tokens_seen": 9856128, "step": 16165 }, { "epoch": 5.017064846416382, "grad_norm": 0.11636534333229065, "learning_rate": 9.322797952584745e-06, "loss": 0.235, "num_input_tokens_seen": 9859360, "step": 16170 }, { "epoch": 5.018616196090599, "grad_norm": 0.032877642661333084, "learning_rate": 9.322117464588757e-06, "loss": 0.2288, "num_input_tokens_seen": 9862304, "step": 16175 }, { "epoch": 5.020167545764815, "grad_norm": 0.12206089496612549, "learning_rate": 9.321436659730378e-06, "loss": 0.2347, "num_input_tokens_seen": 9864832, "step": 16180 }, { "epoch": 5.021718895439032, "grad_norm": 0.12214300781488419, "learning_rate": 9.320755538059519e-06, "loss": 0.2299, "num_input_tokens_seen": 9868544, "step": 16185 }, { "epoch": 5.023270245113249, "grad_norm": 0.11491883546113968, "learning_rate": 9.320074099626112e-06, "loss": 0.2298, "num_input_tokens_seen": 9870848, "step": 16190 }, { "epoch": 5.024821594787465, "grad_norm": 0.1246728003025055, "learning_rate": 9.319392344480116e-06, "loss": 0.2284, "num_input_tokens_seen": 9874176, "step": 16195 }, { "epoch": 5.026372944461682, "grad_norm": 0.10859304666519165, "learning_rate": 9.318710272671514e-06, "loss": 0.2285, "num_input_tokens_seen": 9877504, "step": 16200 }, { "epoch": 5.027924294135898, "grad_norm": 0.12836894392967224, "learning_rate": 9.318027884250304e-06, "loss": 0.2323, "num_input_tokens_seen": 9880832, "step": 16205 }, { "epoch": 5.029475643810115, "grad_norm": 0.03800211101770401, "learning_rate": 9.317345179266522e-06, "loss": 0.2338, "num_input_tokens_seen": 9883744, "step": 16210 }, { "epoch": 5.031026993484331, "grad_norm": 0.10925821959972382, "learning_rate": 9.31666215777021e-06, "loss": 0.224, "num_input_tokens_seen": 9888736, "step": 16215 }, { "epoch": 5.032578343158548, "grad_norm": 0.10495569556951523, "learning_rate": 9.315978819811445e-06, "loss": 0.2311, "num_input_tokens_seen": 9891424, "step": 16220 }, { "epoch": 5.034129692832765, "grad_norm": 0.20324824750423431, "learning_rate": 9.315295165440324e-06, "loss": 0.2257, "num_input_tokens_seen": 9893728, "step": 16225 }, { "epoch": 5.035681042506981, "grad_norm": 0.04095056280493736, "learning_rate": 9.314611194706966e-06, "loss": 0.2367, "num_input_tokens_seen": 9896416, "step": 16230 }, { "epoch": 5.037232392181197, "grad_norm": 0.13789425790309906, "learning_rate": 9.313926907661518e-06, "loss": 0.2263, "num_input_tokens_seen": 9899072, "step": 16235 }, { "epoch": 5.038783741855414, "grad_norm": 0.05142098665237427, "learning_rate": 9.31324230435414e-06, "loss": 0.2413, "num_input_tokens_seen": 9901664, "step": 16240 }, { "epoch": 5.040335091529631, "grad_norm": 0.05390617623925209, "learning_rate": 9.312557384835025e-06, "loss": 0.2329, "num_input_tokens_seen": 9904704, "step": 16245 }, { "epoch": 5.0418864412038475, "grad_norm": 0.22604946792125702, "learning_rate": 9.311872149154386e-06, "loss": 0.2404, "num_input_tokens_seen": 9908672, "step": 16250 }, { "epoch": 5.043437790878064, "grad_norm": 0.03120468184351921, "learning_rate": 9.311186597362458e-06, "loss": 0.233, "num_input_tokens_seen": 9911072, "step": 16255 }, { "epoch": 5.04498914055228, "grad_norm": 0.04241444170475006, "learning_rate": 9.3105007295095e-06, "loss": 0.2314, "num_input_tokens_seen": 9913600, "step": 16260 }, { "epoch": 5.046540490226497, "grad_norm": 0.03211934119462967, "learning_rate": 9.309814545645794e-06, "loss": 0.2267, "num_input_tokens_seen": 9916448, "step": 16265 }, { "epoch": 5.048091839900714, "grad_norm": 0.10520292073488235, "learning_rate": 9.309128045821649e-06, "loss": 0.2283, "num_input_tokens_seen": 9919008, "step": 16270 }, { "epoch": 5.0496431895749305, "grad_norm": 0.04481491446495056, "learning_rate": 9.308441230087389e-06, "loss": 0.2299, "num_input_tokens_seen": 9921888, "step": 16275 }, { "epoch": 5.051194539249146, "grad_norm": 0.09416848421096802, "learning_rate": 9.307754098493364e-06, "loss": 0.2295, "num_input_tokens_seen": 9924416, "step": 16280 }, { "epoch": 5.052745888923363, "grad_norm": 0.03898242861032486, "learning_rate": 9.307066651089955e-06, "loss": 0.2357, "num_input_tokens_seen": 9927872, "step": 16285 }, { "epoch": 5.05429723859758, "grad_norm": 0.10990217328071594, "learning_rate": 9.306378887927558e-06, "loss": 0.227, "num_input_tokens_seen": 9930176, "step": 16290 }, { "epoch": 5.055848588271797, "grad_norm": 0.11496009677648544, "learning_rate": 9.305690809056592e-06, "loss": 0.2373, "num_input_tokens_seen": 9932384, "step": 16295 }, { "epoch": 5.057399937946013, "grad_norm": 0.21009600162506104, "learning_rate": 9.305002414527505e-06, "loss": 0.2283, "num_input_tokens_seen": 9935520, "step": 16300 }, { "epoch": 5.058951287620229, "grad_norm": 0.1168862134218216, "learning_rate": 9.30431370439076e-06, "loss": 0.2326, "num_input_tokens_seen": 9938848, "step": 16305 }, { "epoch": 5.060502637294446, "grad_norm": 0.03740636259317398, "learning_rate": 9.303624678696849e-06, "loss": 0.2284, "num_input_tokens_seen": 9940896, "step": 16310 }, { "epoch": 5.062053986968663, "grad_norm": 0.11699020862579346, "learning_rate": 9.302935337496288e-06, "loss": 0.234, "num_input_tokens_seen": 9944160, "step": 16315 }, { "epoch": 5.06360533664288, "grad_norm": 0.052635557949543, "learning_rate": 9.302245680839611e-06, "loss": 0.231, "num_input_tokens_seen": 9946880, "step": 16320 }, { "epoch": 5.065156686317096, "grad_norm": 0.03908523544669151, "learning_rate": 9.301555708777381e-06, "loss": 0.2275, "num_input_tokens_seen": 9949536, "step": 16325 }, { "epoch": 5.066708035991312, "grad_norm": 0.048475608229637146, "learning_rate": 9.300865421360179e-06, "loss": 0.2336, "num_input_tokens_seen": 9952192, "step": 16330 }, { "epoch": 5.068259385665529, "grad_norm": 0.10499021410942078, "learning_rate": 9.30017481863861e-06, "loss": 0.2311, "num_input_tokens_seen": 9955136, "step": 16335 }, { "epoch": 5.069810735339746, "grad_norm": 0.11228791624307632, "learning_rate": 9.299483900663307e-06, "loss": 0.2332, "num_input_tokens_seen": 9957536, "step": 16340 }, { "epoch": 5.071362085013962, "grad_norm": 0.1055055484175682, "learning_rate": 9.29879266748492e-06, "loss": 0.2312, "num_input_tokens_seen": 9960288, "step": 16345 }, { "epoch": 5.072913434688179, "grad_norm": 0.10246764123439789, "learning_rate": 9.298101119154126e-06, "loss": 0.2308, "num_input_tokens_seen": 9963488, "step": 16350 }, { "epoch": 5.074464784362395, "grad_norm": 0.11555895954370499, "learning_rate": 9.297409255721622e-06, "loss": 0.2359, "num_input_tokens_seen": 9966400, "step": 16355 }, { "epoch": 5.076016134036612, "grad_norm": 0.04007483273744583, "learning_rate": 9.296717077238131e-06, "loss": 0.2339, "num_input_tokens_seen": 9969312, "step": 16360 }, { "epoch": 5.077567483710828, "grad_norm": 0.03286530077457428, "learning_rate": 9.296024583754397e-06, "loss": 0.2357, "num_input_tokens_seen": 9971712, "step": 16365 }, { "epoch": 5.079118833385045, "grad_norm": 0.12103966623544693, "learning_rate": 9.29533177532119e-06, "loss": 0.23, "num_input_tokens_seen": 9974016, "step": 16370 }, { "epoch": 5.0806701830592615, "grad_norm": 0.209815114736557, "learning_rate": 9.294638651989298e-06, "loss": 0.2326, "num_input_tokens_seen": 9976544, "step": 16375 }, { "epoch": 5.082221532733478, "grad_norm": 0.11461333930492401, "learning_rate": 9.293945213809536e-06, "loss": 0.23, "num_input_tokens_seen": 9979680, "step": 16380 }, { "epoch": 5.083772882407695, "grad_norm": 0.04073602333664894, "learning_rate": 9.293251460832744e-06, "loss": 0.2295, "num_input_tokens_seen": 9983296, "step": 16385 }, { "epoch": 5.085324232081911, "grad_norm": 0.11125779151916504, "learning_rate": 9.292557393109779e-06, "loss": 0.2332, "num_input_tokens_seen": 9985568, "step": 16390 }, { "epoch": 5.086875581756128, "grad_norm": 0.13159088790416718, "learning_rate": 9.291863010691525e-06, "loss": 0.2366, "num_input_tokens_seen": 9989568, "step": 16395 }, { "epoch": 5.0884269314303445, "grad_norm": 0.1166040301322937, "learning_rate": 9.29116831362889e-06, "loss": 0.2333, "num_input_tokens_seen": 9992736, "step": 16400 }, { "epoch": 5.089978281104561, "grad_norm": 0.10485243797302246, "learning_rate": 9.290473301972802e-06, "loss": 0.227, "num_input_tokens_seen": 9995040, "step": 16405 }, { "epoch": 5.091529630778777, "grad_norm": 0.11677604913711548, "learning_rate": 9.289777975774214e-06, "loss": 0.2333, "num_input_tokens_seen": 9998208, "step": 16410 }, { "epoch": 5.093080980452994, "grad_norm": 0.1411379724740982, "learning_rate": 9.289082335084102e-06, "loss": 0.2254, "num_input_tokens_seen": 10001376, "step": 16415 }, { "epoch": 5.094632330127211, "grad_norm": 0.11159054934978485, "learning_rate": 9.288386379953467e-06, "loss": 0.2302, "num_input_tokens_seen": 10005088, "step": 16420 }, { "epoch": 5.0961836798014275, "grad_norm": 0.22834841907024384, "learning_rate": 9.287690110433325e-06, "loss": 0.2349, "num_input_tokens_seen": 10008992, "step": 16425 }, { "epoch": 5.097735029475643, "grad_norm": 0.04768911376595497, "learning_rate": 9.286993526574725e-06, "loss": 0.2308, "num_input_tokens_seen": 10011360, "step": 16430 }, { "epoch": 5.09928637914986, "grad_norm": 0.12165108323097229, "learning_rate": 9.286296628428735e-06, "loss": 0.235, "num_input_tokens_seen": 10014144, "step": 16435 }, { "epoch": 5.100837728824077, "grad_norm": 0.044170912355184555, "learning_rate": 9.285599416046443e-06, "loss": 0.2285, "num_input_tokens_seen": 10017152, "step": 16440 }, { "epoch": 5.102389078498294, "grad_norm": 0.21153408288955688, "learning_rate": 9.284901889478965e-06, "loss": 0.2322, "num_input_tokens_seen": 10020288, "step": 16445 }, { "epoch": 5.1039404281725105, "grad_norm": 0.11621267348527908, "learning_rate": 9.284204048777438e-06, "loss": 0.2279, "num_input_tokens_seen": 10024000, "step": 16450 }, { "epoch": 5.105491777846726, "grad_norm": 0.22495712339878082, "learning_rate": 9.283505893993024e-06, "loss": 0.2304, "num_input_tokens_seen": 10026784, "step": 16455 }, { "epoch": 5.107043127520943, "grad_norm": 0.052711401134729385, "learning_rate": 9.2828074251769e-06, "loss": 0.2279, "num_input_tokens_seen": 10029760, "step": 16460 }, { "epoch": 5.10859447719516, "grad_norm": 0.11746495217084885, "learning_rate": 9.282108642380279e-06, "loss": 0.2319, "num_input_tokens_seen": 10033376, "step": 16465 }, { "epoch": 5.110145826869377, "grad_norm": 0.25847136974334717, "learning_rate": 9.281409545654385e-06, "loss": 0.2294, "num_input_tokens_seen": 10036128, "step": 16470 }, { "epoch": 5.111697176543593, "grad_norm": 0.05106485262513161, "learning_rate": 9.28071013505047e-06, "loss": 0.2333, "num_input_tokens_seen": 10038656, "step": 16475 }, { "epoch": 5.113248526217809, "grad_norm": 0.1432371586561203, "learning_rate": 9.280010410619812e-06, "loss": 0.2419, "num_input_tokens_seen": 10042432, "step": 16480 }, { "epoch": 5.114799875892026, "grad_norm": 0.0526241660118103, "learning_rate": 9.27931037241371e-06, "loss": 0.2285, "num_input_tokens_seen": 10046048, "step": 16485 }, { "epoch": 5.116351225566243, "grad_norm": 0.10174153000116348, "learning_rate": 9.278610020483483e-06, "loss": 0.2247, "num_input_tokens_seen": 10048864, "step": 16490 }, { "epoch": 5.117902575240459, "grad_norm": 0.05237482115626335, "learning_rate": 9.277909354880473e-06, "loss": 0.2336, "num_input_tokens_seen": 10052000, "step": 16495 }, { "epoch": 5.1194539249146755, "grad_norm": 0.12778089940547943, "learning_rate": 9.277208375656051e-06, "loss": 0.2242, "num_input_tokens_seen": 10054816, "step": 16500 }, { "epoch": 5.121005274588892, "grad_norm": 0.03143162652850151, "learning_rate": 9.276507082861604e-06, "loss": 0.2303, "num_input_tokens_seen": 10058176, "step": 16505 }, { "epoch": 5.122556624263109, "grad_norm": 0.11188463121652603, "learning_rate": 9.275805476548548e-06, "loss": 0.2283, "num_input_tokens_seen": 10060672, "step": 16510 }, { "epoch": 5.124107973937326, "grad_norm": 0.21431268751621246, "learning_rate": 9.275103556768316e-06, "loss": 0.2286, "num_input_tokens_seen": 10063680, "step": 16515 }, { "epoch": 5.125659323611542, "grad_norm": 0.055590905249118805, "learning_rate": 9.274401323572368e-06, "loss": 0.2256, "num_input_tokens_seen": 10067584, "step": 16520 }, { "epoch": 5.1272106732857585, "grad_norm": 0.2262844294309616, "learning_rate": 9.273698777012188e-06, "loss": 0.2177, "num_input_tokens_seen": 10070080, "step": 16525 }, { "epoch": 5.128762022959975, "grad_norm": 0.04747360199689865, "learning_rate": 9.272995917139278e-06, "loss": 0.2383, "num_input_tokens_seen": 10073472, "step": 16530 }, { "epoch": 5.130313372634192, "grad_norm": 0.24169524013996124, "learning_rate": 9.27229274400517e-06, "loss": 0.2286, "num_input_tokens_seen": 10076256, "step": 16535 }, { "epoch": 5.131864722308408, "grad_norm": 0.05851972475647926, "learning_rate": 9.27158925766141e-06, "loss": 0.2434, "num_input_tokens_seen": 10078944, "step": 16540 }, { "epoch": 5.133416071982625, "grad_norm": 0.11590662598609924, "learning_rate": 9.270885458159576e-06, "loss": 0.233, "num_input_tokens_seen": 10081376, "step": 16545 }, { "epoch": 5.1349674216568415, "grad_norm": 0.11084474623203278, "learning_rate": 9.270181345551261e-06, "loss": 0.2297, "num_input_tokens_seen": 10087008, "step": 16550 }, { "epoch": 5.136518771331058, "grad_norm": 0.1378335803747177, "learning_rate": 9.269476919888087e-06, "loss": 0.241, "num_input_tokens_seen": 10090496, "step": 16555 }, { "epoch": 5.138070121005274, "grad_norm": 0.13223521411418915, "learning_rate": 9.2687721812217e-06, "loss": 0.2276, "num_input_tokens_seen": 10093984, "step": 16560 }, { "epoch": 5.139621470679491, "grad_norm": 0.10791841894388199, "learning_rate": 9.268067129603759e-06, "loss": 0.2322, "num_input_tokens_seen": 10096928, "step": 16565 }, { "epoch": 5.141172820353708, "grad_norm": 0.21824383735656738, "learning_rate": 9.267361765085956e-06, "loss": 0.226, "num_input_tokens_seen": 10099808, "step": 16570 }, { "epoch": 5.1427241700279245, "grad_norm": 0.11461703479290009, "learning_rate": 9.266656087720003e-06, "loss": 0.2369, "num_input_tokens_seen": 10102464, "step": 16575 }, { "epoch": 5.144275519702141, "grad_norm": 0.11862483620643616, "learning_rate": 9.265950097557634e-06, "loss": 0.23, "num_input_tokens_seen": 10105376, "step": 16580 }, { "epoch": 5.145826869376357, "grad_norm": 0.056033216416835785, "learning_rate": 9.265243794650607e-06, "loss": 0.2309, "num_input_tokens_seen": 10107840, "step": 16585 }, { "epoch": 5.147378219050574, "grad_norm": 0.21706420183181763, "learning_rate": 9.2645371790507e-06, "loss": 0.2284, "num_input_tokens_seen": 10111360, "step": 16590 }, { "epoch": 5.148929568724791, "grad_norm": 0.11001770198345184, "learning_rate": 9.263830250809722e-06, "loss": 0.228, "num_input_tokens_seen": 10115200, "step": 16595 }, { "epoch": 5.1504809183990075, "grad_norm": 0.1298963874578476, "learning_rate": 9.26312300997949e-06, "loss": 0.2297, "num_input_tokens_seen": 10119008, "step": 16600 }, { "epoch": 5.152032268073223, "grad_norm": 0.12852279841899872, "learning_rate": 9.262415456611862e-06, "loss": 0.2373, "num_input_tokens_seen": 10121280, "step": 16605 }, { "epoch": 5.15358361774744, "grad_norm": 0.05723283067345619, "learning_rate": 9.261707590758706e-06, "loss": 0.2324, "num_input_tokens_seen": 10123936, "step": 16610 }, { "epoch": 5.155134967421657, "grad_norm": 0.0530671626329422, "learning_rate": 9.260999412471918e-06, "loss": 0.231, "num_input_tokens_seen": 10126912, "step": 16615 }, { "epoch": 5.156686317095874, "grad_norm": 0.11885865777730942, "learning_rate": 9.260290921803415e-06, "loss": 0.2249, "num_input_tokens_seen": 10130112, "step": 16620 }, { "epoch": 5.1582376667700895, "grad_norm": 0.1337662786245346, "learning_rate": 9.259582118805137e-06, "loss": 0.2257, "num_input_tokens_seen": 10134112, "step": 16625 }, { "epoch": 5.159789016444306, "grad_norm": 0.1411302238702774, "learning_rate": 9.258873003529049e-06, "loss": 0.2158, "num_input_tokens_seen": 10136864, "step": 16630 }, { "epoch": 5.161340366118523, "grad_norm": 0.056882716715335846, "learning_rate": 9.258163576027137e-06, "loss": 0.2344, "num_input_tokens_seen": 10140576, "step": 16635 }, { "epoch": 5.16289171579274, "grad_norm": 0.16402949392795563, "learning_rate": 9.257453836351412e-06, "loss": 0.2289, "num_input_tokens_seen": 10143936, "step": 16640 }, { "epoch": 5.164443065466957, "grad_norm": 0.1764994114637375, "learning_rate": 9.256743784553905e-06, "loss": 0.2331, "num_input_tokens_seen": 10146688, "step": 16645 }, { "epoch": 5.1659944151411725, "grad_norm": 0.07830201834440231, "learning_rate": 9.25603342068667e-06, "loss": 0.2288, "num_input_tokens_seen": 10149760, "step": 16650 }, { "epoch": 5.167545764815389, "grad_norm": 0.09232094883918762, "learning_rate": 9.255322744801787e-06, "loss": 0.242, "num_input_tokens_seen": 10152736, "step": 16655 }, { "epoch": 5.169097114489606, "grad_norm": 0.22764410078525543, "learning_rate": 9.254611756951355e-06, "loss": 0.2177, "num_input_tokens_seen": 10155648, "step": 16660 }, { "epoch": 5.170648464163823, "grad_norm": 0.121961809694767, "learning_rate": 9.253900457187498e-06, "loss": 0.2199, "num_input_tokens_seen": 10158688, "step": 16665 }, { "epoch": 5.172199813838039, "grad_norm": 0.11830425262451172, "learning_rate": 9.253188845562368e-06, "loss": 0.2466, "num_input_tokens_seen": 10162112, "step": 16670 }, { "epoch": 5.1737511635122555, "grad_norm": 0.23388217389583588, "learning_rate": 9.252476922128128e-06, "loss": 0.2273, "num_input_tokens_seen": 10165216, "step": 16675 }, { "epoch": 5.175302513186472, "grad_norm": 0.22759638726711273, "learning_rate": 9.251764686936973e-06, "loss": 0.229, "num_input_tokens_seen": 10168480, "step": 16680 }, { "epoch": 5.176853862860689, "grad_norm": 0.15517327189445496, "learning_rate": 9.251052140041117e-06, "loss": 0.2476, "num_input_tokens_seen": 10171008, "step": 16685 }, { "epoch": 5.178405212534905, "grad_norm": 0.2113618403673172, "learning_rate": 9.250339281492801e-06, "loss": 0.2136, "num_input_tokens_seen": 10173792, "step": 16690 }, { "epoch": 5.179956562209122, "grad_norm": 0.14864814281463623, "learning_rate": 9.249626111344283e-06, "loss": 0.2285, "num_input_tokens_seen": 10176224, "step": 16695 }, { "epoch": 5.1815079118833385, "grad_norm": 0.07285897433757782, "learning_rate": 9.248912629647849e-06, "loss": 0.2234, "num_input_tokens_seen": 10178688, "step": 16700 }, { "epoch": 5.183059261557555, "grad_norm": 0.06551947444677353, "learning_rate": 9.248198836455801e-06, "loss": 0.2201, "num_input_tokens_seen": 10181728, "step": 16705 }, { "epoch": 5.184610611231772, "grad_norm": 0.2777717411518097, "learning_rate": 9.247484731820475e-06, "loss": 0.2405, "num_input_tokens_seen": 10185248, "step": 16710 }, { "epoch": 5.186161960905988, "grad_norm": 0.19468380510807037, "learning_rate": 9.24677031579422e-06, "loss": 0.2408, "num_input_tokens_seen": 10187584, "step": 16715 }, { "epoch": 5.187713310580205, "grad_norm": 0.15104728937149048, "learning_rate": 9.246055588429412e-06, "loss": 0.2284, "num_input_tokens_seen": 10190176, "step": 16720 }, { "epoch": 5.1892646602544215, "grad_norm": 0.10453502088785172, "learning_rate": 9.24534054977845e-06, "loss": 0.2443, "num_input_tokens_seen": 10192576, "step": 16725 }, { "epoch": 5.190816009928638, "grad_norm": 0.09619386494159698, "learning_rate": 9.244625199893751e-06, "loss": 0.2201, "num_input_tokens_seen": 10195680, "step": 16730 }, { "epoch": 5.192367359602854, "grad_norm": 0.03881825506687164, "learning_rate": 9.24390953882776e-06, "loss": 0.243, "num_input_tokens_seen": 10198432, "step": 16735 }, { "epoch": 5.193918709277071, "grad_norm": 0.12542693316936493, "learning_rate": 9.243193566632947e-06, "loss": 0.2222, "num_input_tokens_seen": 10201152, "step": 16740 }, { "epoch": 5.195470058951288, "grad_norm": 0.047126635909080505, "learning_rate": 9.242477283361796e-06, "loss": 0.2231, "num_input_tokens_seen": 10203552, "step": 16745 }, { "epoch": 5.197021408625504, "grad_norm": 0.10385385155677795, "learning_rate": 9.241760689066826e-06, "loss": 0.2272, "num_input_tokens_seen": 10205888, "step": 16750 }, { "epoch": 5.19857275829972, "grad_norm": 0.1101737841963768, "learning_rate": 9.241043783800563e-06, "loss": 0.2311, "num_input_tokens_seen": 10209248, "step": 16755 }, { "epoch": 5.200124107973937, "grad_norm": 0.18843726813793182, "learning_rate": 9.240326567615572e-06, "loss": 0.2352, "num_input_tokens_seen": 10212000, "step": 16760 }, { "epoch": 5.201675457648154, "grad_norm": 0.12079007923603058, "learning_rate": 9.23960904056443e-06, "loss": 0.2353, "num_input_tokens_seen": 10214464, "step": 16765 }, { "epoch": 5.203226807322371, "grad_norm": 0.12268321961164474, "learning_rate": 9.23889120269974e-06, "loss": 0.2198, "num_input_tokens_seen": 10217056, "step": 16770 }, { "epoch": 5.204778156996587, "grad_norm": 0.10215108096599579, "learning_rate": 9.23817305407413e-06, "loss": 0.2297, "num_input_tokens_seen": 10219744, "step": 16775 }, { "epoch": 5.206329506670803, "grad_norm": 0.1910707652568817, "learning_rate": 9.237454594740245e-06, "loss": 0.2361, "num_input_tokens_seen": 10222752, "step": 16780 }, { "epoch": 5.20788085634502, "grad_norm": 0.12299222499132156, "learning_rate": 9.236735824750763e-06, "loss": 0.2378, "num_input_tokens_seen": 10225248, "step": 16785 }, { "epoch": 5.209432206019237, "grad_norm": 0.10116302967071533, "learning_rate": 9.236016744158371e-06, "loss": 0.2368, "num_input_tokens_seen": 10227744, "step": 16790 }, { "epoch": 5.210983555693454, "grad_norm": 0.11173579096794128, "learning_rate": 9.235297353015793e-06, "loss": 0.2323, "num_input_tokens_seen": 10230912, "step": 16795 }, { "epoch": 5.2125349053676695, "grad_norm": 0.04579833894968033, "learning_rate": 9.234577651375763e-06, "loss": 0.234, "num_input_tokens_seen": 10233824, "step": 16800 }, { "epoch": 5.214086255041886, "grad_norm": 0.0420941524207592, "learning_rate": 9.233857639291048e-06, "loss": 0.2309, "num_input_tokens_seen": 10236576, "step": 16805 }, { "epoch": 5.215637604716103, "grad_norm": 0.11672081798315048, "learning_rate": 9.233137316814431e-06, "loss": 0.2339, "num_input_tokens_seen": 10239264, "step": 16810 }, { "epoch": 5.21718895439032, "grad_norm": 0.10244487971067429, "learning_rate": 9.23241668399872e-06, "loss": 0.2325, "num_input_tokens_seen": 10243456, "step": 16815 }, { "epoch": 5.218740304064536, "grad_norm": 0.1997620314359665, "learning_rate": 9.231695740896749e-06, "loss": 0.2303, "num_input_tokens_seen": 10248448, "step": 16820 }, { "epoch": 5.2202916537387525, "grad_norm": 0.051435716450214386, "learning_rate": 9.230974487561367e-06, "loss": 0.2345, "num_input_tokens_seen": 10250784, "step": 16825 }, { "epoch": 5.221843003412969, "grad_norm": 0.20451854169368744, "learning_rate": 9.230252924045455e-06, "loss": 0.2329, "num_input_tokens_seen": 10253600, "step": 16830 }, { "epoch": 5.223394353087186, "grad_norm": 0.10648122429847717, "learning_rate": 9.229531050401908e-06, "loss": 0.2298, "num_input_tokens_seen": 10256960, "step": 16835 }, { "epoch": 5.224945702761403, "grad_norm": 0.03746258467435837, "learning_rate": 9.22880886668365e-06, "loss": 0.2309, "num_input_tokens_seen": 10259776, "step": 16840 }, { "epoch": 5.226497052435619, "grad_norm": 0.05063353478908539, "learning_rate": 9.228086372943627e-06, "loss": 0.234, "num_input_tokens_seen": 10262464, "step": 16845 }, { "epoch": 5.2280484021098355, "grad_norm": 0.03747524321079254, "learning_rate": 9.227363569234804e-06, "loss": 0.231, "num_input_tokens_seen": 10266240, "step": 16850 }, { "epoch": 5.229599751784052, "grad_norm": 0.10723564028739929, "learning_rate": 9.226640455610173e-06, "loss": 0.2314, "num_input_tokens_seen": 10269856, "step": 16855 }, { "epoch": 5.231151101458269, "grad_norm": 0.12029244750738144, "learning_rate": 9.225917032122743e-06, "loss": 0.2301, "num_input_tokens_seen": 10272768, "step": 16860 }, { "epoch": 5.232702451132485, "grad_norm": 0.040184054523706436, "learning_rate": 9.225193298825555e-06, "loss": 0.23, "num_input_tokens_seen": 10274880, "step": 16865 }, { "epoch": 5.234253800806702, "grad_norm": 0.034067630767822266, "learning_rate": 9.224469255771663e-06, "loss": 0.2357, "num_input_tokens_seen": 10277632, "step": 16870 }, { "epoch": 5.235805150480918, "grad_norm": 0.10196562111377716, "learning_rate": 9.223744903014149e-06, "loss": 0.2309, "num_input_tokens_seen": 10280480, "step": 16875 }, { "epoch": 5.237356500155135, "grad_norm": 0.03112536109983921, "learning_rate": 9.223020240606118e-06, "loss": 0.2309, "num_input_tokens_seen": 10283264, "step": 16880 }, { "epoch": 5.238907849829351, "grad_norm": 0.10885962843894958, "learning_rate": 9.222295268600695e-06, "loss": 0.2298, "num_input_tokens_seen": 10286016, "step": 16885 }, { "epoch": 5.240459199503568, "grad_norm": 0.09805825352668762, "learning_rate": 9.221569987051029e-06, "loss": 0.2309, "num_input_tokens_seen": 10288320, "step": 16890 }, { "epoch": 5.242010549177785, "grad_norm": 0.11105028539896011, "learning_rate": 9.220844396010292e-06, "loss": 0.2373, "num_input_tokens_seen": 10291968, "step": 16895 }, { "epoch": 5.243561898852001, "grad_norm": 0.10275639593601227, "learning_rate": 9.220118495531678e-06, "loss": 0.233, "num_input_tokens_seen": 10295008, "step": 16900 }, { "epoch": 5.245113248526218, "grad_norm": 0.04524046555161476, "learning_rate": 9.219392285668407e-06, "loss": 0.2304, "num_input_tokens_seen": 10297472, "step": 16905 }, { "epoch": 5.246664598200434, "grad_norm": 0.09992797672748566, "learning_rate": 9.218665766473713e-06, "loss": 0.2315, "num_input_tokens_seen": 10299840, "step": 16910 }, { "epoch": 5.248215947874651, "grad_norm": 0.11030504107475281, "learning_rate": 9.217938938000865e-06, "loss": 0.2283, "num_input_tokens_seen": 10302656, "step": 16915 }, { "epoch": 5.249767297548868, "grad_norm": 0.10711304098367691, "learning_rate": 9.217211800303143e-06, "loss": 0.2309, "num_input_tokens_seen": 10305312, "step": 16920 }, { "epoch": 5.251318647223084, "grad_norm": 0.03726789355278015, "learning_rate": 9.216484353433857e-06, "loss": 0.2295, "num_input_tokens_seen": 10307680, "step": 16925 }, { "epoch": 5.2528699968973, "grad_norm": 0.10888425260782242, "learning_rate": 9.215756597446338e-06, "loss": 0.2363, "num_input_tokens_seen": 10312000, "step": 16930 }, { "epoch": 5.254421346571517, "grad_norm": 0.12166691571474075, "learning_rate": 9.215028532393939e-06, "loss": 0.2295, "num_input_tokens_seen": 10316416, "step": 16935 }, { "epoch": 5.255972696245734, "grad_norm": 0.0451333224773407, "learning_rate": 9.214300158330034e-06, "loss": 0.2348, "num_input_tokens_seen": 10318432, "step": 16940 }, { "epoch": 5.257524045919951, "grad_norm": 0.1046961098909378, "learning_rate": 9.213571475308024e-06, "loss": 0.2304, "num_input_tokens_seen": 10320992, "step": 16945 }, { "epoch": 5.2590753955941665, "grad_norm": 0.05875053629279137, "learning_rate": 9.212842483381326e-06, "loss": 0.2326, "num_input_tokens_seen": 10323456, "step": 16950 }, { "epoch": 5.260626745268383, "grad_norm": 0.10992012917995453, "learning_rate": 9.212113182603388e-06, "loss": 0.23, "num_input_tokens_seen": 10328032, "step": 16955 }, { "epoch": 5.2621780949426, "grad_norm": 0.10882599651813507, "learning_rate": 9.211383573027675e-06, "loss": 0.2299, "num_input_tokens_seen": 10331136, "step": 16960 }, { "epoch": 5.263729444616817, "grad_norm": 0.048925526440143585, "learning_rate": 9.210653654707675e-06, "loss": 0.2331, "num_input_tokens_seen": 10334272, "step": 16965 }, { "epoch": 5.265280794291034, "grad_norm": 0.19995835423469543, "learning_rate": 9.2099234276969e-06, "loss": 0.232, "num_input_tokens_seen": 10336640, "step": 16970 }, { "epoch": 5.2668321439652495, "grad_norm": 0.04933738708496094, "learning_rate": 9.209192892048887e-06, "loss": 0.2335, "num_input_tokens_seen": 10339104, "step": 16975 }, { "epoch": 5.268383493639466, "grad_norm": 0.040556710213422775, "learning_rate": 9.208462047817191e-06, "loss": 0.231, "num_input_tokens_seen": 10342560, "step": 16980 }, { "epoch": 5.269934843313683, "grad_norm": 0.10436587035655975, "learning_rate": 9.207730895055389e-06, "loss": 0.2325, "num_input_tokens_seen": 10344768, "step": 16985 }, { "epoch": 5.2714861929879, "grad_norm": 0.046071428805589676, "learning_rate": 9.206999433817086e-06, "loss": 0.2295, "num_input_tokens_seen": 10347648, "step": 16990 }, { "epoch": 5.273037542662116, "grad_norm": 0.11089227348566055, "learning_rate": 9.206267664155906e-06, "loss": 0.2347, "num_input_tokens_seen": 10351232, "step": 16995 }, { "epoch": 5.274588892336332, "grad_norm": 0.10258685797452927, "learning_rate": 9.205535586125498e-06, "loss": 0.2336, "num_input_tokens_seen": 10354176, "step": 17000 }, { "epoch": 5.276140242010549, "grad_norm": 0.10578728467226028, "learning_rate": 9.20480319977953e-06, "loss": 0.2278, "num_input_tokens_seen": 10357376, "step": 17005 }, { "epoch": 5.277691591684766, "grad_norm": 0.03757962957024574, "learning_rate": 9.204070505171693e-06, "loss": 0.2341, "num_input_tokens_seen": 10360576, "step": 17010 }, { "epoch": 5.279242941358982, "grad_norm": 0.035140082240104675, "learning_rate": 9.203337502355707e-06, "loss": 0.2299, "num_input_tokens_seen": 10363104, "step": 17015 }, { "epoch": 5.280794291033199, "grad_norm": 0.04541613534092903, "learning_rate": 9.202604191385307e-06, "loss": 0.2331, "num_input_tokens_seen": 10365984, "step": 17020 }, { "epoch": 5.282345640707415, "grad_norm": 0.2108655869960785, "learning_rate": 9.201870572314252e-06, "loss": 0.239, "num_input_tokens_seen": 10368800, "step": 17025 }, { "epoch": 5.283896990381632, "grad_norm": 0.10622218251228333, "learning_rate": 9.201136645196327e-06, "loss": 0.2284, "num_input_tokens_seen": 10372000, "step": 17030 }, { "epoch": 5.285448340055849, "grad_norm": 0.19860097765922546, "learning_rate": 9.200402410085338e-06, "loss": 0.2289, "num_input_tokens_seen": 10374880, "step": 17035 }, { "epoch": 5.286999689730065, "grad_norm": 0.12132783234119415, "learning_rate": 9.199667867035111e-06, "loss": 0.2305, "num_input_tokens_seen": 10377568, "step": 17040 }, { "epoch": 5.288551039404282, "grad_norm": 0.10721871256828308, "learning_rate": 9.198933016099499e-06, "loss": 0.2351, "num_input_tokens_seen": 10380640, "step": 17045 }, { "epoch": 5.290102389078498, "grad_norm": 0.043435074388980865, "learning_rate": 9.198197857332371e-06, "loss": 0.231, "num_input_tokens_seen": 10383200, "step": 17050 }, { "epoch": 5.291653738752715, "grad_norm": 0.0338975265622139, "learning_rate": 9.19746239078763e-06, "loss": 0.2309, "num_input_tokens_seen": 10387968, "step": 17055 }, { "epoch": 5.293205088426931, "grad_norm": 0.045007675886154175, "learning_rate": 9.196726616519188e-06, "loss": 0.2353, "num_input_tokens_seen": 10390944, "step": 17060 }, { "epoch": 5.294756438101148, "grad_norm": 0.09760642796754837, "learning_rate": 9.195990534580988e-06, "loss": 0.2294, "num_input_tokens_seen": 10394688, "step": 17065 }, { "epoch": 5.296307787775365, "grad_norm": 0.1019154042005539, "learning_rate": 9.195254145026995e-06, "loss": 0.2311, "num_input_tokens_seen": 10397216, "step": 17070 }, { "epoch": 5.297859137449581, "grad_norm": 0.0986398383975029, "learning_rate": 9.194517447911193e-06, "loss": 0.2322, "num_input_tokens_seen": 10399808, "step": 17075 }, { "epoch": 5.299410487123797, "grad_norm": 0.10258622467517853, "learning_rate": 9.193780443287593e-06, "loss": 0.2274, "num_input_tokens_seen": 10402464, "step": 17080 }, { "epoch": 5.300961836798014, "grad_norm": 0.10736941546201706, "learning_rate": 9.193043131210224e-06, "loss": 0.2296, "num_input_tokens_seen": 10406016, "step": 17085 }, { "epoch": 5.302513186472231, "grad_norm": 0.11439725756645203, "learning_rate": 9.192305511733141e-06, "loss": 0.231, "num_input_tokens_seen": 10409536, "step": 17090 }, { "epoch": 5.304064536146448, "grad_norm": 0.0483684241771698, "learning_rate": 9.191567584910418e-06, "loss": 0.2276, "num_input_tokens_seen": 10412672, "step": 17095 }, { "epoch": 5.305615885820664, "grad_norm": 0.044160887598991394, "learning_rate": 9.190829350796157e-06, "loss": 0.2309, "num_input_tokens_seen": 10415392, "step": 17100 }, { "epoch": 5.30716723549488, "grad_norm": 0.08879978209733963, "learning_rate": 9.190090809444476e-06, "loss": 0.2192, "num_input_tokens_seen": 10418048, "step": 17105 }, { "epoch": 5.308718585169097, "grad_norm": 0.09954020380973816, "learning_rate": 9.189351960909524e-06, "loss": 0.2184, "num_input_tokens_seen": 10420608, "step": 17110 }, { "epoch": 5.310269934843314, "grad_norm": 0.10232016444206238, "learning_rate": 9.188612805245461e-06, "loss": 0.2278, "num_input_tokens_seen": 10423712, "step": 17115 }, { "epoch": 5.3118212845175306, "grad_norm": 0.06867601722478867, "learning_rate": 9.18787334250648e-06, "loss": 0.2364, "num_input_tokens_seen": 10430400, "step": 17120 }, { "epoch": 5.313372634191746, "grad_norm": 0.1207551509141922, "learning_rate": 9.18713357274679e-06, "loss": 0.2161, "num_input_tokens_seen": 10433248, "step": 17125 }, { "epoch": 5.314923983865963, "grad_norm": 0.2028234750032425, "learning_rate": 9.186393496020627e-06, "loss": 0.2328, "num_input_tokens_seen": 10435328, "step": 17130 }, { "epoch": 5.31647533354018, "grad_norm": 0.25334954261779785, "learning_rate": 9.185653112382245e-06, "loss": 0.235, "num_input_tokens_seen": 10438272, "step": 17135 }, { "epoch": 5.318026683214397, "grad_norm": 0.10716773569583893, "learning_rate": 9.184912421885925e-06, "loss": 0.2285, "num_input_tokens_seen": 10442016, "step": 17140 }, { "epoch": 5.319578032888613, "grad_norm": 0.18119433522224426, "learning_rate": 9.184171424585968e-06, "loss": 0.2343, "num_input_tokens_seen": 10445568, "step": 17145 }, { "epoch": 5.321129382562829, "grad_norm": 0.13599546253681183, "learning_rate": 9.183430120536698e-06, "loss": 0.2374, "num_input_tokens_seen": 10447904, "step": 17150 }, { "epoch": 5.322680732237046, "grad_norm": 0.26113855838775635, "learning_rate": 9.182688509792461e-06, "loss": 0.2242, "num_input_tokens_seen": 10450688, "step": 17155 }, { "epoch": 5.324232081911263, "grad_norm": 0.2400532215833664, "learning_rate": 9.181946592407625e-06, "loss": 0.2414, "num_input_tokens_seen": 10454240, "step": 17160 }, { "epoch": 5.32578343158548, "grad_norm": 0.22136355936527252, "learning_rate": 9.181204368436583e-06, "loss": 0.2306, "num_input_tokens_seen": 10456928, "step": 17165 }, { "epoch": 5.327334781259696, "grad_norm": 0.14883849024772644, "learning_rate": 9.180461837933748e-06, "loss": 0.2297, "num_input_tokens_seen": 10462496, "step": 17170 }, { "epoch": 5.328886130933912, "grad_norm": 0.08357031643390656, "learning_rate": 9.179719000953556e-06, "loss": 0.2325, "num_input_tokens_seen": 10465696, "step": 17175 }, { "epoch": 5.330437480608129, "grad_norm": 0.11399345099925995, "learning_rate": 9.178975857550465e-06, "loss": 0.2184, "num_input_tokens_seen": 10468352, "step": 17180 }, { "epoch": 5.331988830282346, "grad_norm": 0.15299035608768463, "learning_rate": 9.178232407778958e-06, "loss": 0.2339, "num_input_tokens_seen": 10470944, "step": 17185 }, { "epoch": 5.333540179956562, "grad_norm": 0.14209826290607452, "learning_rate": 9.177488651693536e-06, "loss": 0.2345, "num_input_tokens_seen": 10474432, "step": 17190 }, { "epoch": 5.335091529630779, "grad_norm": 0.05393711104989052, "learning_rate": 9.17674458934873e-06, "loss": 0.2382, "num_input_tokens_seen": 10477824, "step": 17195 }, { "epoch": 5.336642879304995, "grad_norm": 0.09340473264455795, "learning_rate": 9.176000220799084e-06, "loss": 0.2311, "num_input_tokens_seen": 10479872, "step": 17200 }, { "epoch": 5.338194228979212, "grad_norm": 0.10654692351818085, "learning_rate": 9.175255546099172e-06, "loss": 0.2316, "num_input_tokens_seen": 10484032, "step": 17205 }, { "epoch": 5.339745578653428, "grad_norm": 0.09505093842744827, "learning_rate": 9.174510565303583e-06, "loss": 0.2381, "num_input_tokens_seen": 10486560, "step": 17210 }, { "epoch": 5.341296928327645, "grad_norm": 0.10499429702758789, "learning_rate": 9.173765278466938e-06, "loss": 0.222, "num_input_tokens_seen": 10488992, "step": 17215 }, { "epoch": 5.342848278001862, "grad_norm": 0.12164442241191864, "learning_rate": 9.173019685643874e-06, "loss": 0.2281, "num_input_tokens_seen": 10491584, "step": 17220 }, { "epoch": 5.344399627676078, "grad_norm": 0.19254393875598907, "learning_rate": 9.172273786889049e-06, "loss": 0.2254, "num_input_tokens_seen": 10495168, "step": 17225 }, { "epoch": 5.345950977350295, "grad_norm": 0.1016748920083046, "learning_rate": 9.171527582257149e-06, "loss": 0.232, "num_input_tokens_seen": 10498304, "step": 17230 }, { "epoch": 5.347502327024511, "grad_norm": 0.09651654958724976, "learning_rate": 9.17078107180288e-06, "loss": 0.2372, "num_input_tokens_seen": 10501120, "step": 17235 }, { "epoch": 5.349053676698728, "grad_norm": 0.045063212513923645, "learning_rate": 9.170034255580969e-06, "loss": 0.23, "num_input_tokens_seen": 10503648, "step": 17240 }, { "epoch": 5.3506050263729446, "grad_norm": 0.10080066323280334, "learning_rate": 9.169287133646166e-06, "loss": 0.2255, "num_input_tokens_seen": 10506016, "step": 17245 }, { "epoch": 5.352156376047161, "grad_norm": 0.09295714646577835, "learning_rate": 9.168539706053243e-06, "loss": 0.234, "num_input_tokens_seen": 10509408, "step": 17250 }, { "epoch": 5.353707725721377, "grad_norm": 0.055599041283130646, "learning_rate": 9.167791972856997e-06, "loss": 0.2349, "num_input_tokens_seen": 10512992, "step": 17255 }, { "epoch": 5.355259075395594, "grad_norm": 0.04305022582411766, "learning_rate": 9.167043934112247e-06, "loss": 0.2373, "num_input_tokens_seen": 10514912, "step": 17260 }, { "epoch": 5.356810425069811, "grad_norm": 0.06586156040430069, "learning_rate": 9.166295589873831e-06, "loss": 0.2331, "num_input_tokens_seen": 10518656, "step": 17265 }, { "epoch": 5.3583617747440275, "grad_norm": 0.15830521285533905, "learning_rate": 9.165546940196612e-06, "loss": 0.234, "num_input_tokens_seen": 10522208, "step": 17270 }, { "epoch": 5.359913124418243, "grad_norm": 0.130682110786438, "learning_rate": 9.164797985135473e-06, "loss": 0.2247, "num_input_tokens_seen": 10524832, "step": 17275 }, { "epoch": 5.36146447409246, "grad_norm": 0.07199768722057343, "learning_rate": 9.164048724745325e-06, "loss": 0.2223, "num_input_tokens_seen": 10528128, "step": 17280 }, { "epoch": 5.363015823766677, "grad_norm": 0.12757508456707, "learning_rate": 9.163299159081097e-06, "loss": 0.2349, "num_input_tokens_seen": 10531296, "step": 17285 }, { "epoch": 5.364567173440894, "grad_norm": 0.17426839470863342, "learning_rate": 9.162549288197736e-06, "loss": 0.2282, "num_input_tokens_seen": 10535648, "step": 17290 }, { "epoch": 5.3661185231151105, "grad_norm": 0.035259559750556946, "learning_rate": 9.161799112150223e-06, "loss": 0.2461, "num_input_tokens_seen": 10538144, "step": 17295 }, { "epoch": 5.367669872789326, "grad_norm": 0.12234264612197876, "learning_rate": 9.16104863099355e-06, "loss": 0.2245, "num_input_tokens_seen": 10540928, "step": 17300 }, { "epoch": 5.369221222463543, "grad_norm": 0.13606354594230652, "learning_rate": 9.160297844782741e-06, "loss": 0.2298, "num_input_tokens_seen": 10543392, "step": 17305 }, { "epoch": 5.37077257213776, "grad_norm": 0.11575990915298462, "learning_rate": 9.159546753572833e-06, "loss": 0.2291, "num_input_tokens_seen": 10548416, "step": 17310 }, { "epoch": 5.372323921811977, "grad_norm": 0.07156983762979507, "learning_rate": 9.158795357418892e-06, "loss": 0.2316, "num_input_tokens_seen": 10550912, "step": 17315 }, { "epoch": 5.373875271486193, "grad_norm": 0.1589903086423874, "learning_rate": 9.158043656376004e-06, "loss": 0.2248, "num_input_tokens_seen": 10553568, "step": 17320 }, { "epoch": 5.375426621160409, "grad_norm": 0.17970792949199677, "learning_rate": 9.157291650499275e-06, "loss": 0.2342, "num_input_tokens_seen": 10555840, "step": 17325 }, { "epoch": 5.376977970834626, "grad_norm": 0.11759643256664276, "learning_rate": 9.156539339843841e-06, "loss": 0.2338, "num_input_tokens_seen": 10559328, "step": 17330 }, { "epoch": 5.378529320508843, "grad_norm": 0.14207735657691956, "learning_rate": 9.155786724464852e-06, "loss": 0.2314, "num_input_tokens_seen": 10563040, "step": 17335 }, { "epoch": 5.380080670183059, "grad_norm": 0.09303699433803558, "learning_rate": 9.155033804417483e-06, "loss": 0.2262, "num_input_tokens_seen": 10566240, "step": 17340 }, { "epoch": 5.381632019857276, "grad_norm": 0.14182470738887787, "learning_rate": 9.154280579756935e-06, "loss": 0.2358, "num_input_tokens_seen": 10569664, "step": 17345 }, { "epoch": 5.383183369531492, "grad_norm": 0.1304168850183487, "learning_rate": 9.153527050538426e-06, "loss": 0.2359, "num_input_tokens_seen": 10573312, "step": 17350 }, { "epoch": 5.384734719205709, "grad_norm": 0.14636710286140442, "learning_rate": 9.1527732168172e-06, "loss": 0.2312, "num_input_tokens_seen": 10575968, "step": 17355 }, { "epoch": 5.386286068879926, "grad_norm": 0.09716436266899109, "learning_rate": 9.152019078648517e-06, "loss": 0.2289, "num_input_tokens_seen": 10578432, "step": 17360 }, { "epoch": 5.387837418554142, "grad_norm": 0.092684805393219, "learning_rate": 9.151264636087672e-06, "loss": 0.2278, "num_input_tokens_seen": 10581376, "step": 17365 }, { "epoch": 5.3893887682283586, "grad_norm": 0.11697535216808319, "learning_rate": 9.15050988918997e-06, "loss": 0.2213, "num_input_tokens_seen": 10583712, "step": 17370 }, { "epoch": 5.390940117902575, "grad_norm": 0.26617032289505005, "learning_rate": 9.149754838010745e-06, "loss": 0.245, "num_input_tokens_seen": 10587232, "step": 17375 }, { "epoch": 5.392491467576792, "grad_norm": 0.058037009090185165, "learning_rate": 9.148999482605347e-06, "loss": 0.2336, "num_input_tokens_seen": 10590176, "step": 17380 }, { "epoch": 5.394042817251008, "grad_norm": 0.06861680001020432, "learning_rate": 9.148243823029158e-06, "loss": 0.248, "num_input_tokens_seen": 10593024, "step": 17385 }, { "epoch": 5.395594166925225, "grad_norm": 0.1310323029756546, "learning_rate": 9.14748785933757e-06, "loss": 0.2286, "num_input_tokens_seen": 10596800, "step": 17390 }, { "epoch": 5.3971455165994415, "grad_norm": 0.10828619450330734, "learning_rate": 9.146731591586013e-06, "loss": 0.2302, "num_input_tokens_seen": 10601056, "step": 17395 }, { "epoch": 5.398696866273658, "grad_norm": 0.1076161190867424, "learning_rate": 9.145975019829924e-06, "loss": 0.2329, "num_input_tokens_seen": 10603680, "step": 17400 }, { "epoch": 5.400248215947874, "grad_norm": 0.1099807620048523, "learning_rate": 9.145218144124771e-06, "loss": 0.2348, "num_input_tokens_seen": 10606272, "step": 17405 }, { "epoch": 5.401799565622091, "grad_norm": 0.1864926517009735, "learning_rate": 9.14446096452604e-06, "loss": 0.2275, "num_input_tokens_seen": 10609184, "step": 17410 }, { "epoch": 5.403350915296308, "grad_norm": 0.12580537796020508, "learning_rate": 9.143703481089244e-06, "loss": 0.2312, "num_input_tokens_seen": 10612320, "step": 17415 }, { "epoch": 5.4049022649705245, "grad_norm": 0.11466796696186066, "learning_rate": 9.142945693869914e-06, "loss": 0.2337, "num_input_tokens_seen": 10614720, "step": 17420 }, { "epoch": 5.406453614644741, "grad_norm": 0.10630300641059875, "learning_rate": 9.142187602923605e-06, "loss": 0.2321, "num_input_tokens_seen": 10617504, "step": 17425 }, { "epoch": 5.408004964318957, "grad_norm": 0.11186117678880692, "learning_rate": 9.141429208305893e-06, "loss": 0.2336, "num_input_tokens_seen": 10620704, "step": 17430 }, { "epoch": 5.409556313993174, "grad_norm": 0.10892726480960846, "learning_rate": 9.140670510072379e-06, "loss": 0.23, "num_input_tokens_seen": 10623872, "step": 17435 }, { "epoch": 5.411107663667391, "grad_norm": 0.04938654229044914, "learning_rate": 9.139911508278682e-06, "loss": 0.2314, "num_input_tokens_seen": 10625984, "step": 17440 }, { "epoch": 5.4126590133416075, "grad_norm": 0.10038121789693832, "learning_rate": 9.139152202980449e-06, "loss": 0.2335, "num_input_tokens_seen": 10629152, "step": 17445 }, { "epoch": 5.414210363015823, "grad_norm": 0.19664835929870605, "learning_rate": 9.138392594233344e-06, "loss": 0.232, "num_input_tokens_seen": 10631296, "step": 17450 }, { "epoch": 5.41576171269004, "grad_norm": 0.09779029339551926, "learning_rate": 9.137632682093055e-06, "loss": 0.2316, "num_input_tokens_seen": 10636000, "step": 17455 }, { "epoch": 5.417313062364257, "grad_norm": 0.1895933598279953, "learning_rate": 9.136872466615294e-06, "loss": 0.2283, "num_input_tokens_seen": 10638880, "step": 17460 }, { "epoch": 5.418864412038474, "grad_norm": 0.05334104225039482, "learning_rate": 9.136111947855795e-06, "loss": 0.231, "num_input_tokens_seen": 10641856, "step": 17465 }, { "epoch": 5.42041576171269, "grad_norm": 0.1089697778224945, "learning_rate": 9.13535112587031e-06, "loss": 0.2314, "num_input_tokens_seen": 10645056, "step": 17470 }, { "epoch": 5.421967111386906, "grad_norm": 0.06613040715456009, "learning_rate": 9.134590000714618e-06, "loss": 0.2315, "num_input_tokens_seen": 10648224, "step": 17475 }, { "epoch": 5.423518461061123, "grad_norm": 0.06547456234693527, "learning_rate": 9.133828572444519e-06, "loss": 0.2309, "num_input_tokens_seen": 10651488, "step": 17480 }, { "epoch": 5.42506981073534, "grad_norm": 0.06895498186349869, "learning_rate": 9.133066841115832e-06, "loss": 0.2253, "num_input_tokens_seen": 10654400, "step": 17485 }, { "epoch": 5.426621160409557, "grad_norm": 0.12493254989385605, "learning_rate": 9.132304806784403e-06, "loss": 0.2298, "num_input_tokens_seen": 10657728, "step": 17490 }, { "epoch": 5.4281725100837726, "grad_norm": 0.08056644350290298, "learning_rate": 9.1315424695061e-06, "loss": 0.233, "num_input_tokens_seen": 10660960, "step": 17495 }, { "epoch": 5.429723859757989, "grad_norm": 0.10986290127038956, "learning_rate": 9.130779829336806e-06, "loss": 0.233, "num_input_tokens_seen": 10665440, "step": 17500 }, { "epoch": 5.431275209432206, "grad_norm": 0.2320689707994461, "learning_rate": 9.130016886332436e-06, "loss": 0.2384, "num_input_tokens_seen": 10667744, "step": 17505 }, { "epoch": 5.432826559106423, "grad_norm": 0.13065803050994873, "learning_rate": 9.129253640548923e-06, "loss": 0.229, "num_input_tokens_seen": 10670528, "step": 17510 }, { "epoch": 5.434377908780639, "grad_norm": 0.07545211911201477, "learning_rate": 9.128490092042219e-06, "loss": 0.2362, "num_input_tokens_seen": 10673472, "step": 17515 }, { "epoch": 5.4359292584548555, "grad_norm": 0.13069523870944977, "learning_rate": 9.127726240868301e-06, "loss": 0.2351, "num_input_tokens_seen": 10676992, "step": 17520 }, { "epoch": 5.437480608129072, "grad_norm": 0.10665684193372726, "learning_rate": 9.126962087083173e-06, "loss": 0.2306, "num_input_tokens_seen": 10679744, "step": 17525 }, { "epoch": 5.439031957803289, "grad_norm": 0.12763649225234985, "learning_rate": 9.126197630742853e-06, "loss": 0.2346, "num_input_tokens_seen": 10683264, "step": 17530 }, { "epoch": 5.440583307477505, "grad_norm": 0.05246877297759056, "learning_rate": 9.125432871903383e-06, "loss": 0.2277, "num_input_tokens_seen": 10685568, "step": 17535 }, { "epoch": 5.442134657151722, "grad_norm": 0.13588283956050873, "learning_rate": 9.124667810620833e-06, "loss": 0.231, "num_input_tokens_seen": 10688032, "step": 17540 }, { "epoch": 5.4436860068259385, "grad_norm": 0.10429506003856659, "learning_rate": 9.123902446951288e-06, "loss": 0.2322, "num_input_tokens_seen": 10690688, "step": 17545 }, { "epoch": 5.445237356500155, "grad_norm": 0.2105349451303482, "learning_rate": 9.123136780950861e-06, "loss": 0.2338, "num_input_tokens_seen": 10693376, "step": 17550 }, { "epoch": 5.446788706174372, "grad_norm": 0.11243179440498352, "learning_rate": 9.12237081267568e-06, "loss": 0.2321, "num_input_tokens_seen": 10695968, "step": 17555 }, { "epoch": 5.448340055848588, "grad_norm": 0.1125284731388092, "learning_rate": 9.121604542181905e-06, "loss": 0.2288, "num_input_tokens_seen": 10699552, "step": 17560 }, { "epoch": 5.449891405522805, "grad_norm": 0.11587865650653839, "learning_rate": 9.12083796952571e-06, "loss": 0.2283, "num_input_tokens_seen": 10702656, "step": 17565 }, { "epoch": 5.4514427551970215, "grad_norm": 0.11908268928527832, "learning_rate": 9.120071094763292e-06, "loss": 0.2288, "num_input_tokens_seen": 10705440, "step": 17570 }, { "epoch": 5.452994104871238, "grad_norm": 0.07955043762922287, "learning_rate": 9.119303917950875e-06, "loss": 0.2243, "num_input_tokens_seen": 10708768, "step": 17575 }, { "epoch": 5.454545454545454, "grad_norm": 0.1325649619102478, "learning_rate": 9.1185364391447e-06, "loss": 0.2314, "num_input_tokens_seen": 10711200, "step": 17580 }, { "epoch": 5.456096804219671, "grad_norm": 0.12947499752044678, "learning_rate": 9.117768658401033e-06, "loss": 0.2277, "num_input_tokens_seen": 10714176, "step": 17585 }, { "epoch": 5.457648153893888, "grad_norm": 0.14942443370819092, "learning_rate": 9.117000575776163e-06, "loss": 0.2227, "num_input_tokens_seen": 10717888, "step": 17590 }, { "epoch": 5.4591995035681045, "grad_norm": 0.2951872646808624, "learning_rate": 9.116232191326398e-06, "loss": 0.2425, "num_input_tokens_seen": 10720736, "step": 17595 }, { "epoch": 5.460750853242321, "grad_norm": 0.20235563814640045, "learning_rate": 9.115463505108069e-06, "loss": 0.2277, "num_input_tokens_seen": 10724032, "step": 17600 }, { "epoch": 5.462302202916537, "grad_norm": 0.10507979989051819, "learning_rate": 9.114694517177531e-06, "loss": 0.2406, "num_input_tokens_seen": 10726560, "step": 17605 }, { "epoch": 5.463853552590754, "grad_norm": 0.2521401643753052, "learning_rate": 9.113925227591159e-06, "loss": 0.2298, "num_input_tokens_seen": 10729248, "step": 17610 }, { "epoch": 5.465404902264971, "grad_norm": 0.12336944788694382, "learning_rate": 9.113155636405353e-06, "loss": 0.2343, "num_input_tokens_seen": 10732416, "step": 17615 }, { "epoch": 5.466956251939187, "grad_norm": 0.20515181124210358, "learning_rate": 9.11238574367653e-06, "loss": 0.2333, "num_input_tokens_seen": 10735616, "step": 17620 }, { "epoch": 5.468507601613403, "grad_norm": 0.2583394944667816, "learning_rate": 9.111615549461137e-06, "loss": 0.2325, "num_input_tokens_seen": 10738944, "step": 17625 }, { "epoch": 5.47005895128762, "grad_norm": 0.09572862088680267, "learning_rate": 9.110845053815634e-06, "loss": 0.2258, "num_input_tokens_seen": 10741664, "step": 17630 }, { "epoch": 5.471610300961837, "grad_norm": 0.26495611667633057, "learning_rate": 9.110074256796508e-06, "loss": 0.2318, "num_input_tokens_seen": 10744512, "step": 17635 }, { "epoch": 5.473161650636054, "grad_norm": 0.21932680904865265, "learning_rate": 9.109303158460268e-06, "loss": 0.2309, "num_input_tokens_seen": 10747424, "step": 17640 }, { "epoch": 5.4747130003102695, "grad_norm": 0.07711485773324966, "learning_rate": 9.108531758863445e-06, "loss": 0.2336, "num_input_tokens_seen": 10750080, "step": 17645 }, { "epoch": 5.476264349984486, "grad_norm": 0.12657137215137482, "learning_rate": 9.107760058062594e-06, "loss": 0.2242, "num_input_tokens_seen": 10753184, "step": 17650 }, { "epoch": 5.477815699658703, "grad_norm": 0.36953070759773254, "learning_rate": 9.106988056114288e-06, "loss": 0.2408, "num_input_tokens_seen": 10757184, "step": 17655 }, { "epoch": 5.47936704933292, "grad_norm": 0.061551522463560104, "learning_rate": 9.10621575307512e-06, "loss": 0.2313, "num_input_tokens_seen": 10760160, "step": 17660 }, { "epoch": 5.480918399007137, "grad_norm": 0.11945432424545288, "learning_rate": 9.105443149001715e-06, "loss": 0.2322, "num_input_tokens_seen": 10762272, "step": 17665 }, { "epoch": 5.4824697486813525, "grad_norm": 0.11313597112894058, "learning_rate": 9.10467024395071e-06, "loss": 0.2351, "num_input_tokens_seen": 10765152, "step": 17670 }, { "epoch": 5.484021098355569, "grad_norm": 0.058055900037288666, "learning_rate": 9.10389703797877e-06, "loss": 0.2274, "num_input_tokens_seen": 10767776, "step": 17675 }, { "epoch": 5.485572448029786, "grad_norm": 0.07512157410383224, "learning_rate": 9.103123531142581e-06, "loss": 0.2279, "num_input_tokens_seen": 10771168, "step": 17680 }, { "epoch": 5.487123797704003, "grad_norm": 0.12774550914764404, "learning_rate": 9.102349723498848e-06, "loss": 0.2304, "num_input_tokens_seen": 10774880, "step": 17685 }, { "epoch": 5.488675147378219, "grad_norm": 0.15547016263008118, "learning_rate": 9.1015756151043e-06, "loss": 0.2335, "num_input_tokens_seen": 10777312, "step": 17690 }, { "epoch": 5.4902264970524355, "grad_norm": 0.07574041187763214, "learning_rate": 9.100801206015693e-06, "loss": 0.2278, "num_input_tokens_seen": 10780448, "step": 17695 }, { "epoch": 5.491777846726652, "grad_norm": 0.11648766696453094, "learning_rate": 9.100026496289793e-06, "loss": 0.2334, "num_input_tokens_seen": 10783328, "step": 17700 }, { "epoch": 5.493329196400869, "grad_norm": 0.1363741159439087, "learning_rate": 9.0992514859834e-06, "loss": 0.2318, "num_input_tokens_seen": 10785952, "step": 17705 }, { "epoch": 5.494880546075085, "grad_norm": 0.10592947900295258, "learning_rate": 9.098476175153332e-06, "loss": 0.2329, "num_input_tokens_seen": 10788864, "step": 17710 }, { "epoch": 5.496431895749302, "grad_norm": 0.08100187033414841, "learning_rate": 9.097700563856427e-06, "loss": 0.228, "num_input_tokens_seen": 10791968, "step": 17715 }, { "epoch": 5.4979832454235185, "grad_norm": 0.13666991889476776, "learning_rate": 9.096924652149546e-06, "loss": 0.2309, "num_input_tokens_seen": 10794688, "step": 17720 }, { "epoch": 5.499534595097735, "grad_norm": 0.06568887829780579, "learning_rate": 9.096148440089573e-06, "loss": 0.2369, "num_input_tokens_seen": 10797856, "step": 17725 }, { "epoch": 5.501085944771951, "grad_norm": 0.11304040253162384, "learning_rate": 9.095371927733413e-06, "loss": 0.2332, "num_input_tokens_seen": 10801056, "step": 17730 }, { "epoch": 5.502637294446168, "grad_norm": 0.12332592904567719, "learning_rate": 9.094595115137996e-06, "loss": 0.2301, "num_input_tokens_seen": 10803424, "step": 17735 }, { "epoch": 5.504188644120385, "grad_norm": 0.11174962669610977, "learning_rate": 9.09381800236027e-06, "loss": 0.2294, "num_input_tokens_seen": 10806144, "step": 17740 }, { "epoch": 5.505739993794601, "grad_norm": 0.12942670285701752, "learning_rate": 9.093040589457204e-06, "loss": 0.2357, "num_input_tokens_seen": 10810944, "step": 17745 }, { "epoch": 5.507291343468818, "grad_norm": 0.13223029673099518, "learning_rate": 9.092262876485796e-06, "loss": 0.2284, "num_input_tokens_seen": 10813216, "step": 17750 }, { "epoch": 5.508842693143034, "grad_norm": 0.1559402197599411, "learning_rate": 9.09148486350306e-06, "loss": 0.2353, "num_input_tokens_seen": 10817056, "step": 17755 }, { "epoch": 5.510394042817251, "grad_norm": 0.08563759922981262, "learning_rate": 9.090706550566031e-06, "loss": 0.2322, "num_input_tokens_seen": 10821248, "step": 17760 }, { "epoch": 5.511945392491468, "grad_norm": 0.12215087562799454, "learning_rate": 9.089927937731771e-06, "loss": 0.2286, "num_input_tokens_seen": 10823936, "step": 17765 }, { "epoch": 5.513496742165684, "grad_norm": 0.1103164479136467, "learning_rate": 9.089149025057362e-06, "loss": 0.2307, "num_input_tokens_seen": 10826560, "step": 17770 }, { "epoch": 5.515048091839901, "grad_norm": 0.10130072385072708, "learning_rate": 9.088369812599907e-06, "loss": 0.2318, "num_input_tokens_seen": 10828864, "step": 17775 }, { "epoch": 5.516599441514117, "grad_norm": 0.11714616417884827, "learning_rate": 9.087590300416532e-06, "loss": 0.2281, "num_input_tokens_seen": 10831840, "step": 17780 }, { "epoch": 5.518150791188334, "grad_norm": 0.14727900922298431, "learning_rate": 9.086810488564382e-06, "loss": 0.2371, "num_input_tokens_seen": 10835072, "step": 17785 }, { "epoch": 5.519702140862551, "grad_norm": 0.12421000003814697, "learning_rate": 9.086030377100628e-06, "loss": 0.229, "num_input_tokens_seen": 10838592, "step": 17790 }, { "epoch": 5.5212534905367665, "grad_norm": 0.13178209960460663, "learning_rate": 9.08524996608246e-06, "loss": 0.2281, "num_input_tokens_seen": 10841120, "step": 17795 }, { "epoch": 5.522804840210983, "grad_norm": 0.15864737331867218, "learning_rate": 9.084469255567097e-06, "loss": 0.2304, "num_input_tokens_seen": 10843488, "step": 17800 }, { "epoch": 5.5243561898852, "grad_norm": 0.24576736986637115, "learning_rate": 9.083688245611767e-06, "loss": 0.2333, "num_input_tokens_seen": 10846176, "step": 17805 }, { "epoch": 5.525907539559417, "grad_norm": 0.06345576792955399, "learning_rate": 9.082906936273732e-06, "loss": 0.2256, "num_input_tokens_seen": 10848704, "step": 17810 }, { "epoch": 5.527458889233634, "grad_norm": 0.21469944715499878, "learning_rate": 9.08212532761027e-06, "loss": 0.2275, "num_input_tokens_seen": 10851040, "step": 17815 }, { "epoch": 5.5290102389078495, "grad_norm": 0.14702129364013672, "learning_rate": 9.081343419678679e-06, "loss": 0.2306, "num_input_tokens_seen": 10854560, "step": 17820 }, { "epoch": 5.530561588582066, "grad_norm": 0.1153293326497078, "learning_rate": 9.080561212536288e-06, "loss": 0.2332, "num_input_tokens_seen": 10857024, "step": 17825 }, { "epoch": 5.532112938256283, "grad_norm": 0.08678766340017319, "learning_rate": 9.079778706240439e-06, "loss": 0.2328, "num_input_tokens_seen": 10859776, "step": 17830 }, { "epoch": 5.5336642879305, "grad_norm": 0.11965308338403702, "learning_rate": 9.078995900848498e-06, "loss": 0.2227, "num_input_tokens_seen": 10862368, "step": 17835 }, { "epoch": 5.535215637604717, "grad_norm": 0.05006133019924164, "learning_rate": 9.078212796417855e-06, "loss": 0.2359, "num_input_tokens_seen": 10864736, "step": 17840 }, { "epoch": 5.5367669872789325, "grad_norm": 0.20548127591609955, "learning_rate": 9.077429393005922e-06, "loss": 0.2282, "num_input_tokens_seen": 10867840, "step": 17845 }, { "epoch": 5.538318336953149, "grad_norm": 0.14879608154296875, "learning_rate": 9.076645690670127e-06, "loss": 0.2383, "num_input_tokens_seen": 10870880, "step": 17850 }, { "epoch": 5.539869686627366, "grad_norm": 0.11426673084497452, "learning_rate": 9.075861689467932e-06, "loss": 0.2259, "num_input_tokens_seen": 10874272, "step": 17855 }, { "epoch": 5.541421036301582, "grad_norm": 0.10103987902402878, "learning_rate": 9.075077389456807e-06, "loss": 0.2301, "num_input_tokens_seen": 10876736, "step": 17860 }, { "epoch": 5.542972385975799, "grad_norm": 0.0505150631070137, "learning_rate": 9.074292790694255e-06, "loss": 0.2321, "num_input_tokens_seen": 10879680, "step": 17865 }, { "epoch": 5.544523735650015, "grad_norm": 0.2688184976577759, "learning_rate": 9.073507893237795e-06, "loss": 0.2314, "num_input_tokens_seen": 10882688, "step": 17870 }, { "epoch": 5.546075085324232, "grad_norm": 0.12511630356311798, "learning_rate": 9.072722697144967e-06, "loss": 0.2251, "num_input_tokens_seen": 10885344, "step": 17875 }, { "epoch": 5.547626434998449, "grad_norm": 0.10140306502580643, "learning_rate": 9.07193720247334e-06, "loss": 0.2348, "num_input_tokens_seen": 10887680, "step": 17880 }, { "epoch": 5.549177784672665, "grad_norm": 0.06457822769880295, "learning_rate": 9.071151409280494e-06, "loss": 0.2267, "num_input_tokens_seen": 10889760, "step": 17885 }, { "epoch": 5.550729134346882, "grad_norm": 0.11327561736106873, "learning_rate": 9.070365317624041e-06, "loss": 0.2364, "num_input_tokens_seen": 10892224, "step": 17890 }, { "epoch": 5.552280484021098, "grad_norm": 0.15020860731601715, "learning_rate": 9.06957892756161e-06, "loss": 0.2418, "num_input_tokens_seen": 10895296, "step": 17895 }, { "epoch": 5.553831833695315, "grad_norm": 0.11803892254829407, "learning_rate": 9.068792239150852e-06, "loss": 0.2298, "num_input_tokens_seen": 10897728, "step": 17900 }, { "epoch": 5.555383183369532, "grad_norm": 0.07868704944849014, "learning_rate": 9.068005252449443e-06, "loss": 0.2249, "num_input_tokens_seen": 10902080, "step": 17905 }, { "epoch": 5.556934533043748, "grad_norm": 0.20350897312164307, "learning_rate": 9.067217967515075e-06, "loss": 0.2275, "num_input_tokens_seen": 10905280, "step": 17910 }, { "epoch": 5.558485882717965, "grad_norm": 0.11493906378746033, "learning_rate": 9.066430384405466e-06, "loss": 0.2364, "num_input_tokens_seen": 10908704, "step": 17915 }, { "epoch": 5.560037232392181, "grad_norm": 0.2070881426334381, "learning_rate": 9.065642503178359e-06, "loss": 0.2357, "num_input_tokens_seen": 10912128, "step": 17920 }, { "epoch": 5.561588582066397, "grad_norm": 0.10661948472261429, "learning_rate": 9.06485432389151e-06, "loss": 0.2346, "num_input_tokens_seen": 10915040, "step": 17925 }, { "epoch": 5.563139931740614, "grad_norm": 0.10812754184007645, "learning_rate": 9.064065846602705e-06, "loss": 0.2314, "num_input_tokens_seen": 10917760, "step": 17930 }, { "epoch": 5.564691281414831, "grad_norm": 0.21009033918380737, "learning_rate": 9.063277071369746e-06, "loss": 0.2325, "num_input_tokens_seen": 10920768, "step": 17935 }, { "epoch": 5.566242631089048, "grad_norm": 0.19935846328735352, "learning_rate": 9.062487998250464e-06, "loss": 0.2326, "num_input_tokens_seen": 10923744, "step": 17940 }, { "epoch": 5.567793980763264, "grad_norm": 0.10389207303524017, "learning_rate": 9.061698627302704e-06, "loss": 0.2311, "num_input_tokens_seen": 10926784, "step": 17945 }, { "epoch": 5.56934533043748, "grad_norm": 0.104616180062294, "learning_rate": 9.060908958584335e-06, "loss": 0.238, "num_input_tokens_seen": 10930752, "step": 17950 }, { "epoch": 5.570896680111697, "grad_norm": 0.12953461706638336, "learning_rate": 9.060118992153251e-06, "loss": 0.2331, "num_input_tokens_seen": 10933344, "step": 17955 }, { "epoch": 5.572448029785914, "grad_norm": 0.05369925871491432, "learning_rate": 9.059328728067368e-06, "loss": 0.2316, "num_input_tokens_seen": 10935680, "step": 17960 }, { "epoch": 5.573999379460131, "grad_norm": 0.10925082117319107, "learning_rate": 9.058538166384619e-06, "loss": 0.2238, "num_input_tokens_seen": 10938400, "step": 17965 }, { "epoch": 5.575550729134347, "grad_norm": 0.09968072175979614, "learning_rate": 9.057747307162961e-06, "loss": 0.2287, "num_input_tokens_seen": 10941504, "step": 17970 }, { "epoch": 5.577102078808563, "grad_norm": 0.20956873893737793, "learning_rate": 9.056956150460375e-06, "loss": 0.2324, "num_input_tokens_seen": 10945600, "step": 17975 }, { "epoch": 5.57865342848278, "grad_norm": 0.13384240865707397, "learning_rate": 9.056164696334862e-06, "loss": 0.2353, "num_input_tokens_seen": 10949568, "step": 17980 }, { "epoch": 5.580204778156997, "grad_norm": 0.045617859810590744, "learning_rate": 9.055372944844444e-06, "loss": 0.2298, "num_input_tokens_seen": 10952576, "step": 17985 }, { "epoch": 5.581756127831213, "grad_norm": 0.11928108334541321, "learning_rate": 9.054580896047168e-06, "loss": 0.2286, "num_input_tokens_seen": 10955200, "step": 17990 }, { "epoch": 5.583307477505429, "grad_norm": 0.10204792022705078, "learning_rate": 9.053788550001097e-06, "loss": 0.2212, "num_input_tokens_seen": 10957888, "step": 17995 }, { "epoch": 5.584858827179646, "grad_norm": 0.15532997250556946, "learning_rate": 9.05299590676432e-06, "loss": 0.2397, "num_input_tokens_seen": 10961216, "step": 18000 }, { "epoch": 5.586410176853863, "grad_norm": 0.05987559258937836, "learning_rate": 9.052202966394951e-06, "loss": 0.2368, "num_input_tokens_seen": 10965088, "step": 18005 }, { "epoch": 5.58796152652808, "grad_norm": 0.12697306275367737, "learning_rate": 9.051409728951119e-06, "loss": 0.2221, "num_input_tokens_seen": 10968128, "step": 18010 }, { "epoch": 5.589512876202296, "grad_norm": 0.09341985732316971, "learning_rate": 9.050616194490977e-06, "loss": 0.2308, "num_input_tokens_seen": 10971136, "step": 18015 }, { "epoch": 5.591064225876512, "grad_norm": 0.134155735373497, "learning_rate": 9.049822363072702e-06, "loss": 0.2347, "num_input_tokens_seen": 10973280, "step": 18020 }, { "epoch": 5.592615575550729, "grad_norm": 0.23865197598934174, "learning_rate": 9.049028234754491e-06, "loss": 0.235, "num_input_tokens_seen": 10976928, "step": 18025 }, { "epoch": 5.594166925224946, "grad_norm": 0.06311684846878052, "learning_rate": 9.048233809594561e-06, "loss": 0.2314, "num_input_tokens_seen": 10979456, "step": 18030 }, { "epoch": 5.595718274899163, "grad_norm": 0.048557475209236145, "learning_rate": 9.047439087651158e-06, "loss": 0.2314, "num_input_tokens_seen": 10983648, "step": 18035 }, { "epoch": 5.597269624573379, "grad_norm": 0.09779290854930878, "learning_rate": 9.046644068982539e-06, "loss": 0.2313, "num_input_tokens_seen": 10987168, "step": 18040 }, { "epoch": 5.598820974247595, "grad_norm": 0.033554453402757645, "learning_rate": 9.04584875364699e-06, "loss": 0.241, "num_input_tokens_seen": 10989504, "step": 18045 }, { "epoch": 5.600372323921812, "grad_norm": 0.11587752401828766, "learning_rate": 9.045053141702817e-06, "loss": 0.2369, "num_input_tokens_seen": 10992608, "step": 18050 }, { "epoch": 5.601923673596028, "grad_norm": 0.1133868545293808, "learning_rate": 9.044257233208348e-06, "loss": 0.237, "num_input_tokens_seen": 10995328, "step": 18055 }, { "epoch": 5.603475023270245, "grad_norm": 0.10699179768562317, "learning_rate": 9.043461028221934e-06, "loss": 0.2351, "num_input_tokens_seen": 10998528, "step": 18060 }, { "epoch": 5.605026372944462, "grad_norm": 0.05973154678940773, "learning_rate": 9.042664526801944e-06, "loss": 0.2257, "num_input_tokens_seen": 11001760, "step": 18065 }, { "epoch": 5.606577722618678, "grad_norm": 0.10322075337171555, "learning_rate": 9.041867729006772e-06, "loss": 0.2325, "num_input_tokens_seen": 11005408, "step": 18070 }, { "epoch": 5.608129072292895, "grad_norm": 0.21227967739105225, "learning_rate": 9.041070634894833e-06, "loss": 0.2268, "num_input_tokens_seen": 11008832, "step": 18075 }, { "epoch": 5.609680421967111, "grad_norm": 0.04494773969054222, "learning_rate": 9.040273244524563e-06, "loss": 0.2266, "num_input_tokens_seen": 11011392, "step": 18080 }, { "epoch": 5.611231771641328, "grad_norm": 0.04458916187286377, "learning_rate": 9.03947555795442e-06, "loss": 0.2295, "num_input_tokens_seen": 11014880, "step": 18085 }, { "epoch": 5.612783121315545, "grad_norm": 0.06630118936300278, "learning_rate": 9.038677575242884e-06, "loss": 0.2315, "num_input_tokens_seen": 11019200, "step": 18090 }, { "epoch": 5.614334470989761, "grad_norm": 0.10334596782922745, "learning_rate": 9.037879296448456e-06, "loss": 0.2379, "num_input_tokens_seen": 11022656, "step": 18095 }, { "epoch": 5.615885820663978, "grad_norm": 0.1318516582250595, "learning_rate": 9.03708072162966e-06, "loss": 0.2348, "num_input_tokens_seen": 11025344, "step": 18100 }, { "epoch": 5.617437170338194, "grad_norm": 0.1267935186624527, "learning_rate": 9.036281850845043e-06, "loss": 0.238, "num_input_tokens_seen": 11027808, "step": 18105 }, { "epoch": 5.618988520012411, "grad_norm": 0.03879417106509209, "learning_rate": 9.035482684153168e-06, "loss": 0.2327, "num_input_tokens_seen": 11031488, "step": 18110 }, { "epoch": 5.6205398696866276, "grad_norm": 0.04028159752488136, "learning_rate": 9.034683221612623e-06, "loss": 0.2367, "num_input_tokens_seen": 11034336, "step": 18115 }, { "epoch": 5.622091219360843, "grad_norm": 0.038746289908885956, "learning_rate": 9.033883463282023e-06, "loss": 0.233, "num_input_tokens_seen": 11038048, "step": 18120 }, { "epoch": 5.62364256903506, "grad_norm": 0.11682795733213425, "learning_rate": 9.033083409219996e-06, "loss": 0.2283, "num_input_tokens_seen": 11042528, "step": 18125 }, { "epoch": 5.625193918709277, "grad_norm": 0.04527188092470169, "learning_rate": 9.032283059485196e-06, "loss": 0.2273, "num_input_tokens_seen": 11044640, "step": 18130 }, { "epoch": 5.626745268383494, "grad_norm": 0.06205295771360397, "learning_rate": 9.0314824141363e-06, "loss": 0.2321, "num_input_tokens_seen": 11047552, "step": 18135 }, { "epoch": 5.6282966180577105, "grad_norm": 0.044010698795318604, "learning_rate": 9.030681473232001e-06, "loss": 0.2318, "num_input_tokens_seen": 11049952, "step": 18140 }, { "epoch": 5.629847967731926, "grad_norm": 0.03303679823875427, "learning_rate": 9.029880236831019e-06, "loss": 0.2385, "num_input_tokens_seen": 11052384, "step": 18145 }, { "epoch": 5.631399317406143, "grad_norm": 0.09452634304761887, "learning_rate": 9.029078704992096e-06, "loss": 0.2275, "num_input_tokens_seen": 11055776, "step": 18150 }, { "epoch": 5.63295066708036, "grad_norm": 0.03516250103712082, "learning_rate": 9.028276877773994e-06, "loss": 0.2332, "num_input_tokens_seen": 11058912, "step": 18155 }, { "epoch": 5.634502016754577, "grad_norm": 0.10556972771883011, "learning_rate": 9.027474755235492e-06, "loss": 0.231, "num_input_tokens_seen": 11061856, "step": 18160 }, { "epoch": 5.6360533664287935, "grad_norm": 0.11434946209192276, "learning_rate": 9.0266723374354e-06, "loss": 0.2335, "num_input_tokens_seen": 11066112, "step": 18165 }, { "epoch": 5.637604716103009, "grad_norm": 0.04780738055706024, "learning_rate": 9.025869624432541e-06, "loss": 0.2262, "num_input_tokens_seen": 11068832, "step": 18170 }, { "epoch": 5.639156065777226, "grad_norm": 0.04540691897273064, "learning_rate": 9.025066616285766e-06, "loss": 0.2294, "num_input_tokens_seen": 11071872, "step": 18175 }, { "epoch": 5.640707415451443, "grad_norm": 0.19108304381370544, "learning_rate": 9.024263313053946e-06, "loss": 0.2293, "num_input_tokens_seen": 11074240, "step": 18180 }, { "epoch": 5.642258765125659, "grad_norm": 0.20496240258216858, "learning_rate": 9.02345971479597e-06, "loss": 0.2357, "num_input_tokens_seen": 11078240, "step": 18185 }, { "epoch": 5.643810114799876, "grad_norm": 0.046989865601062775, "learning_rate": 9.02265582157075e-06, "loss": 0.2336, "num_input_tokens_seen": 11081440, "step": 18190 }, { "epoch": 5.645361464474092, "grad_norm": 0.05579526722431183, "learning_rate": 9.021851633437223e-06, "loss": 0.23, "num_input_tokens_seen": 11083680, "step": 18195 }, { "epoch": 5.646912814148309, "grad_norm": 0.1097424104809761, "learning_rate": 9.021047150454347e-06, "loss": 0.2306, "num_input_tokens_seen": 11087488, "step": 18200 }, { "epoch": 5.648464163822526, "grad_norm": 0.12046749144792557, "learning_rate": 9.020242372681099e-06, "loss": 0.2311, "num_input_tokens_seen": 11090368, "step": 18205 }, { "epoch": 5.650015513496742, "grad_norm": 0.05398069694638252, "learning_rate": 9.019437300176477e-06, "loss": 0.2316, "num_input_tokens_seen": 11093568, "step": 18210 }, { "epoch": 5.651566863170959, "grad_norm": 0.10466572642326355, "learning_rate": 9.018631932999504e-06, "loss": 0.2317, "num_input_tokens_seen": 11096384, "step": 18215 }, { "epoch": 5.653118212845175, "grad_norm": 0.05501752719283104, "learning_rate": 9.017826271209223e-06, "loss": 0.2302, "num_input_tokens_seen": 11099360, "step": 18220 }, { "epoch": 5.654669562519392, "grad_norm": 0.049937810748815536, "learning_rate": 9.017020314864698e-06, "loss": 0.2331, "num_input_tokens_seen": 11102368, "step": 18225 }, { "epoch": 5.656220912193609, "grad_norm": 0.0611925907433033, "learning_rate": 9.016214064025013e-06, "loss": 0.2316, "num_input_tokens_seen": 11106400, "step": 18230 }, { "epoch": 5.657772261867825, "grad_norm": 0.07926931977272034, "learning_rate": 9.01540751874928e-06, "loss": 0.2385, "num_input_tokens_seen": 11110048, "step": 18235 }, { "epoch": 5.6593236115420416, "grad_norm": 0.19281543791294098, "learning_rate": 9.014600679096627e-06, "loss": 0.2305, "num_input_tokens_seen": 11112832, "step": 18240 }, { "epoch": 5.660874961216258, "grad_norm": 0.18768247961997986, "learning_rate": 9.013793545126204e-06, "loss": 0.2263, "num_input_tokens_seen": 11115552, "step": 18245 }, { "epoch": 5.662426310890475, "grad_norm": 0.09642881900072098, "learning_rate": 9.012986116897182e-06, "loss": 0.2319, "num_input_tokens_seen": 11118944, "step": 18250 }, { "epoch": 5.663977660564691, "grad_norm": 0.05597648769617081, "learning_rate": 9.012178394468757e-06, "loss": 0.2264, "num_input_tokens_seen": 11121376, "step": 18255 }, { "epoch": 5.665529010238908, "grad_norm": 0.06978771090507507, "learning_rate": 9.011370377900147e-06, "loss": 0.2304, "num_input_tokens_seen": 11125408, "step": 18260 }, { "epoch": 5.6670803599131245, "grad_norm": 0.11024436354637146, "learning_rate": 9.010562067250585e-06, "loss": 0.2349, "num_input_tokens_seen": 11128320, "step": 18265 }, { "epoch": 5.668631709587341, "grad_norm": 0.039722222834825516, "learning_rate": 9.009753462579332e-06, "loss": 0.2331, "num_input_tokens_seen": 11132448, "step": 18270 }, { "epoch": 5.670183059261557, "grad_norm": 0.037945568561553955, "learning_rate": 9.008944563945666e-06, "loss": 0.2306, "num_input_tokens_seen": 11135296, "step": 18275 }, { "epoch": 5.671734408935774, "grad_norm": 0.10199782997369766, "learning_rate": 9.008135371408892e-06, "loss": 0.233, "num_input_tokens_seen": 11139360, "step": 18280 }, { "epoch": 5.673285758609991, "grad_norm": 0.05942871421575546, "learning_rate": 9.00732588502833e-06, "loss": 0.2378, "num_input_tokens_seen": 11143808, "step": 18285 }, { "epoch": 5.6748371082842075, "grad_norm": 0.08979346603155136, "learning_rate": 9.006516104863329e-06, "loss": 0.2289, "num_input_tokens_seen": 11146752, "step": 18290 }, { "epoch": 5.676388457958424, "grad_norm": 0.09424898028373718, "learning_rate": 9.005706030973251e-06, "loss": 0.2325, "num_input_tokens_seen": 11149664, "step": 18295 }, { "epoch": 5.67793980763264, "grad_norm": 0.050603024661540985, "learning_rate": 9.004895663417489e-06, "loss": 0.2299, "num_input_tokens_seen": 11152576, "step": 18300 }, { "epoch": 5.679491157306857, "grad_norm": 0.09767705947160721, "learning_rate": 9.004085002255447e-06, "loss": 0.2325, "num_input_tokens_seen": 11154944, "step": 18305 }, { "epoch": 5.681042506981074, "grad_norm": 0.09560318291187286, "learning_rate": 9.003274047546561e-06, "loss": 0.2274, "num_input_tokens_seen": 11157600, "step": 18310 }, { "epoch": 5.6825938566552905, "grad_norm": 0.09824783354997635, "learning_rate": 9.00246279935028e-06, "loss": 0.2274, "num_input_tokens_seen": 11161184, "step": 18315 }, { "epoch": 5.684145206329506, "grad_norm": 0.09041275084018707, "learning_rate": 9.001651257726081e-06, "loss": 0.229, "num_input_tokens_seen": 11163520, "step": 18320 }, { "epoch": 5.685696556003723, "grad_norm": 0.05300065129995346, "learning_rate": 9.000839422733458e-06, "loss": 0.2336, "num_input_tokens_seen": 11166560, "step": 18325 }, { "epoch": 5.68724790567794, "grad_norm": 0.033540528267621994, "learning_rate": 9.000027294431928e-06, "loss": 0.2361, "num_input_tokens_seen": 11168896, "step": 18330 }, { "epoch": 5.688799255352157, "grad_norm": 0.05060143023729324, "learning_rate": 8.999214872881029e-06, "loss": 0.2306, "num_input_tokens_seen": 11171456, "step": 18335 }, { "epoch": 5.690350605026373, "grad_norm": 0.04850336164236069, "learning_rate": 8.998402158140322e-06, "loss": 0.2295, "num_input_tokens_seen": 11174176, "step": 18340 }, { "epoch": 5.691901954700589, "grad_norm": 0.044050391763448715, "learning_rate": 8.99758915026939e-06, "loss": 0.233, "num_input_tokens_seen": 11178400, "step": 18345 }, { "epoch": 5.693453304374806, "grad_norm": 0.05056903138756752, "learning_rate": 8.996775849327834e-06, "loss": 0.233, "num_input_tokens_seen": 11182304, "step": 18350 }, { "epoch": 5.695004654049023, "grad_norm": 0.10240495204925537, "learning_rate": 8.99596225537528e-06, "loss": 0.2331, "num_input_tokens_seen": 11186080, "step": 18355 }, { "epoch": 5.69655600372324, "grad_norm": 0.1845589131116867, "learning_rate": 8.995148368471371e-06, "loss": 0.2305, "num_input_tokens_seen": 11188672, "step": 18360 }, { "epoch": 5.6981073533974556, "grad_norm": 0.09868990629911423, "learning_rate": 8.99433418867578e-06, "loss": 0.233, "num_input_tokens_seen": 11194144, "step": 18365 }, { "epoch": 5.699658703071672, "grad_norm": 0.03504027798771858, "learning_rate": 8.99351971604819e-06, "loss": 0.2336, "num_input_tokens_seen": 11196736, "step": 18370 }, { "epoch": 5.701210052745889, "grad_norm": 0.1957520693540573, "learning_rate": 8.992704950648316e-06, "loss": 0.2311, "num_input_tokens_seen": 11199392, "step": 18375 }, { "epoch": 5.702761402420106, "grad_norm": 0.040085505694150925, "learning_rate": 8.99188989253589e-06, "loss": 0.2337, "num_input_tokens_seen": 11202336, "step": 18380 }, { "epoch": 5.704312752094322, "grad_norm": 0.09879506379365921, "learning_rate": 8.991074541770662e-06, "loss": 0.2276, "num_input_tokens_seen": 11204960, "step": 18385 }, { "epoch": 5.7058641017685385, "grad_norm": 0.09650770574808121, "learning_rate": 8.99025889841241e-06, "loss": 0.2274, "num_input_tokens_seen": 11207456, "step": 18390 }, { "epoch": 5.707415451442755, "grad_norm": 0.10856937617063522, "learning_rate": 8.989442962520927e-06, "loss": 0.2369, "num_input_tokens_seen": 11211040, "step": 18395 }, { "epoch": 5.708966801116972, "grad_norm": 0.09650017321109772, "learning_rate": 8.988626734156034e-06, "loss": 0.2322, "num_input_tokens_seen": 11213152, "step": 18400 }, { "epoch": 5.710518150791188, "grad_norm": 0.19504263997077942, "learning_rate": 8.987810213377569e-06, "loss": 0.2249, "num_input_tokens_seen": 11216000, "step": 18405 }, { "epoch": 5.712069500465405, "grad_norm": 0.03940822184085846, "learning_rate": 8.986993400245392e-06, "loss": 0.2316, "num_input_tokens_seen": 11218944, "step": 18410 }, { "epoch": 5.7136208501396215, "grad_norm": 0.09987951815128326, "learning_rate": 8.986176294819387e-06, "loss": 0.2255, "num_input_tokens_seen": 11221984, "step": 18415 }, { "epoch": 5.715172199813838, "grad_norm": 0.041028305888175964, "learning_rate": 8.985358897159455e-06, "loss": 0.239, "num_input_tokens_seen": 11224704, "step": 18420 }, { "epoch": 5.716723549488055, "grad_norm": 0.0976635292172432, "learning_rate": 8.984541207325523e-06, "loss": 0.2306, "num_input_tokens_seen": 11228352, "step": 18425 }, { "epoch": 5.718274899162271, "grad_norm": 0.10485932230949402, "learning_rate": 8.983723225377535e-06, "loss": 0.23, "num_input_tokens_seen": 11230944, "step": 18430 }, { "epoch": 5.719826248836488, "grad_norm": 0.04038723185658455, "learning_rate": 8.982904951375462e-06, "loss": 0.2362, "num_input_tokens_seen": 11234560, "step": 18435 }, { "epoch": 5.7213775985107045, "grad_norm": 0.09934163838624954, "learning_rate": 8.98208638537929e-06, "loss": 0.2315, "num_input_tokens_seen": 11236832, "step": 18440 }, { "epoch": 5.722928948184921, "grad_norm": 0.10700774192810059, "learning_rate": 8.981267527449032e-06, "loss": 0.2325, "num_input_tokens_seen": 11240768, "step": 18445 }, { "epoch": 5.724480297859137, "grad_norm": 0.05003557354211807, "learning_rate": 8.980448377644718e-06, "loss": 0.2335, "num_input_tokens_seen": 11243808, "step": 18450 }, { "epoch": 5.726031647533354, "grad_norm": 0.12365571409463882, "learning_rate": 8.979628936026404e-06, "loss": 0.2343, "num_input_tokens_seen": 11246464, "step": 18455 }, { "epoch": 5.727582997207571, "grad_norm": 0.2088419497013092, "learning_rate": 8.978809202654161e-06, "loss": 0.2249, "num_input_tokens_seen": 11248960, "step": 18460 }, { "epoch": 5.7291343468817875, "grad_norm": 0.12528106570243835, "learning_rate": 8.977989177588088e-06, "loss": 0.2277, "num_input_tokens_seen": 11252640, "step": 18465 }, { "epoch": 5.730685696556003, "grad_norm": 0.12685592472553253, "learning_rate": 8.977168860888304e-06, "loss": 0.2382, "num_input_tokens_seen": 11255456, "step": 18470 }, { "epoch": 5.73223704623022, "grad_norm": 0.15991155803203583, "learning_rate": 8.976348252614944e-06, "loss": 0.2291, "num_input_tokens_seen": 11258944, "step": 18475 }, { "epoch": 5.733788395904437, "grad_norm": 0.21323443949222565, "learning_rate": 8.975527352828169e-06, "loss": 0.2262, "num_input_tokens_seen": 11261536, "step": 18480 }, { "epoch": 5.735339745578654, "grad_norm": 0.14328333735466003, "learning_rate": 8.974706161588162e-06, "loss": 0.2379, "num_input_tokens_seen": 11264192, "step": 18485 }, { "epoch": 5.73689109525287, "grad_norm": 0.1209215298295021, "learning_rate": 8.973884678955127e-06, "loss": 0.2304, "num_input_tokens_seen": 11267072, "step": 18490 }, { "epoch": 5.738442444927086, "grad_norm": 0.041789308190345764, "learning_rate": 8.973062904989288e-06, "loss": 0.2307, "num_input_tokens_seen": 11270080, "step": 18495 }, { "epoch": 5.739993794601303, "grad_norm": 0.20032088458538055, "learning_rate": 8.972240839750888e-06, "loss": 0.2342, "num_input_tokens_seen": 11273024, "step": 18500 }, { "epoch": 5.74154514427552, "grad_norm": 0.036376260221004486, "learning_rate": 8.971418483300197e-06, "loss": 0.2316, "num_input_tokens_seen": 11275520, "step": 18505 }, { "epoch": 5.743096493949737, "grad_norm": 0.11175272613763809, "learning_rate": 8.970595835697502e-06, "loss": 0.2329, "num_input_tokens_seen": 11277824, "step": 18510 }, { "epoch": 5.7446478436239525, "grad_norm": 0.10510772466659546, "learning_rate": 8.969772897003116e-06, "loss": 0.2357, "num_input_tokens_seen": 11282848, "step": 18515 }, { "epoch": 5.746199193298169, "grad_norm": 0.09891530871391296, "learning_rate": 8.968949667277364e-06, "loss": 0.2262, "num_input_tokens_seen": 11286688, "step": 18520 }, { "epoch": 5.747750542972386, "grad_norm": 0.10314559191465378, "learning_rate": 8.968126146580602e-06, "loss": 0.2316, "num_input_tokens_seen": 11289440, "step": 18525 }, { "epoch": 5.749301892646603, "grad_norm": 0.04866233468055725, "learning_rate": 8.967302334973206e-06, "loss": 0.2316, "num_input_tokens_seen": 11292512, "step": 18530 }, { "epoch": 5.750853242320819, "grad_norm": 0.04415572062134743, "learning_rate": 8.966478232515568e-06, "loss": 0.2337, "num_input_tokens_seen": 11295104, "step": 18535 }, { "epoch": 5.7524045919950355, "grad_norm": 0.110378198325634, "learning_rate": 8.965653839268104e-06, "loss": 0.228, "num_input_tokens_seen": 11297280, "step": 18540 }, { "epoch": 5.753955941669252, "grad_norm": 0.10159920901060104, "learning_rate": 8.964829155291257e-06, "loss": 0.2324, "num_input_tokens_seen": 11300544, "step": 18545 }, { "epoch": 5.755507291343469, "grad_norm": 0.09642039984464645, "learning_rate": 8.964004180645478e-06, "loss": 0.2321, "num_input_tokens_seen": 11303584, "step": 18550 }, { "epoch": 5.757058641017686, "grad_norm": 0.10094742476940155, "learning_rate": 8.963178915391255e-06, "loss": 0.2346, "num_input_tokens_seen": 11307200, "step": 18555 }, { "epoch": 5.758609990691902, "grad_norm": 0.047980524599552155, "learning_rate": 8.962353359589085e-06, "loss": 0.23, "num_input_tokens_seen": 11310656, "step": 18560 }, { "epoch": 5.7601613403661185, "grad_norm": 0.18398602306842804, "learning_rate": 8.961527513299492e-06, "loss": 0.2319, "num_input_tokens_seen": 11315200, "step": 18565 }, { "epoch": 5.761712690040335, "grad_norm": 0.06160236522555351, "learning_rate": 8.960701376583022e-06, "loss": 0.2342, "num_input_tokens_seen": 11319008, "step": 18570 }, { "epoch": 5.763264039714552, "grad_norm": 0.035854171961545944, "learning_rate": 8.959874949500238e-06, "loss": 0.2321, "num_input_tokens_seen": 11321664, "step": 18575 }, { "epoch": 5.764815389388768, "grad_norm": 0.0479874350130558, "learning_rate": 8.959048232111728e-06, "loss": 0.2347, "num_input_tokens_seen": 11324768, "step": 18580 }, { "epoch": 5.766366739062985, "grad_norm": 0.0958964005112648, "learning_rate": 8.958221224478103e-06, "loss": 0.2253, "num_input_tokens_seen": 11327456, "step": 18585 }, { "epoch": 5.7679180887372015, "grad_norm": 0.0556010901927948, "learning_rate": 8.957393926659988e-06, "loss": 0.2315, "num_input_tokens_seen": 11330208, "step": 18590 }, { "epoch": 5.769469438411418, "grad_norm": 0.046669747680425644, "learning_rate": 8.956566338718035e-06, "loss": 0.2347, "num_input_tokens_seen": 11332512, "step": 18595 }, { "epoch": 5.771020788085634, "grad_norm": 0.09570343792438507, "learning_rate": 8.95573846071292e-06, "loss": 0.2315, "num_input_tokens_seen": 11335424, "step": 18600 }, { "epoch": 5.772572137759851, "grad_norm": 0.06208132952451706, "learning_rate": 8.95491029270533e-06, "loss": 0.2329, "num_input_tokens_seen": 11339008, "step": 18605 }, { "epoch": 5.774123487434068, "grad_norm": 0.0946897640824318, "learning_rate": 8.954081834755981e-06, "loss": 0.231, "num_input_tokens_seen": 11341536, "step": 18610 }, { "epoch": 5.775674837108284, "grad_norm": 0.1832199990749359, "learning_rate": 8.953253086925614e-06, "loss": 0.2309, "num_input_tokens_seen": 11344640, "step": 18615 }, { "epoch": 5.777226186782501, "grad_norm": 0.04415746033191681, "learning_rate": 8.95242404927498e-06, "loss": 0.2299, "num_input_tokens_seen": 11347104, "step": 18620 }, { "epoch": 5.778777536456717, "grad_norm": 0.09904265403747559, "learning_rate": 8.951594721864859e-06, "loss": 0.2254, "num_input_tokens_seen": 11351264, "step": 18625 }, { "epoch": 5.780328886130934, "grad_norm": 0.09668198972940445, "learning_rate": 8.950765104756051e-06, "loss": 0.2241, "num_input_tokens_seen": 11354080, "step": 18630 }, { "epoch": 5.781880235805151, "grad_norm": 0.03703554719686508, "learning_rate": 8.949935198009379e-06, "loss": 0.2315, "num_input_tokens_seen": 11357472, "step": 18635 }, { "epoch": 5.783431585479367, "grad_norm": 0.0986560508608818, "learning_rate": 8.949105001685681e-06, "loss": 0.223, "num_input_tokens_seen": 11360480, "step": 18640 }, { "epoch": 5.784982935153583, "grad_norm": 0.12388987094163895, "learning_rate": 8.948274515845822e-06, "loss": 0.237, "num_input_tokens_seen": 11362848, "step": 18645 }, { "epoch": 5.7865342848278, "grad_norm": 0.0975380539894104, "learning_rate": 8.947443740550687e-06, "loss": 0.2259, "num_input_tokens_seen": 11365472, "step": 18650 }, { "epoch": 5.788085634502017, "grad_norm": 0.1306838095188141, "learning_rate": 8.94661267586118e-06, "loss": 0.2246, "num_input_tokens_seen": 11368064, "step": 18655 }, { "epoch": 5.789636984176234, "grad_norm": 0.09191520512104034, "learning_rate": 8.94578132183823e-06, "loss": 0.2235, "num_input_tokens_seen": 11370720, "step": 18660 }, { "epoch": 5.7911883338504495, "grad_norm": 0.23059214651584625, "learning_rate": 8.944949678542785e-06, "loss": 0.2315, "num_input_tokens_seen": 11373408, "step": 18665 }, { "epoch": 5.792739683524666, "grad_norm": 0.12916044890880585, "learning_rate": 8.944117746035811e-06, "loss": 0.25, "num_input_tokens_seen": 11376544, "step": 18670 }, { "epoch": 5.794291033198883, "grad_norm": 0.10382752120494843, "learning_rate": 8.943285524378303e-06, "loss": 0.2376, "num_input_tokens_seen": 11380000, "step": 18675 }, { "epoch": 5.7958423828731, "grad_norm": 0.12323007732629776, "learning_rate": 8.94245301363127e-06, "loss": 0.2331, "num_input_tokens_seen": 11382784, "step": 18680 }, { "epoch": 5.797393732547317, "grad_norm": 0.10212936252355576, "learning_rate": 8.941620213855743e-06, "loss": 0.2246, "num_input_tokens_seen": 11385184, "step": 18685 }, { "epoch": 5.7989450822215325, "grad_norm": 0.06610686331987381, "learning_rate": 8.940787125112782e-06, "loss": 0.2246, "num_input_tokens_seen": 11388832, "step": 18690 }, { "epoch": 5.800496431895749, "grad_norm": 0.05317206680774689, "learning_rate": 8.939953747463457e-06, "loss": 0.2438, "num_input_tokens_seen": 11391328, "step": 18695 }, { "epoch": 5.802047781569966, "grad_norm": 0.11595939844846725, "learning_rate": 8.939120080968866e-06, "loss": 0.2318, "num_input_tokens_seen": 11395008, "step": 18700 }, { "epoch": 5.803599131244183, "grad_norm": 0.07092564553022385, "learning_rate": 8.938286125690129e-06, "loss": 0.2368, "num_input_tokens_seen": 11398464, "step": 18705 }, { "epoch": 5.805150480918399, "grad_norm": 0.1154031753540039, "learning_rate": 8.937451881688382e-06, "loss": 0.2304, "num_input_tokens_seen": 11400928, "step": 18710 }, { "epoch": 5.8067018305926155, "grad_norm": 0.11424046754837036, "learning_rate": 8.936617349024783e-06, "loss": 0.2325, "num_input_tokens_seen": 11404864, "step": 18715 }, { "epoch": 5.808253180266832, "grad_norm": 0.10815790295600891, "learning_rate": 8.93578252776052e-06, "loss": 0.2308, "num_input_tokens_seen": 11407968, "step": 18720 }, { "epoch": 5.809804529941049, "grad_norm": 0.19623298943042755, "learning_rate": 8.934947417956788e-06, "loss": 0.2309, "num_input_tokens_seen": 11410272, "step": 18725 }, { "epoch": 5.811355879615265, "grad_norm": 0.20906704664230347, "learning_rate": 8.934112019674814e-06, "loss": 0.2325, "num_input_tokens_seen": 11413120, "step": 18730 }, { "epoch": 5.812907229289482, "grad_norm": 0.12084182351827621, "learning_rate": 8.933276332975842e-06, "loss": 0.2389, "num_input_tokens_seen": 11416064, "step": 18735 }, { "epoch": 5.814458578963698, "grad_norm": 0.11748301982879639, "learning_rate": 8.93244035792114e-06, "loss": 0.2331, "num_input_tokens_seen": 11418592, "step": 18740 }, { "epoch": 5.816009928637915, "grad_norm": 0.20532308518886566, "learning_rate": 8.931604094571991e-06, "loss": 0.235, "num_input_tokens_seen": 11421376, "step": 18745 }, { "epoch": 5.817561278312132, "grad_norm": 0.09608078747987747, "learning_rate": 8.930767542989704e-06, "loss": 0.2295, "num_input_tokens_seen": 11423712, "step": 18750 }, { "epoch": 5.819112627986348, "grad_norm": 0.04665898531675339, "learning_rate": 8.929930703235609e-06, "loss": 0.2335, "num_input_tokens_seen": 11427552, "step": 18755 }, { "epoch": 5.820663977660565, "grad_norm": 0.03561440855264664, "learning_rate": 8.929093575371058e-06, "loss": 0.2277, "num_input_tokens_seen": 11430656, "step": 18760 }, { "epoch": 5.822215327334781, "grad_norm": 0.09994114190340042, "learning_rate": 8.928256159457419e-06, "loss": 0.2331, "num_input_tokens_seen": 11434016, "step": 18765 }, { "epoch": 5.823766677008998, "grad_norm": 0.18003593385219574, "learning_rate": 8.927418455556086e-06, "loss": 0.2263, "num_input_tokens_seen": 11436416, "step": 18770 }, { "epoch": 5.825318026683214, "grad_norm": 0.10293548554182053, "learning_rate": 8.926580463728472e-06, "loss": 0.2314, "num_input_tokens_seen": 11439168, "step": 18775 }, { "epoch": 5.826869376357431, "grad_norm": 0.05080695450305939, "learning_rate": 8.925742184036014e-06, "loss": 0.2342, "num_input_tokens_seen": 11443744, "step": 18780 }, { "epoch": 5.828420726031648, "grad_norm": 0.1050097644329071, "learning_rate": 8.924903616540164e-06, "loss": 0.2258, "num_input_tokens_seen": 11446208, "step": 18785 }, { "epoch": 5.829972075705864, "grad_norm": 0.056551169604063034, "learning_rate": 8.924064761302403e-06, "loss": 0.227, "num_input_tokens_seen": 11450176, "step": 18790 }, { "epoch": 5.83152342538008, "grad_norm": 0.11446058750152588, "learning_rate": 8.923225618384228e-06, "loss": 0.2281, "num_input_tokens_seen": 11452704, "step": 18795 }, { "epoch": 5.833074775054297, "grad_norm": 0.04719405621290207, "learning_rate": 8.922386187847155e-06, "loss": 0.2349, "num_input_tokens_seen": 11455328, "step": 18800 }, { "epoch": 5.834626124728514, "grad_norm": 0.04958717152476311, "learning_rate": 8.921546469752726e-06, "loss": 0.2239, "num_input_tokens_seen": 11458720, "step": 18805 }, { "epoch": 5.836177474402731, "grad_norm": 0.1741652935743332, "learning_rate": 8.920706464162506e-06, "loss": 0.2261, "num_input_tokens_seen": 11461632, "step": 18810 }, { "epoch": 5.837728824076947, "grad_norm": 0.0714186280965805, "learning_rate": 8.91986617113807e-06, "loss": 0.2291, "num_input_tokens_seen": 11465312, "step": 18815 }, { "epoch": 5.839280173751163, "grad_norm": 0.1166987270116806, "learning_rate": 8.919025590741029e-06, "loss": 0.2384, "num_input_tokens_seen": 11468064, "step": 18820 }, { "epoch": 5.84083152342538, "grad_norm": 0.04485153779387474, "learning_rate": 8.918184723033002e-06, "loss": 0.2305, "num_input_tokens_seen": 11471200, "step": 18825 }, { "epoch": 5.842382873099597, "grad_norm": 0.06883200258016586, "learning_rate": 8.917343568075636e-06, "loss": 0.226, "num_input_tokens_seen": 11474656, "step": 18830 }, { "epoch": 5.843934222773814, "grad_norm": 0.04488255828619003, "learning_rate": 8.9165021259306e-06, "loss": 0.229, "num_input_tokens_seen": 11477280, "step": 18835 }, { "epoch": 5.8454855724480295, "grad_norm": 0.048410266637802124, "learning_rate": 8.915660396659579e-06, "loss": 0.232, "num_input_tokens_seen": 11480352, "step": 18840 }, { "epoch": 5.847036922122246, "grad_norm": 0.11722786724567413, "learning_rate": 8.914818380324282e-06, "loss": 0.2283, "num_input_tokens_seen": 11483104, "step": 18845 }, { "epoch": 5.848588271796463, "grad_norm": 0.10678607225418091, "learning_rate": 8.91397607698644e-06, "loss": 0.2319, "num_input_tokens_seen": 11486592, "step": 18850 }, { "epoch": 5.85013962147068, "grad_norm": 0.12884938716888428, "learning_rate": 8.913133486707803e-06, "loss": 0.2348, "num_input_tokens_seen": 11490272, "step": 18855 }, { "epoch": 5.851690971144896, "grad_norm": 0.044175885617733, "learning_rate": 8.912290609550144e-06, "loss": 0.2297, "num_input_tokens_seen": 11492736, "step": 18860 }, { "epoch": 5.853242320819112, "grad_norm": 0.11505419760942459, "learning_rate": 8.911447445575254e-06, "loss": 0.2329, "num_input_tokens_seen": 11495072, "step": 18865 }, { "epoch": 5.854793670493329, "grad_norm": 0.1003894954919815, "learning_rate": 8.910603994844947e-06, "loss": 0.2236, "num_input_tokens_seen": 11499072, "step": 18870 }, { "epoch": 5.856345020167546, "grad_norm": 0.06766759604215622, "learning_rate": 8.909760257421061e-06, "loss": 0.2409, "num_input_tokens_seen": 11504192, "step": 18875 }, { "epoch": 5.857896369841763, "grad_norm": 0.09583132714033127, "learning_rate": 8.908916233365448e-06, "loss": 0.2221, "num_input_tokens_seen": 11507392, "step": 18880 }, { "epoch": 5.859447719515979, "grad_norm": 0.2105940282344818, "learning_rate": 8.908071922739986e-06, "loss": 0.2358, "num_input_tokens_seen": 11510528, "step": 18885 }, { "epoch": 5.860999069190195, "grad_norm": 0.049893513321876526, "learning_rate": 8.907227325606574e-06, "loss": 0.2287, "num_input_tokens_seen": 11513248, "step": 18890 }, { "epoch": 5.862550418864412, "grad_norm": 0.04753866046667099, "learning_rate": 8.906382442027131e-06, "loss": 0.2355, "num_input_tokens_seen": 11516800, "step": 18895 }, { "epoch": 5.864101768538629, "grad_norm": 0.03944943845272064, "learning_rate": 8.905537272063595e-06, "loss": 0.2298, "num_input_tokens_seen": 11518976, "step": 18900 }, { "epoch": 5.865653118212845, "grad_norm": 0.08040592819452286, "learning_rate": 8.904691815777931e-06, "loss": 0.2281, "num_input_tokens_seen": 11522464, "step": 18905 }, { "epoch": 5.867204467887062, "grad_norm": 0.13781172037124634, "learning_rate": 8.903846073232116e-06, "loss": 0.2286, "num_input_tokens_seen": 11526560, "step": 18910 }, { "epoch": 5.868755817561278, "grad_norm": 0.05650079622864723, "learning_rate": 8.903000044488157e-06, "loss": 0.2323, "num_input_tokens_seen": 11529696, "step": 18915 }, { "epoch": 5.870307167235495, "grad_norm": 0.05225732550024986, "learning_rate": 8.902153729608076e-06, "loss": 0.2335, "num_input_tokens_seen": 11532320, "step": 18920 }, { "epoch": 5.871858516909711, "grad_norm": 0.058603350073099136, "learning_rate": 8.901307128653918e-06, "loss": 0.2277, "num_input_tokens_seen": 11534976, "step": 18925 }, { "epoch": 5.873409866583928, "grad_norm": 0.1155252531170845, "learning_rate": 8.900460241687749e-06, "loss": 0.2326, "num_input_tokens_seen": 11537920, "step": 18930 }, { "epoch": 5.874961216258145, "grad_norm": 0.07371649146080017, "learning_rate": 8.899613068771658e-06, "loss": 0.2254, "num_input_tokens_seen": 11540192, "step": 18935 }, { "epoch": 5.876512565932361, "grad_norm": 0.08540840446949005, "learning_rate": 8.898765609967747e-06, "loss": 0.2324, "num_input_tokens_seen": 11545024, "step": 18940 }, { "epoch": 5.878063915606578, "grad_norm": 0.11261864751577377, "learning_rate": 8.89791786533815e-06, "loss": 0.2278, "num_input_tokens_seen": 11547552, "step": 18945 }, { "epoch": 5.879615265280794, "grad_norm": 0.1182144358754158, "learning_rate": 8.897069834945016e-06, "loss": 0.2361, "num_input_tokens_seen": 11551040, "step": 18950 }, { "epoch": 5.881166614955011, "grad_norm": 0.10896709561347961, "learning_rate": 8.896221518850515e-06, "loss": 0.2354, "num_input_tokens_seen": 11554016, "step": 18955 }, { "epoch": 5.882717964629228, "grad_norm": 0.06225145608186722, "learning_rate": 8.895372917116838e-06, "loss": 0.2283, "num_input_tokens_seen": 11556960, "step": 18960 }, { "epoch": 5.884269314303444, "grad_norm": 0.13604183495044708, "learning_rate": 8.894524029806198e-06, "loss": 0.2305, "num_input_tokens_seen": 11559968, "step": 18965 }, { "epoch": 5.88582066397766, "grad_norm": 0.11779569089412689, "learning_rate": 8.893674856980829e-06, "loss": 0.2329, "num_input_tokens_seen": 11563424, "step": 18970 }, { "epoch": 5.887372013651877, "grad_norm": 0.106948122382164, "learning_rate": 8.892825398702985e-06, "loss": 0.2278, "num_input_tokens_seen": 11566752, "step": 18975 }, { "epoch": 5.888923363326094, "grad_norm": 0.11770164221525192, "learning_rate": 8.89197565503494e-06, "loss": 0.2346, "num_input_tokens_seen": 11570240, "step": 18980 }, { "epoch": 5.890474713000311, "grad_norm": 0.10278648138046265, "learning_rate": 8.891125626038992e-06, "loss": 0.2257, "num_input_tokens_seen": 11572448, "step": 18985 }, { "epoch": 5.892026062674526, "grad_norm": 0.05586646497249603, "learning_rate": 8.89027531177746e-06, "loss": 0.2347, "num_input_tokens_seen": 11575680, "step": 18990 }, { "epoch": 5.893577412348743, "grad_norm": 0.042422547936439514, "learning_rate": 8.889424712312678e-06, "loss": 0.2315, "num_input_tokens_seen": 11578016, "step": 18995 }, { "epoch": 5.89512876202296, "grad_norm": 0.06498014181852341, "learning_rate": 8.888573827707006e-06, "loss": 0.2381, "num_input_tokens_seen": 11580960, "step": 19000 }, { "epoch": 5.896680111697177, "grad_norm": 0.04008612036705017, "learning_rate": 8.887722658022825e-06, "loss": 0.238, "num_input_tokens_seen": 11583616, "step": 19005 }, { "epoch": 5.8982314613713935, "grad_norm": 0.04291249066591263, "learning_rate": 8.886871203322538e-06, "loss": 0.2305, "num_input_tokens_seen": 11586208, "step": 19010 }, { "epoch": 5.899782811045609, "grad_norm": 0.10171794146299362, "learning_rate": 8.886019463668562e-06, "loss": 0.2265, "num_input_tokens_seen": 11589088, "step": 19015 }, { "epoch": 5.901334160719826, "grad_norm": 0.09444275498390198, "learning_rate": 8.885167439123343e-06, "loss": 0.2255, "num_input_tokens_seen": 11592352, "step": 19020 }, { "epoch": 5.902885510394043, "grad_norm": 0.09516195952892303, "learning_rate": 8.884315129749344e-06, "loss": 0.2306, "num_input_tokens_seen": 11596224, "step": 19025 }, { "epoch": 5.90443686006826, "grad_norm": 0.09190565347671509, "learning_rate": 8.883462535609046e-06, "loss": 0.2347, "num_input_tokens_seen": 11600128, "step": 19030 }, { "epoch": 5.905988209742476, "grad_norm": 0.04170345887541771, "learning_rate": 8.88260965676496e-06, "loss": 0.2359, "num_input_tokens_seen": 11602144, "step": 19035 }, { "epoch": 5.907539559416692, "grad_norm": 0.11711075156927109, "learning_rate": 8.881756493279608e-06, "loss": 0.2342, "num_input_tokens_seen": 11604832, "step": 19040 }, { "epoch": 5.909090909090909, "grad_norm": 0.0567251518368721, "learning_rate": 8.880903045215538e-06, "loss": 0.2296, "num_input_tokens_seen": 11607616, "step": 19045 }, { "epoch": 5.910642258765126, "grad_norm": 0.10878894478082657, "learning_rate": 8.88004931263532e-06, "loss": 0.2342, "num_input_tokens_seen": 11609984, "step": 19050 }, { "epoch": 5.912193608439342, "grad_norm": 0.04158922657370567, "learning_rate": 8.87919529560154e-06, "loss": 0.2324, "num_input_tokens_seen": 11613152, "step": 19055 }, { "epoch": 5.913744958113559, "grad_norm": 0.11818827688694, "learning_rate": 8.878340994176806e-06, "loss": 0.237, "num_input_tokens_seen": 11615616, "step": 19060 }, { "epoch": 5.915296307787775, "grad_norm": 0.0988079234957695, "learning_rate": 8.877486408423752e-06, "loss": 0.2314, "num_input_tokens_seen": 11619360, "step": 19065 }, { "epoch": 5.916847657461992, "grad_norm": 0.05334554240107536, "learning_rate": 8.87663153840503e-06, "loss": 0.2304, "num_input_tokens_seen": 11622016, "step": 19070 }, { "epoch": 5.918399007136209, "grad_norm": 0.13991263508796692, "learning_rate": 8.875776384183308e-06, "loss": 0.2325, "num_input_tokens_seen": 11624864, "step": 19075 }, { "epoch": 5.919950356810425, "grad_norm": 0.24097660183906555, "learning_rate": 8.874920945821282e-06, "loss": 0.2363, "num_input_tokens_seen": 11627424, "step": 19080 }, { "epoch": 5.921501706484642, "grad_norm": 0.13788455724716187, "learning_rate": 8.874065223381663e-06, "loss": 0.237, "num_input_tokens_seen": 11633056, "step": 19085 }, { "epoch": 5.923053056158858, "grad_norm": 0.06446488946676254, "learning_rate": 8.873209216927187e-06, "loss": 0.2272, "num_input_tokens_seen": 11636224, "step": 19090 }, { "epoch": 5.924604405833075, "grad_norm": 0.04974989965558052, "learning_rate": 8.872352926520612e-06, "loss": 0.2337, "num_input_tokens_seen": 11638560, "step": 19095 }, { "epoch": 5.926155755507291, "grad_norm": 0.07490135729312897, "learning_rate": 8.871496352224711e-06, "loss": 0.2363, "num_input_tokens_seen": 11641568, "step": 19100 }, { "epoch": 5.927707105181508, "grad_norm": 0.11035133898258209, "learning_rate": 8.870639494102281e-06, "loss": 0.2351, "num_input_tokens_seen": 11645536, "step": 19105 }, { "epoch": 5.929258454855725, "grad_norm": 0.07319605350494385, "learning_rate": 8.869782352216144e-06, "loss": 0.2299, "num_input_tokens_seen": 11648000, "step": 19110 }, { "epoch": 5.930809804529941, "grad_norm": 0.06560371816158295, "learning_rate": 8.868924926629133e-06, "loss": 0.2321, "num_input_tokens_seen": 11650656, "step": 19115 }, { "epoch": 5.932361154204157, "grad_norm": 0.09342917799949646, "learning_rate": 8.86806721740411e-06, "loss": 0.2291, "num_input_tokens_seen": 11653856, "step": 19120 }, { "epoch": 5.933912503878374, "grad_norm": 0.055237434804439545, "learning_rate": 8.867209224603954e-06, "loss": 0.2408, "num_input_tokens_seen": 11655936, "step": 19125 }, { "epoch": 5.935463853552591, "grad_norm": 0.09109175950288773, "learning_rate": 8.866350948291569e-06, "loss": 0.2376, "num_input_tokens_seen": 11660416, "step": 19130 }, { "epoch": 5.9370152032268075, "grad_norm": 0.041505057364702225, "learning_rate": 8.865492388529874e-06, "loss": 0.2354, "num_input_tokens_seen": 11663392, "step": 19135 }, { "epoch": 5.938566552901024, "grad_norm": 0.17432211339473724, "learning_rate": 8.864633545381816e-06, "loss": 0.2264, "num_input_tokens_seen": 11667264, "step": 19140 }, { "epoch": 5.94011790257524, "grad_norm": 0.08850950002670288, "learning_rate": 8.863774418910351e-06, "loss": 0.2269, "num_input_tokens_seen": 11670272, "step": 19145 }, { "epoch": 5.941669252249457, "grad_norm": 0.03677220270037651, "learning_rate": 8.862915009178468e-06, "loss": 0.2283, "num_input_tokens_seen": 11673024, "step": 19150 }, { "epoch": 5.943220601923674, "grad_norm": 0.09399323165416718, "learning_rate": 8.862055316249171e-06, "loss": 0.2299, "num_input_tokens_seen": 11676128, "step": 19155 }, { "epoch": 5.9447719515978905, "grad_norm": 0.10108764469623566, "learning_rate": 8.861195340185488e-06, "loss": 0.2299, "num_input_tokens_seen": 11679776, "step": 19160 }, { "epoch": 5.946323301272106, "grad_norm": 0.04249637573957443, "learning_rate": 8.860335081050461e-06, "loss": 0.2325, "num_input_tokens_seen": 11682848, "step": 19165 }, { "epoch": 5.947874650946323, "grad_norm": 0.08772658556699753, "learning_rate": 8.85947453890716e-06, "loss": 0.2309, "num_input_tokens_seen": 11685792, "step": 19170 }, { "epoch": 5.94942600062054, "grad_norm": 0.18772518634796143, "learning_rate": 8.858613713818674e-06, "loss": 0.2356, "num_input_tokens_seen": 11690592, "step": 19175 }, { "epoch": 5.950977350294757, "grad_norm": 0.09531472623348236, "learning_rate": 8.857752605848107e-06, "loss": 0.2284, "num_input_tokens_seen": 11693696, "step": 19180 }, { "epoch": 5.952528699968973, "grad_norm": 0.03394763544201851, "learning_rate": 8.856891215058593e-06, "loss": 0.231, "num_input_tokens_seen": 11696544, "step": 19185 }, { "epoch": 5.954080049643189, "grad_norm": 0.18239633738994598, "learning_rate": 8.85602954151328e-06, "loss": 0.2268, "num_input_tokens_seen": 11700096, "step": 19190 }, { "epoch": 5.955631399317406, "grad_norm": 0.11747805029153824, "learning_rate": 8.855167585275339e-06, "loss": 0.2301, "num_input_tokens_seen": 11704704, "step": 19195 }, { "epoch": 5.957182748991623, "grad_norm": 0.05374923348426819, "learning_rate": 8.854305346407964e-06, "loss": 0.2365, "num_input_tokens_seen": 11707680, "step": 19200 }, { "epoch": 5.95873409866584, "grad_norm": 0.10150772333145142, "learning_rate": 8.853442824974363e-06, "loss": 0.2322, "num_input_tokens_seen": 11709824, "step": 19205 }, { "epoch": 5.960285448340056, "grad_norm": 0.09119195491075516, "learning_rate": 8.852580021037772e-06, "loss": 0.2342, "num_input_tokens_seen": 11712448, "step": 19210 }, { "epoch": 5.961836798014272, "grad_norm": 0.10306989401578903, "learning_rate": 8.851716934661442e-06, "loss": 0.2295, "num_input_tokens_seen": 11716768, "step": 19215 }, { "epoch": 5.963388147688489, "grad_norm": 0.1793074905872345, "learning_rate": 8.850853565908653e-06, "loss": 0.2218, "num_input_tokens_seen": 11720448, "step": 19220 }, { "epoch": 5.964939497362706, "grad_norm": 0.05677447095513344, "learning_rate": 8.849989914842697e-06, "loss": 0.2273, "num_input_tokens_seen": 11723936, "step": 19225 }, { "epoch": 5.966490847036922, "grad_norm": 0.11695876717567444, "learning_rate": 8.849125981526886e-06, "loss": 0.2341, "num_input_tokens_seen": 11726784, "step": 19230 }, { "epoch": 5.968042196711139, "grad_norm": 0.03769970312714577, "learning_rate": 8.848261766024562e-06, "loss": 0.2305, "num_input_tokens_seen": 11729440, "step": 19235 }, { "epoch": 5.969593546385355, "grad_norm": 0.0669281855225563, "learning_rate": 8.84739726839908e-06, "loss": 0.2362, "num_input_tokens_seen": 11732448, "step": 19240 }, { "epoch": 5.971144896059572, "grad_norm": 0.16974540054798126, "learning_rate": 8.84653248871382e-06, "loss": 0.2289, "num_input_tokens_seen": 11735264, "step": 19245 }, { "epoch": 5.972696245733788, "grad_norm": 0.11638811230659485, "learning_rate": 8.845667427032176e-06, "loss": 0.2347, "num_input_tokens_seen": 11740032, "step": 19250 }, { "epoch": 5.974247595408005, "grad_norm": 0.037591446191072464, "learning_rate": 8.844802083417572e-06, "loss": 0.231, "num_input_tokens_seen": 11742912, "step": 19255 }, { "epoch": 5.9757989450822215, "grad_norm": 0.10126831382513046, "learning_rate": 8.843936457933447e-06, "loss": 0.2263, "num_input_tokens_seen": 11746304, "step": 19260 }, { "epoch": 5.977350294756438, "grad_norm": 0.1061214730143547, "learning_rate": 8.843070550643257e-06, "loss": 0.229, "num_input_tokens_seen": 11748768, "step": 19265 }, { "epoch": 5.978901644430655, "grad_norm": 0.11752421408891678, "learning_rate": 8.842204361610489e-06, "loss": 0.2347, "num_input_tokens_seen": 11752352, "step": 19270 }, { "epoch": 5.980452994104871, "grad_norm": 0.0924193486571312, "learning_rate": 8.841337890898643e-06, "loss": 0.2301, "num_input_tokens_seen": 11755136, "step": 19275 }, { "epoch": 5.982004343779088, "grad_norm": 0.09717623144388199, "learning_rate": 8.840471138571241e-06, "loss": 0.2272, "num_input_tokens_seen": 11758144, "step": 19280 }, { "epoch": 5.9835556934533045, "grad_norm": 0.08440200239419937, "learning_rate": 8.839604104691827e-06, "loss": 0.2331, "num_input_tokens_seen": 11761344, "step": 19285 }, { "epoch": 5.985107043127521, "grad_norm": 0.050900980830192566, "learning_rate": 8.838736789323963e-06, "loss": 0.2342, "num_input_tokens_seen": 11764448, "step": 19290 }, { "epoch": 5.986658392801737, "grad_norm": 0.08575527369976044, "learning_rate": 8.837869192531237e-06, "loss": 0.2243, "num_input_tokens_seen": 11766848, "step": 19295 }, { "epoch": 5.988209742475954, "grad_norm": 0.18381142616271973, "learning_rate": 8.837001314377252e-06, "loss": 0.227, "num_input_tokens_seen": 11769280, "step": 19300 }, { "epoch": 5.989761092150171, "grad_norm": 0.1128382459282875, "learning_rate": 8.836133154925633e-06, "loss": 0.2298, "num_input_tokens_seen": 11772704, "step": 19305 }, { "epoch": 5.9913124418243875, "grad_norm": 0.10494478791952133, "learning_rate": 8.835264714240026e-06, "loss": 0.2211, "num_input_tokens_seen": 11776992, "step": 19310 }, { "epoch": 5.992863791498603, "grad_norm": 0.05692628398537636, "learning_rate": 8.8343959923841e-06, "loss": 0.2372, "num_input_tokens_seen": 11779456, "step": 19315 }, { "epoch": 5.99441514117282, "grad_norm": 0.13921509683132172, "learning_rate": 8.833526989421541e-06, "loss": 0.2358, "num_input_tokens_seen": 11781760, "step": 19320 }, { "epoch": 5.995966490847037, "grad_norm": 0.12388968467712402, "learning_rate": 8.832657705416059e-06, "loss": 0.2375, "num_input_tokens_seen": 11784352, "step": 19325 }, { "epoch": 5.997517840521254, "grad_norm": 0.05621572211384773, "learning_rate": 8.83178814043138e-06, "loss": 0.2271, "num_input_tokens_seen": 11787136, "step": 19330 }, { "epoch": 5.9990691901954705, "grad_norm": 0.07832647860050201, "learning_rate": 8.830918294531258e-06, "loss": 0.2256, "num_input_tokens_seen": 11791424, "step": 19335 }, { "epoch": 6.0, "eval_loss": 0.23222728073596954, "eval_runtime": 34.3969, "eval_samples_per_second": 93.7, "eval_steps_per_second": 23.432, "num_input_tokens_seen": 11793056, "step": 19338 }, { "epoch": 6.000620539869686, "grad_norm": 0.08324151486158371, "learning_rate": 8.83004816777946e-06, "loss": 0.2356, "num_input_tokens_seen": 11794272, "step": 19340 }, { "epoch": 6.002171889543903, "grad_norm": 0.06290758401155472, "learning_rate": 8.829177760239774e-06, "loss": 0.235, "num_input_tokens_seen": 11797536, "step": 19345 }, { "epoch": 6.00372323921812, "grad_norm": 0.10454072058200836, "learning_rate": 8.828307071976015e-06, "loss": 0.2337, "num_input_tokens_seen": 11800384, "step": 19350 }, { "epoch": 6.005274588892337, "grad_norm": 0.07285362482070923, "learning_rate": 8.827436103052013e-06, "loss": 0.2258, "num_input_tokens_seen": 11803520, "step": 19355 }, { "epoch": 6.006825938566553, "grad_norm": 0.12244010716676712, "learning_rate": 8.826564853531622e-06, "loss": 0.2304, "num_input_tokens_seen": 11806048, "step": 19360 }, { "epoch": 6.008377288240769, "grad_norm": 0.06704964488744736, "learning_rate": 8.825693323478716e-06, "loss": 0.233, "num_input_tokens_seen": 11809280, "step": 19365 }, { "epoch": 6.009928637914986, "grad_norm": 0.2295903116464615, "learning_rate": 8.824821512957184e-06, "loss": 0.2305, "num_input_tokens_seen": 11812576, "step": 19370 }, { "epoch": 6.011479987589203, "grad_norm": 0.23870211839675903, "learning_rate": 8.823949422030943e-06, "loss": 0.2337, "num_input_tokens_seen": 11815552, "step": 19375 }, { "epoch": 6.013031337263419, "grad_norm": 0.09914165735244751, "learning_rate": 8.823077050763927e-06, "loss": 0.227, "num_input_tokens_seen": 11818784, "step": 19380 }, { "epoch": 6.0145826869376355, "grad_norm": 0.22376899421215057, "learning_rate": 8.822204399220092e-06, "loss": 0.2213, "num_input_tokens_seen": 11821216, "step": 19385 }, { "epoch": 6.016134036611852, "grad_norm": 0.38281014561653137, "learning_rate": 8.82133146746341e-06, "loss": 0.2306, "num_input_tokens_seen": 11824032, "step": 19390 }, { "epoch": 6.017685386286069, "grad_norm": 0.23785820603370667, "learning_rate": 8.820458255557882e-06, "loss": 0.2396, "num_input_tokens_seen": 11826912, "step": 19395 }, { "epoch": 6.019236735960286, "grad_norm": 0.17535698413848877, "learning_rate": 8.819584763567524e-06, "loss": 0.2195, "num_input_tokens_seen": 11830144, "step": 19400 }, { "epoch": 6.020788085634502, "grad_norm": 0.15322108566761017, "learning_rate": 8.81871099155637e-06, "loss": 0.2272, "num_input_tokens_seen": 11833344, "step": 19405 }, { "epoch": 6.0223394353087185, "grad_norm": 0.3869730532169342, "learning_rate": 8.81783693958848e-06, "loss": 0.2088, "num_input_tokens_seen": 11835808, "step": 19410 }, { "epoch": 6.023890784982935, "grad_norm": 0.5621849894523621, "learning_rate": 8.816962607727935e-06, "loss": 0.2374, "num_input_tokens_seen": 11838944, "step": 19415 }, { "epoch": 6.025442134657152, "grad_norm": 0.3127553462982178, "learning_rate": 8.816087996038829e-06, "loss": 0.238, "num_input_tokens_seen": 11842112, "step": 19420 }, { "epoch": 6.026993484331368, "grad_norm": 0.26406437158584595, "learning_rate": 8.815213104585285e-06, "loss": 0.2295, "num_input_tokens_seen": 11845088, "step": 19425 }, { "epoch": 6.028544834005585, "grad_norm": 0.19524990022182465, "learning_rate": 8.81433793343144e-06, "loss": 0.2318, "num_input_tokens_seen": 11848416, "step": 19430 }, { "epoch": 6.0300961836798015, "grad_norm": 0.2698829770088196, "learning_rate": 8.813462482641458e-06, "loss": 0.2283, "num_input_tokens_seen": 11852512, "step": 19435 }, { "epoch": 6.031647533354018, "grad_norm": 0.15783388912677765, "learning_rate": 8.812586752279516e-06, "loss": 0.2237, "num_input_tokens_seen": 11855296, "step": 19440 }, { "epoch": 6.033198883028234, "grad_norm": 0.17930816113948822, "learning_rate": 8.811710742409817e-06, "loss": 0.2301, "num_input_tokens_seen": 11857632, "step": 19445 }, { "epoch": 6.034750232702451, "grad_norm": 0.3417227864265442, "learning_rate": 8.810834453096585e-06, "loss": 0.2423, "num_input_tokens_seen": 11860352, "step": 19450 }, { "epoch": 6.036301582376668, "grad_norm": 0.1222798302769661, "learning_rate": 8.80995788440406e-06, "loss": 0.23, "num_input_tokens_seen": 11862976, "step": 19455 }, { "epoch": 6.0378529320508845, "grad_norm": 0.1207592636346817, "learning_rate": 8.809081036396506e-06, "loss": 0.2238, "num_input_tokens_seen": 11866752, "step": 19460 }, { "epoch": 6.039404281725101, "grad_norm": 0.2159477174282074, "learning_rate": 8.808203909138204e-06, "loss": 0.2351, "num_input_tokens_seen": 11869504, "step": 19465 }, { "epoch": 6.040955631399317, "grad_norm": 0.058371610939502716, "learning_rate": 8.80732650269346e-06, "loss": 0.2345, "num_input_tokens_seen": 11872224, "step": 19470 }, { "epoch": 6.042506981073534, "grad_norm": 0.11907514929771423, "learning_rate": 8.806448817126598e-06, "loss": 0.225, "num_input_tokens_seen": 11875264, "step": 19475 }, { "epoch": 6.044058330747751, "grad_norm": 0.08118309080600739, "learning_rate": 8.805570852501963e-06, "loss": 0.2309, "num_input_tokens_seen": 11877728, "step": 19480 }, { "epoch": 6.0456096804219674, "grad_norm": 0.23444317281246185, "learning_rate": 8.80469260888392e-06, "loss": 0.2333, "num_input_tokens_seen": 11881248, "step": 19485 }, { "epoch": 6.047161030096183, "grad_norm": 0.11041190475225449, "learning_rate": 8.803814086336856e-06, "loss": 0.23, "num_input_tokens_seen": 11883840, "step": 19490 }, { "epoch": 6.0487123797704, "grad_norm": 0.1222803145647049, "learning_rate": 8.802935284925174e-06, "loss": 0.2312, "num_input_tokens_seen": 11887328, "step": 19495 }, { "epoch": 6.050263729444617, "grad_norm": 0.07912945002317429, "learning_rate": 8.8020562047133e-06, "loss": 0.2317, "num_input_tokens_seen": 11889728, "step": 19500 }, { "epoch": 6.051815079118834, "grad_norm": 0.07652467489242554, "learning_rate": 8.801176845765687e-06, "loss": 0.2321, "num_input_tokens_seen": 11892512, "step": 19505 }, { "epoch": 6.0533664287930495, "grad_norm": 0.11297664791345596, "learning_rate": 8.800297208146794e-06, "loss": 0.2197, "num_input_tokens_seen": 11895680, "step": 19510 }, { "epoch": 6.054917778467266, "grad_norm": 0.16526758670806885, "learning_rate": 8.799417291921117e-06, "loss": 0.2306, "num_input_tokens_seen": 11898464, "step": 19515 }, { "epoch": 6.056469128141483, "grad_norm": 0.30633556842803955, "learning_rate": 8.79853709715316e-06, "loss": 0.2326, "num_input_tokens_seen": 11901376, "step": 19520 }, { "epoch": 6.0580204778157, "grad_norm": 0.20033948123455048, "learning_rate": 8.797656623907452e-06, "loss": 0.2499, "num_input_tokens_seen": 11904512, "step": 19525 }, { "epoch": 6.059571827489917, "grad_norm": 0.15180398523807526, "learning_rate": 8.796775872248542e-06, "loss": 0.2376, "num_input_tokens_seen": 11908288, "step": 19530 }, { "epoch": 6.0611231771641325, "grad_norm": 0.10598091036081314, "learning_rate": 8.795894842241003e-06, "loss": 0.2303, "num_input_tokens_seen": 11911520, "step": 19535 }, { "epoch": 6.062674526838349, "grad_norm": 0.2071194052696228, "learning_rate": 8.795013533949418e-06, "loss": 0.2324, "num_input_tokens_seen": 11914752, "step": 19540 }, { "epoch": 6.064225876512566, "grad_norm": 0.05436538904905319, "learning_rate": 8.794131947438402e-06, "loss": 0.2317, "num_input_tokens_seen": 11917216, "step": 19545 }, { "epoch": 6.065777226186783, "grad_norm": 0.1464253067970276, "learning_rate": 8.793250082772586e-06, "loss": 0.2315, "num_input_tokens_seen": 11919424, "step": 19550 }, { "epoch": 6.067328575860999, "grad_norm": 0.07910898327827454, "learning_rate": 8.79236794001662e-06, "loss": 0.2298, "num_input_tokens_seen": 11922368, "step": 19555 }, { "epoch": 6.0688799255352155, "grad_norm": 0.06541143357753754, "learning_rate": 8.791485519235175e-06, "loss": 0.2315, "num_input_tokens_seen": 11925312, "step": 19560 }, { "epoch": 6.070431275209432, "grad_norm": 0.10901061445474625, "learning_rate": 8.790602820492944e-06, "loss": 0.2287, "num_input_tokens_seen": 11928000, "step": 19565 }, { "epoch": 6.071982624883649, "grad_norm": 0.07171529531478882, "learning_rate": 8.78971984385464e-06, "loss": 0.2418, "num_input_tokens_seen": 11932064, "step": 19570 }, { "epoch": 6.073533974557865, "grad_norm": 0.2426697015762329, "learning_rate": 8.788836589384993e-06, "loss": 0.2383, "num_input_tokens_seen": 11934880, "step": 19575 }, { "epoch": 6.075085324232082, "grad_norm": 0.05861034616827965, "learning_rate": 8.787953057148757e-06, "loss": 0.2244, "num_input_tokens_seen": 11938240, "step": 19580 }, { "epoch": 6.0766366739062985, "grad_norm": 0.13879512250423431, "learning_rate": 8.787069247210707e-06, "loss": 0.2233, "num_input_tokens_seen": 11941248, "step": 19585 }, { "epoch": 6.078188023580515, "grad_norm": 0.07410913705825806, "learning_rate": 8.786185159635635e-06, "loss": 0.2336, "num_input_tokens_seen": 11943904, "step": 19590 }, { "epoch": 6.079739373254732, "grad_norm": 0.11751975864171982, "learning_rate": 8.785300794488357e-06, "loss": 0.2291, "num_input_tokens_seen": 11946560, "step": 19595 }, { "epoch": 6.081290722928948, "grad_norm": 0.08116522431373596, "learning_rate": 8.784416151833704e-06, "loss": 0.2285, "num_input_tokens_seen": 11949760, "step": 19600 }, { "epoch": 6.082842072603165, "grad_norm": 0.1216086894273758, "learning_rate": 8.783531231736533e-06, "loss": 0.227, "num_input_tokens_seen": 11952544, "step": 19605 }, { "epoch": 6.0843934222773814, "grad_norm": 0.12385541945695877, "learning_rate": 8.78264603426172e-06, "loss": 0.229, "num_input_tokens_seen": 11955776, "step": 19610 }, { "epoch": 6.085944771951598, "grad_norm": 0.15860718488693237, "learning_rate": 8.781760559474159e-06, "loss": 0.2384, "num_input_tokens_seen": 11958528, "step": 19615 }, { "epoch": 6.087496121625814, "grad_norm": 0.10974615067243576, "learning_rate": 8.780874807438768e-06, "loss": 0.2257, "num_input_tokens_seen": 11962112, "step": 19620 }, { "epoch": 6.089047471300031, "grad_norm": 0.10064373165369034, "learning_rate": 8.779988778220478e-06, "loss": 0.2263, "num_input_tokens_seen": 11965376, "step": 19625 }, { "epoch": 6.090598820974248, "grad_norm": 0.1273561716079712, "learning_rate": 8.779102471884253e-06, "loss": 0.2421, "num_input_tokens_seen": 11968224, "step": 19630 }, { "epoch": 6.092150170648464, "grad_norm": 0.06981692463159561, "learning_rate": 8.778215888495064e-06, "loss": 0.2294, "num_input_tokens_seen": 11970976, "step": 19635 }, { "epoch": 6.09370152032268, "grad_norm": 0.12440371513366699, "learning_rate": 8.777329028117907e-06, "loss": 0.2316, "num_input_tokens_seen": 11974464, "step": 19640 }, { "epoch": 6.095252869996897, "grad_norm": 0.1310935765504837, "learning_rate": 8.776441890817803e-06, "loss": 0.2332, "num_input_tokens_seen": 11977248, "step": 19645 }, { "epoch": 6.096804219671114, "grad_norm": 0.04467582330107689, "learning_rate": 8.77555447665979e-06, "loss": 0.2314, "num_input_tokens_seen": 11979936, "step": 19650 }, { "epoch": 6.098355569345331, "grad_norm": 0.20841756463050842, "learning_rate": 8.774666785708924e-06, "loss": 0.23, "num_input_tokens_seen": 11982880, "step": 19655 }, { "epoch": 6.099906919019547, "grad_norm": 0.1418457329273224, "learning_rate": 8.773778818030284e-06, "loss": 0.23, "num_input_tokens_seen": 11986560, "step": 19660 }, { "epoch": 6.101458268693763, "grad_norm": 0.07157142460346222, "learning_rate": 8.772890573688967e-06, "loss": 0.2271, "num_input_tokens_seen": 11989568, "step": 19665 }, { "epoch": 6.10300961836798, "grad_norm": 0.20321571826934814, "learning_rate": 8.772002052750095e-06, "loss": 0.2331, "num_input_tokens_seen": 11992320, "step": 19670 }, { "epoch": 6.104560968042197, "grad_norm": 0.13329608738422394, "learning_rate": 8.771113255278805e-06, "loss": 0.2266, "num_input_tokens_seen": 11995456, "step": 19675 }, { "epoch": 6.106112317716414, "grad_norm": 0.139851376414299, "learning_rate": 8.770224181340257e-06, "loss": 0.2344, "num_input_tokens_seen": 11998592, "step": 19680 }, { "epoch": 6.1076636673906295, "grad_norm": 0.07374460995197296, "learning_rate": 8.76933483099963e-06, "loss": 0.2364, "num_input_tokens_seen": 12001856, "step": 19685 }, { "epoch": 6.109215017064846, "grad_norm": 0.12705479562282562, "learning_rate": 8.768445204322125e-06, "loss": 0.2258, "num_input_tokens_seen": 12004384, "step": 19690 }, { "epoch": 6.110766366739063, "grad_norm": 0.11122804135084152, "learning_rate": 8.767555301372961e-06, "loss": 0.2344, "num_input_tokens_seen": 12007552, "step": 19695 }, { "epoch": 6.11231771641328, "grad_norm": 0.14056147634983063, "learning_rate": 8.766665122217379e-06, "loss": 0.2298, "num_input_tokens_seen": 12011200, "step": 19700 }, { "epoch": 6.113869066087496, "grad_norm": 0.08073665201663971, "learning_rate": 8.76577466692064e-06, "loss": 0.2308, "num_input_tokens_seen": 12013248, "step": 19705 }, { "epoch": 6.1154204157617125, "grad_norm": 0.12716233730316162, "learning_rate": 8.764883935548022e-06, "loss": 0.233, "num_input_tokens_seen": 12015808, "step": 19710 }, { "epoch": 6.116971765435929, "grad_norm": 0.20717152953147888, "learning_rate": 8.76399292816483e-06, "loss": 0.2363, "num_input_tokens_seen": 12019136, "step": 19715 }, { "epoch": 6.118523115110146, "grad_norm": 0.06340672820806503, "learning_rate": 8.763101644836385e-06, "loss": 0.2312, "num_input_tokens_seen": 12021632, "step": 19720 }, { "epoch": 6.120074464784363, "grad_norm": 0.1346171349287033, "learning_rate": 8.762210085628027e-06, "loss": 0.236, "num_input_tokens_seen": 12025440, "step": 19725 }, { "epoch": 6.121625814458579, "grad_norm": 0.09439484775066376, "learning_rate": 8.761318250605117e-06, "loss": 0.2268, "num_input_tokens_seen": 12028000, "step": 19730 }, { "epoch": 6.1231771641327954, "grad_norm": 0.06749013066291809, "learning_rate": 8.76042613983304e-06, "loss": 0.2286, "num_input_tokens_seen": 12030688, "step": 19735 }, { "epoch": 6.124728513807012, "grad_norm": 0.10718269646167755, "learning_rate": 8.759533753377199e-06, "loss": 0.2277, "num_input_tokens_seen": 12034880, "step": 19740 }, { "epoch": 6.126279863481229, "grad_norm": 0.0664016455411911, "learning_rate": 8.758641091303011e-06, "loss": 0.2393, "num_input_tokens_seen": 12038848, "step": 19745 }, { "epoch": 6.127831213155445, "grad_norm": 0.12373179197311401, "learning_rate": 8.757748153675923e-06, "loss": 0.2281, "num_input_tokens_seen": 12041024, "step": 19750 }, { "epoch": 6.129382562829662, "grad_norm": 0.129908487200737, "learning_rate": 8.756854940561397e-06, "loss": 0.2333, "num_input_tokens_seen": 12044000, "step": 19755 }, { "epoch": 6.130933912503878, "grad_norm": 0.12983214855194092, "learning_rate": 8.755961452024916e-06, "loss": 0.2315, "num_input_tokens_seen": 12047168, "step": 19760 }, { "epoch": 6.132485262178095, "grad_norm": 0.12491030246019363, "learning_rate": 8.755067688131984e-06, "loss": 0.2343, "num_input_tokens_seen": 12049600, "step": 19765 }, { "epoch": 6.134036611852311, "grad_norm": 0.10260995477437973, "learning_rate": 8.754173648948124e-06, "loss": 0.2308, "num_input_tokens_seen": 12052736, "step": 19770 }, { "epoch": 6.135587961526528, "grad_norm": 0.1115926131606102, "learning_rate": 8.753279334538878e-06, "loss": 0.2316, "num_input_tokens_seen": 12055424, "step": 19775 }, { "epoch": 6.137139311200745, "grad_norm": 0.107362300157547, "learning_rate": 8.752384744969813e-06, "loss": 0.2309, "num_input_tokens_seen": 12058624, "step": 19780 }, { "epoch": 6.138690660874961, "grad_norm": 0.11957104504108429, "learning_rate": 8.75148988030651e-06, "loss": 0.2267, "num_input_tokens_seen": 12061824, "step": 19785 }, { "epoch": 6.140242010549178, "grad_norm": 0.10866943001747131, "learning_rate": 8.750594740614575e-06, "loss": 0.2266, "num_input_tokens_seen": 12064576, "step": 19790 }, { "epoch": 6.141793360223394, "grad_norm": 0.06935630738735199, "learning_rate": 8.749699325959631e-06, "loss": 0.2329, "num_input_tokens_seen": 12067168, "step": 19795 }, { "epoch": 6.143344709897611, "grad_norm": 0.11200358718633652, "learning_rate": 8.748803636407324e-06, "loss": 0.2324, "num_input_tokens_seen": 12069792, "step": 19800 }, { "epoch": 6.144896059571828, "grad_norm": 0.12288028001785278, "learning_rate": 8.747907672023318e-06, "loss": 0.23, "num_input_tokens_seen": 12072448, "step": 19805 }, { "epoch": 6.146447409246044, "grad_norm": 0.14686031639575958, "learning_rate": 8.747011432873299e-06, "loss": 0.2296, "num_input_tokens_seen": 12075296, "step": 19810 }, { "epoch": 6.14799875892026, "grad_norm": 0.12455692887306213, "learning_rate": 8.746114919022968e-06, "loss": 0.2347, "num_input_tokens_seen": 12078208, "step": 19815 }, { "epoch": 6.149550108594477, "grad_norm": 0.20821158587932587, "learning_rate": 8.745218130538055e-06, "loss": 0.2367, "num_input_tokens_seen": 12081344, "step": 19820 }, { "epoch": 6.151101458268694, "grad_norm": 0.1109846755862236, "learning_rate": 8.744321067484302e-06, "loss": 0.2296, "num_input_tokens_seen": 12084608, "step": 19825 }, { "epoch": 6.152652807942911, "grad_norm": 0.12484687566757202, "learning_rate": 8.743423729927476e-06, "loss": 0.232, "num_input_tokens_seen": 12088160, "step": 19830 }, { "epoch": 6.1542041576171265, "grad_norm": 0.06197091192007065, "learning_rate": 8.74252611793336e-06, "loss": 0.231, "num_input_tokens_seen": 12091520, "step": 19835 }, { "epoch": 6.155755507291343, "grad_norm": 0.0749831423163414, "learning_rate": 8.741628231567765e-06, "loss": 0.2316, "num_input_tokens_seen": 12094208, "step": 19840 }, { "epoch": 6.15730685696556, "grad_norm": 0.06288913637399673, "learning_rate": 8.74073007089651e-06, "loss": 0.2289, "num_input_tokens_seen": 12096704, "step": 19845 }, { "epoch": 6.158858206639777, "grad_norm": 0.049396585673093796, "learning_rate": 8.739831635985444e-06, "loss": 0.2262, "num_input_tokens_seen": 12099456, "step": 19850 }, { "epoch": 6.160409556313994, "grad_norm": 0.050532806664705276, "learning_rate": 8.738932926900434e-06, "loss": 0.2325, "num_input_tokens_seen": 12103008, "step": 19855 }, { "epoch": 6.1619609059882094, "grad_norm": 0.11863228678703308, "learning_rate": 8.738033943707364e-06, "loss": 0.2285, "num_input_tokens_seen": 12106528, "step": 19860 }, { "epoch": 6.163512255662426, "grad_norm": 0.1232987493276596, "learning_rate": 8.737134686472144e-06, "loss": 0.2299, "num_input_tokens_seen": 12108960, "step": 19865 }, { "epoch": 6.165063605336643, "grad_norm": 0.25618836283683777, "learning_rate": 8.736235155260696e-06, "loss": 0.2435, "num_input_tokens_seen": 12111936, "step": 19870 }, { "epoch": 6.16661495501086, "grad_norm": 0.10375833511352539, "learning_rate": 8.735335350138965e-06, "loss": 0.2326, "num_input_tokens_seen": 12114464, "step": 19875 }, { "epoch": 6.168166304685076, "grad_norm": 0.17804354429244995, "learning_rate": 8.73443527117292e-06, "loss": 0.2386, "num_input_tokens_seen": 12117536, "step": 19880 }, { "epoch": 6.169717654359292, "grad_norm": 0.18779250979423523, "learning_rate": 8.73353491842855e-06, "loss": 0.2283, "num_input_tokens_seen": 12124416, "step": 19885 }, { "epoch": 6.171269004033509, "grad_norm": 0.1057521253824234, "learning_rate": 8.732634291971857e-06, "loss": 0.2273, "num_input_tokens_seen": 12128256, "step": 19890 }, { "epoch": 6.172820353707726, "grad_norm": 0.059363529086112976, "learning_rate": 8.731733391868868e-06, "loss": 0.2263, "num_input_tokens_seen": 12132320, "step": 19895 }, { "epoch": 6.174371703381942, "grad_norm": 0.11936051398515701, "learning_rate": 8.730832218185635e-06, "loss": 0.2273, "num_input_tokens_seen": 12135680, "step": 19900 }, { "epoch": 6.175923053056159, "grad_norm": 0.0777687057852745, "learning_rate": 8.729930770988218e-06, "loss": 0.2357, "num_input_tokens_seen": 12138400, "step": 19905 }, { "epoch": 6.177474402730375, "grad_norm": 0.05429534241557121, "learning_rate": 8.729029050342705e-06, "loss": 0.2337, "num_input_tokens_seen": 12142016, "step": 19910 }, { "epoch": 6.179025752404592, "grad_norm": 0.06424840539693832, "learning_rate": 8.728127056315208e-06, "loss": 0.234, "num_input_tokens_seen": 12145184, "step": 19915 }, { "epoch": 6.180577102078809, "grad_norm": 0.0675400123000145, "learning_rate": 8.727224788971848e-06, "loss": 0.23, "num_input_tokens_seen": 12147968, "step": 19920 }, { "epoch": 6.182128451753025, "grad_norm": 0.0708901584148407, "learning_rate": 8.726322248378775e-06, "loss": 0.2309, "num_input_tokens_seen": 12152320, "step": 19925 }, { "epoch": 6.183679801427242, "grad_norm": 0.07423850893974304, "learning_rate": 8.725419434602154e-06, "loss": 0.228, "num_input_tokens_seen": 12155040, "step": 19930 }, { "epoch": 6.185231151101458, "grad_norm": 0.10251504182815552, "learning_rate": 8.724516347708173e-06, "loss": 0.2259, "num_input_tokens_seen": 12158592, "step": 19935 }, { "epoch": 6.186782500775675, "grad_norm": 0.2061012089252472, "learning_rate": 8.72361298776304e-06, "loss": 0.2336, "num_input_tokens_seen": 12161024, "step": 19940 }, { "epoch": 6.188333850449891, "grad_norm": 0.1231033131480217, "learning_rate": 8.72270935483298e-06, "loss": 0.2334, "num_input_tokens_seen": 12164576, "step": 19945 }, { "epoch": 6.189885200124108, "grad_norm": 0.09889914095401764, "learning_rate": 8.721805448984239e-06, "loss": 0.2276, "num_input_tokens_seen": 12167072, "step": 19950 }, { "epoch": 6.191436549798325, "grad_norm": 0.11051309108734131, "learning_rate": 8.720901270283088e-06, "loss": 0.2415, "num_input_tokens_seen": 12170368, "step": 19955 }, { "epoch": 6.192987899472541, "grad_norm": 0.10369029641151428, "learning_rate": 8.719996818795812e-06, "loss": 0.2305, "num_input_tokens_seen": 12172544, "step": 19960 }, { "epoch": 6.194539249146757, "grad_norm": 0.0902552604675293, "learning_rate": 8.719092094588717e-06, "loss": 0.2311, "num_input_tokens_seen": 12175136, "step": 19965 }, { "epoch": 6.196090598820974, "grad_norm": 0.2114861160516739, "learning_rate": 8.71818709772813e-06, "loss": 0.231, "num_input_tokens_seen": 12178464, "step": 19970 }, { "epoch": 6.197641948495191, "grad_norm": 0.11518679559230804, "learning_rate": 8.7172818282804e-06, "loss": 0.2268, "num_input_tokens_seen": 12180768, "step": 19975 }, { "epoch": 6.199193298169408, "grad_norm": 0.13899539411067963, "learning_rate": 8.716376286311894e-06, "loss": 0.233, "num_input_tokens_seen": 12183648, "step": 19980 }, { "epoch": 6.200744647843624, "grad_norm": 0.10567127913236618, "learning_rate": 8.715470471888997e-06, "loss": 0.2293, "num_input_tokens_seen": 12187072, "step": 19985 }, { "epoch": 6.20229599751784, "grad_norm": 0.08603249490261078, "learning_rate": 8.714564385078114e-06, "loss": 0.2321, "num_input_tokens_seen": 12189664, "step": 19990 }, { "epoch": 6.203847347192057, "grad_norm": 0.14160451292991638, "learning_rate": 8.713658025945677e-06, "loss": 0.2326, "num_input_tokens_seen": 12192288, "step": 19995 }, { "epoch": 6.205398696866274, "grad_norm": 0.12989699840545654, "learning_rate": 8.71275139455813e-06, "loss": 0.229, "num_input_tokens_seen": 12195200, "step": 20000 }, { "epoch": 6.2069500465404905, "grad_norm": 0.1913338601589203, "learning_rate": 8.71184449098194e-06, "loss": 0.2301, "num_input_tokens_seen": 12198720, "step": 20005 }, { "epoch": 6.208501396214706, "grad_norm": 0.1152951568365097, "learning_rate": 8.710937315283594e-06, "loss": 0.2279, "num_input_tokens_seen": 12201280, "step": 20010 }, { "epoch": 6.210052745888923, "grad_norm": 0.061920542269945145, "learning_rate": 8.710029867529599e-06, "loss": 0.2306, "num_input_tokens_seen": 12203936, "step": 20015 }, { "epoch": 6.21160409556314, "grad_norm": 0.1673251837491989, "learning_rate": 8.709122147786481e-06, "loss": 0.2388, "num_input_tokens_seen": 12206496, "step": 20020 }, { "epoch": 6.213155445237357, "grad_norm": 0.17249783873558044, "learning_rate": 8.708214156120789e-06, "loss": 0.2304, "num_input_tokens_seen": 12209984, "step": 20025 }, { "epoch": 6.214706794911573, "grad_norm": 0.1562661975622177, "learning_rate": 8.707305892599085e-06, "loss": 0.2275, "num_input_tokens_seen": 12213280, "step": 20030 }, { "epoch": 6.216258144585789, "grad_norm": 0.15578578412532806, "learning_rate": 8.706397357287959e-06, "loss": 0.2306, "num_input_tokens_seen": 12217152, "step": 20035 }, { "epoch": 6.217809494260006, "grad_norm": 0.14267149567604065, "learning_rate": 8.705488550254016e-06, "loss": 0.2261, "num_input_tokens_seen": 12220224, "step": 20040 }, { "epoch": 6.219360843934223, "grad_norm": 0.17105956375598907, "learning_rate": 8.704579471563883e-06, "loss": 0.2314, "num_input_tokens_seen": 12224320, "step": 20045 }, { "epoch": 6.22091219360844, "grad_norm": 0.13908067345619202, "learning_rate": 8.703670121284206e-06, "loss": 0.2297, "num_input_tokens_seen": 12227136, "step": 20050 }, { "epoch": 6.222463543282656, "grad_norm": 0.1281185895204544, "learning_rate": 8.70276049948165e-06, "loss": 0.2378, "num_input_tokens_seen": 12229888, "step": 20055 }, { "epoch": 6.224014892956872, "grad_norm": 0.2282012701034546, "learning_rate": 8.701850606222904e-06, "loss": 0.2326, "num_input_tokens_seen": 12232544, "step": 20060 }, { "epoch": 6.225566242631089, "grad_norm": 0.14661459624767303, "learning_rate": 8.70094044157467e-06, "loss": 0.2327, "num_input_tokens_seen": 12234976, "step": 20065 }, { "epoch": 6.227117592305306, "grad_norm": 0.0992877259850502, "learning_rate": 8.700030005603679e-06, "loss": 0.2292, "num_input_tokens_seen": 12237664, "step": 20070 }, { "epoch": 6.228668941979522, "grad_norm": 0.0894893929362297, "learning_rate": 8.699119298376671e-06, "loss": 0.2413, "num_input_tokens_seen": 12240736, "step": 20075 }, { "epoch": 6.230220291653739, "grad_norm": 0.1464666873216629, "learning_rate": 8.698208319960415e-06, "loss": 0.228, "num_input_tokens_seen": 12243776, "step": 20080 }, { "epoch": 6.231771641327955, "grad_norm": 0.12944458425045013, "learning_rate": 8.697297070421697e-06, "loss": 0.231, "num_input_tokens_seen": 12246560, "step": 20085 }, { "epoch": 6.233322991002172, "grad_norm": 0.188069149851799, "learning_rate": 8.69638554982732e-06, "loss": 0.2327, "num_input_tokens_seen": 12249472, "step": 20090 }, { "epoch": 6.234874340676388, "grad_norm": 0.06935783475637436, "learning_rate": 8.695473758244109e-06, "loss": 0.2339, "num_input_tokens_seen": 12253408, "step": 20095 }, { "epoch": 6.236425690350605, "grad_norm": 0.21071475744247437, "learning_rate": 8.694561695738912e-06, "loss": 0.2321, "num_input_tokens_seen": 12255968, "step": 20100 }, { "epoch": 6.237977040024822, "grad_norm": 0.05937771871685982, "learning_rate": 8.693649362378593e-06, "loss": 0.2336, "num_input_tokens_seen": 12258496, "step": 20105 }, { "epoch": 6.239528389699038, "grad_norm": 0.1324879229068756, "learning_rate": 8.692736758230038e-06, "loss": 0.2287, "num_input_tokens_seen": 12262528, "step": 20110 }, { "epoch": 6.241079739373255, "grad_norm": 0.1088290736079216, "learning_rate": 8.691823883360147e-06, "loss": 0.2273, "num_input_tokens_seen": 12265408, "step": 20115 }, { "epoch": 6.242631089047471, "grad_norm": 0.10397873818874359, "learning_rate": 8.690910737835849e-06, "loss": 0.2321, "num_input_tokens_seen": 12268288, "step": 20120 }, { "epoch": 6.244182438721688, "grad_norm": 0.10820268094539642, "learning_rate": 8.689997321724086e-06, "loss": 0.2306, "num_input_tokens_seen": 12272352, "step": 20125 }, { "epoch": 6.2457337883959045, "grad_norm": 0.12107755988836288, "learning_rate": 8.689083635091824e-06, "loss": 0.2268, "num_input_tokens_seen": 12274976, "step": 20130 }, { "epoch": 6.247285138070121, "grad_norm": 0.11860833317041397, "learning_rate": 8.688169678006047e-06, "loss": 0.233, "num_input_tokens_seen": 12277472, "step": 20135 }, { "epoch": 6.248836487744337, "grad_norm": 0.13458777964115143, "learning_rate": 8.687255450533758e-06, "loss": 0.2289, "num_input_tokens_seen": 12280768, "step": 20140 }, { "epoch": 6.250387837418554, "grad_norm": 0.20446619391441345, "learning_rate": 8.686340952741978e-06, "loss": 0.2277, "num_input_tokens_seen": 12283584, "step": 20145 }, { "epoch": 6.251939187092771, "grad_norm": 0.07752136141061783, "learning_rate": 8.685426184697756e-06, "loss": 0.231, "num_input_tokens_seen": 12286976, "step": 20150 }, { "epoch": 6.2534905367669875, "grad_norm": 0.21599549055099487, "learning_rate": 8.684511146468153e-06, "loss": 0.2268, "num_input_tokens_seen": 12290528, "step": 20155 }, { "epoch": 6.255041886441203, "grad_norm": 0.09695777297019958, "learning_rate": 8.68359583812025e-06, "loss": 0.224, "num_input_tokens_seen": 12293632, "step": 20160 }, { "epoch": 6.25659323611542, "grad_norm": 0.07914122194051743, "learning_rate": 8.682680259721155e-06, "loss": 0.2364, "num_input_tokens_seen": 12295744, "step": 20165 }, { "epoch": 6.258144585789637, "grad_norm": 0.06541362404823303, "learning_rate": 8.681764411337983e-06, "loss": 0.2369, "num_input_tokens_seen": 12299648, "step": 20170 }, { "epoch": 6.259695935463854, "grad_norm": 0.12701749801635742, "learning_rate": 8.680848293037885e-06, "loss": 0.2292, "num_input_tokens_seen": 12302368, "step": 20175 }, { "epoch": 6.2612472851380705, "grad_norm": 0.0704609602689743, "learning_rate": 8.679931904888018e-06, "loss": 0.233, "num_input_tokens_seen": 12305440, "step": 20180 }, { "epoch": 6.262798634812286, "grad_norm": 0.06973182410001755, "learning_rate": 8.679015246955565e-06, "loss": 0.2354, "num_input_tokens_seen": 12307968, "step": 20185 }, { "epoch": 6.264349984486503, "grad_norm": 0.11446983367204666, "learning_rate": 8.678098319307728e-06, "loss": 0.2414, "num_input_tokens_seen": 12311776, "step": 20190 }, { "epoch": 6.26590133416072, "grad_norm": 0.20131677389144897, "learning_rate": 8.67718112201173e-06, "loss": 0.2362, "num_input_tokens_seen": 12314336, "step": 20195 }, { "epoch": 6.267452683834937, "grad_norm": 0.07727029919624329, "learning_rate": 8.676263655134811e-06, "loss": 0.2279, "num_input_tokens_seen": 12316704, "step": 20200 }, { "epoch": 6.269004033509153, "grad_norm": 0.07374610751867294, "learning_rate": 8.675345918744232e-06, "loss": 0.2309, "num_input_tokens_seen": 12320320, "step": 20205 }, { "epoch": 6.270555383183369, "grad_norm": 0.13367021083831787, "learning_rate": 8.674427912907276e-06, "loss": 0.2338, "num_input_tokens_seen": 12324224, "step": 20210 }, { "epoch": 6.272106732857586, "grad_norm": 0.0786169096827507, "learning_rate": 8.67350963769124e-06, "loss": 0.2276, "num_input_tokens_seen": 12326304, "step": 20215 }, { "epoch": 6.273658082531803, "grad_norm": 0.09348693490028381, "learning_rate": 8.672591093163449e-06, "loss": 0.2393, "num_input_tokens_seen": 12329920, "step": 20220 }, { "epoch": 6.275209432206019, "grad_norm": 0.06165560334920883, "learning_rate": 8.67167227939124e-06, "loss": 0.2407, "num_input_tokens_seen": 12332736, "step": 20225 }, { "epoch": 6.276760781880236, "grad_norm": 0.12282713502645493, "learning_rate": 8.670753196441973e-06, "loss": 0.2362, "num_input_tokens_seen": 12335040, "step": 20230 }, { "epoch": 6.278312131554452, "grad_norm": 0.08131079375743866, "learning_rate": 8.66983384438303e-06, "loss": 0.2268, "num_input_tokens_seen": 12338112, "step": 20235 }, { "epoch": 6.279863481228669, "grad_norm": 0.14017248153686523, "learning_rate": 8.668914223281808e-06, "loss": 0.2278, "num_input_tokens_seen": 12341760, "step": 20240 }, { "epoch": 6.281414830902886, "grad_norm": 0.15870089828968048, "learning_rate": 8.667994333205727e-06, "loss": 0.2316, "num_input_tokens_seen": 12345952, "step": 20245 }, { "epoch": 6.282966180577102, "grad_norm": 0.13895681500434875, "learning_rate": 8.667074174222225e-06, "loss": 0.2319, "num_input_tokens_seen": 12348480, "step": 20250 }, { "epoch": 6.2845175302513185, "grad_norm": 0.22419381141662598, "learning_rate": 8.666153746398763e-06, "loss": 0.2288, "num_input_tokens_seen": 12351424, "step": 20255 }, { "epoch": 6.286068879925535, "grad_norm": 0.07078095525503159, "learning_rate": 8.665233049802817e-06, "loss": 0.2323, "num_input_tokens_seen": 12354336, "step": 20260 }, { "epoch": 6.287620229599752, "grad_norm": 0.0639493465423584, "learning_rate": 8.664312084501886e-06, "loss": 0.2291, "num_input_tokens_seen": 12357504, "step": 20265 }, { "epoch": 6.289171579273968, "grad_norm": 0.07553868740797043, "learning_rate": 8.663390850563487e-06, "loss": 0.2348, "num_input_tokens_seen": 12359840, "step": 20270 }, { "epoch": 6.290722928948185, "grad_norm": 0.1139105036854744, "learning_rate": 8.662469348055158e-06, "loss": 0.2296, "num_input_tokens_seen": 12363232, "step": 20275 }, { "epoch": 6.2922742786224015, "grad_norm": 0.11588940024375916, "learning_rate": 8.661547577044453e-06, "loss": 0.2307, "num_input_tokens_seen": 12365760, "step": 20280 }, { "epoch": 6.293825628296618, "grad_norm": 0.11804601550102234, "learning_rate": 8.660625537598955e-06, "loss": 0.2302, "num_input_tokens_seen": 12368416, "step": 20285 }, { "epoch": 6.295376977970834, "grad_norm": 0.11675556749105453, "learning_rate": 8.659703229786253e-06, "loss": 0.2292, "num_input_tokens_seen": 12371232, "step": 20290 }, { "epoch": 6.296928327645051, "grad_norm": 0.15376242995262146, "learning_rate": 8.65878065367397e-06, "loss": 0.2308, "num_input_tokens_seen": 12374752, "step": 20295 }, { "epoch": 6.298479677319268, "grad_norm": 0.1435045599937439, "learning_rate": 8.657857809329738e-06, "loss": 0.2285, "num_input_tokens_seen": 12377440, "step": 20300 }, { "epoch": 6.3000310269934845, "grad_norm": 0.1347886621952057, "learning_rate": 8.656934696821213e-06, "loss": 0.2303, "num_input_tokens_seen": 12380064, "step": 20305 }, { "epoch": 6.301582376667701, "grad_norm": 0.09856870025396347, "learning_rate": 8.656011316216071e-06, "loss": 0.2367, "num_input_tokens_seen": 12382784, "step": 20310 }, { "epoch": 6.303133726341917, "grad_norm": 0.08523910492658615, "learning_rate": 8.655087667582005e-06, "loss": 0.2288, "num_input_tokens_seen": 12386208, "step": 20315 }, { "epoch": 6.304685076016134, "grad_norm": 0.11239271610975266, "learning_rate": 8.654163750986729e-06, "loss": 0.2288, "num_input_tokens_seen": 12388576, "step": 20320 }, { "epoch": 6.306236425690351, "grad_norm": 0.1338297575712204, "learning_rate": 8.65323956649798e-06, "loss": 0.2319, "num_input_tokens_seen": 12392256, "step": 20325 }, { "epoch": 6.3077877753645675, "grad_norm": 0.1620478332042694, "learning_rate": 8.652315114183509e-06, "loss": 0.2349, "num_input_tokens_seen": 12395072, "step": 20330 }, { "epoch": 6.309339125038783, "grad_norm": 0.12781810760498047, "learning_rate": 8.65139039411109e-06, "loss": 0.2359, "num_input_tokens_seen": 12398208, "step": 20335 }, { "epoch": 6.310890474713, "grad_norm": 0.11356852948665619, "learning_rate": 8.650465406348517e-06, "loss": 0.2265, "num_input_tokens_seen": 12400448, "step": 20340 }, { "epoch": 6.312441824387217, "grad_norm": 0.24945811927318573, "learning_rate": 8.649540150963603e-06, "loss": 0.2336, "num_input_tokens_seen": 12403488, "step": 20345 }, { "epoch": 6.313993174061434, "grad_norm": 0.08419406414031982, "learning_rate": 8.648614628024177e-06, "loss": 0.2284, "num_input_tokens_seen": 12405952, "step": 20350 }, { "epoch": 6.31554452373565, "grad_norm": 0.12147220969200134, "learning_rate": 8.647688837598092e-06, "loss": 0.23, "num_input_tokens_seen": 12410304, "step": 20355 }, { "epoch": 6.317095873409866, "grad_norm": 0.11596491932868958, "learning_rate": 8.646762779753221e-06, "loss": 0.2338, "num_input_tokens_seen": 12412896, "step": 20360 }, { "epoch": 6.318647223084083, "grad_norm": 0.15546587109565735, "learning_rate": 8.645836454557454e-06, "loss": 0.2255, "num_input_tokens_seen": 12416864, "step": 20365 }, { "epoch": 6.3201985727583, "grad_norm": 0.11895912140607834, "learning_rate": 8.644909862078702e-06, "loss": 0.2336, "num_input_tokens_seen": 12419776, "step": 20370 }, { "epoch": 6.321749922432517, "grad_norm": 0.18934448063373566, "learning_rate": 8.643983002384897e-06, "loss": 0.2279, "num_input_tokens_seen": 12422784, "step": 20375 }, { "epoch": 6.3233012721067325, "grad_norm": 0.14535094797611237, "learning_rate": 8.643055875543984e-06, "loss": 0.2276, "num_input_tokens_seen": 12425824, "step": 20380 }, { "epoch": 6.324852621780949, "grad_norm": 0.15299440920352936, "learning_rate": 8.642128481623935e-06, "loss": 0.2327, "num_input_tokens_seen": 12428384, "step": 20385 }, { "epoch": 6.326403971455166, "grad_norm": 0.11167924851179123, "learning_rate": 8.641200820692741e-06, "loss": 0.2276, "num_input_tokens_seen": 12430432, "step": 20390 }, { "epoch": 6.327955321129383, "grad_norm": 0.20618891716003418, "learning_rate": 8.640272892818406e-06, "loss": 0.2349, "num_input_tokens_seen": 12433280, "step": 20395 }, { "epoch": 6.329506670803599, "grad_norm": 0.12048964202404022, "learning_rate": 8.639344698068963e-06, "loss": 0.2301, "num_input_tokens_seen": 12435872, "step": 20400 }, { "epoch": 6.3310580204778155, "grad_norm": 0.09098159521818161, "learning_rate": 8.638416236512457e-06, "loss": 0.2254, "num_input_tokens_seen": 12438432, "step": 20405 }, { "epoch": 6.332609370152032, "grad_norm": 0.17345143854618073, "learning_rate": 8.637487508216954e-06, "loss": 0.2348, "num_input_tokens_seen": 12440960, "step": 20410 }, { "epoch": 6.334160719826249, "grad_norm": 0.06421859562397003, "learning_rate": 8.636558513250544e-06, "loss": 0.2299, "num_input_tokens_seen": 12443616, "step": 20415 }, { "epoch": 6.335712069500465, "grad_norm": 0.13998019695281982, "learning_rate": 8.63562925168133e-06, "loss": 0.2233, "num_input_tokens_seen": 12446752, "step": 20420 }, { "epoch": 6.337263419174682, "grad_norm": 0.1482425034046173, "learning_rate": 8.63469972357744e-06, "loss": 0.2344, "num_input_tokens_seen": 12449696, "step": 20425 }, { "epoch": 6.3388147688488985, "grad_norm": 0.06423842161893845, "learning_rate": 8.633769929007021e-06, "loss": 0.2304, "num_input_tokens_seen": 12451872, "step": 20430 }, { "epoch": 6.340366118523115, "grad_norm": 0.1475224643945694, "learning_rate": 8.632839868038234e-06, "loss": 0.2299, "num_input_tokens_seen": 12454048, "step": 20435 }, { "epoch": 6.341917468197332, "grad_norm": 0.18855595588684082, "learning_rate": 8.631909540739266e-06, "loss": 0.2279, "num_input_tokens_seen": 12456896, "step": 20440 }, { "epoch": 6.343468817871548, "grad_norm": 0.1958846151828766, "learning_rate": 8.63097894717832e-06, "loss": 0.2296, "num_input_tokens_seen": 12459488, "step": 20445 }, { "epoch": 6.345020167545765, "grad_norm": 0.10393811762332916, "learning_rate": 8.63004808742362e-06, "loss": 0.2318, "num_input_tokens_seen": 12463008, "step": 20450 }, { "epoch": 6.3465715172199815, "grad_norm": 0.22085225582122803, "learning_rate": 8.629116961543408e-06, "loss": 0.229, "num_input_tokens_seen": 12465696, "step": 20455 }, { "epoch": 6.348122866894198, "grad_norm": 0.1736333817243576, "learning_rate": 8.62818556960595e-06, "loss": 0.2248, "num_input_tokens_seen": 12470016, "step": 20460 }, { "epoch": 6.349674216568414, "grad_norm": 0.22691556811332703, "learning_rate": 8.627253911679524e-06, "loss": 0.2316, "num_input_tokens_seen": 12472992, "step": 20465 }, { "epoch": 6.351225566242631, "grad_norm": 0.30948224663734436, "learning_rate": 8.626321987832434e-06, "loss": 0.2325, "num_input_tokens_seen": 12476448, "step": 20470 }, { "epoch": 6.352776915916848, "grad_norm": 0.305686354637146, "learning_rate": 8.625389798133e-06, "loss": 0.2351, "num_input_tokens_seen": 12479456, "step": 20475 }, { "epoch": 6.3543282655910645, "grad_norm": 0.3417983055114746, "learning_rate": 8.624457342649563e-06, "loss": 0.2276, "num_input_tokens_seen": 12482048, "step": 20480 }, { "epoch": 6.35587961526528, "grad_norm": 0.37278419733047485, "learning_rate": 8.623524621450483e-06, "loss": 0.2334, "num_input_tokens_seen": 12484608, "step": 20485 }, { "epoch": 6.357430964939497, "grad_norm": 0.1584857553243637, "learning_rate": 8.622591634604138e-06, "loss": 0.2404, "num_input_tokens_seen": 12487488, "step": 20490 }, { "epoch": 6.358982314613714, "grad_norm": 0.11154130101203918, "learning_rate": 8.621658382178929e-06, "loss": 0.2296, "num_input_tokens_seen": 12489792, "step": 20495 }, { "epoch": 6.360533664287931, "grad_norm": 0.16265055537223816, "learning_rate": 8.620724864243272e-06, "loss": 0.2255, "num_input_tokens_seen": 12492672, "step": 20500 }, { "epoch": 6.362085013962147, "grad_norm": 0.11242823302745819, "learning_rate": 8.619791080865609e-06, "loss": 0.2317, "num_input_tokens_seen": 12495328, "step": 20505 }, { "epoch": 6.363636363636363, "grad_norm": 0.2316349744796753, "learning_rate": 8.618857032114396e-06, "loss": 0.2369, "num_input_tokens_seen": 12498752, "step": 20510 }, { "epoch": 6.36518771331058, "grad_norm": 0.13046905398368835, "learning_rate": 8.617922718058106e-06, "loss": 0.2327, "num_input_tokens_seen": 12501408, "step": 20515 }, { "epoch": 6.366739062984797, "grad_norm": 0.19366922974586487, "learning_rate": 8.61698813876524e-06, "loss": 0.2321, "num_input_tokens_seen": 12503744, "step": 20520 }, { "epoch": 6.368290412659014, "grad_norm": 0.15472224354743958, "learning_rate": 8.61605329430431e-06, "loss": 0.2316, "num_input_tokens_seen": 12506656, "step": 20525 }, { "epoch": 6.3698417623332295, "grad_norm": 0.12681308388710022, "learning_rate": 8.615118184743857e-06, "loss": 0.2382, "num_input_tokens_seen": 12508992, "step": 20530 }, { "epoch": 6.371393112007446, "grad_norm": 0.12022635340690613, "learning_rate": 8.614182810152427e-06, "loss": 0.2302, "num_input_tokens_seen": 12512192, "step": 20535 }, { "epoch": 6.372944461681663, "grad_norm": 0.17381738126277924, "learning_rate": 8.6132471705986e-06, "loss": 0.2318, "num_input_tokens_seen": 12515424, "step": 20540 }, { "epoch": 6.37449581135588, "grad_norm": 0.06408600509166718, "learning_rate": 8.61231126615097e-06, "loss": 0.2307, "num_input_tokens_seen": 12518784, "step": 20545 }, { "epoch": 6.376047161030096, "grad_norm": 0.10718291997909546, "learning_rate": 8.611375096878147e-06, "loss": 0.2334, "num_input_tokens_seen": 12522112, "step": 20550 }, { "epoch": 6.3775985107043125, "grad_norm": 0.13149015605449677, "learning_rate": 8.610438662848764e-06, "loss": 0.237, "num_input_tokens_seen": 12524928, "step": 20555 }, { "epoch": 6.379149860378529, "grad_norm": 0.13937495648860931, "learning_rate": 8.609501964131473e-06, "loss": 0.2321, "num_input_tokens_seen": 12527456, "step": 20560 }, { "epoch": 6.380701210052746, "grad_norm": 0.09788267314434052, "learning_rate": 8.608565000794944e-06, "loss": 0.2266, "num_input_tokens_seen": 12530752, "step": 20565 }, { "epoch": 6.382252559726963, "grad_norm": 0.19851142168045044, "learning_rate": 8.60762777290787e-06, "loss": 0.2312, "num_input_tokens_seen": 12533728, "step": 20570 }, { "epoch": 6.383803909401179, "grad_norm": 0.15575772523880005, "learning_rate": 8.60669028053896e-06, "loss": 0.2354, "num_input_tokens_seen": 12536416, "step": 20575 }, { "epoch": 6.3853552590753955, "grad_norm": 0.14237377047538757, "learning_rate": 8.605752523756943e-06, "loss": 0.2305, "num_input_tokens_seen": 12541248, "step": 20580 }, { "epoch": 6.386906608749612, "grad_norm": 0.19723278284072876, "learning_rate": 8.604814502630567e-06, "loss": 0.2353, "num_input_tokens_seen": 12544096, "step": 20585 }, { "epoch": 6.388457958423829, "grad_norm": 0.14440718293190002, "learning_rate": 8.6038762172286e-06, "loss": 0.2311, "num_input_tokens_seen": 12547072, "step": 20590 }, { "epoch": 6.390009308098045, "grad_norm": 0.1259133517742157, "learning_rate": 8.60293766761983e-06, "loss": 0.2307, "num_input_tokens_seen": 12550176, "step": 20595 }, { "epoch": 6.391560657772262, "grad_norm": 0.1259555220603943, "learning_rate": 8.601998853873063e-06, "loss": 0.2381, "num_input_tokens_seen": 12553952, "step": 20600 }, { "epoch": 6.3931120074464785, "grad_norm": 0.14766208827495575, "learning_rate": 8.601059776057128e-06, "loss": 0.2281, "num_input_tokens_seen": 12556832, "step": 20605 }, { "epoch": 6.394663357120695, "grad_norm": 0.1015794426202774, "learning_rate": 8.600120434240868e-06, "loss": 0.2255, "num_input_tokens_seen": 12559808, "step": 20610 }, { "epoch": 6.396214706794911, "grad_norm": 0.2132367342710495, "learning_rate": 8.599180828493147e-06, "loss": 0.2337, "num_input_tokens_seen": 12562624, "step": 20615 }, { "epoch": 6.397766056469128, "grad_norm": 0.07744341343641281, "learning_rate": 8.598240958882853e-06, "loss": 0.2388, "num_input_tokens_seen": 12565376, "step": 20620 }, { "epoch": 6.399317406143345, "grad_norm": 0.06707760691642761, "learning_rate": 8.597300825478884e-06, "loss": 0.2369, "num_input_tokens_seen": 12568416, "step": 20625 }, { "epoch": 6.400868755817561, "grad_norm": 0.2090604156255722, "learning_rate": 8.596360428350169e-06, "loss": 0.2329, "num_input_tokens_seen": 12572096, "step": 20630 }, { "epoch": 6.402420105491778, "grad_norm": 0.09448376297950745, "learning_rate": 8.595419767565645e-06, "loss": 0.2256, "num_input_tokens_seen": 12576224, "step": 20635 }, { "epoch": 6.403971455165994, "grad_norm": 0.13651180267333984, "learning_rate": 8.594478843194279e-06, "loss": 0.2301, "num_input_tokens_seen": 12579328, "step": 20640 }, { "epoch": 6.405522804840211, "grad_norm": 0.13989892601966858, "learning_rate": 8.593537655305047e-06, "loss": 0.2337, "num_input_tokens_seen": 12582176, "step": 20645 }, { "epoch": 6.407074154514428, "grad_norm": 0.1118181124329567, "learning_rate": 8.59259620396695e-06, "loss": 0.23, "num_input_tokens_seen": 12584864, "step": 20650 }, { "epoch": 6.408625504188644, "grad_norm": 0.06867653131484985, "learning_rate": 8.591654489249009e-06, "loss": 0.2289, "num_input_tokens_seen": 12588096, "step": 20655 }, { "epoch": 6.41017685386286, "grad_norm": 0.11392126977443695, "learning_rate": 8.590712511220262e-06, "loss": 0.2265, "num_input_tokens_seen": 12591040, "step": 20660 }, { "epoch": 6.411728203537077, "grad_norm": 0.11269071698188782, "learning_rate": 8.58977026994977e-06, "loss": 0.2393, "num_input_tokens_seen": 12593440, "step": 20665 }, { "epoch": 6.413279553211294, "grad_norm": 0.1231055036187172, "learning_rate": 8.588827765506606e-06, "loss": 0.2356, "num_input_tokens_seen": 12595872, "step": 20670 }, { "epoch": 6.414830902885511, "grad_norm": 0.08543483912944794, "learning_rate": 8.587884997959867e-06, "loss": 0.2322, "num_input_tokens_seen": 12598208, "step": 20675 }, { "epoch": 6.4163822525597265, "grad_norm": 0.1250380575656891, "learning_rate": 8.586941967378675e-06, "loss": 0.2332, "num_input_tokens_seen": 12601632, "step": 20680 }, { "epoch": 6.417933602233943, "grad_norm": 0.049009934067726135, "learning_rate": 8.585998673832158e-06, "loss": 0.2358, "num_input_tokens_seen": 12603904, "step": 20685 }, { "epoch": 6.41948495190816, "grad_norm": 0.07263470441102982, "learning_rate": 8.585055117389475e-06, "loss": 0.2326, "num_input_tokens_seen": 12607168, "step": 20690 }, { "epoch": 6.421036301582377, "grad_norm": 0.05918840691447258, "learning_rate": 8.584111298119797e-06, "loss": 0.2316, "num_input_tokens_seen": 12610464, "step": 20695 }, { "epoch": 6.422587651256594, "grad_norm": 0.12132926285266876, "learning_rate": 8.583167216092319e-06, "loss": 0.2331, "num_input_tokens_seen": 12613792, "step": 20700 }, { "epoch": 6.4241390009308095, "grad_norm": 0.1940351128578186, "learning_rate": 8.582222871376253e-06, "loss": 0.2305, "num_input_tokens_seen": 12618528, "step": 20705 }, { "epoch": 6.425690350605026, "grad_norm": 0.055330291390419006, "learning_rate": 8.581278264040832e-06, "loss": 0.2335, "num_input_tokens_seen": 12621600, "step": 20710 }, { "epoch": 6.427241700279243, "grad_norm": 0.09700068086385727, "learning_rate": 8.580333394155305e-06, "loss": 0.2284, "num_input_tokens_seen": 12624832, "step": 20715 }, { "epoch": 6.42879304995346, "grad_norm": 0.10293230414390564, "learning_rate": 8.579388261788943e-06, "loss": 0.2269, "num_input_tokens_seen": 12628000, "step": 20720 }, { "epoch": 6.430344399627676, "grad_norm": 0.11399324983358383, "learning_rate": 8.578442867011035e-06, "loss": 0.2269, "num_input_tokens_seen": 12631008, "step": 20725 }, { "epoch": 6.4318957493018925, "grad_norm": 0.07557756453752518, "learning_rate": 8.577497209890889e-06, "loss": 0.233, "num_input_tokens_seen": 12633632, "step": 20730 }, { "epoch": 6.433447098976109, "grad_norm": 0.09268134087324142, "learning_rate": 8.576551290497833e-06, "loss": 0.2341, "num_input_tokens_seen": 12636768, "step": 20735 }, { "epoch": 6.434998448650326, "grad_norm": 0.13092219829559326, "learning_rate": 8.575605108901217e-06, "loss": 0.2268, "num_input_tokens_seen": 12640256, "step": 20740 }, { "epoch": 6.436549798324542, "grad_norm": 0.09588067978620529, "learning_rate": 8.574658665170404e-06, "loss": 0.2349, "num_input_tokens_seen": 12642528, "step": 20745 }, { "epoch": 6.438101147998759, "grad_norm": 0.1221400797367096, "learning_rate": 8.57371195937478e-06, "loss": 0.2323, "num_input_tokens_seen": 12645408, "step": 20750 }, { "epoch": 6.439652497672975, "grad_norm": 0.10689650475978851, "learning_rate": 8.57276499158375e-06, "loss": 0.2304, "num_input_tokens_seen": 12648320, "step": 20755 }, { "epoch": 6.441203847347192, "grad_norm": 0.055814478546381, "learning_rate": 8.571817761866739e-06, "loss": 0.2242, "num_input_tokens_seen": 12651008, "step": 20760 }, { "epoch": 6.442755197021409, "grad_norm": 0.10601656138896942, "learning_rate": 8.570870270293188e-06, "loss": 0.2218, "num_input_tokens_seen": 12654144, "step": 20765 }, { "epoch": 6.444306546695625, "grad_norm": 0.14636296033859253, "learning_rate": 8.569922516932564e-06, "loss": 0.2271, "num_input_tokens_seen": 12656928, "step": 20770 }, { "epoch": 6.445857896369842, "grad_norm": 0.0662362203001976, "learning_rate": 8.56897450185434e-06, "loss": 0.2387, "num_input_tokens_seen": 12660384, "step": 20775 }, { "epoch": 6.447409246044058, "grad_norm": 0.11877693980932236, "learning_rate": 8.568026225128028e-06, "loss": 0.2193, "num_input_tokens_seen": 12663200, "step": 20780 }, { "epoch": 6.448960595718275, "grad_norm": 0.10289021581411362, "learning_rate": 8.567077686823137e-06, "loss": 0.2467, "num_input_tokens_seen": 12666272, "step": 20785 }, { "epoch": 6.450511945392491, "grad_norm": 0.07194508612155914, "learning_rate": 8.566128887009213e-06, "loss": 0.2422, "num_input_tokens_seen": 12668672, "step": 20790 }, { "epoch": 6.452063295066708, "grad_norm": 0.2142639458179474, "learning_rate": 8.565179825755813e-06, "loss": 0.2379, "num_input_tokens_seen": 12672736, "step": 20795 }, { "epoch": 6.453614644740925, "grad_norm": 0.08079002797603607, "learning_rate": 8.564230503132513e-06, "loss": 0.2291, "num_input_tokens_seen": 12675936, "step": 20800 }, { "epoch": 6.455165994415141, "grad_norm": 0.12165579199790955, "learning_rate": 8.563280919208911e-06, "loss": 0.2392, "num_input_tokens_seen": 12679296, "step": 20805 }, { "epoch": 6.456717344089358, "grad_norm": 0.06223325431346893, "learning_rate": 8.562331074054621e-06, "loss": 0.2302, "num_input_tokens_seen": 12683232, "step": 20810 }, { "epoch": 6.458268693763574, "grad_norm": 0.13037727773189545, "learning_rate": 8.561380967739282e-06, "loss": 0.23, "num_input_tokens_seen": 12686176, "step": 20815 }, { "epoch": 6.459820043437791, "grad_norm": 0.1068524494767189, "learning_rate": 8.56043060033254e-06, "loss": 0.2315, "num_input_tokens_seen": 12689216, "step": 20820 }, { "epoch": 6.461371393112008, "grad_norm": 0.13871093094348907, "learning_rate": 8.559479971904077e-06, "loss": 0.2288, "num_input_tokens_seen": 12692608, "step": 20825 }, { "epoch": 6.462922742786224, "grad_norm": 0.10165101289749146, "learning_rate": 8.558529082523581e-06, "loss": 0.2293, "num_input_tokens_seen": 12694688, "step": 20830 }, { "epoch": 6.46447409246044, "grad_norm": 0.132810577750206, "learning_rate": 8.557577932260763e-06, "loss": 0.231, "num_input_tokens_seen": 12697344, "step": 20835 }, { "epoch": 6.466025442134657, "grad_norm": 0.1056811586022377, "learning_rate": 8.556626521185356e-06, "loss": 0.2314, "num_input_tokens_seen": 12700832, "step": 20840 }, { "epoch": 6.467576791808874, "grad_norm": 0.1128753125667572, "learning_rate": 8.555674849367106e-06, "loss": 0.2269, "num_input_tokens_seen": 12704096, "step": 20845 }, { "epoch": 6.469128141483091, "grad_norm": 0.0572037398815155, "learning_rate": 8.554722916875786e-06, "loss": 0.2335, "num_input_tokens_seen": 12707712, "step": 20850 }, { "epoch": 6.4706794911573065, "grad_norm": 0.10711953043937683, "learning_rate": 8.55377072378118e-06, "loss": 0.2331, "num_input_tokens_seen": 12711264, "step": 20855 }, { "epoch": 6.472230840831523, "grad_norm": 0.11768289655447006, "learning_rate": 8.552818270153098e-06, "loss": 0.2279, "num_input_tokens_seen": 12714016, "step": 20860 }, { "epoch": 6.47378219050574, "grad_norm": 0.05827533081173897, "learning_rate": 8.551865556061364e-06, "loss": 0.2301, "num_input_tokens_seen": 12717408, "step": 20865 }, { "epoch": 6.475333540179957, "grad_norm": 0.1921873688697815, "learning_rate": 8.550912581575828e-06, "loss": 0.2389, "num_input_tokens_seen": 12720352, "step": 20870 }, { "epoch": 6.4768848898541735, "grad_norm": 0.06972099095582962, "learning_rate": 8.549959346766348e-06, "loss": 0.2278, "num_input_tokens_seen": 12723008, "step": 20875 }, { "epoch": 6.478436239528389, "grad_norm": 0.09566427022218704, "learning_rate": 8.54900585170281e-06, "loss": 0.2339, "num_input_tokens_seen": 12727552, "step": 20880 }, { "epoch": 6.479987589202606, "grad_norm": 0.13195940852165222, "learning_rate": 8.548052096455117e-06, "loss": 0.2332, "num_input_tokens_seen": 12729760, "step": 20885 }, { "epoch": 6.481538938876823, "grad_norm": 0.10481059551239014, "learning_rate": 8.547098081093189e-06, "loss": 0.23, "num_input_tokens_seen": 12732320, "step": 20890 }, { "epoch": 6.48309028855104, "grad_norm": 0.1864936798810959, "learning_rate": 8.546143805686969e-06, "loss": 0.2306, "num_input_tokens_seen": 12734944, "step": 20895 }, { "epoch": 6.484641638225256, "grad_norm": 0.07906325906515121, "learning_rate": 8.545189270306415e-06, "loss": 0.232, "num_input_tokens_seen": 12737536, "step": 20900 }, { "epoch": 6.486192987899472, "grad_norm": 0.07702463120222092, "learning_rate": 8.544234475021506e-06, "loss": 0.233, "num_input_tokens_seen": 12740320, "step": 20905 }, { "epoch": 6.487744337573689, "grad_norm": 0.10539670288562775, "learning_rate": 8.543279419902243e-06, "loss": 0.2289, "num_input_tokens_seen": 12742400, "step": 20910 }, { "epoch": 6.489295687247906, "grad_norm": 0.08379487693309784, "learning_rate": 8.542324105018636e-06, "loss": 0.2325, "num_input_tokens_seen": 12745728, "step": 20915 }, { "epoch": 6.490847036922122, "grad_norm": 0.10334441065788269, "learning_rate": 8.541368530440724e-06, "loss": 0.2311, "num_input_tokens_seen": 12749344, "step": 20920 }, { "epoch": 6.492398386596339, "grad_norm": 0.18904078006744385, "learning_rate": 8.540412696238565e-06, "loss": 0.2316, "num_input_tokens_seen": 12751840, "step": 20925 }, { "epoch": 6.493949736270555, "grad_norm": 0.19085386395454407, "learning_rate": 8.53945660248223e-06, "loss": 0.23, "num_input_tokens_seen": 12755136, "step": 20930 }, { "epoch": 6.495501085944772, "grad_norm": 0.19727453589439392, "learning_rate": 8.538500249241813e-06, "loss": 0.2273, "num_input_tokens_seen": 12757600, "step": 20935 }, { "epoch": 6.497052435618989, "grad_norm": 0.11162614077329636, "learning_rate": 8.537543636587425e-06, "loss": 0.2321, "num_input_tokens_seen": 12760736, "step": 20940 }, { "epoch": 6.498603785293205, "grad_norm": 0.11963723599910736, "learning_rate": 8.536586764589198e-06, "loss": 0.2284, "num_input_tokens_seen": 12763424, "step": 20945 }, { "epoch": 6.500155134967422, "grad_norm": 0.045906003564596176, "learning_rate": 8.535629633317281e-06, "loss": 0.2265, "num_input_tokens_seen": 12766784, "step": 20950 }, { "epoch": 6.501706484641638, "grad_norm": 0.13199464976787567, "learning_rate": 8.534672242841844e-06, "loss": 0.2361, "num_input_tokens_seen": 12769504, "step": 20955 }, { "epoch": 6.503257834315855, "grad_norm": 0.13841183483600616, "learning_rate": 8.533714593233076e-06, "loss": 0.2301, "num_input_tokens_seen": 12773024, "step": 20960 }, { "epoch": 6.504809183990071, "grad_norm": 0.10523181408643723, "learning_rate": 8.532756684561181e-06, "loss": 0.2331, "num_input_tokens_seen": 12776064, "step": 20965 }, { "epoch": 6.506360533664288, "grad_norm": 0.10964202880859375, "learning_rate": 8.531798516896389e-06, "loss": 0.228, "num_input_tokens_seen": 12778592, "step": 20970 }, { "epoch": 6.507911883338505, "grad_norm": 0.06032541021704674, "learning_rate": 8.530840090308943e-06, "loss": 0.2406, "num_input_tokens_seen": 12780928, "step": 20975 }, { "epoch": 6.509463233012721, "grad_norm": 0.17998303472995758, "learning_rate": 8.529881404869105e-06, "loss": 0.2312, "num_input_tokens_seen": 12784864, "step": 20980 }, { "epoch": 6.511014582686938, "grad_norm": 0.1535395383834839, "learning_rate": 8.528922460647161e-06, "loss": 0.2363, "num_input_tokens_seen": 12788224, "step": 20985 }, { "epoch": 6.512565932361154, "grad_norm": 0.2015627771615982, "learning_rate": 8.527963257713412e-06, "loss": 0.2361, "num_input_tokens_seen": 12790400, "step": 20990 }, { "epoch": 6.514117282035371, "grad_norm": 0.09191044420003891, "learning_rate": 8.527003796138178e-06, "loss": 0.2246, "num_input_tokens_seen": 12793216, "step": 20995 }, { "epoch": 6.5156686317095875, "grad_norm": 0.08994551748037338, "learning_rate": 8.526044075991801e-06, "loss": 0.2348, "num_input_tokens_seen": 12795552, "step": 21000 }, { "epoch": 6.517219981383803, "grad_norm": 0.09980448335409164, "learning_rate": 8.525084097344638e-06, "loss": 0.2321, "num_input_tokens_seen": 12798784, "step": 21005 }, { "epoch": 6.51877133105802, "grad_norm": 0.1872641146183014, "learning_rate": 8.524123860267068e-06, "loss": 0.2342, "num_input_tokens_seen": 12801504, "step": 21010 }, { "epoch": 6.520322680732237, "grad_norm": 0.050374455749988556, "learning_rate": 8.523163364829486e-06, "loss": 0.234, "num_input_tokens_seen": 12804288, "step": 21015 }, { "epoch": 6.521874030406454, "grad_norm": 0.0783928781747818, "learning_rate": 8.522202611102311e-06, "loss": 0.2367, "num_input_tokens_seen": 12806784, "step": 21020 }, { "epoch": 6.5234253800806705, "grad_norm": 0.10893610864877701, "learning_rate": 8.521241599155973e-06, "loss": 0.2301, "num_input_tokens_seen": 12810720, "step": 21025 }, { "epoch": 6.524976729754886, "grad_norm": 0.09953738003969193, "learning_rate": 8.520280329060928e-06, "loss": 0.23, "num_input_tokens_seen": 12813440, "step": 21030 }, { "epoch": 6.526528079429103, "grad_norm": 0.1023942083120346, "learning_rate": 8.51931880088765e-06, "loss": 0.2336, "num_input_tokens_seen": 12817856, "step": 21035 }, { "epoch": 6.52807942910332, "grad_norm": 0.055431757122278214, "learning_rate": 8.51835701470663e-06, "loss": 0.2289, "num_input_tokens_seen": 12820192, "step": 21040 }, { "epoch": 6.529630778777537, "grad_norm": 0.06735383719205856, "learning_rate": 8.517394970588375e-06, "loss": 0.2304, "num_input_tokens_seen": 12822944, "step": 21045 }, { "epoch": 6.5311821284517535, "grad_norm": 0.07241903245449066, "learning_rate": 8.516432668603417e-06, "loss": 0.2316, "num_input_tokens_seen": 12827680, "step": 21050 }, { "epoch": 6.532733478125969, "grad_norm": 0.20098985731601715, "learning_rate": 8.515470108822303e-06, "loss": 0.2341, "num_input_tokens_seen": 12831040, "step": 21055 }, { "epoch": 6.534284827800186, "grad_norm": 0.10203289985656738, "learning_rate": 8.514507291315603e-06, "loss": 0.2271, "num_input_tokens_seen": 12833856, "step": 21060 }, { "epoch": 6.535836177474403, "grad_norm": 0.10324122756719589, "learning_rate": 8.5135442161539e-06, "loss": 0.2249, "num_input_tokens_seen": 12837216, "step": 21065 }, { "epoch": 6.537387527148619, "grad_norm": 0.10290945321321487, "learning_rate": 8.512580883407797e-06, "loss": 0.2387, "num_input_tokens_seen": 12839840, "step": 21070 }, { "epoch": 6.538938876822836, "grad_norm": 0.08155222237110138, "learning_rate": 8.511617293147923e-06, "loss": 0.2251, "num_input_tokens_seen": 12842240, "step": 21075 }, { "epoch": 6.540490226497052, "grad_norm": 0.18113955855369568, "learning_rate": 8.510653445444914e-06, "loss": 0.2236, "num_input_tokens_seen": 12844800, "step": 21080 }, { "epoch": 6.542041576171269, "grad_norm": 0.20098452270030975, "learning_rate": 8.509689340369438e-06, "loss": 0.2457, "num_input_tokens_seen": 12847520, "step": 21085 }, { "epoch": 6.543592925845486, "grad_norm": 0.08219768106937408, "learning_rate": 8.50872497799217e-06, "loss": 0.2234, "num_input_tokens_seen": 12849888, "step": 21090 }, { "epoch": 6.545144275519702, "grad_norm": 0.09968625009059906, "learning_rate": 8.507760358383815e-06, "loss": 0.2339, "num_input_tokens_seen": 12854816, "step": 21095 }, { "epoch": 6.546695625193919, "grad_norm": 0.09698569774627686, "learning_rate": 8.506795481615086e-06, "loss": 0.2348, "num_input_tokens_seen": 12857568, "step": 21100 }, { "epoch": 6.548246974868135, "grad_norm": 0.1693517416715622, "learning_rate": 8.50583034775672e-06, "loss": 0.226, "num_input_tokens_seen": 12860512, "step": 21105 }, { "epoch": 6.549798324542352, "grad_norm": 0.0428914949297905, "learning_rate": 8.504864956879477e-06, "loss": 0.2332, "num_input_tokens_seen": 12864960, "step": 21110 }, { "epoch": 6.551349674216569, "grad_norm": 0.06701421737670898, "learning_rate": 8.503899309054127e-06, "loss": 0.2406, "num_input_tokens_seen": 12867808, "step": 21115 }, { "epoch": 6.552901023890785, "grad_norm": 0.1245480626821518, "learning_rate": 8.502933404351465e-06, "loss": 0.2299, "num_input_tokens_seen": 12871072, "step": 21120 }, { "epoch": 6.5544523735650015, "grad_norm": 0.19463323056697845, "learning_rate": 8.501967242842305e-06, "loss": 0.2367, "num_input_tokens_seen": 12874720, "step": 21125 }, { "epoch": 6.556003723239218, "grad_norm": 0.10775071382522583, "learning_rate": 8.501000824597476e-06, "loss": 0.2297, "num_input_tokens_seen": 12877728, "step": 21130 }, { "epoch": 6.557555072913434, "grad_norm": 0.08812880516052246, "learning_rate": 8.500034149687828e-06, "loss": 0.2298, "num_input_tokens_seen": 12880576, "step": 21135 }, { "epoch": 6.559106422587651, "grad_norm": 0.0640738382935524, "learning_rate": 8.499067218184231e-06, "loss": 0.229, "num_input_tokens_seen": 12885280, "step": 21140 }, { "epoch": 6.560657772261868, "grad_norm": 0.19913849234580994, "learning_rate": 8.498100030157572e-06, "loss": 0.2355, "num_input_tokens_seen": 12888224, "step": 21145 }, { "epoch": 6.5622091219360845, "grad_norm": 0.11582595854997635, "learning_rate": 8.497132585678756e-06, "loss": 0.2275, "num_input_tokens_seen": 12890976, "step": 21150 }, { "epoch": 6.563760471610301, "grad_norm": 0.10741483420133591, "learning_rate": 8.49616488481871e-06, "loss": 0.2358, "num_input_tokens_seen": 12893856, "step": 21155 }, { "epoch": 6.565311821284517, "grad_norm": 0.05283532664179802, "learning_rate": 8.495196927648377e-06, "loss": 0.2284, "num_input_tokens_seen": 12896768, "step": 21160 }, { "epoch": 6.566863170958734, "grad_norm": 0.10221581161022186, "learning_rate": 8.494228714238719e-06, "loss": 0.2278, "num_input_tokens_seen": 12899200, "step": 21165 }, { "epoch": 6.568414520632951, "grad_norm": 0.1211530938744545, "learning_rate": 8.49326024466072e-06, "loss": 0.2341, "num_input_tokens_seen": 12902048, "step": 21170 }, { "epoch": 6.5699658703071675, "grad_norm": 0.10384906083345413, "learning_rate": 8.492291518985378e-06, "loss": 0.2284, "num_input_tokens_seen": 12905440, "step": 21175 }, { "epoch": 6.571517219981384, "grad_norm": 0.09942704439163208, "learning_rate": 8.491322537283713e-06, "loss": 0.2341, "num_input_tokens_seen": 12908416, "step": 21180 }, { "epoch": 6.5730685696556, "grad_norm": 0.08236236870288849, "learning_rate": 8.49035329962676e-06, "loss": 0.232, "num_input_tokens_seen": 12911712, "step": 21185 }, { "epoch": 6.574619919329817, "grad_norm": 0.11664515733718872, "learning_rate": 8.489383806085581e-06, "loss": 0.229, "num_input_tokens_seen": 12916384, "step": 21190 }, { "epoch": 6.576171269004034, "grad_norm": 0.11655844748020172, "learning_rate": 8.488414056731248e-06, "loss": 0.2434, "num_input_tokens_seen": 12919680, "step": 21195 }, { "epoch": 6.57772261867825, "grad_norm": 0.08091272413730621, "learning_rate": 8.487444051634855e-06, "loss": 0.2287, "num_input_tokens_seen": 12924384, "step": 21200 }, { "epoch": 6.579273968352466, "grad_norm": 0.12133632600307465, "learning_rate": 8.486473790867516e-06, "loss": 0.2359, "num_input_tokens_seen": 12928384, "step": 21205 }, { "epoch": 6.580825318026683, "grad_norm": 0.10060050338506699, "learning_rate": 8.485503274500363e-06, "loss": 0.232, "num_input_tokens_seen": 12932160, "step": 21210 }, { "epoch": 6.5823766677009, "grad_norm": 0.04784083738923073, "learning_rate": 8.484532502604544e-06, "loss": 0.2285, "num_input_tokens_seen": 12935168, "step": 21215 }, { "epoch": 6.583928017375117, "grad_norm": 0.17297007143497467, "learning_rate": 8.48356147525123e-06, "loss": 0.2234, "num_input_tokens_seen": 12938592, "step": 21220 }, { "epoch": 6.585479367049333, "grad_norm": 0.08406655490398407, "learning_rate": 8.482590192511609e-06, "loss": 0.2298, "num_input_tokens_seen": 12941376, "step": 21225 }, { "epoch": 6.587030716723549, "grad_norm": 0.1242983266711235, "learning_rate": 8.481618654456884e-06, "loss": 0.2272, "num_input_tokens_seen": 12946080, "step": 21230 }, { "epoch": 6.588582066397766, "grad_norm": 0.08493321388959885, "learning_rate": 8.480646861158286e-06, "loss": 0.2358, "num_input_tokens_seen": 12948928, "step": 21235 }, { "epoch": 6.590133416071983, "grad_norm": 0.05871553346514702, "learning_rate": 8.479674812687056e-06, "loss": 0.2377, "num_input_tokens_seen": 12953120, "step": 21240 }, { "epoch": 6.5916847657462, "grad_norm": 0.06167585775256157, "learning_rate": 8.478702509114457e-06, "loss": 0.2319, "num_input_tokens_seen": 12957344, "step": 21245 }, { "epoch": 6.5932361154204155, "grad_norm": 0.1137489452958107, "learning_rate": 8.477729950511769e-06, "loss": 0.2357, "num_input_tokens_seen": 12959936, "step": 21250 }, { "epoch": 6.594787465094632, "grad_norm": 0.10492070019245148, "learning_rate": 8.476757136950292e-06, "loss": 0.2327, "num_input_tokens_seen": 12963072, "step": 21255 }, { "epoch": 6.596338814768849, "grad_norm": 0.18076196312904358, "learning_rate": 8.475784068501347e-06, "loss": 0.2316, "num_input_tokens_seen": 12966656, "step": 21260 }, { "epoch": 6.597890164443065, "grad_norm": 0.09627198427915573, "learning_rate": 8.47481074523627e-06, "loss": 0.2305, "num_input_tokens_seen": 12969088, "step": 21265 }, { "epoch": 6.599441514117282, "grad_norm": 0.10593133419752121, "learning_rate": 8.473837167226418e-06, "loss": 0.2259, "num_input_tokens_seen": 12972256, "step": 21270 }, { "epoch": 6.6009928637914985, "grad_norm": 0.08950356394052505, "learning_rate": 8.472863334543166e-06, "loss": 0.2294, "num_input_tokens_seen": 12974912, "step": 21275 }, { "epoch": 6.602544213465715, "grad_norm": 0.10455005615949631, "learning_rate": 8.471889247257907e-06, "loss": 0.2349, "num_input_tokens_seen": 12978304, "step": 21280 }, { "epoch": 6.604095563139932, "grad_norm": 0.05935145169496536, "learning_rate": 8.470914905442051e-06, "loss": 0.2358, "num_input_tokens_seen": 12981856, "step": 21285 }, { "epoch": 6.605646912814148, "grad_norm": 0.09789545834064484, "learning_rate": 8.469940309167032e-06, "loss": 0.2348, "num_input_tokens_seen": 12984768, "step": 21290 }, { "epoch": 6.607198262488365, "grad_norm": 0.051244527101516724, "learning_rate": 8.468965458504297e-06, "loss": 0.2305, "num_input_tokens_seen": 12987328, "step": 21295 }, { "epoch": 6.6087496121625815, "grad_norm": 0.10171055793762207, "learning_rate": 8.467990353525315e-06, "loss": 0.2268, "num_input_tokens_seen": 12993984, "step": 21300 }, { "epoch": 6.610300961836798, "grad_norm": 0.09643913805484772, "learning_rate": 8.467014994301574e-06, "loss": 0.2278, "num_input_tokens_seen": 12996064, "step": 21305 }, { "epoch": 6.611852311511015, "grad_norm": 0.10402471572160721, "learning_rate": 8.466039380904579e-06, "loss": 0.2299, "num_input_tokens_seen": 12999040, "step": 21310 }, { "epoch": 6.613403661185231, "grad_norm": 0.08576113730669022, "learning_rate": 8.465063513405851e-06, "loss": 0.23, "num_input_tokens_seen": 13001760, "step": 21315 }, { "epoch": 6.614955010859448, "grad_norm": 0.1300719678401947, "learning_rate": 8.464087391876937e-06, "loss": 0.2304, "num_input_tokens_seen": 13005056, "step": 21320 }, { "epoch": 6.6165063605336645, "grad_norm": 0.0900978073477745, "learning_rate": 8.463111016389395e-06, "loss": 0.2296, "num_input_tokens_seen": 13008064, "step": 21325 }, { "epoch": 6.61805771020788, "grad_norm": 0.10624489188194275, "learning_rate": 8.462134387014806e-06, "loss": 0.2303, "num_input_tokens_seen": 13011776, "step": 21330 }, { "epoch": 6.619609059882097, "grad_norm": 0.09905573725700378, "learning_rate": 8.46115750382477e-06, "loss": 0.235, "num_input_tokens_seen": 13015008, "step": 21335 }, { "epoch": 6.621160409556314, "grad_norm": 0.10164546966552734, "learning_rate": 8.460180366890903e-06, "loss": 0.2333, "num_input_tokens_seen": 13018720, "step": 21340 }, { "epoch": 6.622711759230531, "grad_norm": 0.060532692819833755, "learning_rate": 8.45920297628484e-06, "loss": 0.2332, "num_input_tokens_seen": 13021280, "step": 21345 }, { "epoch": 6.6242631089047475, "grad_norm": 0.05974087491631508, "learning_rate": 8.458225332078235e-06, "loss": 0.2296, "num_input_tokens_seen": 13023936, "step": 21350 }, { "epoch": 6.625814458578963, "grad_norm": 0.10097659379243851, "learning_rate": 8.457247434342762e-06, "loss": 0.2286, "num_input_tokens_seen": 13026240, "step": 21355 }, { "epoch": 6.62736580825318, "grad_norm": 0.18562501668930054, "learning_rate": 8.456269283150111e-06, "loss": 0.2306, "num_input_tokens_seen": 13029120, "step": 21360 }, { "epoch": 6.628917157927397, "grad_norm": 0.11859048902988434, "learning_rate": 8.455290878571995e-06, "loss": 0.2279, "num_input_tokens_seen": 13033152, "step": 21365 }, { "epoch": 6.630468507601614, "grad_norm": 0.10596724599599838, "learning_rate": 8.454312220680139e-06, "loss": 0.2335, "num_input_tokens_seen": 13035360, "step": 21370 }, { "epoch": 6.63201985727583, "grad_norm": 0.11138101667165756, "learning_rate": 8.453333309546294e-06, "loss": 0.2304, "num_input_tokens_seen": 13037760, "step": 21375 }, { "epoch": 6.633571206950046, "grad_norm": 0.09863775223493576, "learning_rate": 8.452354145242223e-06, "loss": 0.2286, "num_input_tokens_seen": 13040224, "step": 21380 }, { "epoch": 6.635122556624263, "grad_norm": 0.12464582920074463, "learning_rate": 8.45137472783971e-06, "loss": 0.2224, "num_input_tokens_seen": 13042752, "step": 21385 }, { "epoch": 6.63667390629848, "grad_norm": 0.08196809887886047, "learning_rate": 8.450395057410561e-06, "loss": 0.2297, "num_input_tokens_seen": 13045376, "step": 21390 }, { "epoch": 6.638225255972696, "grad_norm": 0.11874589323997498, "learning_rate": 8.449415134026594e-06, "loss": 0.2267, "num_input_tokens_seen": 13048960, "step": 21395 }, { "epoch": 6.6397766056469125, "grad_norm": 0.09808101505041122, "learning_rate": 8.448434957759652e-06, "loss": 0.2283, "num_input_tokens_seen": 13051552, "step": 21400 }, { "epoch": 6.641327955321129, "grad_norm": 0.1247810423374176, "learning_rate": 8.447454528681592e-06, "loss": 0.2323, "num_input_tokens_seen": 13054336, "step": 21405 }, { "epoch": 6.642879304995346, "grad_norm": 0.050691138952970505, "learning_rate": 8.44647384686429e-06, "loss": 0.2289, "num_input_tokens_seen": 13058304, "step": 21410 }, { "epoch": 6.644430654669563, "grad_norm": 0.09543251246213913, "learning_rate": 8.445492912379642e-06, "loss": 0.223, "num_input_tokens_seen": 13061728, "step": 21415 }, { "epoch": 6.645982004343779, "grad_norm": 0.13770700991153717, "learning_rate": 8.444511725299563e-06, "loss": 0.2324, "num_input_tokens_seen": 13064608, "step": 21420 }, { "epoch": 6.6475333540179955, "grad_norm": 0.10768990218639374, "learning_rate": 8.443530285695987e-06, "loss": 0.2258, "num_input_tokens_seen": 13067104, "step": 21425 }, { "epoch": 6.649084703692212, "grad_norm": 0.07108980417251587, "learning_rate": 8.442548593640863e-06, "loss": 0.2316, "num_input_tokens_seen": 13069728, "step": 21430 }, { "epoch": 6.650636053366429, "grad_norm": 0.11116702109575272, "learning_rate": 8.44156664920616e-06, "loss": 0.217, "num_input_tokens_seen": 13072256, "step": 21435 }, { "epoch": 6.652187403040646, "grad_norm": 0.11572002619504929, "learning_rate": 8.440584452463868e-06, "loss": 0.2244, "num_input_tokens_seen": 13075584, "step": 21440 }, { "epoch": 6.653738752714862, "grad_norm": 0.1611158698797226, "learning_rate": 8.439602003485993e-06, "loss": 0.2376, "num_input_tokens_seen": 13078592, "step": 21445 }, { "epoch": 6.6552901023890785, "grad_norm": 0.09145054221153259, "learning_rate": 8.438619302344561e-06, "loss": 0.237, "num_input_tokens_seen": 13081088, "step": 21450 }, { "epoch": 6.656841452063295, "grad_norm": 0.10712552070617676, "learning_rate": 8.437636349111614e-06, "loss": 0.2493, "num_input_tokens_seen": 13083200, "step": 21455 }, { "epoch": 6.658392801737512, "grad_norm": 0.11511768400669098, "learning_rate": 8.436653143859215e-06, "loss": 0.2367, "num_input_tokens_seen": 13087168, "step": 21460 }, { "epoch": 6.659944151411728, "grad_norm": 0.05910135805606842, "learning_rate": 8.435669686659444e-06, "loss": 0.2363, "num_input_tokens_seen": 13089472, "step": 21465 }, { "epoch": 6.661495501085945, "grad_norm": 0.18015874922275543, "learning_rate": 8.434685977584402e-06, "loss": 0.2191, "num_input_tokens_seen": 13092064, "step": 21470 }, { "epoch": 6.6630468507601615, "grad_norm": 0.06760507822036743, "learning_rate": 8.433702016706204e-06, "loss": 0.233, "num_input_tokens_seen": 13095296, "step": 21475 }, { "epoch": 6.664598200434378, "grad_norm": 0.0814320296049118, "learning_rate": 8.432717804096987e-06, "loss": 0.2265, "num_input_tokens_seen": 13097920, "step": 21480 }, { "epoch": 6.666149550108594, "grad_norm": 0.13197237253189087, "learning_rate": 8.431733339828906e-06, "loss": 0.2387, "num_input_tokens_seen": 13100864, "step": 21485 }, { "epoch": 6.667700899782811, "grad_norm": 0.12394794076681137, "learning_rate": 8.430748623974134e-06, "loss": 0.2382, "num_input_tokens_seen": 13103520, "step": 21490 }, { "epoch": 6.669252249457028, "grad_norm": 0.13289423286914825, "learning_rate": 8.42976365660486e-06, "loss": 0.2302, "num_input_tokens_seen": 13105952, "step": 21495 }, { "epoch": 6.670803599131244, "grad_norm": 0.12142914533615112, "learning_rate": 8.428778437793296e-06, "loss": 0.2431, "num_input_tokens_seen": 13109152, "step": 21500 }, { "epoch": 6.672354948805461, "grad_norm": 0.07557110488414764, "learning_rate": 8.42779296761167e-06, "loss": 0.2272, "num_input_tokens_seen": 13112544, "step": 21505 }, { "epoch": 6.673906298479677, "grad_norm": 0.11633533984422684, "learning_rate": 8.42680724613223e-06, "loss": 0.225, "num_input_tokens_seen": 13115136, "step": 21510 }, { "epoch": 6.675457648153894, "grad_norm": 0.07035145908594131, "learning_rate": 8.425821273427237e-06, "loss": 0.2257, "num_input_tokens_seen": 13118560, "step": 21515 }, { "epoch": 6.677008997828111, "grad_norm": 0.049528706818819046, "learning_rate": 8.424835049568978e-06, "loss": 0.242, "num_input_tokens_seen": 13121792, "step": 21520 }, { "epoch": 6.678560347502327, "grad_norm": 0.1790798455476761, "learning_rate": 8.423848574629752e-06, "loss": 0.2332, "num_input_tokens_seen": 13124704, "step": 21525 }, { "epoch": 6.680111697176543, "grad_norm": 0.060807809233665466, "learning_rate": 8.422861848681882e-06, "loss": 0.2333, "num_input_tokens_seen": 13127936, "step": 21530 }, { "epoch": 6.68166304685076, "grad_norm": 0.10160978883504868, "learning_rate": 8.421874871797707e-06, "loss": 0.2316, "num_input_tokens_seen": 13131072, "step": 21535 }, { "epoch": 6.683214396524977, "grad_norm": 0.04995379596948624, "learning_rate": 8.420887644049583e-06, "loss": 0.2276, "num_input_tokens_seen": 13135168, "step": 21540 }, { "epoch": 6.684765746199194, "grad_norm": 0.0688939094543457, "learning_rate": 8.419900165509884e-06, "loss": 0.2362, "num_input_tokens_seen": 13137952, "step": 21545 }, { "epoch": 6.6863170958734095, "grad_norm": 0.09036823362112045, "learning_rate": 8.418912436251005e-06, "loss": 0.2337, "num_input_tokens_seen": 13140384, "step": 21550 }, { "epoch": 6.687868445547626, "grad_norm": 0.09833067655563354, "learning_rate": 8.417924456345358e-06, "loss": 0.231, "num_input_tokens_seen": 13142624, "step": 21555 }, { "epoch": 6.689419795221843, "grad_norm": 0.10441354662179947, "learning_rate": 8.416936225865377e-06, "loss": 0.2316, "num_input_tokens_seen": 13144896, "step": 21560 }, { "epoch": 6.69097114489606, "grad_norm": 0.1110130250453949, "learning_rate": 8.415947744883505e-06, "loss": 0.2284, "num_input_tokens_seen": 13148160, "step": 21565 }, { "epoch": 6.692522494570277, "grad_norm": 0.18521147966384888, "learning_rate": 8.414959013472214e-06, "loss": 0.2237, "num_input_tokens_seen": 13150656, "step": 21570 }, { "epoch": 6.6940738442444925, "grad_norm": 0.11423685401678085, "learning_rate": 8.413970031703988e-06, "loss": 0.2315, "num_input_tokens_seen": 13152736, "step": 21575 }, { "epoch": 6.695625193918709, "grad_norm": 0.10273325443267822, "learning_rate": 8.412980799651331e-06, "loss": 0.2296, "num_input_tokens_seen": 13155872, "step": 21580 }, { "epoch": 6.697176543592926, "grad_norm": 0.052304740995168686, "learning_rate": 8.411991317386764e-06, "loss": 0.2373, "num_input_tokens_seen": 13158464, "step": 21585 }, { "epoch": 6.698727893267143, "grad_norm": 0.10622920095920563, "learning_rate": 8.411001584982831e-06, "loss": 0.2284, "num_input_tokens_seen": 13161568, "step": 21590 }, { "epoch": 6.700279242941359, "grad_norm": 0.07454721629619598, "learning_rate": 8.410011602512092e-06, "loss": 0.2238, "num_input_tokens_seen": 13164480, "step": 21595 }, { "epoch": 6.7018305926155755, "grad_norm": 0.17347602546215057, "learning_rate": 8.409021370047118e-06, "loss": 0.2255, "num_input_tokens_seen": 13167072, "step": 21600 }, { "epoch": 6.703381942289792, "grad_norm": 0.05125231295824051, "learning_rate": 8.408030887660512e-06, "loss": 0.2335, "num_input_tokens_seen": 13169536, "step": 21605 }, { "epoch": 6.704933291964009, "grad_norm": 0.0690898671746254, "learning_rate": 8.407040155424881e-06, "loss": 0.2317, "num_input_tokens_seen": 13172480, "step": 21610 }, { "epoch": 6.706484641638225, "grad_norm": 0.10048367828130722, "learning_rate": 8.406049173412865e-06, "loss": 0.2303, "num_input_tokens_seen": 13175360, "step": 21615 }, { "epoch": 6.708035991312442, "grad_norm": 0.06087267026305199, "learning_rate": 8.405057941697108e-06, "loss": 0.2357, "num_input_tokens_seen": 13178336, "step": 21620 }, { "epoch": 6.709587340986658, "grad_norm": 0.12289946526288986, "learning_rate": 8.404066460350282e-06, "loss": 0.232, "num_input_tokens_seen": 13181184, "step": 21625 }, { "epoch": 6.711138690660875, "grad_norm": 0.05686952918767929, "learning_rate": 8.403074729445077e-06, "loss": 0.2356, "num_input_tokens_seen": 13183392, "step": 21630 }, { "epoch": 6.712690040335092, "grad_norm": 0.08084303140640259, "learning_rate": 8.402082749054194e-06, "loss": 0.2331, "num_input_tokens_seen": 13186432, "step": 21635 }, { "epoch": 6.714241390009308, "grad_norm": 0.055280666798353195, "learning_rate": 8.40109051925036e-06, "loss": 0.2326, "num_input_tokens_seen": 13190336, "step": 21640 }, { "epoch": 6.715792739683525, "grad_norm": 0.1012946143746376, "learning_rate": 8.400098040106314e-06, "loss": 0.2332, "num_input_tokens_seen": 13193120, "step": 21645 }, { "epoch": 6.717344089357741, "grad_norm": 0.11640162765979767, "learning_rate": 8.39910531169482e-06, "loss": 0.227, "num_input_tokens_seen": 13195520, "step": 21650 }, { "epoch": 6.718895439031958, "grad_norm": 0.16358216106891632, "learning_rate": 8.398112334088656e-06, "loss": 0.2275, "num_input_tokens_seen": 13200896, "step": 21655 }, { "epoch": 6.720446788706174, "grad_norm": 0.14390404522418976, "learning_rate": 8.397119107360616e-06, "loss": 0.2284, "num_input_tokens_seen": 13203104, "step": 21660 }, { "epoch": 6.721998138380391, "grad_norm": 0.06168379634618759, "learning_rate": 8.396125631583518e-06, "loss": 0.234, "num_input_tokens_seen": 13206208, "step": 21665 }, { "epoch": 6.723549488054608, "grad_norm": 0.07821465283632278, "learning_rate": 8.395131906830195e-06, "loss": 0.2287, "num_input_tokens_seen": 13208992, "step": 21670 }, { "epoch": 6.725100837728824, "grad_norm": 0.21872732043266296, "learning_rate": 8.3941379331735e-06, "loss": 0.2241, "num_input_tokens_seen": 13212256, "step": 21675 }, { "epoch": 6.72665218740304, "grad_norm": 0.08255937695503235, "learning_rate": 8.393143710686303e-06, "loss": 0.2122, "num_input_tokens_seen": 13214912, "step": 21680 }, { "epoch": 6.728203537077257, "grad_norm": 0.12896689772605896, "learning_rate": 8.392149239441489e-06, "loss": 0.2339, "num_input_tokens_seen": 13219040, "step": 21685 }, { "epoch": 6.729754886751474, "grad_norm": 0.14143966138362885, "learning_rate": 8.391154519511965e-06, "loss": 0.2549, "num_input_tokens_seen": 13221440, "step": 21690 }, { "epoch": 6.731306236425691, "grad_norm": 0.07541459053754807, "learning_rate": 8.39015955097066e-06, "loss": 0.2279, "num_input_tokens_seen": 13224096, "step": 21695 }, { "epoch": 6.732857586099907, "grad_norm": 0.16266773641109467, "learning_rate": 8.389164333890516e-06, "loss": 0.2261, "num_input_tokens_seen": 13227776, "step": 21700 }, { "epoch": 6.734408935774123, "grad_norm": 0.11031558364629745, "learning_rate": 8.38816886834449e-06, "loss": 0.2234, "num_input_tokens_seen": 13231040, "step": 21705 }, { "epoch": 6.73596028544834, "grad_norm": 0.10580185055732727, "learning_rate": 8.387173154405564e-06, "loss": 0.2295, "num_input_tokens_seen": 13233728, "step": 21710 }, { "epoch": 6.737511635122557, "grad_norm": 0.11665305495262146, "learning_rate": 8.386177192146737e-06, "loss": 0.2242, "num_input_tokens_seen": 13236192, "step": 21715 }, { "epoch": 6.739062984796774, "grad_norm": 0.15177442133426666, "learning_rate": 8.385180981641024e-06, "loss": 0.2349, "num_input_tokens_seen": 13239392, "step": 21720 }, { "epoch": 6.7406143344709895, "grad_norm": 0.1387421190738678, "learning_rate": 8.384184522961457e-06, "loss": 0.2396, "num_input_tokens_seen": 13244032, "step": 21725 }, { "epoch": 6.742165684145206, "grad_norm": 0.16933061182498932, "learning_rate": 8.38318781618109e-06, "loss": 0.2374, "num_input_tokens_seen": 13247040, "step": 21730 }, { "epoch": 6.743717033819423, "grad_norm": 0.1097886711359024, "learning_rate": 8.382190861372992e-06, "loss": 0.2321, "num_input_tokens_seen": 13249792, "step": 21735 }, { "epoch": 6.74526838349364, "grad_norm": 0.19769588112831116, "learning_rate": 8.381193658610254e-06, "loss": 0.2315, "num_input_tokens_seen": 13252192, "step": 21740 }, { "epoch": 6.746819733167856, "grad_norm": 0.08594612777233124, "learning_rate": 8.380196207965983e-06, "loss": 0.2231, "num_input_tokens_seen": 13256224, "step": 21745 }, { "epoch": 6.748371082842072, "grad_norm": 0.17931273579597473, "learning_rate": 8.3791985095133e-06, "loss": 0.2283, "num_input_tokens_seen": 13259104, "step": 21750 }, { "epoch": 6.749922432516289, "grad_norm": 0.08198849111795425, "learning_rate": 8.378200563325352e-06, "loss": 0.2327, "num_input_tokens_seen": 13262720, "step": 21755 }, { "epoch": 6.751473782190506, "grad_norm": 0.12854571640491486, "learning_rate": 8.377202369475298e-06, "loss": 0.2378, "num_input_tokens_seen": 13265024, "step": 21760 }, { "epoch": 6.753025131864723, "grad_norm": 0.054634612053632736, "learning_rate": 8.376203928036318e-06, "loss": 0.2329, "num_input_tokens_seen": 13267520, "step": 21765 }, { "epoch": 6.754576481538939, "grad_norm": 0.12026183307170868, "learning_rate": 8.375205239081612e-06, "loss": 0.2337, "num_input_tokens_seen": 13271200, "step": 21770 }, { "epoch": 6.756127831213155, "grad_norm": 0.12019935995340347, "learning_rate": 8.374206302684391e-06, "loss": 0.231, "num_input_tokens_seen": 13273984, "step": 21775 }, { "epoch": 6.757679180887372, "grad_norm": 0.18629729747772217, "learning_rate": 8.373207118917892e-06, "loss": 0.2357, "num_input_tokens_seen": 13276160, "step": 21780 }, { "epoch": 6.759230530561589, "grad_norm": 0.12970572710037231, "learning_rate": 8.372207687855367e-06, "loss": 0.2299, "num_input_tokens_seen": 13278080, "step": 21785 }, { "epoch": 6.760781880235805, "grad_norm": 0.11645407229661942, "learning_rate": 8.371208009570084e-06, "loss": 0.2293, "num_input_tokens_seen": 13280768, "step": 21790 }, { "epoch": 6.762333229910022, "grad_norm": 0.13873103260993958, "learning_rate": 8.370208084135336e-06, "loss": 0.2309, "num_input_tokens_seen": 13283712, "step": 21795 }, { "epoch": 6.763884579584238, "grad_norm": 0.06119954213500023, "learning_rate": 8.369207911624424e-06, "loss": 0.2304, "num_input_tokens_seen": 13286304, "step": 21800 }, { "epoch": 6.765435929258455, "grad_norm": 0.05956227704882622, "learning_rate": 8.368207492110674e-06, "loss": 0.2337, "num_input_tokens_seen": 13289024, "step": 21805 }, { "epoch": 6.766987278932671, "grad_norm": 0.1995398998260498, "learning_rate": 8.36720682566743e-06, "loss": 0.2368, "num_input_tokens_seen": 13291680, "step": 21810 }, { "epoch": 6.768538628606888, "grad_norm": 0.06887682527303696, "learning_rate": 8.366205912368053e-06, "loss": 0.2329, "num_input_tokens_seen": 13294144, "step": 21815 }, { "epoch": 6.770089978281105, "grad_norm": 0.08014547824859619, "learning_rate": 8.36520475228592e-06, "loss": 0.2282, "num_input_tokens_seen": 13296640, "step": 21820 }, { "epoch": 6.771641327955321, "grad_norm": 0.08304157853126526, "learning_rate": 8.36420334549443e-06, "loss": 0.2268, "num_input_tokens_seen": 13299232, "step": 21825 }, { "epoch": 6.773192677629538, "grad_norm": 0.1870688796043396, "learning_rate": 8.363201692066995e-06, "loss": 0.2351, "num_input_tokens_seen": 13301728, "step": 21830 }, { "epoch": 6.774744027303754, "grad_norm": 0.15248048305511475, "learning_rate": 8.36219979207705e-06, "loss": 0.2388, "num_input_tokens_seen": 13305760, "step": 21835 }, { "epoch": 6.776295376977971, "grad_norm": 0.05867208540439606, "learning_rate": 8.361197645598045e-06, "loss": 0.2239, "num_input_tokens_seen": 13309056, "step": 21840 }, { "epoch": 6.777846726652188, "grad_norm": 0.1820845901966095, "learning_rate": 8.360195252703452e-06, "loss": 0.2309, "num_input_tokens_seen": 13312064, "step": 21845 }, { "epoch": 6.779398076326404, "grad_norm": 0.06800901144742966, "learning_rate": 8.359192613466756e-06, "loss": 0.2326, "num_input_tokens_seen": 13314656, "step": 21850 }, { "epoch": 6.78094942600062, "grad_norm": 0.11213410645723343, "learning_rate": 8.35818972796146e-06, "loss": 0.2383, "num_input_tokens_seen": 13317280, "step": 21855 }, { "epoch": 6.782500775674837, "grad_norm": 0.11599040031433105, "learning_rate": 8.357186596261093e-06, "loss": 0.2255, "num_input_tokens_seen": 13320320, "step": 21860 }, { "epoch": 6.784052125349054, "grad_norm": 0.1371438056230545, "learning_rate": 8.356183218439194e-06, "loss": 0.2315, "num_input_tokens_seen": 13322912, "step": 21865 }, { "epoch": 6.7856034750232705, "grad_norm": 0.07354474067687988, "learning_rate": 8.35517959456932e-06, "loss": 0.2331, "num_input_tokens_seen": 13325216, "step": 21870 }, { "epoch": 6.787154824697486, "grad_norm": 0.09289968013763428, "learning_rate": 8.354175724725051e-06, "loss": 0.2346, "num_input_tokens_seen": 13327968, "step": 21875 }, { "epoch": 6.788706174371703, "grad_norm": 0.05372505635023117, "learning_rate": 8.353171608979983e-06, "loss": 0.2299, "num_input_tokens_seen": 13330688, "step": 21880 }, { "epoch": 6.79025752404592, "grad_norm": 0.06379549205303192, "learning_rate": 8.352167247407725e-06, "loss": 0.2281, "num_input_tokens_seen": 13333888, "step": 21885 }, { "epoch": 6.791808873720137, "grad_norm": 0.06649917364120483, "learning_rate": 8.351162640081915e-06, "loss": 0.2364, "num_input_tokens_seen": 13336736, "step": 21890 }, { "epoch": 6.7933602233943535, "grad_norm": 0.1803397685289383, "learning_rate": 8.3501577870762e-06, "loss": 0.2296, "num_input_tokens_seen": 13340000, "step": 21895 }, { "epoch": 6.794911573068569, "grad_norm": 0.04927438870072365, "learning_rate": 8.349152688464246e-06, "loss": 0.2277, "num_input_tokens_seen": 13343168, "step": 21900 }, { "epoch": 6.796462922742786, "grad_norm": 0.11777555197477341, "learning_rate": 8.348147344319741e-06, "loss": 0.2305, "num_input_tokens_seen": 13346816, "step": 21905 }, { "epoch": 6.798014272417003, "grad_norm": 0.13006073236465454, "learning_rate": 8.347141754716387e-06, "loss": 0.2329, "num_input_tokens_seen": 13349536, "step": 21910 }, { "epoch": 6.79956562209122, "grad_norm": 0.047407735139131546, "learning_rate": 8.346135919727909e-06, "loss": 0.237, "num_input_tokens_seen": 13351904, "step": 21915 }, { "epoch": 6.801116971765436, "grad_norm": 0.05938941240310669, "learning_rate": 8.345129839428042e-06, "loss": 0.2337, "num_input_tokens_seen": 13354944, "step": 21920 }, { "epoch": 6.802668321439652, "grad_norm": 0.10709451138973236, "learning_rate": 8.344123513890544e-06, "loss": 0.2311, "num_input_tokens_seen": 13357472, "step": 21925 }, { "epoch": 6.804219671113869, "grad_norm": 0.09692110121250153, "learning_rate": 8.343116943189193e-06, "loss": 0.2336, "num_input_tokens_seen": 13359968, "step": 21930 }, { "epoch": 6.805771020788086, "grad_norm": 0.10683682560920715, "learning_rate": 8.342110127397782e-06, "loss": 0.231, "num_input_tokens_seen": 13363360, "step": 21935 }, { "epoch": 6.807322370462302, "grad_norm": 0.10183651745319366, "learning_rate": 8.341103066590122e-06, "loss": 0.2314, "num_input_tokens_seen": 13366976, "step": 21940 }, { "epoch": 6.808873720136519, "grad_norm": 0.04871036484837532, "learning_rate": 8.340095760840043e-06, "loss": 0.2362, "num_input_tokens_seen": 13370432, "step": 21945 }, { "epoch": 6.810425069810735, "grad_norm": 0.10978272557258606, "learning_rate": 8.33908821022139e-06, "loss": 0.2279, "num_input_tokens_seen": 13372576, "step": 21950 }, { "epoch": 6.811976419484952, "grad_norm": 0.19672027230262756, "learning_rate": 8.338080414808034e-06, "loss": 0.2286, "num_input_tokens_seen": 13375584, "step": 21955 }, { "epoch": 6.813527769159169, "grad_norm": 0.10272826254367828, "learning_rate": 8.337072374673852e-06, "loss": 0.2309, "num_input_tokens_seen": 13378432, "step": 21960 }, { "epoch": 6.815079118833385, "grad_norm": 0.19594669342041016, "learning_rate": 8.33606408989275e-06, "loss": 0.2317, "num_input_tokens_seen": 13380896, "step": 21965 }, { "epoch": 6.816630468507602, "grad_norm": 0.10598994046449661, "learning_rate": 8.335055560538645e-06, "loss": 0.2321, "num_input_tokens_seen": 13383424, "step": 21970 }, { "epoch": 6.818181818181818, "grad_norm": 0.10358614474534988, "learning_rate": 8.334046786685473e-06, "loss": 0.2306, "num_input_tokens_seen": 13385856, "step": 21975 }, { "epoch": 6.819733167856035, "grad_norm": 0.10857242345809937, "learning_rate": 8.333037768407191e-06, "loss": 0.2291, "num_input_tokens_seen": 13389280, "step": 21980 }, { "epoch": 6.821284517530251, "grad_norm": 0.10004650056362152, "learning_rate": 8.332028505777773e-06, "loss": 0.2289, "num_input_tokens_seen": 13392352, "step": 21985 }, { "epoch": 6.822835867204468, "grad_norm": 0.08194849640130997, "learning_rate": 8.331018998871207e-06, "loss": 0.2322, "num_input_tokens_seen": 13395648, "step": 21990 }, { "epoch": 6.8243872168786845, "grad_norm": 0.11560280621051788, "learning_rate": 8.330009247761504e-06, "loss": 0.2338, "num_input_tokens_seen": 13398656, "step": 21995 }, { "epoch": 6.825938566552901, "grad_norm": 0.13131429255008698, "learning_rate": 8.32899925252269e-06, "loss": 0.2279, "num_input_tokens_seen": 13402784, "step": 22000 }, { "epoch": 6.827489916227117, "grad_norm": 0.1151236817240715, "learning_rate": 8.327989013228807e-06, "loss": 0.2336, "num_input_tokens_seen": 13405920, "step": 22005 }, { "epoch": 6.829041265901334, "grad_norm": 0.06085268035531044, "learning_rate": 8.326978529953924e-06, "loss": 0.2361, "num_input_tokens_seen": 13408544, "step": 22010 }, { "epoch": 6.830592615575551, "grad_norm": 0.11653976887464523, "learning_rate": 8.325967802772114e-06, "loss": 0.2311, "num_input_tokens_seen": 13411200, "step": 22015 }, { "epoch": 6.8321439652497675, "grad_norm": 0.10768170654773712, "learning_rate": 8.324956831757481e-06, "loss": 0.2321, "num_input_tokens_seen": 13414176, "step": 22020 }, { "epoch": 6.833695314923984, "grad_norm": 0.08443307876586914, "learning_rate": 8.323945616984138e-06, "loss": 0.2321, "num_input_tokens_seen": 13416832, "step": 22025 }, { "epoch": 6.8352466645982, "grad_norm": 0.12572187185287476, "learning_rate": 8.32293415852622e-06, "loss": 0.2274, "num_input_tokens_seen": 13420608, "step": 22030 }, { "epoch": 6.836798014272417, "grad_norm": 0.09782375395298004, "learning_rate": 8.321922456457879e-06, "loss": 0.2316, "num_input_tokens_seen": 13423616, "step": 22035 }, { "epoch": 6.838349363946634, "grad_norm": 0.09361493587493896, "learning_rate": 8.320910510853285e-06, "loss": 0.2311, "num_input_tokens_seen": 13426208, "step": 22040 }, { "epoch": 6.8399007136208505, "grad_norm": 0.06822200119495392, "learning_rate": 8.319898321786623e-06, "loss": 0.2333, "num_input_tokens_seen": 13429280, "step": 22045 }, { "epoch": 6.841452063295066, "grad_norm": 0.052119895815849304, "learning_rate": 8.318885889332102e-06, "loss": 0.2301, "num_input_tokens_seen": 13431488, "step": 22050 }, { "epoch": 6.843003412969283, "grad_norm": 0.10417529195547104, "learning_rate": 8.317873213563943e-06, "loss": 0.2375, "num_input_tokens_seen": 13435008, "step": 22055 }, { "epoch": 6.8445547626435, "grad_norm": 0.107091024518013, "learning_rate": 8.316860294556389e-06, "loss": 0.2337, "num_input_tokens_seen": 13440448, "step": 22060 }, { "epoch": 6.846106112317717, "grad_norm": 0.06790994107723236, "learning_rate": 8.315847132383697e-06, "loss": 0.2322, "num_input_tokens_seen": 13442944, "step": 22065 }, { "epoch": 6.847657461991933, "grad_norm": 0.06958653032779694, "learning_rate": 8.314833727120147e-06, "loss": 0.2295, "num_input_tokens_seen": 13445984, "step": 22070 }, { "epoch": 6.849208811666149, "grad_norm": 0.10636153817176819, "learning_rate": 8.313820078840029e-06, "loss": 0.2326, "num_input_tokens_seen": 13448480, "step": 22075 }, { "epoch": 6.850760161340366, "grad_norm": 0.11872536689043045, "learning_rate": 8.312806187617656e-06, "loss": 0.2321, "num_input_tokens_seen": 13454720, "step": 22080 }, { "epoch": 6.852311511014583, "grad_norm": 0.10613558441400528, "learning_rate": 8.311792053527363e-06, "loss": 0.2311, "num_input_tokens_seen": 13457632, "step": 22085 }, { "epoch": 6.8538628606888, "grad_norm": 0.10911788791418076, "learning_rate": 8.310777676643494e-06, "loss": 0.229, "num_input_tokens_seen": 13460160, "step": 22090 }, { "epoch": 6.855414210363016, "grad_norm": 0.18353883922100067, "learning_rate": 8.309763057040417e-06, "loss": 0.2306, "num_input_tokens_seen": 13463296, "step": 22095 }, { "epoch": 6.856965560037232, "grad_norm": 0.05083546042442322, "learning_rate": 8.308748194792513e-06, "loss": 0.2326, "num_input_tokens_seen": 13466976, "step": 22100 }, { "epoch": 6.858516909711449, "grad_norm": 0.10853293538093567, "learning_rate": 8.307733089974185e-06, "loss": 0.2292, "num_input_tokens_seen": 13469888, "step": 22105 }, { "epoch": 6.860068259385666, "grad_norm": 0.10911431163549423, "learning_rate": 8.306717742659853e-06, "loss": 0.23, "num_input_tokens_seen": 13471936, "step": 22110 }, { "epoch": 6.861619609059882, "grad_norm": 0.05185546725988388, "learning_rate": 8.305702152923951e-06, "loss": 0.2279, "num_input_tokens_seen": 13474976, "step": 22115 }, { "epoch": 6.8631709587340985, "grad_norm": 0.10655742883682251, "learning_rate": 8.304686320840937e-06, "loss": 0.2289, "num_input_tokens_seen": 13478080, "step": 22120 }, { "epoch": 6.864722308408315, "grad_norm": 0.06154927611351013, "learning_rate": 8.303670246485284e-06, "loss": 0.2331, "num_input_tokens_seen": 13480960, "step": 22125 }, { "epoch": 6.866273658082532, "grad_norm": 0.09210566431283951, "learning_rate": 8.30265392993148e-06, "loss": 0.2281, "num_input_tokens_seen": 13483744, "step": 22130 }, { "epoch": 6.867825007756748, "grad_norm": 0.0708383098244667, "learning_rate": 8.301637371254032e-06, "loss": 0.23, "num_input_tokens_seen": 13486912, "step": 22135 }, { "epoch": 6.869376357430965, "grad_norm": 0.22143001854419708, "learning_rate": 8.300620570527469e-06, "loss": 0.2354, "num_input_tokens_seen": 13489984, "step": 22140 }, { "epoch": 6.8709277071051815, "grad_norm": 0.12554453313350677, "learning_rate": 8.299603527826332e-06, "loss": 0.2349, "num_input_tokens_seen": 13491872, "step": 22145 }, { "epoch": 6.872479056779398, "grad_norm": 0.12287849932909012, "learning_rate": 8.298586243225183e-06, "loss": 0.2276, "num_input_tokens_seen": 13494208, "step": 22150 }, { "epoch": 6.874030406453615, "grad_norm": 0.12557892501354218, "learning_rate": 8.297568716798602e-06, "loss": 0.2386, "num_input_tokens_seen": 13497152, "step": 22155 }, { "epoch": 6.875581756127831, "grad_norm": 0.05436920374631882, "learning_rate": 8.296550948621184e-06, "loss": 0.2277, "num_input_tokens_seen": 13499904, "step": 22160 }, { "epoch": 6.877133105802048, "grad_norm": 0.07339125126600266, "learning_rate": 8.295532938767547e-06, "loss": 0.2371, "num_input_tokens_seen": 13504096, "step": 22165 }, { "epoch": 6.8786844554762645, "grad_norm": 0.12580250203609467, "learning_rate": 8.294514687312318e-06, "loss": 0.2358, "num_input_tokens_seen": 13506784, "step": 22170 }, { "epoch": 6.880235805150481, "grad_norm": 0.0650414302945137, "learning_rate": 8.293496194330151e-06, "loss": 0.2284, "num_input_tokens_seen": 13510688, "step": 22175 }, { "epoch": 6.881787154824697, "grad_norm": 0.08532337844371796, "learning_rate": 8.292477459895711e-06, "loss": 0.2369, "num_input_tokens_seen": 13513984, "step": 22180 }, { "epoch": 6.883338504498914, "grad_norm": 0.073129802942276, "learning_rate": 8.291458484083685e-06, "loss": 0.2321, "num_input_tokens_seen": 13516960, "step": 22185 }, { "epoch": 6.884889854173131, "grad_norm": 0.13650235533714294, "learning_rate": 8.290439266968776e-06, "loss": 0.2337, "num_input_tokens_seen": 13520576, "step": 22190 }, { "epoch": 6.8864412038473475, "grad_norm": 0.13454321026802063, "learning_rate": 8.289419808625705e-06, "loss": 0.2345, "num_input_tokens_seen": 13524000, "step": 22195 }, { "epoch": 6.887992553521563, "grad_norm": 0.11901681870222092, "learning_rate": 8.288400109129206e-06, "loss": 0.2348, "num_input_tokens_seen": 13527392, "step": 22200 }, { "epoch": 6.88954390319578, "grad_norm": 0.09652326256036758, "learning_rate": 8.28738016855404e-06, "loss": 0.2288, "num_input_tokens_seen": 13530016, "step": 22205 }, { "epoch": 6.891095252869997, "grad_norm": 0.050693951547145844, "learning_rate": 8.286359986974981e-06, "loss": 0.2342, "num_input_tokens_seen": 13533376, "step": 22210 }, { "epoch": 6.892646602544214, "grad_norm": 0.07079517841339111, "learning_rate": 8.285339564466817e-06, "loss": 0.2363, "num_input_tokens_seen": 13536992, "step": 22215 }, { "epoch": 6.8941979522184305, "grad_norm": 0.07229399681091309, "learning_rate": 8.28431890110436e-06, "loss": 0.233, "num_input_tokens_seen": 13539968, "step": 22220 }, { "epoch": 6.895749301892646, "grad_norm": 0.07038450986146927, "learning_rate": 8.283297996962433e-06, "loss": 0.2315, "num_input_tokens_seen": 13544160, "step": 22225 }, { "epoch": 6.897300651566863, "grad_norm": 0.19768761098384857, "learning_rate": 8.282276852115885e-06, "loss": 0.231, "num_input_tokens_seen": 13548096, "step": 22230 }, { "epoch": 6.89885200124108, "grad_norm": 0.11899107694625854, "learning_rate": 8.281255466639575e-06, "loss": 0.2315, "num_input_tokens_seen": 13551136, "step": 22235 }, { "epoch": 6.900403350915297, "grad_norm": 0.11056163907051086, "learning_rate": 8.280233840608383e-06, "loss": 0.2336, "num_input_tokens_seen": 13555072, "step": 22240 }, { "epoch": 6.9019547005895125, "grad_norm": 0.046447791159152985, "learning_rate": 8.279211974097207e-06, "loss": 0.23, "num_input_tokens_seen": 13558880, "step": 22245 }, { "epoch": 6.903506050263729, "grad_norm": 0.08264566212892532, "learning_rate": 8.278189867180964e-06, "loss": 0.2311, "num_input_tokens_seen": 13561856, "step": 22250 }, { "epoch": 6.905057399937946, "grad_norm": 0.04612657427787781, "learning_rate": 8.277167519934582e-06, "loss": 0.2354, "num_input_tokens_seen": 13564320, "step": 22255 }, { "epoch": 6.906608749612163, "grad_norm": 0.08037065714597702, "learning_rate": 8.276144932433012e-06, "loss": 0.2296, "num_input_tokens_seen": 13567264, "step": 22260 }, { "epoch": 6.908160099286379, "grad_norm": 0.09164397418498993, "learning_rate": 8.275122104751225e-06, "loss": 0.2288, "num_input_tokens_seen": 13571648, "step": 22265 }, { "epoch": 6.9097114489605955, "grad_norm": 0.10247214138507843, "learning_rate": 8.274099036964203e-06, "loss": 0.2359, "num_input_tokens_seen": 13575744, "step": 22270 }, { "epoch": 6.911262798634812, "grad_norm": 0.04430205374956131, "learning_rate": 8.273075729146951e-06, "loss": 0.2275, "num_input_tokens_seen": 13578560, "step": 22275 }, { "epoch": 6.912814148309029, "grad_norm": 0.11364178359508514, "learning_rate": 8.272052181374491e-06, "loss": 0.2298, "num_input_tokens_seen": 13582176, "step": 22280 }, { "epoch": 6.914365497983246, "grad_norm": 0.09020908921957016, "learning_rate": 8.271028393721857e-06, "loss": 0.2333, "num_input_tokens_seen": 13585664, "step": 22285 }, { "epoch": 6.915916847657462, "grad_norm": 0.10740962624549866, "learning_rate": 8.270004366264107e-06, "loss": 0.2318, "num_input_tokens_seen": 13588320, "step": 22290 }, { "epoch": 6.9174681973316785, "grad_norm": 0.08701559156179428, "learning_rate": 8.268980099076314e-06, "loss": 0.2184, "num_input_tokens_seen": 13591904, "step": 22295 }, { "epoch": 6.919019547005895, "grad_norm": 0.09874583780765533, "learning_rate": 8.26795559223357e-06, "loss": 0.2358, "num_input_tokens_seen": 13596960, "step": 22300 }, { "epoch": 6.920570896680112, "grad_norm": 0.21543800830841064, "learning_rate": 8.266930845810983e-06, "loss": 0.239, "num_input_tokens_seen": 13600928, "step": 22305 }, { "epoch": 6.922122246354328, "grad_norm": 0.05909698083996773, "learning_rate": 8.265905859883679e-06, "loss": 0.2315, "num_input_tokens_seen": 13603808, "step": 22310 }, { "epoch": 6.923673596028545, "grad_norm": 0.09225010871887207, "learning_rate": 8.2648806345268e-06, "loss": 0.228, "num_input_tokens_seen": 13607072, "step": 22315 }, { "epoch": 6.9252249457027615, "grad_norm": 0.050273336470127106, "learning_rate": 8.26385516981551e-06, "loss": 0.2324, "num_input_tokens_seen": 13609504, "step": 22320 }, { "epoch": 6.926776295376978, "grad_norm": 0.10750149190425873, "learning_rate": 8.262829465824986e-06, "loss": 0.2331, "num_input_tokens_seen": 13612608, "step": 22325 }, { "epoch": 6.928327645051194, "grad_norm": 0.17894347012043, "learning_rate": 8.261803522630424e-06, "loss": 0.2255, "num_input_tokens_seen": 13615136, "step": 22330 }, { "epoch": 6.929878994725411, "grad_norm": 0.199244424700737, "learning_rate": 8.260777340307038e-06, "loss": 0.243, "num_input_tokens_seen": 13617792, "step": 22335 }, { "epoch": 6.931430344399628, "grad_norm": 0.048404861241579056, "learning_rate": 8.259750918930061e-06, "loss": 0.2239, "num_input_tokens_seen": 13620640, "step": 22340 }, { "epoch": 6.9329816940738445, "grad_norm": 0.16847175359725952, "learning_rate": 8.258724258574739e-06, "loss": 0.2324, "num_input_tokens_seen": 13623328, "step": 22345 }, { "epoch": 6.934533043748061, "grad_norm": 0.10185298323631287, "learning_rate": 8.257697359316339e-06, "loss": 0.2303, "num_input_tokens_seen": 13627648, "step": 22350 }, { "epoch": 6.936084393422277, "grad_norm": 0.05046912655234337, "learning_rate": 8.256670221230147e-06, "loss": 0.2346, "num_input_tokens_seen": 13630496, "step": 22355 }, { "epoch": 6.937635743096494, "grad_norm": 0.18533729016780853, "learning_rate": 8.255642844391462e-06, "loss": 0.2362, "num_input_tokens_seen": 13633664, "step": 22360 }, { "epoch": 6.939187092770711, "grad_norm": 0.10686282068490982, "learning_rate": 8.254615228875605e-06, "loss": 0.2313, "num_input_tokens_seen": 13636832, "step": 22365 }, { "epoch": 6.940738442444927, "grad_norm": 0.04144636169075966, "learning_rate": 8.253587374757913e-06, "loss": 0.229, "num_input_tokens_seen": 13639200, "step": 22370 }, { "epoch": 6.942289792119143, "grad_norm": 0.10556244850158691, "learning_rate": 8.252559282113734e-06, "loss": 0.2311, "num_input_tokens_seen": 13641312, "step": 22375 }, { "epoch": 6.94384114179336, "grad_norm": 0.0521392896771431, "learning_rate": 8.251530951018447e-06, "loss": 0.2291, "num_input_tokens_seen": 13643840, "step": 22380 }, { "epoch": 6.945392491467577, "grad_norm": 0.09224564582109451, "learning_rate": 8.250502381547437e-06, "loss": 0.2336, "num_input_tokens_seen": 13646656, "step": 22385 }, { "epoch": 6.946943841141794, "grad_norm": 0.045493509620428085, "learning_rate": 8.249473573776108e-06, "loss": 0.2333, "num_input_tokens_seen": 13650464, "step": 22390 }, { "epoch": 6.9484951908160095, "grad_norm": 0.053550835698843, "learning_rate": 8.248444527779888e-06, "loss": 0.2319, "num_input_tokens_seen": 13653280, "step": 22395 }, { "epoch": 6.950046540490226, "grad_norm": 0.09758757054805756, "learning_rate": 8.247415243634219e-06, "loss": 0.2332, "num_input_tokens_seen": 13655488, "step": 22400 }, { "epoch": 6.951597890164443, "grad_norm": 0.04875803738832474, "learning_rate": 8.246385721414555e-06, "loss": 0.2304, "num_input_tokens_seen": 13657920, "step": 22405 }, { "epoch": 6.95314923983866, "grad_norm": 0.11023098975419998, "learning_rate": 8.245355961196376e-06, "loss": 0.2315, "num_input_tokens_seen": 13661216, "step": 22410 }, { "epoch": 6.954700589512877, "grad_norm": 0.09199736267328262, "learning_rate": 8.244325963055173e-06, "loss": 0.2253, "num_input_tokens_seen": 13663552, "step": 22415 }, { "epoch": 6.9562519391870925, "grad_norm": 0.10396379232406616, "learning_rate": 8.243295727066459e-06, "loss": 0.2351, "num_input_tokens_seen": 13667072, "step": 22420 }, { "epoch": 6.957803288861309, "grad_norm": 0.131223663687706, "learning_rate": 8.242265253305762e-06, "loss": 0.2305, "num_input_tokens_seen": 13670080, "step": 22425 }, { "epoch": 6.959354638535526, "grad_norm": 0.044596560299396515, "learning_rate": 8.241234541848628e-06, "loss": 0.2325, "num_input_tokens_seen": 13673152, "step": 22430 }, { "epoch": 6.960905988209743, "grad_norm": 0.11362894624471664, "learning_rate": 8.240203592770619e-06, "loss": 0.2335, "num_input_tokens_seen": 13675968, "step": 22435 }, { "epoch": 6.962457337883959, "grad_norm": 0.09537527710199356, "learning_rate": 8.239172406147318e-06, "loss": 0.2263, "num_input_tokens_seen": 13679584, "step": 22440 }, { "epoch": 6.9640086875581755, "grad_norm": 0.11023809015750885, "learning_rate": 8.238140982054322e-06, "loss": 0.2361, "num_input_tokens_seen": 13682880, "step": 22445 }, { "epoch": 6.965560037232392, "grad_norm": 0.09573293477296829, "learning_rate": 8.237109320567245e-06, "loss": 0.2298, "num_input_tokens_seen": 13685024, "step": 22450 }, { "epoch": 6.967111386906609, "grad_norm": 0.05881189554929733, "learning_rate": 8.236077421761724e-06, "loss": 0.2268, "num_input_tokens_seen": 13688544, "step": 22455 }, { "epoch": 6.968662736580825, "grad_norm": 0.0944114625453949, "learning_rate": 8.235045285713405e-06, "loss": 0.2253, "num_input_tokens_seen": 13690816, "step": 22460 }, { "epoch": 6.970214086255042, "grad_norm": 0.03386129438877106, "learning_rate": 8.23401291249796e-06, "loss": 0.225, "num_input_tokens_seen": 13693696, "step": 22465 }, { "epoch": 6.9717654359292585, "grad_norm": 0.18328985571861267, "learning_rate": 8.232980302191072e-06, "loss": 0.2347, "num_input_tokens_seen": 13698848, "step": 22470 }, { "epoch": 6.973316785603475, "grad_norm": 0.11707919090986252, "learning_rate": 8.231947454868441e-06, "loss": 0.2377, "num_input_tokens_seen": 13701152, "step": 22475 }, { "epoch": 6.974868135277692, "grad_norm": 0.06485028564929962, "learning_rate": 8.23091437060579e-06, "loss": 0.2326, "num_input_tokens_seen": 13703328, "step": 22480 }, { "epoch": 6.976419484951908, "grad_norm": 0.1984807401895523, "learning_rate": 8.229881049478859e-06, "loss": 0.2308, "num_input_tokens_seen": 13707104, "step": 22485 }, { "epoch": 6.977970834626125, "grad_norm": 0.06877149641513824, "learning_rate": 8.2288474915634e-06, "loss": 0.233, "num_input_tokens_seen": 13710368, "step": 22490 }, { "epoch": 6.979522184300341, "grad_norm": 0.11976210027933121, "learning_rate": 8.22781369693518e-06, "loss": 0.2341, "num_input_tokens_seen": 13714592, "step": 22495 }, { "epoch": 6.981073533974558, "grad_norm": 0.11479309946298599, "learning_rate": 8.226779665669995e-06, "loss": 0.2316, "num_input_tokens_seen": 13716704, "step": 22500 }, { "epoch": 6.982624883648774, "grad_norm": 0.19172732532024384, "learning_rate": 8.22574539784365e-06, "loss": 0.2248, "num_input_tokens_seen": 13720000, "step": 22505 }, { "epoch": 6.984176233322991, "grad_norm": 0.09830715507268906, "learning_rate": 8.22471089353197e-06, "loss": 0.2268, "num_input_tokens_seen": 13722592, "step": 22510 }, { "epoch": 6.985727582997208, "grad_norm": 0.11004286259412766, "learning_rate": 8.223676152810793e-06, "loss": 0.226, "num_input_tokens_seen": 13725504, "step": 22515 }, { "epoch": 6.987278932671424, "grad_norm": 0.14353665709495544, "learning_rate": 8.222641175755983e-06, "loss": 0.2346, "num_input_tokens_seen": 13728224, "step": 22520 }, { "epoch": 6.98883028234564, "grad_norm": 0.1010361760854721, "learning_rate": 8.22160596244341e-06, "loss": 0.23, "num_input_tokens_seen": 13730752, "step": 22525 }, { "epoch": 6.990381632019857, "grad_norm": 0.11369616538286209, "learning_rate": 8.22057051294897e-06, "loss": 0.2307, "num_input_tokens_seen": 13734272, "step": 22530 }, { "epoch": 6.991932981694074, "grad_norm": 0.10368967056274414, "learning_rate": 8.219534827348577e-06, "loss": 0.2338, "num_input_tokens_seen": 13737440, "step": 22535 }, { "epoch": 6.993484331368291, "grad_norm": 0.08868639171123505, "learning_rate": 8.218498905718155e-06, "loss": 0.2308, "num_input_tokens_seen": 13740256, "step": 22540 }, { "epoch": 6.995035681042507, "grad_norm": 0.1367989480495453, "learning_rate": 8.217462748133651e-06, "loss": 0.2269, "num_input_tokens_seen": 13744224, "step": 22545 }, { "epoch": 6.996587030716723, "grad_norm": 0.05850599706172943, "learning_rate": 8.216426354671026e-06, "loss": 0.2358, "num_input_tokens_seen": 13748448, "step": 22550 }, { "epoch": 6.99813838039094, "grad_norm": 0.0687355101108551, "learning_rate": 8.21538972540626e-06, "loss": 0.2253, "num_input_tokens_seen": 13750912, "step": 22555 }, { "epoch": 6.999689730065157, "grad_norm": 0.08411046117544174, "learning_rate": 8.214352860415354e-06, "loss": 0.2286, "num_input_tokens_seen": 13754240, "step": 22560 }, { "epoch": 7.001241079739374, "grad_norm": 0.13502196967601776, "learning_rate": 8.21331575977432e-06, "loss": 0.238, "num_input_tokens_seen": 13757088, "step": 22565 }, { "epoch": 7.0027924294135895, "grad_norm": 0.05594172701239586, "learning_rate": 8.212278423559185e-06, "loss": 0.2264, "num_input_tokens_seen": 13761792, "step": 22570 }, { "epoch": 7.004343779087806, "grad_norm": 0.14254851639270782, "learning_rate": 8.211240851846007e-06, "loss": 0.2295, "num_input_tokens_seen": 13764512, "step": 22575 }, { "epoch": 7.005895128762023, "grad_norm": 0.14219215512275696, "learning_rate": 8.210203044710846e-06, "loss": 0.2294, "num_input_tokens_seen": 13767552, "step": 22580 }, { "epoch": 7.00744647843624, "grad_norm": 0.1066107451915741, "learning_rate": 8.209165002229786e-06, "loss": 0.2258, "num_input_tokens_seen": 13770240, "step": 22585 }, { "epoch": 7.008997828110456, "grad_norm": 0.12412611395120621, "learning_rate": 8.208126724478931e-06, "loss": 0.2328, "num_input_tokens_seen": 13773824, "step": 22590 }, { "epoch": 7.0105491777846725, "grad_norm": 0.189190074801445, "learning_rate": 8.207088211534397e-06, "loss": 0.2269, "num_input_tokens_seen": 13777568, "step": 22595 }, { "epoch": 7.012100527458889, "grad_norm": 0.1995369791984558, "learning_rate": 8.206049463472319e-06, "loss": 0.2251, "num_input_tokens_seen": 13780608, "step": 22600 }, { "epoch": 7.013651877133106, "grad_norm": 0.13965961337089539, "learning_rate": 8.205010480368852e-06, "loss": 0.2357, "num_input_tokens_seen": 13784064, "step": 22605 }, { "epoch": 7.015203226807323, "grad_norm": 0.1558452844619751, "learning_rate": 8.203971262300161e-06, "loss": 0.2371, "num_input_tokens_seen": 13788320, "step": 22610 }, { "epoch": 7.016754576481539, "grad_norm": 0.07656777650117874, "learning_rate": 8.202931809342436e-06, "loss": 0.2252, "num_input_tokens_seen": 13791328, "step": 22615 }, { "epoch": 7.018305926155755, "grad_norm": 0.11944307386875153, "learning_rate": 8.201892121571881e-06, "loss": 0.2297, "num_input_tokens_seen": 13793856, "step": 22620 }, { "epoch": 7.019857275829972, "grad_norm": 0.10965078324079514, "learning_rate": 8.20085219906472e-06, "loss": 0.2285, "num_input_tokens_seen": 13796960, "step": 22625 }, { "epoch": 7.021408625504189, "grad_norm": 0.28119152784347534, "learning_rate": 8.199812041897186e-06, "loss": 0.2324, "num_input_tokens_seen": 13800000, "step": 22630 }, { "epoch": 7.022959975178405, "grad_norm": 0.08472487330436707, "learning_rate": 8.19877165014554e-06, "loss": 0.2337, "num_input_tokens_seen": 13803168, "step": 22635 }, { "epoch": 7.024511324852622, "grad_norm": 0.16612766683101654, "learning_rate": 8.197731023886052e-06, "loss": 0.2331, "num_input_tokens_seen": 13806048, "step": 22640 }, { "epoch": 7.026062674526838, "grad_norm": 0.2819008529186249, "learning_rate": 8.196690163195014e-06, "loss": 0.2276, "num_input_tokens_seen": 13809280, "step": 22645 }, { "epoch": 7.027614024201055, "grad_norm": 0.18680135905742645, "learning_rate": 8.19564906814873e-06, "loss": 0.2328, "num_input_tokens_seen": 13813376, "step": 22650 }, { "epoch": 7.029165373875271, "grad_norm": 0.2260255366563797, "learning_rate": 8.19460773882353e-06, "loss": 0.2383, "num_input_tokens_seen": 13816640, "step": 22655 }, { "epoch": 7.030716723549488, "grad_norm": 0.15803605318069458, "learning_rate": 8.193566175295751e-06, "loss": 0.2227, "num_input_tokens_seen": 13819136, "step": 22660 }, { "epoch": 7.032268073223705, "grad_norm": 0.17893999814987183, "learning_rate": 8.192524377641756e-06, "loss": 0.2414, "num_input_tokens_seen": 13822240, "step": 22665 }, { "epoch": 7.033819422897921, "grad_norm": 0.1608155369758606, "learning_rate": 8.191482345937915e-06, "loss": 0.2209, "num_input_tokens_seen": 13824960, "step": 22670 }, { "epoch": 7.035370772572138, "grad_norm": 0.13488145172595978, "learning_rate": 8.190440080260629e-06, "loss": 0.2462, "num_input_tokens_seen": 13827904, "step": 22675 }, { "epoch": 7.036922122246354, "grad_norm": 0.1320662945508957, "learning_rate": 8.189397580686305e-06, "loss": 0.2336, "num_input_tokens_seen": 13831200, "step": 22680 }, { "epoch": 7.038473471920571, "grad_norm": 0.1545814722776413, "learning_rate": 8.188354847291367e-06, "loss": 0.229, "num_input_tokens_seen": 13834880, "step": 22685 }, { "epoch": 7.040024821594788, "grad_norm": 0.14471600949764252, "learning_rate": 8.187311880152265e-06, "loss": 0.2269, "num_input_tokens_seen": 13838784, "step": 22690 }, { "epoch": 7.041576171269004, "grad_norm": 0.14345411956310272, "learning_rate": 8.18626867934546e-06, "loss": 0.2302, "num_input_tokens_seen": 13843584, "step": 22695 }, { "epoch": 7.04312752094322, "grad_norm": 0.1471157968044281, "learning_rate": 8.185225244947428e-06, "loss": 0.2289, "num_input_tokens_seen": 13846496, "step": 22700 }, { "epoch": 7.044678870617437, "grad_norm": 0.2462843954563141, "learning_rate": 8.184181577034666e-06, "loss": 0.2382, "num_input_tokens_seen": 13849504, "step": 22705 }, { "epoch": 7.046230220291654, "grad_norm": 0.10397522896528244, "learning_rate": 8.18313767568369e-06, "loss": 0.2271, "num_input_tokens_seen": 13852224, "step": 22710 }, { "epoch": 7.047781569965871, "grad_norm": 0.12991668283939362, "learning_rate": 8.182093540971027e-06, "loss": 0.2246, "num_input_tokens_seen": 13854848, "step": 22715 }, { "epoch": 7.0493329196400865, "grad_norm": 0.1671011745929718, "learning_rate": 8.181049172973226e-06, "loss": 0.2281, "num_input_tokens_seen": 13857856, "step": 22720 }, { "epoch": 7.050884269314303, "grad_norm": 0.15865248441696167, "learning_rate": 8.180004571766852e-06, "loss": 0.233, "num_input_tokens_seen": 13860800, "step": 22725 }, { "epoch": 7.05243561898852, "grad_norm": 0.09498219192028046, "learning_rate": 8.178959737428485e-06, "loss": 0.2246, "num_input_tokens_seen": 13864288, "step": 22730 }, { "epoch": 7.053986968662737, "grad_norm": 0.13971839845180511, "learning_rate": 8.177914670034725e-06, "loss": 0.2281, "num_input_tokens_seen": 13867424, "step": 22735 }, { "epoch": 7.0555383183369536, "grad_norm": 0.1744001805782318, "learning_rate": 8.17686936966219e-06, "loss": 0.2342, "num_input_tokens_seen": 13870848, "step": 22740 }, { "epoch": 7.057089668011169, "grad_norm": 0.11455822736024857, "learning_rate": 8.175823836387507e-06, "loss": 0.2282, "num_input_tokens_seen": 13874816, "step": 22745 }, { "epoch": 7.058641017685386, "grad_norm": 0.08291886001825333, "learning_rate": 8.174778070287331e-06, "loss": 0.236, "num_input_tokens_seen": 13877696, "step": 22750 }, { "epoch": 7.060192367359603, "grad_norm": 0.15327440202236176, "learning_rate": 8.173732071438327e-06, "loss": 0.227, "num_input_tokens_seen": 13880640, "step": 22755 }, { "epoch": 7.06174371703382, "grad_norm": 0.1593962162733078, "learning_rate": 8.17268583991718e-06, "loss": 0.2329, "num_input_tokens_seen": 13882912, "step": 22760 }, { "epoch": 7.063295066708036, "grad_norm": 0.22917293012142181, "learning_rate": 8.17163937580059e-06, "loss": 0.2273, "num_input_tokens_seen": 13885696, "step": 22765 }, { "epoch": 7.064846416382252, "grad_norm": 0.19591741263866425, "learning_rate": 8.170592679165277e-06, "loss": 0.2316, "num_input_tokens_seen": 13890272, "step": 22770 }, { "epoch": 7.066397766056469, "grad_norm": 0.14208556711673737, "learning_rate": 8.169545750087975e-06, "loss": 0.2234, "num_input_tokens_seen": 13892768, "step": 22775 }, { "epoch": 7.067949115730686, "grad_norm": 0.21244078874588013, "learning_rate": 8.168498588645436e-06, "loss": 0.2136, "num_input_tokens_seen": 13895712, "step": 22780 }, { "epoch": 7.069500465404902, "grad_norm": 0.27652233839035034, "learning_rate": 8.16745119491443e-06, "loss": 0.2264, "num_input_tokens_seen": 13898624, "step": 22785 }, { "epoch": 7.071051815079119, "grad_norm": 0.8678886890411377, "learning_rate": 8.166403568971743e-06, "loss": 0.232, "num_input_tokens_seen": 13902336, "step": 22790 }, { "epoch": 7.072603164753335, "grad_norm": 0.45588624477386475, "learning_rate": 8.165355710894178e-06, "loss": 0.2367, "num_input_tokens_seen": 13905280, "step": 22795 }, { "epoch": 7.074154514427552, "grad_norm": 0.34615516662597656, "learning_rate": 8.164307620758558e-06, "loss": 0.2449, "num_input_tokens_seen": 13908288, "step": 22800 }, { "epoch": 7.075705864101769, "grad_norm": 0.16550014913082123, "learning_rate": 8.163259298641716e-06, "loss": 0.2306, "num_input_tokens_seen": 13910752, "step": 22805 }, { "epoch": 7.077257213775985, "grad_norm": 0.09412778168916702, "learning_rate": 8.16221074462051e-06, "loss": 0.2305, "num_input_tokens_seen": 13913024, "step": 22810 }, { "epoch": 7.078808563450202, "grad_norm": 0.33017194271087646, "learning_rate": 8.16116195877181e-06, "loss": 0.2331, "num_input_tokens_seen": 13915776, "step": 22815 }, { "epoch": 7.080359913124418, "grad_norm": 0.1770942658185959, "learning_rate": 8.160112941172505e-06, "loss": 0.2257, "num_input_tokens_seen": 13918848, "step": 22820 }, { "epoch": 7.081911262798635, "grad_norm": 0.2526736557483673, "learning_rate": 8.1590636918995e-06, "loss": 0.2353, "num_input_tokens_seen": 13922464, "step": 22825 }, { "epoch": 7.083462612472851, "grad_norm": 0.2668720483779907, "learning_rate": 8.158014211029718e-06, "loss": 0.2333, "num_input_tokens_seen": 13925408, "step": 22830 }, { "epoch": 7.085013962147068, "grad_norm": 0.1601417362689972, "learning_rate": 8.156964498640097e-06, "loss": 0.235, "num_input_tokens_seen": 13928032, "step": 22835 }, { "epoch": 7.086565311821285, "grad_norm": 0.17697958648204803, "learning_rate": 8.155914554807593e-06, "loss": 0.231, "num_input_tokens_seen": 13930176, "step": 22840 }, { "epoch": 7.088116661495501, "grad_norm": 0.20031088590621948, "learning_rate": 8.154864379609183e-06, "loss": 0.2361, "num_input_tokens_seen": 13933312, "step": 22845 }, { "epoch": 7.089668011169717, "grad_norm": 0.1676788330078125, "learning_rate": 8.153813973121854e-06, "loss": 0.2274, "num_input_tokens_seen": 13935392, "step": 22850 }, { "epoch": 7.091219360843934, "grad_norm": 0.07898610830307007, "learning_rate": 8.152763335422612e-06, "loss": 0.228, "num_input_tokens_seen": 13937568, "step": 22855 }, { "epoch": 7.092770710518151, "grad_norm": 0.0755414217710495, "learning_rate": 8.151712466588487e-06, "loss": 0.2343, "num_input_tokens_seen": 13940704, "step": 22860 }, { "epoch": 7.0943220601923676, "grad_norm": 0.16598820686340332, "learning_rate": 8.150661366696514e-06, "loss": 0.2339, "num_input_tokens_seen": 13943712, "step": 22865 }, { "epoch": 7.095873409866584, "grad_norm": 0.14175231754779816, "learning_rate": 8.149610035823752e-06, "loss": 0.2313, "num_input_tokens_seen": 13946752, "step": 22870 }, { "epoch": 7.0974247595408, "grad_norm": 0.18414084613323212, "learning_rate": 8.14855847404728e-06, "loss": 0.235, "num_input_tokens_seen": 13949216, "step": 22875 }, { "epoch": 7.098976109215017, "grad_norm": 0.10468416661024094, "learning_rate": 8.147506681444187e-06, "loss": 0.232, "num_input_tokens_seen": 13952928, "step": 22880 }, { "epoch": 7.100527458889234, "grad_norm": 0.09431234002113342, "learning_rate": 8.146454658091582e-06, "loss": 0.2223, "num_input_tokens_seen": 13955072, "step": 22885 }, { "epoch": 7.1020788085634505, "grad_norm": 0.12005318701267242, "learning_rate": 8.14540240406659e-06, "loss": 0.2326, "num_input_tokens_seen": 13957344, "step": 22890 }, { "epoch": 7.103630158237666, "grad_norm": 0.1293933391571045, "learning_rate": 8.144349919446355e-06, "loss": 0.23, "num_input_tokens_seen": 13960544, "step": 22895 }, { "epoch": 7.105181507911883, "grad_norm": 0.11571291834115982, "learning_rate": 8.143297204308035e-06, "loss": 0.225, "num_input_tokens_seen": 13963840, "step": 22900 }, { "epoch": 7.1067328575861, "grad_norm": 0.1306602507829666, "learning_rate": 8.14224425872881e-06, "loss": 0.2293, "num_input_tokens_seen": 13966528, "step": 22905 }, { "epoch": 7.108284207260317, "grad_norm": 0.209434375166893, "learning_rate": 8.141191082785871e-06, "loss": 0.2268, "num_input_tokens_seen": 13969280, "step": 22910 }, { "epoch": 7.109835556934533, "grad_norm": 0.13142330944538116, "learning_rate": 8.140137676556428e-06, "loss": 0.2352, "num_input_tokens_seen": 13972288, "step": 22915 }, { "epoch": 7.111386906608749, "grad_norm": 0.20672550797462463, "learning_rate": 8.139084040117709e-06, "loss": 0.2315, "num_input_tokens_seen": 13974496, "step": 22920 }, { "epoch": 7.112938256282966, "grad_norm": 0.12362075597047806, "learning_rate": 8.138030173546955e-06, "loss": 0.2253, "num_input_tokens_seen": 13977792, "step": 22925 }, { "epoch": 7.114489605957183, "grad_norm": 0.21183748543262482, "learning_rate": 8.136976076921433e-06, "loss": 0.2399, "num_input_tokens_seen": 13980704, "step": 22930 }, { "epoch": 7.1160409556314, "grad_norm": 0.14014583826065063, "learning_rate": 8.135921750318414e-06, "loss": 0.2249, "num_input_tokens_seen": 13983648, "step": 22935 }, { "epoch": 7.117592305305616, "grad_norm": 0.21554304659366608, "learning_rate": 8.134867193815196e-06, "loss": 0.2331, "num_input_tokens_seen": 13986016, "step": 22940 }, { "epoch": 7.119143654979832, "grad_norm": 0.08344856649637222, "learning_rate": 8.133812407489092e-06, "loss": 0.2302, "num_input_tokens_seen": 13988928, "step": 22945 }, { "epoch": 7.120695004654049, "grad_norm": 0.0810500979423523, "learning_rate": 8.132757391417427e-06, "loss": 0.2315, "num_input_tokens_seen": 13992160, "step": 22950 }, { "epoch": 7.122246354328266, "grad_norm": 0.1010790541768074, "learning_rate": 8.13170214567755e-06, "loss": 0.2243, "num_input_tokens_seen": 13995008, "step": 22955 }, { "epoch": 7.123797704002482, "grad_norm": 0.08091811835765839, "learning_rate": 8.13064667034682e-06, "loss": 0.2339, "num_input_tokens_seen": 13998304, "step": 22960 }, { "epoch": 7.125349053676699, "grad_norm": 0.12274906039237976, "learning_rate": 8.129590965502616e-06, "loss": 0.2262, "num_input_tokens_seen": 14001536, "step": 22965 }, { "epoch": 7.126900403350915, "grad_norm": 0.12585623562335968, "learning_rate": 8.128535031222335e-06, "loss": 0.2316, "num_input_tokens_seen": 14003936, "step": 22970 }, { "epoch": 7.128451753025132, "grad_norm": 0.09223800897598267, "learning_rate": 8.127478867583387e-06, "loss": 0.2272, "num_input_tokens_seen": 14007104, "step": 22975 }, { "epoch": 7.130003102699348, "grad_norm": 0.23012128472328186, "learning_rate": 8.126422474663205e-06, "loss": 0.2403, "num_input_tokens_seen": 14009920, "step": 22980 }, { "epoch": 7.131554452373565, "grad_norm": 0.06691276282072067, "learning_rate": 8.125365852539235e-06, "loss": 0.228, "num_input_tokens_seen": 14012224, "step": 22985 }, { "epoch": 7.1331058020477816, "grad_norm": 0.13954389095306396, "learning_rate": 8.124309001288934e-06, "loss": 0.2339, "num_input_tokens_seen": 14016736, "step": 22990 }, { "epoch": 7.134657151721998, "grad_norm": 0.1598833203315735, "learning_rate": 8.12325192098979e-06, "loss": 0.2351, "num_input_tokens_seen": 14019808, "step": 22995 }, { "epoch": 7.136208501396215, "grad_norm": 0.13477884232997894, "learning_rate": 8.122194611719292e-06, "loss": 0.2239, "num_input_tokens_seen": 14023424, "step": 23000 }, { "epoch": 7.137759851070431, "grad_norm": 0.12929587066173553, "learning_rate": 8.12113707355496e-06, "loss": 0.231, "num_input_tokens_seen": 14027328, "step": 23005 }, { "epoch": 7.139311200744648, "grad_norm": 0.10004126280546188, "learning_rate": 8.120079306574317e-06, "loss": 0.2368, "num_input_tokens_seen": 14030208, "step": 23010 }, { "epoch": 7.1408625504188645, "grad_norm": 0.13711322844028473, "learning_rate": 8.119021310854916e-06, "loss": 0.2339, "num_input_tokens_seen": 14032768, "step": 23015 }, { "epoch": 7.142413900093081, "grad_norm": 0.11857391148805618, "learning_rate": 8.117963086474317e-06, "loss": 0.2347, "num_input_tokens_seen": 14035744, "step": 23020 }, { "epoch": 7.143965249767297, "grad_norm": 0.2150442898273468, "learning_rate": 8.116904633510101e-06, "loss": 0.2296, "num_input_tokens_seen": 14038464, "step": 23025 }, { "epoch": 7.145516599441514, "grad_norm": 0.12187272310256958, "learning_rate": 8.115845952039867e-06, "loss": 0.2342, "num_input_tokens_seen": 14040800, "step": 23030 }, { "epoch": 7.147067949115731, "grad_norm": 0.07553023099899292, "learning_rate": 8.114787042141228e-06, "loss": 0.2284, "num_input_tokens_seen": 14044160, "step": 23035 }, { "epoch": 7.1486192987899475, "grad_norm": 0.04817111790180206, "learning_rate": 8.113727903891813e-06, "loss": 0.2336, "num_input_tokens_seen": 14047936, "step": 23040 }, { "epoch": 7.150170648464163, "grad_norm": 0.07552997767925262, "learning_rate": 8.11266853736927e-06, "loss": 0.2283, "num_input_tokens_seen": 14050240, "step": 23045 }, { "epoch": 7.15172199813838, "grad_norm": 0.11886508017778397, "learning_rate": 8.111608942651265e-06, "loss": 0.2374, "num_input_tokens_seen": 14052736, "step": 23050 }, { "epoch": 7.153273347812597, "grad_norm": 0.10995697975158691, "learning_rate": 8.110549119815477e-06, "loss": 0.2284, "num_input_tokens_seen": 14055136, "step": 23055 }, { "epoch": 7.154824697486814, "grad_norm": 0.1113600954413414, "learning_rate": 8.109489068939604e-06, "loss": 0.2304, "num_input_tokens_seen": 14057664, "step": 23060 }, { "epoch": 7.1563760471610305, "grad_norm": 0.2001040130853653, "learning_rate": 8.10842879010136e-06, "loss": 0.2333, "num_input_tokens_seen": 14060352, "step": 23065 }, { "epoch": 7.157927396835246, "grad_norm": 0.08262412250041962, "learning_rate": 8.107368283378478e-06, "loss": 0.2275, "num_input_tokens_seen": 14065184, "step": 23070 }, { "epoch": 7.159478746509463, "grad_norm": 0.10563035309314728, "learning_rate": 8.106307548848705e-06, "loss": 0.2316, "num_input_tokens_seen": 14067840, "step": 23075 }, { "epoch": 7.16103009618368, "grad_norm": 0.059976976364851, "learning_rate": 8.105246586589802e-06, "loss": 0.2306, "num_input_tokens_seen": 14070368, "step": 23080 }, { "epoch": 7.162581445857897, "grad_norm": 0.1140320673584938, "learning_rate": 8.104185396679553e-06, "loss": 0.2363, "num_input_tokens_seen": 14072928, "step": 23085 }, { "epoch": 7.164132795532113, "grad_norm": 0.12366786599159241, "learning_rate": 8.103123979195755e-06, "loss": 0.2346, "num_input_tokens_seen": 14076096, "step": 23090 }, { "epoch": 7.165684145206329, "grad_norm": 0.20803025364875793, "learning_rate": 8.102062334216224e-06, "loss": 0.2362, "num_input_tokens_seen": 14079904, "step": 23095 }, { "epoch": 7.167235494880546, "grad_norm": 0.07931218296289444, "learning_rate": 8.10100046181879e-06, "loss": 0.2304, "num_input_tokens_seen": 14083296, "step": 23100 }, { "epoch": 7.168786844554763, "grad_norm": 0.11847881227731705, "learning_rate": 8.099938362081298e-06, "loss": 0.2368, "num_input_tokens_seen": 14086144, "step": 23105 }, { "epoch": 7.170338194228979, "grad_norm": 0.10630019754171371, "learning_rate": 8.098876035081618e-06, "loss": 0.2325, "num_input_tokens_seen": 14088384, "step": 23110 }, { "epoch": 7.1718895439031956, "grad_norm": 0.12094983458518982, "learning_rate": 8.09781348089763e-06, "loss": 0.2269, "num_input_tokens_seen": 14091200, "step": 23115 }, { "epoch": 7.173440893577412, "grad_norm": 0.10024885833263397, "learning_rate": 8.096750699607227e-06, "loss": 0.2336, "num_input_tokens_seen": 14094240, "step": 23120 }, { "epoch": 7.174992243251629, "grad_norm": 0.07721079885959625, "learning_rate": 8.095687691288327e-06, "loss": 0.2351, "num_input_tokens_seen": 14097504, "step": 23125 }, { "epoch": 7.176543592925846, "grad_norm": 0.11235567927360535, "learning_rate": 8.094624456018862e-06, "loss": 0.2273, "num_input_tokens_seen": 14100352, "step": 23130 }, { "epoch": 7.178094942600062, "grad_norm": 0.04998767003417015, "learning_rate": 8.093560993876778e-06, "loss": 0.2309, "num_input_tokens_seen": 14102976, "step": 23135 }, { "epoch": 7.1796462922742785, "grad_norm": 0.0670175552368164, "learning_rate": 8.092497304940041e-06, "loss": 0.2283, "num_input_tokens_seen": 14106176, "step": 23140 }, { "epoch": 7.181197641948495, "grad_norm": 0.21284224092960358, "learning_rate": 8.091433389286631e-06, "loss": 0.2203, "num_input_tokens_seen": 14109280, "step": 23145 }, { "epoch": 7.182748991622712, "grad_norm": 0.11417427659034729, "learning_rate": 8.090369246994545e-06, "loss": 0.226, "num_input_tokens_seen": 14111584, "step": 23150 }, { "epoch": 7.184300341296928, "grad_norm": 0.08897099643945694, "learning_rate": 8.089304878141796e-06, "loss": 0.2401, "num_input_tokens_seen": 14113856, "step": 23155 }, { "epoch": 7.185851690971145, "grad_norm": 0.11129165440797806, "learning_rate": 8.08824028280642e-06, "loss": 0.2289, "num_input_tokens_seen": 14116288, "step": 23160 }, { "epoch": 7.1874030406453615, "grad_norm": 0.1073230430483818, "learning_rate": 8.087175461066457e-06, "loss": 0.2324, "num_input_tokens_seen": 14118976, "step": 23165 }, { "epoch": 7.188954390319578, "grad_norm": 0.14099831879138947, "learning_rate": 8.086110412999976e-06, "loss": 0.2305, "num_input_tokens_seen": 14122784, "step": 23170 }, { "epoch": 7.190505739993794, "grad_norm": 0.09975426644086838, "learning_rate": 8.08504513868506e-06, "loss": 0.2302, "num_input_tokens_seen": 14125920, "step": 23175 }, { "epoch": 7.192057089668011, "grad_norm": 0.22998978197574615, "learning_rate": 8.0839796381998e-06, "loss": 0.2302, "num_input_tokens_seen": 14128704, "step": 23180 }, { "epoch": 7.193608439342228, "grad_norm": 0.2009664624929428, "learning_rate": 8.082913911622314e-06, "loss": 0.2264, "num_input_tokens_seen": 14131872, "step": 23185 }, { "epoch": 7.1951597890164445, "grad_norm": 0.07885929942131042, "learning_rate": 8.08184795903073e-06, "loss": 0.2372, "num_input_tokens_seen": 14134464, "step": 23190 }, { "epoch": 7.196711138690661, "grad_norm": 0.05439499393105507, "learning_rate": 8.080781780503197e-06, "loss": 0.2326, "num_input_tokens_seen": 14137472, "step": 23195 }, { "epoch": 7.198262488364877, "grad_norm": 0.05908733978867531, "learning_rate": 8.079715376117876e-06, "loss": 0.2323, "num_input_tokens_seen": 14139456, "step": 23200 }, { "epoch": 7.199813838039094, "grad_norm": 0.09629550576210022, "learning_rate": 8.07864874595295e-06, "loss": 0.2294, "num_input_tokens_seen": 14142144, "step": 23205 }, { "epoch": 7.201365187713311, "grad_norm": 0.20290420949459076, "learning_rate": 8.077581890086614e-06, "loss": 0.2445, "num_input_tokens_seen": 14144896, "step": 23210 }, { "epoch": 7.2029165373875275, "grad_norm": 0.10307296365499496, "learning_rate": 8.076514808597082e-06, "loss": 0.229, "num_input_tokens_seen": 14147488, "step": 23215 }, { "epoch": 7.204467887061743, "grad_norm": 0.1949414759874344, "learning_rate": 8.075447501562583e-06, "loss": 0.232, "num_input_tokens_seen": 14150016, "step": 23220 }, { "epoch": 7.20601923673596, "grad_norm": 0.06341106444597244, "learning_rate": 8.074379969061363e-06, "loss": 0.228, "num_input_tokens_seen": 14152640, "step": 23225 }, { "epoch": 7.207570586410177, "grad_norm": 0.10135286301374435, "learning_rate": 8.073312211171684e-06, "loss": 0.2296, "num_input_tokens_seen": 14156384, "step": 23230 }, { "epoch": 7.209121936084394, "grad_norm": 0.12881359457969666, "learning_rate": 8.072244227971829e-06, "loss": 0.2273, "num_input_tokens_seen": 14159456, "step": 23235 }, { "epoch": 7.2106732857586096, "grad_norm": 0.11086234450340271, "learning_rate": 8.071176019540089e-06, "loss": 0.2311, "num_input_tokens_seen": 14162496, "step": 23240 }, { "epoch": 7.212224635432826, "grad_norm": 0.08702702820301056, "learning_rate": 8.07010758595478e-06, "loss": 0.2364, "num_input_tokens_seen": 14165184, "step": 23245 }, { "epoch": 7.213775985107043, "grad_norm": 0.11853865534067154, "learning_rate": 8.069038927294228e-06, "loss": 0.235, "num_input_tokens_seen": 14167424, "step": 23250 }, { "epoch": 7.21532733478126, "grad_norm": 0.09211153537034988, "learning_rate": 8.067970043636782e-06, "loss": 0.2388, "num_input_tokens_seen": 14170272, "step": 23255 }, { "epoch": 7.216878684455477, "grad_norm": 0.0567324236035347, "learning_rate": 8.0669009350608e-06, "loss": 0.2364, "num_input_tokens_seen": 14172896, "step": 23260 }, { "epoch": 7.2184300341296925, "grad_norm": 0.09997665137052536, "learning_rate": 8.065831601644663e-06, "loss": 0.2319, "num_input_tokens_seen": 14176416, "step": 23265 }, { "epoch": 7.219981383803909, "grad_norm": 0.06116407364606857, "learning_rate": 8.064762043466763e-06, "loss": 0.23, "num_input_tokens_seen": 14179136, "step": 23270 }, { "epoch": 7.221532733478126, "grad_norm": 0.08301162719726562, "learning_rate": 8.063692260605514e-06, "loss": 0.2259, "num_input_tokens_seen": 14183040, "step": 23275 }, { "epoch": 7.223084083152343, "grad_norm": 0.06742019951343536, "learning_rate": 8.062622253139344e-06, "loss": 0.2322, "num_input_tokens_seen": 14185632, "step": 23280 }, { "epoch": 7.224635432826559, "grad_norm": 0.19604648649692535, "learning_rate": 8.061552021146694e-06, "loss": 0.2296, "num_input_tokens_seen": 14188576, "step": 23285 }, { "epoch": 7.2261867825007755, "grad_norm": 0.06623971462249756, "learning_rate": 8.060481564706027e-06, "loss": 0.2325, "num_input_tokens_seen": 14190720, "step": 23290 }, { "epoch": 7.227738132174992, "grad_norm": 0.21362487971782684, "learning_rate": 8.05941088389582e-06, "loss": 0.228, "num_input_tokens_seen": 14193600, "step": 23295 }, { "epoch": 7.229289481849209, "grad_norm": 0.060238663107156754, "learning_rate": 8.058339978794567e-06, "loss": 0.2343, "num_input_tokens_seen": 14195680, "step": 23300 }, { "epoch": 7.230840831523425, "grad_norm": 0.22632570564746857, "learning_rate": 8.057268849480777e-06, "loss": 0.2235, "num_input_tokens_seen": 14198816, "step": 23305 }, { "epoch": 7.232392181197642, "grad_norm": 0.13218605518341064, "learning_rate": 8.056197496032976e-06, "loss": 0.2353, "num_input_tokens_seen": 14201440, "step": 23310 }, { "epoch": 7.2339435308718585, "grad_norm": 0.11419420689344406, "learning_rate": 8.05512591852971e-06, "loss": 0.2383, "num_input_tokens_seen": 14204288, "step": 23315 }, { "epoch": 7.235494880546075, "grad_norm": 0.18319135904312134, "learning_rate": 8.054054117049535e-06, "loss": 0.2219, "num_input_tokens_seen": 14206848, "step": 23320 }, { "epoch": 7.237046230220292, "grad_norm": 0.0692412406206131, "learning_rate": 8.052982091671026e-06, "loss": 0.2359, "num_input_tokens_seen": 14210400, "step": 23325 }, { "epoch": 7.238597579894508, "grad_norm": 0.12239526212215424, "learning_rate": 8.051909842472779e-06, "loss": 0.2329, "num_input_tokens_seen": 14212480, "step": 23330 }, { "epoch": 7.240148929568725, "grad_norm": 0.09095676988363266, "learning_rate": 8.050837369533399e-06, "loss": 0.2259, "num_input_tokens_seen": 14215136, "step": 23335 }, { "epoch": 7.2417002792429415, "grad_norm": 0.11426683515310287, "learning_rate": 8.049764672931515e-06, "loss": 0.2278, "num_input_tokens_seen": 14217984, "step": 23340 }, { "epoch": 7.243251628917158, "grad_norm": 0.12255580723285675, "learning_rate": 8.048691752745763e-06, "loss": 0.2315, "num_input_tokens_seen": 14221632, "step": 23345 }, { "epoch": 7.244802978591374, "grad_norm": 0.15037132799625397, "learning_rate": 8.047618609054805e-06, "loss": 0.2364, "num_input_tokens_seen": 14225632, "step": 23350 }, { "epoch": 7.246354328265591, "grad_norm": 0.1059921383857727, "learning_rate": 8.046545241937314e-06, "loss": 0.2308, "num_input_tokens_seen": 14228128, "step": 23355 }, { "epoch": 7.247905677939808, "grad_norm": 0.12259232252836227, "learning_rate": 8.04547165147198e-06, "loss": 0.2289, "num_input_tokens_seen": 14230944, "step": 23360 }, { "epoch": 7.249457027614024, "grad_norm": 0.08135341107845306, "learning_rate": 8.04439783773751e-06, "loss": 0.232, "num_input_tokens_seen": 14234720, "step": 23365 }, { "epoch": 7.25100837728824, "grad_norm": 0.06193417310714722, "learning_rate": 8.043323800812629e-06, "loss": 0.2342, "num_input_tokens_seen": 14239136, "step": 23370 }, { "epoch": 7.252559726962457, "grad_norm": 0.19633179903030396, "learning_rate": 8.04224954077607e-06, "loss": 0.2315, "num_input_tokens_seen": 14241952, "step": 23375 }, { "epoch": 7.254111076636674, "grad_norm": 0.1879836767911911, "learning_rate": 8.0411750577066e-06, "loss": 0.2247, "num_input_tokens_seen": 14245408, "step": 23380 }, { "epoch": 7.255662426310891, "grad_norm": 0.16501405835151672, "learning_rate": 8.040100351682982e-06, "loss": 0.2378, "num_input_tokens_seen": 14248448, "step": 23385 }, { "epoch": 7.257213775985107, "grad_norm": 0.12566891312599182, "learning_rate": 8.03902542278401e-06, "loss": 0.2347, "num_input_tokens_seen": 14251840, "step": 23390 }, { "epoch": 7.258765125659323, "grad_norm": 0.1264614462852478, "learning_rate": 8.037950271088487e-06, "loss": 0.2356, "num_input_tokens_seen": 14256064, "step": 23395 }, { "epoch": 7.26031647533354, "grad_norm": 0.14089137315750122, "learning_rate": 8.036874896675232e-06, "loss": 0.2332, "num_input_tokens_seen": 14258976, "step": 23400 }, { "epoch": 7.261867825007757, "grad_norm": 0.2209395170211792, "learning_rate": 8.035799299623086e-06, "loss": 0.2307, "num_input_tokens_seen": 14262176, "step": 23405 }, { "epoch": 7.263419174681974, "grad_norm": 0.08498799800872803, "learning_rate": 8.034723480010904e-06, "loss": 0.2319, "num_input_tokens_seen": 14264768, "step": 23410 }, { "epoch": 7.2649705243561895, "grad_norm": 0.10076507925987244, "learning_rate": 8.033647437917552e-06, "loss": 0.2276, "num_input_tokens_seen": 14266976, "step": 23415 }, { "epoch": 7.266521874030406, "grad_norm": 0.1364123374223709, "learning_rate": 8.032571173421921e-06, "loss": 0.2274, "num_input_tokens_seen": 14270912, "step": 23420 }, { "epoch": 7.268073223704623, "grad_norm": 0.12445439398288727, "learning_rate": 8.031494686602911e-06, "loss": 0.2248, "num_input_tokens_seen": 14274240, "step": 23425 }, { "epoch": 7.26962457337884, "grad_norm": 0.06711357086896896, "learning_rate": 8.030417977539442e-06, "loss": 0.2356, "num_input_tokens_seen": 14276928, "step": 23430 }, { "epoch": 7.271175923053056, "grad_norm": 0.09330102801322937, "learning_rate": 8.02934104631045e-06, "loss": 0.2295, "num_input_tokens_seen": 14279392, "step": 23435 }, { "epoch": 7.2727272727272725, "grad_norm": 0.20380643010139465, "learning_rate": 8.028263892994886e-06, "loss": 0.234, "num_input_tokens_seen": 14282464, "step": 23440 }, { "epoch": 7.274278622401489, "grad_norm": 0.13552388548851013, "learning_rate": 8.02718651767172e-06, "loss": 0.2301, "num_input_tokens_seen": 14285312, "step": 23445 }, { "epoch": 7.275829972075706, "grad_norm": 0.1666458249092102, "learning_rate": 8.026108920419934e-06, "loss": 0.2276, "num_input_tokens_seen": 14288192, "step": 23450 }, { "epoch": 7.277381321749923, "grad_norm": 0.15678709745407104, "learning_rate": 8.02503110131853e-06, "loss": 0.2326, "num_input_tokens_seen": 14290784, "step": 23455 }, { "epoch": 7.278932671424139, "grad_norm": 0.12009971588850021, "learning_rate": 8.023953060446524e-06, "loss": 0.2294, "num_input_tokens_seen": 14295776, "step": 23460 }, { "epoch": 7.2804840210983555, "grad_norm": 0.09498246759176254, "learning_rate": 8.02287479788295e-06, "loss": 0.2389, "num_input_tokens_seen": 14298432, "step": 23465 }, { "epoch": 7.282035370772572, "grad_norm": 0.13720539212226868, "learning_rate": 8.021796313706857e-06, "loss": 0.2326, "num_input_tokens_seen": 14300608, "step": 23470 }, { "epoch": 7.283586720446789, "grad_norm": 0.05383804440498352, "learning_rate": 8.020717607997311e-06, "loss": 0.2276, "num_input_tokens_seen": 14302816, "step": 23475 }, { "epoch": 7.285138070121005, "grad_norm": 0.15024754405021667, "learning_rate": 8.019638680833395e-06, "loss": 0.2301, "num_input_tokens_seen": 14306560, "step": 23480 }, { "epoch": 7.286689419795222, "grad_norm": 0.13030698895454407, "learning_rate": 8.018559532294204e-06, "loss": 0.2391, "num_input_tokens_seen": 14309312, "step": 23485 }, { "epoch": 7.288240769469438, "grad_norm": 0.12947434186935425, "learning_rate": 8.017480162458855e-06, "loss": 0.2312, "num_input_tokens_seen": 14311744, "step": 23490 }, { "epoch": 7.289792119143655, "grad_norm": 0.10620725154876709, "learning_rate": 8.016400571406478e-06, "loss": 0.2275, "num_input_tokens_seen": 14315968, "step": 23495 }, { "epoch": 7.291343468817871, "grad_norm": 0.22529374063014984, "learning_rate": 8.01532075921622e-06, "loss": 0.2373, "num_input_tokens_seen": 14318688, "step": 23500 }, { "epoch": 7.292894818492088, "grad_norm": 0.07725540548563004, "learning_rate": 8.014240725967241e-06, "loss": 0.2337, "num_input_tokens_seen": 14322144, "step": 23505 }, { "epoch": 7.294446168166305, "grad_norm": 0.20729754865169525, "learning_rate": 8.013160471738724e-06, "loss": 0.2332, "num_input_tokens_seen": 14325632, "step": 23510 }, { "epoch": 7.295997517840521, "grad_norm": 0.10523385554552078, "learning_rate": 8.012079996609865e-06, "loss": 0.2306, "num_input_tokens_seen": 14328384, "step": 23515 }, { "epoch": 7.297548867514738, "grad_norm": 0.1856534332036972, "learning_rate": 8.010999300659871e-06, "loss": 0.2321, "num_input_tokens_seen": 14332288, "step": 23520 }, { "epoch": 7.299100217188954, "grad_norm": 0.1738452911376953, "learning_rate": 8.009918383967975e-06, "loss": 0.233, "num_input_tokens_seen": 14334688, "step": 23525 }, { "epoch": 7.300651566863171, "grad_norm": 0.1268913298845291, "learning_rate": 8.008837246613417e-06, "loss": 0.2345, "num_input_tokens_seen": 14337248, "step": 23530 }, { "epoch": 7.302202916537388, "grad_norm": 0.04728163033723831, "learning_rate": 8.007755888675462e-06, "loss": 0.2311, "num_input_tokens_seen": 14339744, "step": 23535 }, { "epoch": 7.303754266211604, "grad_norm": 0.11489290744066238, "learning_rate": 8.00667431023338e-06, "loss": 0.229, "num_input_tokens_seen": 14343232, "step": 23540 }, { "epoch": 7.30530561588582, "grad_norm": 0.12186402827501297, "learning_rate": 8.005592511366468e-06, "loss": 0.2285, "num_input_tokens_seen": 14346496, "step": 23545 }, { "epoch": 7.306856965560037, "grad_norm": 0.058644235134124756, "learning_rate": 8.004510492154033e-06, "loss": 0.2312, "num_input_tokens_seen": 14350560, "step": 23550 }, { "epoch": 7.308408315234254, "grad_norm": 0.0875440239906311, "learning_rate": 8.0034282526754e-06, "loss": 0.2302, "num_input_tokens_seen": 14353408, "step": 23555 }, { "epoch": 7.309959664908471, "grad_norm": 0.13484779000282288, "learning_rate": 8.002345793009912e-06, "loss": 0.2348, "num_input_tokens_seen": 14356032, "step": 23560 }, { "epoch": 7.3115110145826865, "grad_norm": 0.20400367677211761, "learning_rate": 8.001263113236922e-06, "loss": 0.2385, "num_input_tokens_seen": 14361472, "step": 23565 }, { "epoch": 7.313062364256903, "grad_norm": 0.1258516162633896, "learning_rate": 8.000180213435806e-06, "loss": 0.2286, "num_input_tokens_seen": 14364928, "step": 23570 }, { "epoch": 7.31461371393112, "grad_norm": 0.10460273176431656, "learning_rate": 7.999097093685953e-06, "loss": 0.2315, "num_input_tokens_seen": 14367744, "step": 23575 }, { "epoch": 7.316165063605337, "grad_norm": 0.06413804739713669, "learning_rate": 7.998013754066769e-06, "loss": 0.2275, "num_input_tokens_seen": 14371072, "step": 23580 }, { "epoch": 7.317716413279554, "grad_norm": 0.1408320814371109, "learning_rate": 7.996930194657675e-06, "loss": 0.2307, "num_input_tokens_seen": 14374336, "step": 23585 }, { "epoch": 7.3192677629537695, "grad_norm": 0.10630964487791061, "learning_rate": 7.995846415538109e-06, "loss": 0.2291, "num_input_tokens_seen": 14378624, "step": 23590 }, { "epoch": 7.320819112627986, "grad_norm": 0.10973558574914932, "learning_rate": 7.994762416787523e-06, "loss": 0.2274, "num_input_tokens_seen": 14381760, "step": 23595 }, { "epoch": 7.322370462302203, "grad_norm": 0.17097650468349457, "learning_rate": 7.99367819848539e-06, "loss": 0.2348, "num_input_tokens_seen": 14384512, "step": 23600 }, { "epoch": 7.32392181197642, "grad_norm": 0.10714417695999146, "learning_rate": 7.992593760711194e-06, "loss": 0.2338, "num_input_tokens_seen": 14387264, "step": 23605 }, { "epoch": 7.325473161650636, "grad_norm": 0.20219074189662933, "learning_rate": 7.991509103544439e-06, "loss": 0.2311, "num_input_tokens_seen": 14389600, "step": 23610 }, { "epoch": 7.327024511324852, "grad_norm": 0.09674587845802307, "learning_rate": 7.99042422706464e-06, "loss": 0.2338, "num_input_tokens_seen": 14393536, "step": 23615 }, { "epoch": 7.328575860999069, "grad_norm": 0.11863412708044052, "learning_rate": 7.989339131351335e-06, "loss": 0.2307, "num_input_tokens_seen": 14396000, "step": 23620 }, { "epoch": 7.330127210673286, "grad_norm": 0.10435818880796432, "learning_rate": 7.988253816484071e-06, "loss": 0.2377, "num_input_tokens_seen": 14398528, "step": 23625 }, { "epoch": 7.331678560347502, "grad_norm": 0.06923858076334, "learning_rate": 7.987168282542416e-06, "loss": 0.2394, "num_input_tokens_seen": 14401344, "step": 23630 }, { "epoch": 7.333229910021719, "grad_norm": 0.12533186376094818, "learning_rate": 7.986082529605955e-06, "loss": 0.2345, "num_input_tokens_seen": 14403584, "step": 23635 }, { "epoch": 7.334781259695935, "grad_norm": 0.09761494398117065, "learning_rate": 7.984996557754282e-06, "loss": 0.2312, "num_input_tokens_seen": 14406496, "step": 23640 }, { "epoch": 7.336332609370152, "grad_norm": 0.11360272765159607, "learning_rate": 7.983910367067014e-06, "loss": 0.229, "num_input_tokens_seen": 14409344, "step": 23645 }, { "epoch": 7.337883959044369, "grad_norm": 0.09619425982236862, "learning_rate": 7.982823957623782e-06, "loss": 0.2347, "num_input_tokens_seen": 14412384, "step": 23650 }, { "epoch": 7.339435308718585, "grad_norm": 0.11440909653902054, "learning_rate": 7.98173732950423e-06, "loss": 0.2311, "num_input_tokens_seen": 14415616, "step": 23655 }, { "epoch": 7.340986658392802, "grad_norm": 0.0982242301106453, "learning_rate": 7.980650482788025e-06, "loss": 0.23, "num_input_tokens_seen": 14418560, "step": 23660 }, { "epoch": 7.342538008067018, "grad_norm": 0.1113772839307785, "learning_rate": 7.979563417554843e-06, "loss": 0.2321, "num_input_tokens_seen": 14421248, "step": 23665 }, { "epoch": 7.344089357741235, "grad_norm": 0.12076173722743988, "learning_rate": 7.978476133884378e-06, "loss": 0.2273, "num_input_tokens_seen": 14423936, "step": 23670 }, { "epoch": 7.345640707415451, "grad_norm": 0.10554240643978119, "learning_rate": 7.977388631856343e-06, "loss": 0.2309, "num_input_tokens_seen": 14426432, "step": 23675 }, { "epoch": 7.347192057089668, "grad_norm": 0.09193915873765945, "learning_rate": 7.976300911550463e-06, "loss": 0.233, "num_input_tokens_seen": 14429568, "step": 23680 }, { "epoch": 7.348743406763885, "grad_norm": 0.10711697489023209, "learning_rate": 7.975212973046482e-06, "loss": 0.2275, "num_input_tokens_seen": 14433536, "step": 23685 }, { "epoch": 7.350294756438101, "grad_norm": 0.0649905577301979, "learning_rate": 7.974124816424158e-06, "loss": 0.2313, "num_input_tokens_seen": 14436416, "step": 23690 }, { "epoch": 7.351846106112317, "grad_norm": 0.08067075908184052, "learning_rate": 7.973036441763267e-06, "loss": 0.2274, "num_input_tokens_seen": 14439296, "step": 23695 }, { "epoch": 7.353397455786534, "grad_norm": 0.09821537882089615, "learning_rate": 7.971947849143597e-06, "loss": 0.2268, "num_input_tokens_seen": 14441408, "step": 23700 }, { "epoch": 7.354948805460751, "grad_norm": 0.07542038708925247, "learning_rate": 7.970859038644958e-06, "loss": 0.2249, "num_input_tokens_seen": 14443968, "step": 23705 }, { "epoch": 7.356500155134968, "grad_norm": 0.1423170119524002, "learning_rate": 7.96977001034717e-06, "loss": 0.2358, "num_input_tokens_seen": 14447168, "step": 23710 }, { "epoch": 7.358051504809184, "grad_norm": 0.09510799497365952, "learning_rate": 7.968680764330074e-06, "loss": 0.2295, "num_input_tokens_seen": 14450144, "step": 23715 }, { "epoch": 7.3596028544834, "grad_norm": 0.1534397453069687, "learning_rate": 7.967591300673523e-06, "loss": 0.2334, "num_input_tokens_seen": 14453280, "step": 23720 }, { "epoch": 7.361154204157617, "grad_norm": 0.06622254848480225, "learning_rate": 7.96650161945739e-06, "loss": 0.2352, "num_input_tokens_seen": 14456256, "step": 23725 }, { "epoch": 7.362705553831834, "grad_norm": 0.054891571402549744, "learning_rate": 7.965411720761558e-06, "loss": 0.2321, "num_input_tokens_seen": 14459200, "step": 23730 }, { "epoch": 7.3642569035060506, "grad_norm": 0.11287010461091995, "learning_rate": 7.96432160466593e-06, "loss": 0.2323, "num_input_tokens_seen": 14462592, "step": 23735 }, { "epoch": 7.365808253180266, "grad_norm": 0.19111771881580353, "learning_rate": 7.963231271250426e-06, "loss": 0.2309, "num_input_tokens_seen": 14465472, "step": 23740 }, { "epoch": 7.367359602854483, "grad_norm": 0.10524721443653107, "learning_rate": 7.962140720594981e-06, "loss": 0.233, "num_input_tokens_seen": 14469120, "step": 23745 }, { "epoch": 7.3689109525287, "grad_norm": 0.1918734461069107, "learning_rate": 7.961049952779545e-06, "loss": 0.23, "num_input_tokens_seen": 14473280, "step": 23750 }, { "epoch": 7.370462302202917, "grad_norm": 0.07620017975568771, "learning_rate": 7.959958967884081e-06, "loss": 0.2244, "num_input_tokens_seen": 14476064, "step": 23755 }, { "epoch": 7.372013651877133, "grad_norm": 0.11384646594524384, "learning_rate": 7.958867765988575e-06, "loss": 0.2249, "num_input_tokens_seen": 14478784, "step": 23760 }, { "epoch": 7.373565001551349, "grad_norm": 0.11387211084365845, "learning_rate": 7.957776347173023e-06, "loss": 0.2311, "num_input_tokens_seen": 14481216, "step": 23765 }, { "epoch": 7.375116351225566, "grad_norm": 0.13843201100826263, "learning_rate": 7.95668471151744e-06, "loss": 0.2249, "num_input_tokens_seen": 14483968, "step": 23770 }, { "epoch": 7.376667700899783, "grad_norm": 0.22186830639839172, "learning_rate": 7.955592859101854e-06, "loss": 0.2318, "num_input_tokens_seen": 14486816, "step": 23775 }, { "epoch": 7.378219050574, "grad_norm": 0.1437031328678131, "learning_rate": 7.954500790006315e-06, "loss": 0.2347, "num_input_tokens_seen": 14489120, "step": 23780 }, { "epoch": 7.379770400248216, "grad_norm": 0.3012508749961853, "learning_rate": 7.953408504310878e-06, "loss": 0.2302, "num_input_tokens_seen": 14492032, "step": 23785 }, { "epoch": 7.381321749922432, "grad_norm": 0.11950530856847763, "learning_rate": 7.952316002095626e-06, "loss": 0.2368, "num_input_tokens_seen": 14494464, "step": 23790 }, { "epoch": 7.382873099596649, "grad_norm": 0.13135285675525665, "learning_rate": 7.95122328344065e-06, "loss": 0.2379, "num_input_tokens_seen": 14498688, "step": 23795 }, { "epoch": 7.384424449270866, "grad_norm": 0.2381579428911209, "learning_rate": 7.95013034842606e-06, "loss": 0.2237, "num_input_tokens_seen": 14501248, "step": 23800 }, { "epoch": 7.385975798945082, "grad_norm": 0.15705932676792145, "learning_rate": 7.94903719713198e-06, "loss": 0.2332, "num_input_tokens_seen": 14504736, "step": 23805 }, { "epoch": 7.387527148619299, "grad_norm": 0.0878407284617424, "learning_rate": 7.947943829638551e-06, "loss": 0.2365, "num_input_tokens_seen": 14507328, "step": 23810 }, { "epoch": 7.389078498293515, "grad_norm": 0.19324469566345215, "learning_rate": 7.94685024602593e-06, "loss": 0.228, "num_input_tokens_seen": 14511456, "step": 23815 }, { "epoch": 7.390629847967732, "grad_norm": 0.16488295793533325, "learning_rate": 7.945756446374292e-06, "loss": 0.2334, "num_input_tokens_seen": 14515712, "step": 23820 }, { "epoch": 7.392181197641948, "grad_norm": 0.2889026701450348, "learning_rate": 7.944662430763823e-06, "loss": 0.2288, "num_input_tokens_seen": 14518432, "step": 23825 }, { "epoch": 7.393732547316165, "grad_norm": 0.12907110154628754, "learning_rate": 7.943568199274727e-06, "loss": 0.2372, "num_input_tokens_seen": 14521536, "step": 23830 }, { "epoch": 7.395283896990382, "grad_norm": 0.19372586905956268, "learning_rate": 7.942473751987224e-06, "loss": 0.2261, "num_input_tokens_seen": 14524352, "step": 23835 }, { "epoch": 7.396835246664598, "grad_norm": 0.2627437710762024, "learning_rate": 7.941379088981554e-06, "loss": 0.2378, "num_input_tokens_seen": 14527648, "step": 23840 }, { "epoch": 7.398386596338815, "grad_norm": 0.1354556828737259, "learning_rate": 7.940284210337965e-06, "loss": 0.2307, "num_input_tokens_seen": 14530976, "step": 23845 }, { "epoch": 7.399937946013031, "grad_norm": 0.20857317745685577, "learning_rate": 7.939189116136723e-06, "loss": 0.2374, "num_input_tokens_seen": 14533632, "step": 23850 }, { "epoch": 7.401489295687248, "grad_norm": 0.08148501813411713, "learning_rate": 7.938093806458114e-06, "loss": 0.2342, "num_input_tokens_seen": 14536800, "step": 23855 }, { "epoch": 7.4030406453614646, "grad_norm": 0.09692694991827011, "learning_rate": 7.936998281382437e-06, "loss": 0.2299, "num_input_tokens_seen": 14539680, "step": 23860 }, { "epoch": 7.404591995035681, "grad_norm": 0.10026805102825165, "learning_rate": 7.935902540990008e-06, "loss": 0.2315, "num_input_tokens_seen": 14542208, "step": 23865 }, { "epoch": 7.406143344709897, "grad_norm": 0.1956358402967453, "learning_rate": 7.934806585361157e-06, "loss": 0.2326, "num_input_tokens_seen": 14546144, "step": 23870 }, { "epoch": 7.407694694384114, "grad_norm": 0.1336948275566101, "learning_rate": 7.933710414576228e-06, "loss": 0.232, "num_input_tokens_seen": 14550336, "step": 23875 }, { "epoch": 7.409246044058331, "grad_norm": 0.24759332835674286, "learning_rate": 7.932614028715587e-06, "loss": 0.2341, "num_input_tokens_seen": 14553472, "step": 23880 }, { "epoch": 7.4107973937325475, "grad_norm": 0.08458533138036728, "learning_rate": 7.931517427859608e-06, "loss": 0.2303, "num_input_tokens_seen": 14556544, "step": 23885 }, { "epoch": 7.412348743406763, "grad_norm": 0.11671503633260727, "learning_rate": 7.930420612088689e-06, "loss": 0.2357, "num_input_tokens_seen": 14559456, "step": 23890 }, { "epoch": 7.41390009308098, "grad_norm": 0.21670037508010864, "learning_rate": 7.929323581483238e-06, "loss": 0.2313, "num_input_tokens_seen": 14562208, "step": 23895 }, { "epoch": 7.415451442755197, "grad_norm": 0.17410928010940552, "learning_rate": 7.928226336123679e-06, "loss": 0.2295, "num_input_tokens_seen": 14565120, "step": 23900 }, { "epoch": 7.417002792429414, "grad_norm": 0.08823327720165253, "learning_rate": 7.927128876090454e-06, "loss": 0.2299, "num_input_tokens_seen": 14567776, "step": 23905 }, { "epoch": 7.4185541421036305, "grad_norm": 0.15593697130680084, "learning_rate": 7.92603120146402e-06, "loss": 0.2281, "num_input_tokens_seen": 14570528, "step": 23910 }, { "epoch": 7.420105491777846, "grad_norm": 0.1557338535785675, "learning_rate": 7.924933312324849e-06, "loss": 0.2288, "num_input_tokens_seen": 14573408, "step": 23915 }, { "epoch": 7.421656841452063, "grad_norm": 0.07675135135650635, "learning_rate": 7.923835208753432e-06, "loss": 0.2399, "num_input_tokens_seen": 14576032, "step": 23920 }, { "epoch": 7.42320819112628, "grad_norm": 0.13360796868801117, "learning_rate": 7.92273689083027e-06, "loss": 0.2273, "num_input_tokens_seen": 14579552, "step": 23925 }, { "epoch": 7.424759540800497, "grad_norm": 0.09335260093212128, "learning_rate": 7.921638358635884e-06, "loss": 0.2289, "num_input_tokens_seen": 14581824, "step": 23930 }, { "epoch": 7.426310890474713, "grad_norm": 0.175581157207489, "learning_rate": 7.920539612250808e-06, "loss": 0.2373, "num_input_tokens_seen": 14584704, "step": 23935 }, { "epoch": 7.427862240148929, "grad_norm": 0.11962802708148956, "learning_rate": 7.919440651755595e-06, "loss": 0.2337, "num_input_tokens_seen": 14587200, "step": 23940 }, { "epoch": 7.429413589823146, "grad_norm": 0.17142999172210693, "learning_rate": 7.91834147723081e-06, "loss": 0.2241, "num_input_tokens_seen": 14590048, "step": 23945 }, { "epoch": 7.430964939497363, "grad_norm": 0.21389180421829224, "learning_rate": 7.917242088757036e-06, "loss": 0.2391, "num_input_tokens_seen": 14593440, "step": 23950 }, { "epoch": 7.432516289171579, "grad_norm": 0.12200695276260376, "learning_rate": 7.91614248641487e-06, "loss": 0.2337, "num_input_tokens_seen": 14596608, "step": 23955 }, { "epoch": 7.434067638845796, "grad_norm": 0.2632182240486145, "learning_rate": 7.915042670284931e-06, "loss": 0.2263, "num_input_tokens_seen": 14599360, "step": 23960 }, { "epoch": 7.435618988520012, "grad_norm": 0.16520968079566956, "learning_rate": 7.913942640447844e-06, "loss": 0.2261, "num_input_tokens_seen": 14602496, "step": 23965 }, { "epoch": 7.437170338194229, "grad_norm": 0.24346062541007996, "learning_rate": 7.912842396984256e-06, "loss": 0.2426, "num_input_tokens_seen": 14607232, "step": 23970 }, { "epoch": 7.438721687868446, "grad_norm": 0.10611522197723389, "learning_rate": 7.911741939974825e-06, "loss": 0.2297, "num_input_tokens_seen": 14610752, "step": 23975 }, { "epoch": 7.440273037542662, "grad_norm": 0.12680357694625854, "learning_rate": 7.91064126950023e-06, "loss": 0.229, "num_input_tokens_seen": 14613248, "step": 23980 }, { "epoch": 7.4418243872168786, "grad_norm": 0.15015184879302979, "learning_rate": 7.909540385641162e-06, "loss": 0.232, "num_input_tokens_seen": 14617856, "step": 23985 }, { "epoch": 7.443375736891095, "grad_norm": 0.13528287410736084, "learning_rate": 7.908439288478331e-06, "loss": 0.229, "num_input_tokens_seen": 14620864, "step": 23990 }, { "epoch": 7.444927086565312, "grad_norm": 0.18516647815704346, "learning_rate": 7.90733797809246e-06, "loss": 0.2294, "num_input_tokens_seen": 14624096, "step": 23995 }, { "epoch": 7.446478436239528, "grad_norm": 0.15331043303012848, "learning_rate": 7.906236454564286e-06, "loss": 0.2337, "num_input_tokens_seen": 14628864, "step": 24000 }, { "epoch": 7.448029785913745, "grad_norm": 0.20866721868515015, "learning_rate": 7.905134717974563e-06, "loss": 0.2336, "num_input_tokens_seen": 14634496, "step": 24005 }, { "epoch": 7.4495811355879615, "grad_norm": 0.10970375686883926, "learning_rate": 7.904032768404066e-06, "loss": 0.228, "num_input_tokens_seen": 14637152, "step": 24010 }, { "epoch": 7.451132485262178, "grad_norm": 0.1152939647436142, "learning_rate": 7.902930605933578e-06, "loss": 0.2331, "num_input_tokens_seen": 14640000, "step": 24015 }, { "epoch": 7.452683834936394, "grad_norm": 0.09455719590187073, "learning_rate": 7.9018282306439e-06, "loss": 0.2317, "num_input_tokens_seen": 14642144, "step": 24020 }, { "epoch": 7.454235184610611, "grad_norm": 0.1405228078365326, "learning_rate": 7.900725642615849e-06, "loss": 0.2262, "num_input_tokens_seen": 14644960, "step": 24025 }, { "epoch": 7.455786534284828, "grad_norm": 0.11252802610397339, "learning_rate": 7.899622841930261e-06, "loss": 0.2301, "num_input_tokens_seen": 14647392, "step": 24030 }, { "epoch": 7.4573378839590445, "grad_norm": 0.09687314927577972, "learning_rate": 7.898519828667981e-06, "loss": 0.2326, "num_input_tokens_seen": 14649728, "step": 24035 }, { "epoch": 7.458889233633261, "grad_norm": 0.08308948576450348, "learning_rate": 7.897416602909873e-06, "loss": 0.2225, "num_input_tokens_seen": 14651968, "step": 24040 }, { "epoch": 7.460440583307477, "grad_norm": 0.18495234847068787, "learning_rate": 7.896313164736817e-06, "loss": 0.2242, "num_input_tokens_seen": 14654432, "step": 24045 }, { "epoch": 7.461991932981694, "grad_norm": 0.12472261488437653, "learning_rate": 7.895209514229711e-06, "loss": 0.2328, "num_input_tokens_seen": 14657472, "step": 24050 }, { "epoch": 7.463543282655911, "grad_norm": 0.05673626810312271, "learning_rate": 7.894105651469462e-06, "loss": 0.2335, "num_input_tokens_seen": 14660192, "step": 24055 }, { "epoch": 7.4650946323301275, "grad_norm": 0.07232996076345444, "learning_rate": 7.893001576536997e-06, "loss": 0.2363, "num_input_tokens_seen": 14663328, "step": 24060 }, { "epoch": 7.466645982004343, "grad_norm": 0.1209726557135582, "learning_rate": 7.89189728951326e-06, "loss": 0.2239, "num_input_tokens_seen": 14666112, "step": 24065 }, { "epoch": 7.46819733167856, "grad_norm": 0.1281103938817978, "learning_rate": 7.890792790479204e-06, "loss": 0.2425, "num_input_tokens_seen": 14669152, "step": 24070 }, { "epoch": 7.469748681352777, "grad_norm": 0.11561509221792221, "learning_rate": 7.889688079515805e-06, "loss": 0.2213, "num_input_tokens_seen": 14672256, "step": 24075 }, { "epoch": 7.471300031026994, "grad_norm": 0.23830072581768036, "learning_rate": 7.888583156704051e-06, "loss": 0.2283, "num_input_tokens_seen": 14675008, "step": 24080 }, { "epoch": 7.4728513807012105, "grad_norm": 0.22629153728485107, "learning_rate": 7.887478022124946e-06, "loss": 0.2321, "num_input_tokens_seen": 14677920, "step": 24085 }, { "epoch": 7.474402730375426, "grad_norm": 0.14149929583072662, "learning_rate": 7.886372675859508e-06, "loss": 0.2308, "num_input_tokens_seen": 14681056, "step": 24090 }, { "epoch": 7.475954080049643, "grad_norm": 0.11440297216176987, "learning_rate": 7.885267117988774e-06, "loss": 0.2274, "num_input_tokens_seen": 14685376, "step": 24095 }, { "epoch": 7.47750542972386, "grad_norm": 0.10705956071615219, "learning_rate": 7.884161348593794e-06, "loss": 0.2271, "num_input_tokens_seen": 14689376, "step": 24100 }, { "epoch": 7.479056779398077, "grad_norm": 0.16699372231960297, "learning_rate": 7.883055367755632e-06, "loss": 0.2333, "num_input_tokens_seen": 14692256, "step": 24105 }, { "epoch": 7.4806081290722926, "grad_norm": 0.20850770175457, "learning_rate": 7.881949175555372e-06, "loss": 0.2322, "num_input_tokens_seen": 14695200, "step": 24110 }, { "epoch": 7.482159478746509, "grad_norm": 0.15148556232452393, "learning_rate": 7.880842772074112e-06, "loss": 0.2339, "num_input_tokens_seen": 14699456, "step": 24115 }, { "epoch": 7.483710828420726, "grad_norm": 0.16815446317195892, "learning_rate": 7.879736157392959e-06, "loss": 0.2343, "num_input_tokens_seen": 14703360, "step": 24120 }, { "epoch": 7.485262178094943, "grad_norm": 0.24214167892932892, "learning_rate": 7.878629331593047e-06, "loss": 0.2311, "num_input_tokens_seen": 14706176, "step": 24125 }, { "epoch": 7.486813527769159, "grad_norm": 0.23035983741283417, "learning_rate": 7.877522294755515e-06, "loss": 0.228, "num_input_tokens_seen": 14708544, "step": 24130 }, { "epoch": 7.4883648774433755, "grad_norm": 0.21087424457073212, "learning_rate": 7.876415046961525e-06, "loss": 0.2229, "num_input_tokens_seen": 14713536, "step": 24135 }, { "epoch": 7.489916227117592, "grad_norm": 0.15766681730747223, "learning_rate": 7.87530758829225e-06, "loss": 0.2353, "num_input_tokens_seen": 14715488, "step": 24140 }, { "epoch": 7.491467576791809, "grad_norm": 0.12920346856117249, "learning_rate": 7.874199918828882e-06, "loss": 0.2322, "num_input_tokens_seen": 14719424, "step": 24145 }, { "epoch": 7.493018926466026, "grad_norm": 0.12349198013544083, "learning_rate": 7.873092038652621e-06, "loss": 0.2321, "num_input_tokens_seen": 14722240, "step": 24150 }, { "epoch": 7.494570276140242, "grad_norm": 0.17464236915111542, "learning_rate": 7.871983947844693e-06, "loss": 0.23, "num_input_tokens_seen": 14725888, "step": 24155 }, { "epoch": 7.4961216258144585, "grad_norm": 0.16164717078208923, "learning_rate": 7.870875646486333e-06, "loss": 0.2377, "num_input_tokens_seen": 14728032, "step": 24160 }, { "epoch": 7.497672975488675, "grad_norm": 0.2739114761352539, "learning_rate": 7.86976713465879e-06, "loss": 0.2339, "num_input_tokens_seen": 14730848, "step": 24165 }, { "epoch": 7.499224325162892, "grad_norm": 0.12658186256885529, "learning_rate": 7.868658412443334e-06, "loss": 0.2331, "num_input_tokens_seen": 14734656, "step": 24170 }, { "epoch": 7.500775674837108, "grad_norm": 0.12483136355876923, "learning_rate": 7.867549479921246e-06, "loss": 0.231, "num_input_tokens_seen": 14737088, "step": 24175 }, { "epoch": 7.502327024511325, "grad_norm": 0.06252504885196686, "learning_rate": 7.866440337173824e-06, "loss": 0.2315, "num_input_tokens_seen": 14739616, "step": 24180 }, { "epoch": 7.5038783741855415, "grad_norm": 0.09655284881591797, "learning_rate": 7.865330984282383e-06, "loss": 0.2299, "num_input_tokens_seen": 14742912, "step": 24185 }, { "epoch": 7.505429723859758, "grad_norm": 0.07917375862598419, "learning_rate": 7.864221421328248e-06, "loss": 0.23, "num_input_tokens_seen": 14745408, "step": 24190 }, { "epoch": 7.506981073533975, "grad_norm": 0.08526186645030975, "learning_rate": 7.863111648392767e-06, "loss": 0.2357, "num_input_tokens_seen": 14747584, "step": 24195 }, { "epoch": 7.508532423208191, "grad_norm": 0.11873722821474075, "learning_rate": 7.862001665557297e-06, "loss": 0.228, "num_input_tokens_seen": 14751328, "step": 24200 }, { "epoch": 7.510083772882408, "grad_norm": 0.11250452697277069, "learning_rate": 7.860891472903214e-06, "loss": 0.2294, "num_input_tokens_seen": 14754656, "step": 24205 }, { "epoch": 7.5116351225566245, "grad_norm": 0.09566614776849747, "learning_rate": 7.859781070511907e-06, "loss": 0.2285, "num_input_tokens_seen": 14758368, "step": 24210 }, { "epoch": 7.51318647223084, "grad_norm": 0.10687419027090073, "learning_rate": 7.858670458464783e-06, "loss": 0.2279, "num_input_tokens_seen": 14760704, "step": 24215 }, { "epoch": 7.514737821905057, "grad_norm": 0.1092243492603302, "learning_rate": 7.857559636843263e-06, "loss": 0.233, "num_input_tokens_seen": 14762976, "step": 24220 }, { "epoch": 7.516289171579274, "grad_norm": 0.16527414321899414, "learning_rate": 7.856448605728784e-06, "loss": 0.2266, "num_input_tokens_seen": 14766240, "step": 24225 }, { "epoch": 7.517840521253491, "grad_norm": 0.09730122983455658, "learning_rate": 7.855337365202796e-06, "loss": 0.232, "num_input_tokens_seen": 14768160, "step": 24230 }, { "epoch": 7.5193918709277074, "grad_norm": 0.1358160376548767, "learning_rate": 7.854225915346768e-06, "loss": 0.2353, "num_input_tokens_seen": 14771488, "step": 24235 }, { "epoch": 7.520943220601923, "grad_norm": 0.15427833795547485, "learning_rate": 7.85311425624218e-06, "loss": 0.2289, "num_input_tokens_seen": 14774464, "step": 24240 }, { "epoch": 7.52249457027614, "grad_norm": 0.07403616607189178, "learning_rate": 7.852002387970532e-06, "loss": 0.232, "num_input_tokens_seen": 14776672, "step": 24245 }, { "epoch": 7.524045919950357, "grad_norm": 0.11953440308570862, "learning_rate": 7.850890310613335e-06, "loss": 0.2238, "num_input_tokens_seen": 14779680, "step": 24250 }, { "epoch": 7.525597269624574, "grad_norm": 0.14363020658493042, "learning_rate": 7.849778024252119e-06, "loss": 0.2275, "num_input_tokens_seen": 14782976, "step": 24255 }, { "epoch": 7.52714861929879, "grad_norm": 0.19506427645683289, "learning_rate": 7.848665528968429e-06, "loss": 0.2275, "num_input_tokens_seen": 14785984, "step": 24260 }, { "epoch": 7.528699968973006, "grad_norm": 0.08927913755178452, "learning_rate": 7.847552824843821e-06, "loss": 0.2313, "num_input_tokens_seen": 14788576, "step": 24265 }, { "epoch": 7.530251318647223, "grad_norm": 0.17145434021949768, "learning_rate": 7.846439911959871e-06, "loss": 0.2258, "num_input_tokens_seen": 14791040, "step": 24270 }, { "epoch": 7.53180266832144, "grad_norm": 0.1565624326467514, "learning_rate": 7.845326790398168e-06, "loss": 0.2341, "num_input_tokens_seen": 14794336, "step": 24275 }, { "epoch": 7.533354017995656, "grad_norm": 0.166442409157753, "learning_rate": 7.844213460240318e-06, "loss": 0.227, "num_input_tokens_seen": 14798240, "step": 24280 }, { "epoch": 7.5349053676698725, "grad_norm": 0.22646360099315643, "learning_rate": 7.843099921567942e-06, "loss": 0.2302, "num_input_tokens_seen": 14802272, "step": 24285 }, { "epoch": 7.536456717344089, "grad_norm": 0.1693105548620224, "learning_rate": 7.841986174462672e-06, "loss": 0.2363, "num_input_tokens_seen": 14804768, "step": 24290 }, { "epoch": 7.538008067018306, "grad_norm": 0.16658224165439606, "learning_rate": 7.840872219006164e-06, "loss": 0.2379, "num_input_tokens_seen": 14807360, "step": 24295 }, { "epoch": 7.539559416692523, "grad_norm": 0.12441056221723557, "learning_rate": 7.83975805528008e-06, "loss": 0.2309, "num_input_tokens_seen": 14809728, "step": 24300 }, { "epoch": 7.541110766366739, "grad_norm": 0.11237362772226334, "learning_rate": 7.838643683366103e-06, "loss": 0.231, "num_input_tokens_seen": 14812448, "step": 24305 }, { "epoch": 7.5426621160409555, "grad_norm": 0.15032480657100677, "learning_rate": 7.83752910334593e-06, "loss": 0.2301, "num_input_tokens_seen": 14815424, "step": 24310 }, { "epoch": 7.544213465715172, "grad_norm": 0.219939187169075, "learning_rate": 7.83641431530127e-06, "loss": 0.2276, "num_input_tokens_seen": 14819040, "step": 24315 }, { "epoch": 7.545764815389389, "grad_norm": 0.09011220932006836, "learning_rate": 7.835299319313854e-06, "loss": 0.2302, "num_input_tokens_seen": 14822368, "step": 24320 }, { "epoch": 7.547316165063606, "grad_norm": 0.27735933661460876, "learning_rate": 7.834184115465424e-06, "loss": 0.2283, "num_input_tokens_seen": 14825248, "step": 24325 }, { "epoch": 7.548867514737822, "grad_norm": 0.09995992481708527, "learning_rate": 7.833068703837737e-06, "loss": 0.2355, "num_input_tokens_seen": 14827360, "step": 24330 }, { "epoch": 7.5504188644120385, "grad_norm": 0.20223277807235718, "learning_rate": 7.831953084512562e-06, "loss": 0.2313, "num_input_tokens_seen": 14830432, "step": 24335 }, { "epoch": 7.551970214086255, "grad_norm": 0.21525603532791138, "learning_rate": 7.830837257571693e-06, "loss": 0.2345, "num_input_tokens_seen": 14833440, "step": 24340 }, { "epoch": 7.553521563760471, "grad_norm": 0.5061758160591125, "learning_rate": 7.829721223096931e-06, "loss": 0.2272, "num_input_tokens_seen": 14837056, "step": 24345 }, { "epoch": 7.555072913434688, "grad_norm": 0.13308116793632507, "learning_rate": 7.828604981170094e-06, "loss": 0.2393, "num_input_tokens_seen": 14840384, "step": 24350 }, { "epoch": 7.556624263108905, "grad_norm": 0.24438422918319702, "learning_rate": 7.827488531873016e-06, "loss": 0.2339, "num_input_tokens_seen": 14843968, "step": 24355 }, { "epoch": 7.5581756127831214, "grad_norm": 0.1777234822511673, "learning_rate": 7.826371875287546e-06, "loss": 0.2289, "num_input_tokens_seen": 14847328, "step": 24360 }, { "epoch": 7.559726962457338, "grad_norm": 0.18605762720108032, "learning_rate": 7.82525501149555e-06, "loss": 0.2285, "num_input_tokens_seen": 14852032, "step": 24365 }, { "epoch": 7.561278312131554, "grad_norm": 0.1953292042016983, "learning_rate": 7.824137940578905e-06, "loss": 0.2306, "num_input_tokens_seen": 14854496, "step": 24370 }, { "epoch": 7.562829661805771, "grad_norm": 0.09017042070627213, "learning_rate": 7.823020662619503e-06, "loss": 0.2322, "num_input_tokens_seen": 14857440, "step": 24375 }, { "epoch": 7.564381011479988, "grad_norm": 0.23304598033428192, "learning_rate": 7.82190317769926e-06, "loss": 0.2373, "num_input_tokens_seen": 14860288, "step": 24380 }, { "epoch": 7.565932361154204, "grad_norm": 0.20472365617752075, "learning_rate": 7.820785485900098e-06, "loss": 0.2341, "num_input_tokens_seen": 14863200, "step": 24385 }, { "epoch": 7.567483710828421, "grad_norm": 0.2070971131324768, "learning_rate": 7.819667587303954e-06, "loss": 0.2332, "num_input_tokens_seen": 14865312, "step": 24390 }, { "epoch": 7.569035060502637, "grad_norm": 0.13587462902069092, "learning_rate": 7.818549481992788e-06, "loss": 0.2327, "num_input_tokens_seen": 14867872, "step": 24395 }, { "epoch": 7.570586410176854, "grad_norm": 0.11940626055002213, "learning_rate": 7.817431170048568e-06, "loss": 0.2296, "num_input_tokens_seen": 14871040, "step": 24400 }, { "epoch": 7.572137759851071, "grad_norm": 0.0807458683848381, "learning_rate": 7.816312651553279e-06, "loss": 0.2314, "num_input_tokens_seen": 14873888, "step": 24405 }, { "epoch": 7.5736891095252865, "grad_norm": 0.16671185195446014, "learning_rate": 7.815193926588925e-06, "loss": 0.2347, "num_input_tokens_seen": 14877600, "step": 24410 }, { "epoch": 7.575240459199503, "grad_norm": 0.07917770743370056, "learning_rate": 7.814074995237517e-06, "loss": 0.2325, "num_input_tokens_seen": 14879872, "step": 24415 }, { "epoch": 7.57679180887372, "grad_norm": 0.15554802119731903, "learning_rate": 7.812955857581089e-06, "loss": 0.2305, "num_input_tokens_seen": 14883392, "step": 24420 }, { "epoch": 7.578343158547937, "grad_norm": 0.11474140733480453, "learning_rate": 7.811836513701686e-06, "loss": 0.2249, "num_input_tokens_seen": 14886944, "step": 24425 }, { "epoch": 7.579894508222154, "grad_norm": 0.12120798975229263, "learning_rate": 7.810716963681371e-06, "loss": 0.2309, "num_input_tokens_seen": 14889504, "step": 24430 }, { "epoch": 7.5814458578963695, "grad_norm": 0.14827750623226166, "learning_rate": 7.809597207602218e-06, "loss": 0.2329, "num_input_tokens_seen": 14892384, "step": 24435 }, { "epoch": 7.582997207570586, "grad_norm": 0.12297379970550537, "learning_rate": 7.808477245546317e-06, "loss": 0.2325, "num_input_tokens_seen": 14894720, "step": 24440 }, { "epoch": 7.584548557244803, "grad_norm": 0.14305442571640015, "learning_rate": 7.807357077595778e-06, "loss": 0.2346, "num_input_tokens_seen": 14898400, "step": 24445 }, { "epoch": 7.58609990691902, "grad_norm": 0.11264371126890182, "learning_rate": 7.806236703832722e-06, "loss": 0.2275, "num_input_tokens_seen": 14900928, "step": 24450 }, { "epoch": 7.587651256593237, "grad_norm": 0.1827591359615326, "learning_rate": 7.805116124339283e-06, "loss": 0.2321, "num_input_tokens_seen": 14903744, "step": 24455 }, { "epoch": 7.5892026062674525, "grad_norm": 0.08871128410100937, "learning_rate": 7.803995339197616e-06, "loss": 0.2354, "num_input_tokens_seen": 14906656, "step": 24460 }, { "epoch": 7.590753955941669, "grad_norm": 0.11364194005727768, "learning_rate": 7.802874348489887e-06, "loss": 0.23, "num_input_tokens_seen": 14909664, "step": 24465 }, { "epoch": 7.592305305615886, "grad_norm": 0.09675527364015579, "learning_rate": 7.801753152298274e-06, "loss": 0.228, "num_input_tokens_seen": 14912800, "step": 24470 }, { "epoch": 7.593856655290102, "grad_norm": 0.12347063422203064, "learning_rate": 7.800631750704982e-06, "loss": 0.2269, "num_input_tokens_seen": 14915488, "step": 24475 }, { "epoch": 7.595408004964319, "grad_norm": 0.15173904597759247, "learning_rate": 7.799510143792214e-06, "loss": 0.2344, "num_input_tokens_seen": 14919936, "step": 24480 }, { "epoch": 7.5969593546385354, "grad_norm": 0.08431525528430939, "learning_rate": 7.798388331642203e-06, "loss": 0.2337, "num_input_tokens_seen": 14922496, "step": 24485 }, { "epoch": 7.598510704312752, "grad_norm": 0.06787779182195663, "learning_rate": 7.797266314337189e-06, "loss": 0.2349, "num_input_tokens_seen": 14926816, "step": 24490 }, { "epoch": 7.600062053986969, "grad_norm": 0.2568473815917969, "learning_rate": 7.79614409195943e-06, "loss": 0.2316, "num_input_tokens_seen": 14929952, "step": 24495 }, { "epoch": 7.601613403661185, "grad_norm": 0.129347026348114, "learning_rate": 7.795021664591198e-06, "loss": 0.2294, "num_input_tokens_seen": 14932576, "step": 24500 }, { "epoch": 7.603164753335402, "grad_norm": 0.11049266159534454, "learning_rate": 7.79389903231478e-06, "loss": 0.2379, "num_input_tokens_seen": 14936160, "step": 24505 }, { "epoch": 7.604716103009618, "grad_norm": 0.12154219299554825, "learning_rate": 7.792776195212477e-06, "loss": 0.2358, "num_input_tokens_seen": 14940320, "step": 24510 }, { "epoch": 7.606267452683835, "grad_norm": 0.06900766491889954, "learning_rate": 7.791653153366608e-06, "loss": 0.2309, "num_input_tokens_seen": 14943104, "step": 24515 }, { "epoch": 7.607818802358052, "grad_norm": 0.13660436868667603, "learning_rate": 7.790529906859505e-06, "loss": 0.2331, "num_input_tokens_seen": 14945792, "step": 24520 }, { "epoch": 7.609370152032268, "grad_norm": 0.08776117116212845, "learning_rate": 7.789406455773516e-06, "loss": 0.2341, "num_input_tokens_seen": 14948480, "step": 24525 }, { "epoch": 7.610921501706485, "grad_norm": 0.11738752573728561, "learning_rate": 7.788282800191e-06, "loss": 0.2196, "num_input_tokens_seen": 14951264, "step": 24530 }, { "epoch": 7.612472851380701, "grad_norm": 0.18342743813991547, "learning_rate": 7.78715894019434e-06, "loss": 0.2389, "num_input_tokens_seen": 14954368, "step": 24535 }, { "epoch": 7.614024201054917, "grad_norm": 0.13097645342350006, "learning_rate": 7.786034875865921e-06, "loss": 0.2298, "num_input_tokens_seen": 14957344, "step": 24540 }, { "epoch": 7.615575550729134, "grad_norm": 0.07649564743041992, "learning_rate": 7.784910607288157e-06, "loss": 0.2304, "num_input_tokens_seen": 14961152, "step": 24545 }, { "epoch": 7.617126900403351, "grad_norm": 0.21000123023986816, "learning_rate": 7.783786134543465e-06, "loss": 0.2348, "num_input_tokens_seen": 14963776, "step": 24550 }, { "epoch": 7.618678250077568, "grad_norm": 0.11502522230148315, "learning_rate": 7.782661457714285e-06, "loss": 0.2365, "num_input_tokens_seen": 14966752, "step": 24555 }, { "epoch": 7.620229599751784, "grad_norm": 0.12360025942325592, "learning_rate": 7.781536576883068e-06, "loss": 0.2336, "num_input_tokens_seen": 14971328, "step": 24560 }, { "epoch": 7.621780949426, "grad_norm": 0.11699303239583969, "learning_rate": 7.780411492132284e-06, "loss": 0.2279, "num_input_tokens_seen": 14974240, "step": 24565 }, { "epoch": 7.623332299100217, "grad_norm": 0.14104335010051727, "learning_rate": 7.77928620354441e-06, "loss": 0.2343, "num_input_tokens_seen": 14977024, "step": 24570 }, { "epoch": 7.624883648774434, "grad_norm": 0.13316144049167633, "learning_rate": 7.778160711201948e-06, "loss": 0.2334, "num_input_tokens_seen": 14979552, "step": 24575 }, { "epoch": 7.626434998448651, "grad_norm": 0.13982926309108734, "learning_rate": 7.777035015187403e-06, "loss": 0.2278, "num_input_tokens_seen": 14982368, "step": 24580 }, { "epoch": 7.627986348122867, "grad_norm": 0.10706125199794769, "learning_rate": 7.775909115583311e-06, "loss": 0.227, "num_input_tokens_seen": 14985248, "step": 24585 }, { "epoch": 7.629537697797083, "grad_norm": 0.10271871834993362, "learning_rate": 7.774783012472208e-06, "loss": 0.2305, "num_input_tokens_seen": 14988544, "step": 24590 }, { "epoch": 7.6310890474713, "grad_norm": 0.1066489964723587, "learning_rate": 7.773656705936651e-06, "loss": 0.2297, "num_input_tokens_seen": 14990880, "step": 24595 }, { "epoch": 7.632640397145517, "grad_norm": 0.14731217920780182, "learning_rate": 7.772530196059214e-06, "loss": 0.2369, "num_input_tokens_seen": 14993632, "step": 24600 }, { "epoch": 7.634191746819733, "grad_norm": 0.054462336003780365, "learning_rate": 7.771403482922479e-06, "loss": 0.2379, "num_input_tokens_seen": 14995872, "step": 24605 }, { "epoch": 7.6357430964939494, "grad_norm": 0.2353401482105255, "learning_rate": 7.770276566609055e-06, "loss": 0.2375, "num_input_tokens_seen": 14999488, "step": 24610 }, { "epoch": 7.637294446168166, "grad_norm": 0.12386007606983185, "learning_rate": 7.76914944720155e-06, "loss": 0.2263, "num_input_tokens_seen": 15003584, "step": 24615 }, { "epoch": 7.638845795842383, "grad_norm": 0.12260294705629349, "learning_rate": 7.7680221247826e-06, "loss": 0.2377, "num_input_tokens_seen": 15005856, "step": 24620 }, { "epoch": 7.6403971455166, "grad_norm": 0.12264695018529892, "learning_rate": 7.76689459943485e-06, "loss": 0.226, "num_input_tokens_seen": 15008768, "step": 24625 }, { "epoch": 7.641948495190816, "grad_norm": 0.05033121258020401, "learning_rate": 7.76576687124096e-06, "loss": 0.239, "num_input_tokens_seen": 15011488, "step": 24630 }, { "epoch": 7.643499844865032, "grad_norm": 0.09189331531524658, "learning_rate": 7.764638940283606e-06, "loss": 0.2241, "num_input_tokens_seen": 15014752, "step": 24635 }, { "epoch": 7.645051194539249, "grad_norm": 0.12827186286449432, "learning_rate": 7.763510806645483e-06, "loss": 0.2353, "num_input_tokens_seen": 15017504, "step": 24640 }, { "epoch": 7.646602544213466, "grad_norm": 0.13943897187709808, "learning_rate": 7.762382470409291e-06, "loss": 0.2269, "num_input_tokens_seen": 15020320, "step": 24645 }, { "epoch": 7.648153893887683, "grad_norm": 0.07445039600133896, "learning_rate": 7.761253931657752e-06, "loss": 0.2324, "num_input_tokens_seen": 15023936, "step": 24650 }, { "epoch": 7.649705243561899, "grad_norm": 0.13258399069309235, "learning_rate": 7.760125190473602e-06, "loss": 0.2313, "num_input_tokens_seen": 15027776, "step": 24655 }, { "epoch": 7.651256593236115, "grad_norm": 0.12849946320056915, "learning_rate": 7.758996246939592e-06, "loss": 0.2313, "num_input_tokens_seen": 15030720, "step": 24660 }, { "epoch": 7.652807942910332, "grad_norm": 0.1145043820142746, "learning_rate": 7.757867101138484e-06, "loss": 0.2293, "num_input_tokens_seen": 15034176, "step": 24665 }, { "epoch": 7.654359292584548, "grad_norm": 0.06637504696846008, "learning_rate": 7.756737753153058e-06, "loss": 0.2303, "num_input_tokens_seen": 15036448, "step": 24670 }, { "epoch": 7.655910642258765, "grad_norm": 0.11414896696805954, "learning_rate": 7.755608203066113e-06, "loss": 0.233, "num_input_tokens_seen": 15038784, "step": 24675 }, { "epoch": 7.657461991932982, "grad_norm": 0.1437978595495224, "learning_rate": 7.754478450960453e-06, "loss": 0.2269, "num_input_tokens_seen": 15041856, "step": 24680 }, { "epoch": 7.659013341607198, "grad_norm": 0.14857500791549683, "learning_rate": 7.753348496918906e-06, "loss": 0.2324, "num_input_tokens_seen": 15044992, "step": 24685 }, { "epoch": 7.660564691281415, "grad_norm": 0.14110121130943298, "learning_rate": 7.75221834102431e-06, "loss": 0.2335, "num_input_tokens_seen": 15048128, "step": 24690 }, { "epoch": 7.662116040955631, "grad_norm": 0.05348905920982361, "learning_rate": 7.751087983359518e-06, "loss": 0.2315, "num_input_tokens_seen": 15051232, "step": 24695 }, { "epoch": 7.663667390629848, "grad_norm": 0.13422803580760956, "learning_rate": 7.7499574240074e-06, "loss": 0.2301, "num_input_tokens_seen": 15053856, "step": 24700 }, { "epoch": 7.665218740304065, "grad_norm": 0.12705133855342865, "learning_rate": 7.748826663050837e-06, "loss": 0.2364, "num_input_tokens_seen": 15056288, "step": 24705 }, { "epoch": 7.666770089978281, "grad_norm": 0.13712096214294434, "learning_rate": 7.74769570057273e-06, "loss": 0.2336, "num_input_tokens_seen": 15058688, "step": 24710 }, { "epoch": 7.668321439652498, "grad_norm": 0.20637904107570648, "learning_rate": 7.746564536655989e-06, "loss": 0.2279, "num_input_tokens_seen": 15061152, "step": 24715 }, { "epoch": 7.669872789326714, "grad_norm": 0.13608425855636597, "learning_rate": 7.745433171383545e-06, "loss": 0.2293, "num_input_tokens_seen": 15064544, "step": 24720 }, { "epoch": 7.671424139000931, "grad_norm": 0.07586175948381424, "learning_rate": 7.744301604838337e-06, "loss": 0.2273, "num_input_tokens_seen": 15068736, "step": 24725 }, { "epoch": 7.672975488675148, "grad_norm": 0.13248029351234436, "learning_rate": 7.743169837103327e-06, "loss": 0.2342, "num_input_tokens_seen": 15072832, "step": 24730 }, { "epoch": 7.674526838349364, "grad_norm": 0.08030455559492111, "learning_rate": 7.742037868261481e-06, "loss": 0.2275, "num_input_tokens_seen": 15077024, "step": 24735 }, { "epoch": 7.67607818802358, "grad_norm": 0.10915791243314743, "learning_rate": 7.740905698395788e-06, "loss": 0.2306, "num_input_tokens_seen": 15081152, "step": 24740 }, { "epoch": 7.677629537697797, "grad_norm": 0.15616931021213531, "learning_rate": 7.739773327589252e-06, "loss": 0.2345, "num_input_tokens_seen": 15085536, "step": 24745 }, { "epoch": 7.679180887372014, "grad_norm": 0.12890031933784485, "learning_rate": 7.738640755924886e-06, "loss": 0.2363, "num_input_tokens_seen": 15088544, "step": 24750 }, { "epoch": 7.6807322370462305, "grad_norm": 0.08637753129005432, "learning_rate": 7.737507983485724e-06, "loss": 0.2361, "num_input_tokens_seen": 15092096, "step": 24755 }, { "epoch": 7.682283586720446, "grad_norm": 0.06135560944676399, "learning_rate": 7.73637501035481e-06, "loss": 0.2303, "num_input_tokens_seen": 15094656, "step": 24760 }, { "epoch": 7.683834936394663, "grad_norm": 0.1339932382106781, "learning_rate": 7.735241836615202e-06, "loss": 0.2283, "num_input_tokens_seen": 15097280, "step": 24765 }, { "epoch": 7.68538628606888, "grad_norm": 0.2250041514635086, "learning_rate": 7.734108462349978e-06, "loss": 0.229, "num_input_tokens_seen": 15100000, "step": 24770 }, { "epoch": 7.686937635743097, "grad_norm": 0.08531489223241806, "learning_rate": 7.732974887642228e-06, "loss": 0.2299, "num_input_tokens_seen": 15102752, "step": 24775 }, { "epoch": 7.6884889854173135, "grad_norm": 0.1958286166191101, "learning_rate": 7.731841112575056e-06, "loss": 0.2356, "num_input_tokens_seen": 15106464, "step": 24780 }, { "epoch": 7.690040335091529, "grad_norm": 0.1313631385564804, "learning_rate": 7.730707137231578e-06, "loss": 0.234, "num_input_tokens_seen": 15109280, "step": 24785 }, { "epoch": 7.691591684765746, "grad_norm": 0.2143564373254776, "learning_rate": 7.729572961694933e-06, "loss": 0.2341, "num_input_tokens_seen": 15112896, "step": 24790 }, { "epoch": 7.693143034439963, "grad_norm": 0.09814535081386566, "learning_rate": 7.728438586048265e-06, "loss": 0.2295, "num_input_tokens_seen": 15117056, "step": 24795 }, { "epoch": 7.69469438411418, "grad_norm": 0.11886035650968552, "learning_rate": 7.727304010374741e-06, "loss": 0.2326, "num_input_tokens_seen": 15120608, "step": 24800 }, { "epoch": 7.696245733788396, "grad_norm": 0.1305488795042038, "learning_rate": 7.726169234757535e-06, "loss": 0.228, "num_input_tokens_seen": 15123776, "step": 24805 }, { "epoch": 7.697797083462612, "grad_norm": 0.12945003807544708, "learning_rate": 7.725034259279844e-06, "loss": 0.2296, "num_input_tokens_seen": 15127232, "step": 24810 }, { "epoch": 7.699348433136829, "grad_norm": 0.13054311275482178, "learning_rate": 7.72389908402487e-06, "loss": 0.2279, "num_input_tokens_seen": 15130048, "step": 24815 }, { "epoch": 7.700899782811046, "grad_norm": 0.14546656608581543, "learning_rate": 7.72276370907584e-06, "loss": 0.2313, "num_input_tokens_seen": 15134304, "step": 24820 }, { "epoch": 7.702451132485262, "grad_norm": 0.24993693828582764, "learning_rate": 7.721628134515983e-06, "loss": 0.2319, "num_input_tokens_seen": 15141088, "step": 24825 }, { "epoch": 7.704002482159479, "grad_norm": 0.13339917361736298, "learning_rate": 7.72049236042856e-06, "loss": 0.2344, "num_input_tokens_seen": 15143232, "step": 24830 }, { "epoch": 7.705553831833695, "grad_norm": 0.0931498110294342, "learning_rate": 7.71935638689683e-06, "loss": 0.2318, "num_input_tokens_seen": 15146528, "step": 24835 }, { "epoch": 7.707105181507912, "grad_norm": 0.1383538544178009, "learning_rate": 7.718220214004073e-06, "loss": 0.2275, "num_input_tokens_seen": 15148896, "step": 24840 }, { "epoch": 7.708656531182129, "grad_norm": 0.20240511000156403, "learning_rate": 7.717083841833587e-06, "loss": 0.2244, "num_input_tokens_seen": 15152512, "step": 24845 }, { "epoch": 7.710207880856345, "grad_norm": 0.17013533413410187, "learning_rate": 7.715947270468681e-06, "loss": 0.2363, "num_input_tokens_seen": 15156384, "step": 24850 }, { "epoch": 7.711759230530562, "grad_norm": 0.1834036409854889, "learning_rate": 7.714810499992678e-06, "loss": 0.2255, "num_input_tokens_seen": 15158656, "step": 24855 }, { "epoch": 7.713310580204778, "grad_norm": 0.15153665840625763, "learning_rate": 7.713673530488917e-06, "loss": 0.23, "num_input_tokens_seen": 15161440, "step": 24860 }, { "epoch": 7.714861929878995, "grad_norm": 0.08649859577417374, "learning_rate": 7.712536362040751e-06, "loss": 0.2338, "num_input_tokens_seen": 15163456, "step": 24865 }, { "epoch": 7.716413279553211, "grad_norm": 0.1992926001548767, "learning_rate": 7.71139899473155e-06, "loss": 0.2265, "num_input_tokens_seen": 15167456, "step": 24870 }, { "epoch": 7.717964629227428, "grad_norm": 0.1461144983768463, "learning_rate": 7.710261428644694e-06, "loss": 0.2358, "num_input_tokens_seen": 15169792, "step": 24875 }, { "epoch": 7.7195159789016445, "grad_norm": 0.1527630090713501, "learning_rate": 7.70912366386358e-06, "loss": 0.2218, "num_input_tokens_seen": 15173536, "step": 24880 }, { "epoch": 7.721067328575861, "grad_norm": 0.12667591869831085, "learning_rate": 7.707985700471621e-06, "loss": 0.2291, "num_input_tokens_seen": 15176576, "step": 24885 }, { "epoch": 7.722618678250077, "grad_norm": 0.15707261860370636, "learning_rate": 7.706847538552244e-06, "loss": 0.2373, "num_input_tokens_seen": 15181472, "step": 24890 }, { "epoch": 7.724170027924294, "grad_norm": 0.14912103116512299, "learning_rate": 7.705709178188887e-06, "loss": 0.2294, "num_input_tokens_seen": 15184640, "step": 24895 }, { "epoch": 7.725721377598511, "grad_norm": 0.11542104184627533, "learning_rate": 7.704570619465009e-06, "loss": 0.2311, "num_input_tokens_seen": 15187904, "step": 24900 }, { "epoch": 7.7272727272727275, "grad_norm": 0.1588389128446579, "learning_rate": 7.703431862464076e-06, "loss": 0.2392, "num_input_tokens_seen": 15191040, "step": 24905 }, { "epoch": 7.728824076946944, "grad_norm": 0.17850805819034576, "learning_rate": 7.702292907269574e-06, "loss": 0.234, "num_input_tokens_seen": 15194016, "step": 24910 }, { "epoch": 7.73037542662116, "grad_norm": 0.22629250586032867, "learning_rate": 7.701153753965002e-06, "loss": 0.2234, "num_input_tokens_seen": 15196128, "step": 24915 }, { "epoch": 7.731926776295377, "grad_norm": 0.1469022035598755, "learning_rate": 7.700014402633875e-06, "loss": 0.2336, "num_input_tokens_seen": 15198656, "step": 24920 }, { "epoch": 7.733478125969594, "grad_norm": 0.11753721535205841, "learning_rate": 7.698874853359717e-06, "loss": 0.2337, "num_input_tokens_seen": 15201696, "step": 24925 }, { "epoch": 7.7350294756438105, "grad_norm": 0.13582994043827057, "learning_rate": 7.697735106226075e-06, "loss": 0.2331, "num_input_tokens_seen": 15204800, "step": 24930 }, { "epoch": 7.736580825318026, "grad_norm": 0.13818566501140594, "learning_rate": 7.696595161316501e-06, "loss": 0.2391, "num_input_tokens_seen": 15208672, "step": 24935 }, { "epoch": 7.738132174992243, "grad_norm": 0.14513739943504333, "learning_rate": 7.695455018714571e-06, "loss": 0.2326, "num_input_tokens_seen": 15212160, "step": 24940 }, { "epoch": 7.73968352466646, "grad_norm": 0.13911496102809906, "learning_rate": 7.694314678503869e-06, "loss": 0.2274, "num_input_tokens_seen": 15215488, "step": 24945 }, { "epoch": 7.741234874340677, "grad_norm": 0.1318167895078659, "learning_rate": 7.693174140767996e-06, "loss": 0.2348, "num_input_tokens_seen": 15220256, "step": 24950 }, { "epoch": 7.742786224014893, "grad_norm": 0.1225249320268631, "learning_rate": 7.692033405590567e-06, "loss": 0.2284, "num_input_tokens_seen": 15223040, "step": 24955 }, { "epoch": 7.744337573689109, "grad_norm": 0.2439471334218979, "learning_rate": 7.69089247305521e-06, "loss": 0.238, "num_input_tokens_seen": 15226912, "step": 24960 }, { "epoch": 7.745888923363326, "grad_norm": 0.14023804664611816, "learning_rate": 7.68975134324557e-06, "loss": 0.2258, "num_input_tokens_seen": 15229568, "step": 24965 }, { "epoch": 7.747440273037543, "grad_norm": 0.14598999917507172, "learning_rate": 7.688610016245304e-06, "loss": 0.2356, "num_input_tokens_seen": 15232416, "step": 24970 }, { "epoch": 7.74899162271176, "grad_norm": 0.11317013949155807, "learning_rate": 7.687468492138085e-06, "loss": 0.2325, "num_input_tokens_seen": 15235136, "step": 24975 }, { "epoch": 7.750542972385976, "grad_norm": 0.24463844299316406, "learning_rate": 7.686326771007604e-06, "loss": 0.2357, "num_input_tokens_seen": 15237696, "step": 24980 }, { "epoch": 7.752094322060192, "grad_norm": 0.08486337959766388, "learning_rate": 7.68518485293756e-06, "loss": 0.23, "num_input_tokens_seen": 15240256, "step": 24985 }, { "epoch": 7.753645671734409, "grad_norm": 0.1331121176481247, "learning_rate": 7.684042738011668e-06, "loss": 0.2336, "num_input_tokens_seen": 15243008, "step": 24990 }, { "epoch": 7.755197021408626, "grad_norm": 0.0714772418141365, "learning_rate": 7.682900426313658e-06, "loss": 0.2285, "num_input_tokens_seen": 15245952, "step": 24995 }, { "epoch": 7.756748371082842, "grad_norm": 0.15462499856948853, "learning_rate": 7.68175791792728e-06, "loss": 0.2325, "num_input_tokens_seen": 15248832, "step": 25000 }, { "epoch": 7.7582997207570585, "grad_norm": 0.14427481591701508, "learning_rate": 7.680615212936285e-06, "loss": 0.2361, "num_input_tokens_seen": 15253504, "step": 25005 }, { "epoch": 7.759851070431275, "grad_norm": 0.09705336391925812, "learning_rate": 7.679472311424453e-06, "loss": 0.225, "num_input_tokens_seen": 15257248, "step": 25010 }, { "epoch": 7.761402420105492, "grad_norm": 0.1440073847770691, "learning_rate": 7.678329213475574e-06, "loss": 0.2268, "num_input_tokens_seen": 15260096, "step": 25015 }, { "epoch": 7.762953769779708, "grad_norm": 0.09812597185373306, "learning_rate": 7.677185919173446e-06, "loss": 0.2279, "num_input_tokens_seen": 15263552, "step": 25020 }, { "epoch": 7.764505119453925, "grad_norm": 0.13255691528320312, "learning_rate": 7.676042428601887e-06, "loss": 0.2401, "num_input_tokens_seen": 15266112, "step": 25025 }, { "epoch": 7.7660564691281415, "grad_norm": 0.07937286794185638, "learning_rate": 7.67489874184473e-06, "loss": 0.2327, "num_input_tokens_seen": 15269536, "step": 25030 }, { "epoch": 7.767607818802358, "grad_norm": 0.20898166298866272, "learning_rate": 7.673754858985817e-06, "loss": 0.2261, "num_input_tokens_seen": 15273088, "step": 25035 }, { "epoch": 7.769159168476575, "grad_norm": 0.1829838901758194, "learning_rate": 7.672610780109014e-06, "loss": 0.2364, "num_input_tokens_seen": 15275776, "step": 25040 }, { "epoch": 7.770710518150791, "grad_norm": 0.12895557284355164, "learning_rate": 7.67146650529819e-06, "loss": 0.2277, "num_input_tokens_seen": 15278560, "step": 25045 }, { "epoch": 7.772261867825008, "grad_norm": 0.11095786094665527, "learning_rate": 7.670322034637237e-06, "loss": 0.228, "num_input_tokens_seen": 15281696, "step": 25050 }, { "epoch": 7.7738132174992245, "grad_norm": 0.10630560666322708, "learning_rate": 7.669177368210059e-06, "loss": 0.2258, "num_input_tokens_seen": 15284512, "step": 25055 }, { "epoch": 7.775364567173441, "grad_norm": 0.21611319482326508, "learning_rate": 7.66803250610057e-06, "loss": 0.2214, "num_input_tokens_seen": 15287040, "step": 25060 }, { "epoch": 7.776915916847657, "grad_norm": 0.16803744435310364, "learning_rate": 7.666887448392704e-06, "loss": 0.2257, "num_input_tokens_seen": 15290176, "step": 25065 }, { "epoch": 7.778467266521874, "grad_norm": 0.1703965663909912, "learning_rate": 7.665742195170408e-06, "loss": 0.2346, "num_input_tokens_seen": 15293216, "step": 25070 }, { "epoch": 7.780018616196091, "grad_norm": 0.13271872699260712, "learning_rate": 7.664596746517643e-06, "loss": 0.2246, "num_input_tokens_seen": 15297152, "step": 25075 }, { "epoch": 7.7815699658703075, "grad_norm": 0.14024801552295685, "learning_rate": 7.663451102518383e-06, "loss": 0.2344, "num_input_tokens_seen": 15300000, "step": 25080 }, { "epoch": 7.783121315544523, "grad_norm": 0.13757985830307007, "learning_rate": 7.662305263256615e-06, "loss": 0.2276, "num_input_tokens_seen": 15302912, "step": 25085 }, { "epoch": 7.78467266521874, "grad_norm": 0.14579574763774872, "learning_rate": 7.661159228816345e-06, "loss": 0.2343, "num_input_tokens_seen": 15305408, "step": 25090 }, { "epoch": 7.786224014892957, "grad_norm": 0.14414767920970917, "learning_rate": 7.660012999281594e-06, "loss": 0.2262, "num_input_tokens_seen": 15308928, "step": 25095 }, { "epoch": 7.787775364567174, "grad_norm": 0.14404451847076416, "learning_rate": 7.658866574736387e-06, "loss": 0.2413, "num_input_tokens_seen": 15312192, "step": 25100 }, { "epoch": 7.7893267142413904, "grad_norm": 0.11568060517311096, "learning_rate": 7.657719955264777e-06, "loss": 0.2292, "num_input_tokens_seen": 15314144, "step": 25105 }, { "epoch": 7.790878063915606, "grad_norm": 0.10954410582780838, "learning_rate": 7.656573140950823e-06, "loss": 0.232, "num_input_tokens_seen": 15316928, "step": 25110 }, { "epoch": 7.792429413589823, "grad_norm": 0.16228340566158295, "learning_rate": 7.655426131878596e-06, "loss": 0.2283, "num_input_tokens_seen": 15319776, "step": 25115 }, { "epoch": 7.79398076326404, "grad_norm": 0.15930472314357758, "learning_rate": 7.65427892813219e-06, "loss": 0.2336, "num_input_tokens_seen": 15322688, "step": 25120 }, { "epoch": 7.795532112938257, "grad_norm": 0.18931499123573303, "learning_rate": 7.653131529795708e-06, "loss": 0.2294, "num_input_tokens_seen": 15327072, "step": 25125 }, { "epoch": 7.7970834626124725, "grad_norm": 0.184946209192276, "learning_rate": 7.651983936953266e-06, "loss": 0.2309, "num_input_tokens_seen": 15330496, "step": 25130 }, { "epoch": 7.798634812286689, "grad_norm": 0.18720702826976776, "learning_rate": 7.650836149688997e-06, "loss": 0.2313, "num_input_tokens_seen": 15332832, "step": 25135 }, { "epoch": 7.800186161960906, "grad_norm": 0.1691213846206665, "learning_rate": 7.649688168087049e-06, "loss": 0.2265, "num_input_tokens_seen": 15335456, "step": 25140 }, { "epoch": 7.801737511635123, "grad_norm": 0.13862623274326324, "learning_rate": 7.64853999223158e-06, "loss": 0.2308, "num_input_tokens_seen": 15337888, "step": 25145 }, { "epoch": 7.803288861309339, "grad_norm": 0.15442609786987305, "learning_rate": 7.647391622206767e-06, "loss": 0.2259, "num_input_tokens_seen": 15340960, "step": 25150 }, { "epoch": 7.8048402109835555, "grad_norm": 0.12844164669513702, "learning_rate": 7.646243058096798e-06, "loss": 0.2271, "num_input_tokens_seen": 15343968, "step": 25155 }, { "epoch": 7.806391560657772, "grad_norm": 0.3534455895423889, "learning_rate": 7.645094299985877e-06, "loss": 0.2301, "num_input_tokens_seen": 15346656, "step": 25160 }, { "epoch": 7.807942910331989, "grad_norm": 0.4028797149658203, "learning_rate": 7.64394534795822e-06, "loss": 0.2314, "num_input_tokens_seen": 15350528, "step": 25165 }, { "epoch": 7.809494260006206, "grad_norm": 0.17917446792125702, "learning_rate": 7.642796202098061e-06, "loss": 0.2295, "num_input_tokens_seen": 15353280, "step": 25170 }, { "epoch": 7.811045609680422, "grad_norm": 0.1309632658958435, "learning_rate": 7.641646862489644e-06, "loss": 0.2303, "num_input_tokens_seen": 15355936, "step": 25175 }, { "epoch": 7.8125969593546385, "grad_norm": 0.3219758868217468, "learning_rate": 7.640497329217232e-06, "loss": 0.233, "num_input_tokens_seen": 15358784, "step": 25180 }, { "epoch": 7.814148309028855, "grad_norm": 0.21982085704803467, "learning_rate": 7.639347602365095e-06, "loss": 0.2325, "num_input_tokens_seen": 15361184, "step": 25185 }, { "epoch": 7.815699658703072, "grad_norm": 0.18660326302051544, "learning_rate": 7.638197682017527e-06, "loss": 0.2357, "num_input_tokens_seen": 15365088, "step": 25190 }, { "epoch": 7.817251008377288, "grad_norm": 0.25446152687072754, "learning_rate": 7.637047568258826e-06, "loss": 0.227, "num_input_tokens_seen": 15367328, "step": 25195 }, { "epoch": 7.818802358051505, "grad_norm": 0.23468028008937836, "learning_rate": 7.635897261173313e-06, "loss": 0.2279, "num_input_tokens_seen": 15370400, "step": 25200 }, { "epoch": 7.8203537077257215, "grad_norm": 0.338684618473053, "learning_rate": 7.634746760845314e-06, "loss": 0.236, "num_input_tokens_seen": 15372736, "step": 25205 }, { "epoch": 7.821905057399938, "grad_norm": 0.394020676612854, "learning_rate": 7.633596067359182e-06, "loss": 0.2312, "num_input_tokens_seen": 15375328, "step": 25210 }, { "epoch": 7.823456407074154, "grad_norm": 0.1931363344192505, "learning_rate": 7.632445180799269e-06, "loss": 0.2316, "num_input_tokens_seen": 15377920, "step": 25215 }, { "epoch": 7.825007756748371, "grad_norm": 0.29158708453178406, "learning_rate": 7.63129410124995e-06, "loss": 0.2334, "num_input_tokens_seen": 15380992, "step": 25220 }, { "epoch": 7.826559106422588, "grad_norm": 0.15512549877166748, "learning_rate": 7.630142828795616e-06, "loss": 0.233, "num_input_tokens_seen": 15383232, "step": 25225 }, { "epoch": 7.8281104560968044, "grad_norm": 0.17186513543128967, "learning_rate": 7.6289913635206655e-06, "loss": 0.2332, "num_input_tokens_seen": 15386304, "step": 25230 }, { "epoch": 7.829661805771021, "grad_norm": 0.15924352407455444, "learning_rate": 7.627839705509519e-06, "loss": 0.2336, "num_input_tokens_seen": 15389312, "step": 25235 }, { "epoch": 7.831213155445237, "grad_norm": 0.08963629603385925, "learning_rate": 7.626687854846601e-06, "loss": 0.2279, "num_input_tokens_seen": 15391872, "step": 25240 }, { "epoch": 7.832764505119454, "grad_norm": 0.25911226868629456, "learning_rate": 7.62553581161636e-06, "loss": 0.2367, "num_input_tokens_seen": 15395392, "step": 25245 }, { "epoch": 7.834315854793671, "grad_norm": 0.11897850036621094, "learning_rate": 7.624383575903253e-06, "loss": 0.2372, "num_input_tokens_seen": 15398176, "step": 25250 }, { "epoch": 7.835867204467887, "grad_norm": 0.12064094096422195, "learning_rate": 7.623231147791752e-06, "loss": 0.2328, "num_input_tokens_seen": 15400896, "step": 25255 }, { "epoch": 7.837418554142103, "grad_norm": 0.22955453395843506, "learning_rate": 7.622078527366343e-06, "loss": 0.2291, "num_input_tokens_seen": 15404480, "step": 25260 }, { "epoch": 7.83896990381632, "grad_norm": 0.1932182013988495, "learning_rate": 7.62092571471153e-06, "loss": 0.2355, "num_input_tokens_seen": 15407680, "step": 25265 }, { "epoch": 7.840521253490537, "grad_norm": 0.23612764477729797, "learning_rate": 7.619772709911825e-06, "loss": 0.2361, "num_input_tokens_seen": 15410528, "step": 25270 }, { "epoch": 7.842072603164754, "grad_norm": 0.16626912355422974, "learning_rate": 7.618619513051757e-06, "loss": 0.2318, "num_input_tokens_seen": 15415936, "step": 25275 }, { "epoch": 7.8436239528389695, "grad_norm": 0.1315423548221588, "learning_rate": 7.617466124215871e-06, "loss": 0.2303, "num_input_tokens_seen": 15420000, "step": 25280 }, { "epoch": 7.845175302513186, "grad_norm": 0.15306957066059113, "learning_rate": 7.616312543488722e-06, "loss": 0.2286, "num_input_tokens_seen": 15422496, "step": 25285 }, { "epoch": 7.846726652187403, "grad_norm": 0.2122044414281845, "learning_rate": 7.615158770954883e-06, "loss": 0.2351, "num_input_tokens_seen": 15425024, "step": 25290 }, { "epoch": 7.84827800186162, "grad_norm": 0.11802154034376144, "learning_rate": 7.614004806698936e-06, "loss": 0.2275, "num_input_tokens_seen": 15428096, "step": 25295 }, { "epoch": 7.849829351535837, "grad_norm": 0.09872746467590332, "learning_rate": 7.6128506508054825e-06, "loss": 0.2294, "num_input_tokens_seen": 15431040, "step": 25300 }, { "epoch": 7.8513807012100525, "grad_norm": 0.1517445594072342, "learning_rate": 7.611696303359136e-06, "loss": 0.2352, "num_input_tokens_seen": 15433888, "step": 25305 }, { "epoch": 7.852932050884269, "grad_norm": 0.152143657207489, "learning_rate": 7.6105417644445234e-06, "loss": 0.2383, "num_input_tokens_seen": 15436704, "step": 25310 }, { "epoch": 7.854483400558486, "grad_norm": 0.13229425251483917, "learning_rate": 7.609387034146286e-06, "loss": 0.2353, "num_input_tokens_seen": 15438592, "step": 25315 }, { "epoch": 7.856034750232703, "grad_norm": 0.06777731329202652, "learning_rate": 7.608232112549079e-06, "loss": 0.228, "num_input_tokens_seen": 15441600, "step": 25320 }, { "epoch": 7.857586099906919, "grad_norm": 0.1369994878768921, "learning_rate": 7.607076999737571e-06, "loss": 0.2311, "num_input_tokens_seen": 15444224, "step": 25325 }, { "epoch": 7.8591374495811355, "grad_norm": 0.08885177224874496, "learning_rate": 7.605921695796448e-06, "loss": 0.2213, "num_input_tokens_seen": 15446944, "step": 25330 }, { "epoch": 7.860688799255352, "grad_norm": 0.21891625225543976, "learning_rate": 7.604766200810406e-06, "loss": 0.2307, "num_input_tokens_seen": 15449952, "step": 25335 }, { "epoch": 7.862240148929569, "grad_norm": 0.11295905709266663, "learning_rate": 7.603610514864156e-06, "loss": 0.231, "num_input_tokens_seen": 15452448, "step": 25340 }, { "epoch": 7.863791498603785, "grad_norm": 0.234029158949852, "learning_rate": 7.602454638042425e-06, "loss": 0.2312, "num_input_tokens_seen": 15456576, "step": 25345 }, { "epoch": 7.865342848278002, "grad_norm": 0.2609771192073822, "learning_rate": 7.601298570429949e-06, "loss": 0.2341, "num_input_tokens_seen": 15460480, "step": 25350 }, { "epoch": 7.8668941979522184, "grad_norm": 0.09336016327142715, "learning_rate": 7.600142312111486e-06, "loss": 0.2336, "num_input_tokens_seen": 15463552, "step": 25355 }, { "epoch": 7.868445547626435, "grad_norm": 0.09735685586929321, "learning_rate": 7.598985863171801e-06, "loss": 0.2378, "num_input_tokens_seen": 15466400, "step": 25360 }, { "epoch": 7.869996897300652, "grad_norm": 0.2025601714849472, "learning_rate": 7.597829223695674e-06, "loss": 0.23, "num_input_tokens_seen": 15469056, "step": 25365 }, { "epoch": 7.871548246974868, "grad_norm": 0.0682184025645256, "learning_rate": 7.596672393767903e-06, "loss": 0.227, "num_input_tokens_seen": 15471360, "step": 25370 }, { "epoch": 7.873099596649085, "grad_norm": 0.06689868122339249, "learning_rate": 7.595515373473297e-06, "loss": 0.2244, "num_input_tokens_seen": 15475424, "step": 25375 }, { "epoch": 7.874650946323301, "grad_norm": 0.12056756019592285, "learning_rate": 7.594358162896678e-06, "loss": 0.2318, "num_input_tokens_seen": 15478368, "step": 25380 }, { "epoch": 7.876202295997518, "grad_norm": 0.16365955770015717, "learning_rate": 7.593200762122883e-06, "loss": 0.2311, "num_input_tokens_seen": 15481760, "step": 25385 }, { "epoch": 7.877753645671734, "grad_norm": 0.12067107111215591, "learning_rate": 7.592043171236765e-06, "loss": 0.2299, "num_input_tokens_seen": 15484608, "step": 25390 }, { "epoch": 7.879304995345951, "grad_norm": 0.09530454874038696, "learning_rate": 7.590885390323188e-06, "loss": 0.2281, "num_input_tokens_seen": 15487808, "step": 25395 }, { "epoch": 7.880856345020168, "grad_norm": 0.1071595773100853, "learning_rate": 7.589727419467032e-06, "loss": 0.2253, "num_input_tokens_seen": 15491296, "step": 25400 }, { "epoch": 7.882407694694384, "grad_norm": 0.19320310652256012, "learning_rate": 7.588569258753188e-06, "loss": 0.2299, "num_input_tokens_seen": 15493760, "step": 25405 }, { "epoch": 7.8839590443686, "grad_norm": 0.21586139500141144, "learning_rate": 7.587410908266564e-06, "loss": 0.2291, "num_input_tokens_seen": 15496768, "step": 25410 }, { "epoch": 7.885510394042817, "grad_norm": 0.09314523637294769, "learning_rate": 7.5862523680920815e-06, "loss": 0.2262, "num_input_tokens_seen": 15499936, "step": 25415 }, { "epoch": 7.887061743717034, "grad_norm": 0.2224491536617279, "learning_rate": 7.585093638314676e-06, "loss": 0.2293, "num_input_tokens_seen": 15502784, "step": 25420 }, { "epoch": 7.888613093391251, "grad_norm": 0.08650266379117966, "learning_rate": 7.583934719019293e-06, "loss": 0.2302, "num_input_tokens_seen": 15505536, "step": 25425 }, { "epoch": 7.890164443065467, "grad_norm": 0.11246564984321594, "learning_rate": 7.582775610290896e-06, "loss": 0.2264, "num_input_tokens_seen": 15508224, "step": 25430 }, { "epoch": 7.891715792739683, "grad_norm": 0.1364181935787201, "learning_rate": 7.581616312214464e-06, "loss": 0.2254, "num_input_tokens_seen": 15512160, "step": 25435 }, { "epoch": 7.8932671424139, "grad_norm": 0.13656413555145264, "learning_rate": 7.580456824874984e-06, "loss": 0.2277, "num_input_tokens_seen": 15514528, "step": 25440 }, { "epoch": 7.894818492088117, "grad_norm": 0.1624877005815506, "learning_rate": 7.579297148357462e-06, "loss": 0.2331, "num_input_tokens_seen": 15519328, "step": 25445 }, { "epoch": 7.896369841762334, "grad_norm": 0.16008783876895905, "learning_rate": 7.578137282746916e-06, "loss": 0.2348, "num_input_tokens_seen": 15522304, "step": 25450 }, { "epoch": 7.8979211914365495, "grad_norm": 0.1729588359594345, "learning_rate": 7.576977228128377e-06, "loss": 0.2271, "num_input_tokens_seen": 15525472, "step": 25455 }, { "epoch": 7.899472541110766, "grad_norm": 0.15923143923282623, "learning_rate": 7.5758169845868915e-06, "loss": 0.2303, "num_input_tokens_seen": 15528416, "step": 25460 }, { "epoch": 7.901023890784983, "grad_norm": 0.10974475741386414, "learning_rate": 7.574656552207518e-06, "loss": 0.225, "num_input_tokens_seen": 15530784, "step": 25465 }, { "epoch": 7.9025752404592, "grad_norm": 0.228928342461586, "learning_rate": 7.573495931075332e-06, "loss": 0.2334, "num_input_tokens_seen": 15533696, "step": 25470 }, { "epoch": 7.904126590133416, "grad_norm": 0.3234756588935852, "learning_rate": 7.572335121275418e-06, "loss": 0.2334, "num_input_tokens_seen": 15536768, "step": 25475 }, { "epoch": 7.9056779398076324, "grad_norm": 0.3569674789905548, "learning_rate": 7.571174122892881e-06, "loss": 0.235, "num_input_tokens_seen": 15540128, "step": 25480 }, { "epoch": 7.907229289481849, "grad_norm": 0.22977541387081146, "learning_rate": 7.5700129360128316e-06, "loss": 0.2364, "num_input_tokens_seen": 15543712, "step": 25485 }, { "epoch": 7.908780639156066, "grad_norm": 0.25056251883506775, "learning_rate": 7.568851560720401e-06, "loss": 0.231, "num_input_tokens_seen": 15546592, "step": 25490 }, { "epoch": 7.910331988830283, "grad_norm": 0.21702833473682404, "learning_rate": 7.567689997100731e-06, "loss": 0.2319, "num_input_tokens_seen": 15549280, "step": 25495 }, { "epoch": 7.911883338504499, "grad_norm": 0.09092256426811218, "learning_rate": 7.566528245238978e-06, "loss": 0.2272, "num_input_tokens_seen": 15551616, "step": 25500 }, { "epoch": 7.913434688178715, "grad_norm": 0.2193065732717514, "learning_rate": 7.565366305220312e-06, "loss": 0.2282, "num_input_tokens_seen": 15554912, "step": 25505 }, { "epoch": 7.914986037852932, "grad_norm": 0.20021820068359375, "learning_rate": 7.564204177129919e-06, "loss": 0.2285, "num_input_tokens_seen": 15557600, "step": 25510 }, { "epoch": 7.916537387527149, "grad_norm": 0.20298877358436584, "learning_rate": 7.563041861052993e-06, "loss": 0.2186, "num_input_tokens_seen": 15559936, "step": 25515 }, { "epoch": 7.918088737201365, "grad_norm": 0.24464190006256104, "learning_rate": 7.56187935707475e-06, "loss": 0.222, "num_input_tokens_seen": 15564032, "step": 25520 }, { "epoch": 7.919640086875582, "grad_norm": 0.1722223311662674, "learning_rate": 7.560716665280412e-06, "loss": 0.2292, "num_input_tokens_seen": 15566432, "step": 25525 }, { "epoch": 7.921191436549798, "grad_norm": 0.16593347489833832, "learning_rate": 7.559553785755218e-06, "loss": 0.2433, "num_input_tokens_seen": 15570144, "step": 25530 }, { "epoch": 7.922742786224015, "grad_norm": 0.24045346677303314, "learning_rate": 7.558390718584423e-06, "loss": 0.2286, "num_input_tokens_seen": 15573120, "step": 25535 }, { "epoch": 7.924294135898231, "grad_norm": 0.16213375329971313, "learning_rate": 7.557227463853291e-06, "loss": 0.2285, "num_input_tokens_seen": 15576256, "step": 25540 }, { "epoch": 7.925845485572448, "grad_norm": 0.2413311004638672, "learning_rate": 7.556064021647105e-06, "loss": 0.2319, "num_input_tokens_seen": 15579552, "step": 25545 }, { "epoch": 7.927396835246665, "grad_norm": 0.20921500027179718, "learning_rate": 7.554900392051157e-06, "loss": 0.2191, "num_input_tokens_seen": 15582336, "step": 25550 }, { "epoch": 7.928948184920881, "grad_norm": 0.20700904726982117, "learning_rate": 7.553736575150757e-06, "loss": 0.2315, "num_input_tokens_seen": 15586176, "step": 25555 }, { "epoch": 7.930499534595098, "grad_norm": 0.20337076485157013, "learning_rate": 7.552572571031225e-06, "loss": 0.2266, "num_input_tokens_seen": 15589536, "step": 25560 }, { "epoch": 7.932050884269314, "grad_norm": 0.217006117105484, "learning_rate": 7.551408379777896e-06, "loss": 0.2363, "num_input_tokens_seen": 15592160, "step": 25565 }, { "epoch": 7.933602233943531, "grad_norm": 0.12346608936786652, "learning_rate": 7.550244001476119e-06, "loss": 0.2333, "num_input_tokens_seen": 15594464, "step": 25570 }, { "epoch": 7.935153583617748, "grad_norm": 0.27437058091163635, "learning_rate": 7.54907943621126e-06, "loss": 0.2209, "num_input_tokens_seen": 15597984, "step": 25575 }, { "epoch": 7.936704933291964, "grad_norm": 0.28545889258384705, "learning_rate": 7.547914684068689e-06, "loss": 0.24, "num_input_tokens_seen": 15602080, "step": 25580 }, { "epoch": 7.93825628296618, "grad_norm": 0.15441899001598358, "learning_rate": 7.546749745133802e-06, "loss": 0.2239, "num_input_tokens_seen": 15605088, "step": 25585 }, { "epoch": 7.939807632640397, "grad_norm": 0.3536722660064697, "learning_rate": 7.545584619492001e-06, "loss": 0.2359, "num_input_tokens_seen": 15607840, "step": 25590 }, { "epoch": 7.941358982314614, "grad_norm": 0.11254031211137772, "learning_rate": 7.544419307228703e-06, "loss": 0.2295, "num_input_tokens_seen": 15611264, "step": 25595 }, { "epoch": 7.942910331988831, "grad_norm": 0.23511123657226562, "learning_rate": 7.5432538084293385e-06, "loss": 0.2253, "num_input_tokens_seen": 15614336, "step": 25600 }, { "epoch": 7.9444616816630464, "grad_norm": 0.18119822442531586, "learning_rate": 7.5420881231793545e-06, "loss": 0.2367, "num_input_tokens_seen": 15616640, "step": 25605 }, { "epoch": 7.946013031337263, "grad_norm": 0.2260570526123047, "learning_rate": 7.5409222515642065e-06, "loss": 0.2182, "num_input_tokens_seen": 15620448, "step": 25610 }, { "epoch": 7.94756438101148, "grad_norm": 0.23401334881782532, "learning_rate": 7.5397561936693695e-06, "loss": 0.2282, "num_input_tokens_seen": 15624672, "step": 25615 }, { "epoch": 7.949115730685697, "grad_norm": 0.2726004123687744, "learning_rate": 7.5385899495803285e-06, "loss": 0.2253, "num_input_tokens_seen": 15627616, "step": 25620 }, { "epoch": 7.9506670803599135, "grad_norm": 0.41507479548454285, "learning_rate": 7.537423519382581e-06, "loss": 0.2384, "num_input_tokens_seen": 15630304, "step": 25625 }, { "epoch": 7.952218430034129, "grad_norm": 0.150675967335701, "learning_rate": 7.536256903161644e-06, "loss": 0.2247, "num_input_tokens_seen": 15632640, "step": 25630 }, { "epoch": 7.953769779708346, "grad_norm": 0.18027502298355103, "learning_rate": 7.5350901010030405e-06, "loss": 0.2234, "num_input_tokens_seen": 15635520, "step": 25635 }, { "epoch": 7.955321129382563, "grad_norm": 0.14754945039749146, "learning_rate": 7.533923112992314e-06, "loss": 0.2133, "num_input_tokens_seen": 15639264, "step": 25640 }, { "epoch": 7.95687247905678, "grad_norm": 0.32126033306121826, "learning_rate": 7.532755939215017e-06, "loss": 0.2322, "num_input_tokens_seen": 15641792, "step": 25645 }, { "epoch": 7.958423828730996, "grad_norm": 0.264165461063385, "learning_rate": 7.5315885797567166e-06, "loss": 0.2316, "num_input_tokens_seen": 15644800, "step": 25650 }, { "epoch": 7.959975178405212, "grad_norm": 0.20604294538497925, "learning_rate": 7.530421034702996e-06, "loss": 0.2571, "num_input_tokens_seen": 15647552, "step": 25655 }, { "epoch": 7.961526528079429, "grad_norm": 0.11844131350517273, "learning_rate": 7.529253304139448e-06, "loss": 0.2334, "num_input_tokens_seen": 15650112, "step": 25660 }, { "epoch": 7.963077877753646, "grad_norm": 0.14687079191207886, "learning_rate": 7.528085388151682e-06, "loss": 0.2467, "num_input_tokens_seen": 15653056, "step": 25665 }, { "epoch": 7.964629227427862, "grad_norm": 0.26534906029701233, "learning_rate": 7.526917286825322e-06, "loss": 0.2153, "num_input_tokens_seen": 15656960, "step": 25670 }, { "epoch": 7.966180577102079, "grad_norm": 0.11624165624380112, "learning_rate": 7.525749000245998e-06, "loss": 0.229, "num_input_tokens_seen": 15659872, "step": 25675 }, { "epoch": 7.967731926776295, "grad_norm": 0.17863576114177704, "learning_rate": 7.524580528499365e-06, "loss": 0.2291, "num_input_tokens_seen": 15663232, "step": 25680 }, { "epoch": 7.969283276450512, "grad_norm": 0.2918952405452728, "learning_rate": 7.523411871671085e-06, "loss": 0.2301, "num_input_tokens_seen": 15665600, "step": 25685 }, { "epoch": 7.970834626124729, "grad_norm": 0.14707617461681366, "learning_rate": 7.522243029846832e-06, "loss": 0.2361, "num_input_tokens_seen": 15668672, "step": 25690 }, { "epoch": 7.972385975798945, "grad_norm": 0.22736576199531555, "learning_rate": 7.521074003112295e-06, "loss": 0.2302, "num_input_tokens_seen": 15671872, "step": 25695 }, { "epoch": 7.973937325473162, "grad_norm": 0.1227823868393898, "learning_rate": 7.519904791553182e-06, "loss": 0.2272, "num_input_tokens_seen": 15674208, "step": 25700 }, { "epoch": 7.975488675147378, "grad_norm": 0.14483027160167694, "learning_rate": 7.518735395255208e-06, "loss": 0.2336, "num_input_tokens_seen": 15677088, "step": 25705 }, { "epoch": 7.977040024821595, "grad_norm": 0.263370156288147, "learning_rate": 7.517565814304102e-06, "loss": 0.2295, "num_input_tokens_seen": 15679904, "step": 25710 }, { "epoch": 7.978591374495811, "grad_norm": 0.26031243801116943, "learning_rate": 7.516396048785609e-06, "loss": 0.23, "num_input_tokens_seen": 15682784, "step": 25715 }, { "epoch": 7.980142724170028, "grad_norm": 0.19862699508666992, "learning_rate": 7.5152260987854885e-06, "loss": 0.2306, "num_input_tokens_seen": 15685376, "step": 25720 }, { "epoch": 7.981694073844245, "grad_norm": 0.12785452604293823, "learning_rate": 7.5140559643895095e-06, "loss": 0.2302, "num_input_tokens_seen": 15688000, "step": 25725 }, { "epoch": 7.983245423518461, "grad_norm": 0.1687946617603302, "learning_rate": 7.5128856456834565e-06, "loss": 0.2195, "num_input_tokens_seen": 15690784, "step": 25730 }, { "epoch": 7.984796773192677, "grad_norm": 0.1680743396282196, "learning_rate": 7.5117151427531305e-06, "loss": 0.2248, "num_input_tokens_seen": 15693280, "step": 25735 }, { "epoch": 7.986348122866894, "grad_norm": 0.22372789680957794, "learning_rate": 7.510544455684341e-06, "loss": 0.239, "num_input_tokens_seen": 15696480, "step": 25740 }, { "epoch": 7.987899472541111, "grad_norm": 0.21777154505252838, "learning_rate": 7.509373584562912e-06, "loss": 0.2267, "num_input_tokens_seen": 15700320, "step": 25745 }, { "epoch": 7.9894508222153275, "grad_norm": 0.09696401655673981, "learning_rate": 7.5082025294746854e-06, "loss": 0.2313, "num_input_tokens_seen": 15704736, "step": 25750 }, { "epoch": 7.991002171889544, "grad_norm": 0.08634790033102036, "learning_rate": 7.507031290505511e-06, "loss": 0.2325, "num_input_tokens_seen": 15707872, "step": 25755 }, { "epoch": 7.99255352156376, "grad_norm": 0.16206397116184235, "learning_rate": 7.505859867741255e-06, "loss": 0.228, "num_input_tokens_seen": 15710688, "step": 25760 }, { "epoch": 7.994104871237977, "grad_norm": 0.16627967357635498, "learning_rate": 7.504688261267799e-06, "loss": 0.2337, "num_input_tokens_seen": 15713056, "step": 25765 }, { "epoch": 7.995656220912194, "grad_norm": 0.11261166632175446, "learning_rate": 7.503516471171031e-06, "loss": 0.2323, "num_input_tokens_seen": 15715680, "step": 25770 }, { "epoch": 7.9972075705864105, "grad_norm": 0.13339176774024963, "learning_rate": 7.50234449753686e-06, "loss": 0.2232, "num_input_tokens_seen": 15717920, "step": 25775 }, { "epoch": 7.998758920260626, "grad_norm": 0.15333686769008636, "learning_rate": 7.501172340451206e-06, "loss": 0.2268, "num_input_tokens_seen": 15721792, "step": 25780 }, { "epoch": 8.0, "eval_loss": 0.2319650948047638, "eval_runtime": 34.7016, "eval_samples_per_second": 92.878, "eval_steps_per_second": 23.227, "num_input_tokens_seen": 15724160, "step": 25784 }, { "epoch": 8.000310269934843, "grad_norm": 0.23436994850635529, "learning_rate": 7.500000000000001e-06, "loss": 0.2337, "num_input_tokens_seen": 15724544, "step": 25785 }, { "epoch": 8.001861619609059, "grad_norm": 0.17996498942375183, "learning_rate": 7.4988274762691915e-06, "loss": 0.2303, "num_input_tokens_seen": 15727456, "step": 25790 }, { "epoch": 8.003412969283277, "grad_norm": 0.18309707939624786, "learning_rate": 7.497654769344737e-06, "loss": 0.2362, "num_input_tokens_seen": 15729792, "step": 25795 }, { "epoch": 8.004964318957493, "grad_norm": 0.22316952049732208, "learning_rate": 7.496481879312613e-06, "loss": 0.2318, "num_input_tokens_seen": 15732256, "step": 25800 }, { "epoch": 8.00651566863171, "grad_norm": 0.17440906167030334, "learning_rate": 7.495308806258803e-06, "loss": 0.2227, "num_input_tokens_seen": 15734720, "step": 25805 }, { "epoch": 8.008067018305926, "grad_norm": 0.1419811248779297, "learning_rate": 7.494135550269308e-06, "loss": 0.228, "num_input_tokens_seen": 15737952, "step": 25810 }, { "epoch": 8.009618367980142, "grad_norm": 0.25841137766838074, "learning_rate": 7.492962111430142e-06, "loss": 0.2268, "num_input_tokens_seen": 15740896, "step": 25815 }, { "epoch": 8.01116971765436, "grad_norm": 0.17453117668628693, "learning_rate": 7.491788489827333e-06, "loss": 0.2258, "num_input_tokens_seen": 15743584, "step": 25820 }, { "epoch": 8.012721067328576, "grad_norm": 0.18597741425037384, "learning_rate": 7.490614685546921e-06, "loss": 0.2216, "num_input_tokens_seen": 15746208, "step": 25825 }, { "epoch": 8.014272417002793, "grad_norm": 0.23632439970970154, "learning_rate": 7.489440698674959e-06, "loss": 0.2262, "num_input_tokens_seen": 15748960, "step": 25830 }, { "epoch": 8.01582376667701, "grad_norm": 0.2782953381538391, "learning_rate": 7.4882665292975135e-06, "loss": 0.2299, "num_input_tokens_seen": 15753632, "step": 25835 }, { "epoch": 8.017375116351225, "grad_norm": 0.16898220777511597, "learning_rate": 7.4870921775006665e-06, "loss": 0.2404, "num_input_tokens_seen": 15756384, "step": 25840 }, { "epoch": 8.018926466025443, "grad_norm": 0.3664412200450897, "learning_rate": 7.485917643370512e-06, "loss": 0.2298, "num_input_tokens_seen": 15759232, "step": 25845 }, { "epoch": 8.020477815699659, "grad_norm": 0.18344423174858093, "learning_rate": 7.484742926993156e-06, "loss": 0.2259, "num_input_tokens_seen": 15761824, "step": 25850 }, { "epoch": 8.022029165373874, "grad_norm": 0.211508110165596, "learning_rate": 7.48356802845472e-06, "loss": 0.2312, "num_input_tokens_seen": 15765600, "step": 25855 }, { "epoch": 8.023580515048092, "grad_norm": 0.34333163499832153, "learning_rate": 7.482392947841337e-06, "loss": 0.2243, "num_input_tokens_seen": 15768096, "step": 25860 }, { "epoch": 8.025131864722308, "grad_norm": 0.21685372292995453, "learning_rate": 7.481217685239155e-06, "loss": 0.2298, "num_input_tokens_seen": 15770880, "step": 25865 }, { "epoch": 8.026683214396526, "grad_norm": 0.18814241886138916, "learning_rate": 7.4800422407343355e-06, "loss": 0.2399, "num_input_tokens_seen": 15773760, "step": 25870 }, { "epoch": 8.028234564070742, "grad_norm": 0.3032738268375397, "learning_rate": 7.478866614413052e-06, "loss": 0.2393, "num_input_tokens_seen": 15776992, "step": 25875 }, { "epoch": 8.029785913744957, "grad_norm": 0.3492814004421234, "learning_rate": 7.477690806361489e-06, "loss": 0.2416, "num_input_tokens_seen": 15780128, "step": 25880 }, { "epoch": 8.031337263419175, "grad_norm": 0.3097100853919983, "learning_rate": 7.476514816665852e-06, "loss": 0.2336, "num_input_tokens_seen": 15783232, "step": 25885 }, { "epoch": 8.032888613093391, "grad_norm": 0.1720673143863678, "learning_rate": 7.4753386454123514e-06, "loss": 0.226, "num_input_tokens_seen": 15785888, "step": 25890 }, { "epoch": 8.034439962767609, "grad_norm": 0.11416615545749664, "learning_rate": 7.474162292687216e-06, "loss": 0.2218, "num_input_tokens_seen": 15789280, "step": 25895 }, { "epoch": 8.035991312441825, "grad_norm": 0.19875146448612213, "learning_rate": 7.472985758576686e-06, "loss": 0.2267, "num_input_tokens_seen": 15792928, "step": 25900 }, { "epoch": 8.03754266211604, "grad_norm": 0.11813554912805557, "learning_rate": 7.471809043167016e-06, "loss": 0.2375, "num_input_tokens_seen": 15795936, "step": 25905 }, { "epoch": 8.039094011790258, "grad_norm": 0.1820952594280243, "learning_rate": 7.470632146544472e-06, "loss": 0.2302, "num_input_tokens_seen": 15798592, "step": 25910 }, { "epoch": 8.040645361464474, "grad_norm": 0.15415221452713013, "learning_rate": 7.469455068795336e-06, "loss": 0.2288, "num_input_tokens_seen": 15801632, "step": 25915 }, { "epoch": 8.04219671113869, "grad_norm": 0.12512880563735962, "learning_rate": 7.4682778100058995e-06, "loss": 0.2235, "num_input_tokens_seen": 15805344, "step": 25920 }, { "epoch": 8.043748060812907, "grad_norm": 0.2572554051876068, "learning_rate": 7.467100370262472e-06, "loss": 0.2267, "num_input_tokens_seen": 15808800, "step": 25925 }, { "epoch": 8.045299410487123, "grad_norm": 0.13768988847732544, "learning_rate": 7.465922749651371e-06, "loss": 0.2371, "num_input_tokens_seen": 15811200, "step": 25930 }, { "epoch": 8.046850760161341, "grad_norm": 0.16872698068618774, "learning_rate": 7.464744948258933e-06, "loss": 0.2284, "num_input_tokens_seen": 15813792, "step": 25935 }, { "epoch": 8.048402109835557, "grad_norm": 0.2026432603597641, "learning_rate": 7.463566966171503e-06, "loss": 0.2376, "num_input_tokens_seen": 15817760, "step": 25940 }, { "epoch": 8.049953459509773, "grad_norm": 0.19948454201221466, "learning_rate": 7.46238880347544e-06, "loss": 0.2428, "num_input_tokens_seen": 15824480, "step": 25945 }, { "epoch": 8.05150480918399, "grad_norm": 0.13483461737632751, "learning_rate": 7.46121046025712e-06, "loss": 0.2242, "num_input_tokens_seen": 15827808, "step": 25950 }, { "epoch": 8.053056158858206, "grad_norm": 0.1504707932472229, "learning_rate": 7.460031936602927e-06, "loss": 0.2185, "num_input_tokens_seen": 15831584, "step": 25955 }, { "epoch": 8.054607508532424, "grad_norm": 0.23225542902946472, "learning_rate": 7.4588532325992636e-06, "loss": 0.2329, "num_input_tokens_seen": 15835104, "step": 25960 }, { "epoch": 8.05615885820664, "grad_norm": 0.29095080494880676, "learning_rate": 7.4576743483325395e-06, "loss": 0.2317, "num_input_tokens_seen": 15839200, "step": 25965 }, { "epoch": 8.057710207880856, "grad_norm": 0.22671398520469666, "learning_rate": 7.4564952838891835e-06, "loss": 0.2381, "num_input_tokens_seen": 15841568, "step": 25970 }, { "epoch": 8.059261557555073, "grad_norm": 0.15298643708229065, "learning_rate": 7.455316039355633e-06, "loss": 0.2236, "num_input_tokens_seen": 15844992, "step": 25975 }, { "epoch": 8.06081290722929, "grad_norm": 0.33196157217025757, "learning_rate": 7.454136614818343e-06, "loss": 0.2322, "num_input_tokens_seen": 15847456, "step": 25980 }, { "epoch": 8.062364256903505, "grad_norm": 0.16295810043811798, "learning_rate": 7.452957010363777e-06, "loss": 0.2303, "num_input_tokens_seen": 15850848, "step": 25985 }, { "epoch": 8.063915606577723, "grad_norm": 0.2997060716152191, "learning_rate": 7.451777226078415e-06, "loss": 0.2406, "num_input_tokens_seen": 15854080, "step": 25990 }, { "epoch": 8.065466956251939, "grad_norm": 0.2381281852722168, "learning_rate": 7.450597262048749e-06, "loss": 0.2334, "num_input_tokens_seen": 15856384, "step": 25995 }, { "epoch": 8.067018305926156, "grad_norm": 0.14822810888290405, "learning_rate": 7.449417118361284e-06, "loss": 0.231, "num_input_tokens_seen": 15859200, "step": 26000 }, { "epoch": 8.068569655600372, "grad_norm": 0.15835076570510864, "learning_rate": 7.448236795102537e-06, "loss": 0.2292, "num_input_tokens_seen": 15862432, "step": 26005 }, { "epoch": 8.070121005274588, "grad_norm": 0.13252952694892883, "learning_rate": 7.4470562923590425e-06, "loss": 0.2251, "num_input_tokens_seen": 15864832, "step": 26010 }, { "epoch": 8.071672354948806, "grad_norm": 0.16565664112567902, "learning_rate": 7.445875610217346e-06, "loss": 0.2372, "num_input_tokens_seen": 15867296, "step": 26015 }, { "epoch": 8.073223704623022, "grad_norm": 0.15047290921211243, "learning_rate": 7.4446947487640035e-06, "loss": 0.2317, "num_input_tokens_seen": 15870176, "step": 26020 }, { "epoch": 8.07477505429724, "grad_norm": 0.0824122503399849, "learning_rate": 7.443513708085586e-06, "loss": 0.2244, "num_input_tokens_seen": 15873824, "step": 26025 }, { "epoch": 8.076326403971455, "grad_norm": 0.10544401407241821, "learning_rate": 7.442332488268678e-06, "loss": 0.2306, "num_input_tokens_seen": 15877088, "step": 26030 }, { "epoch": 8.077877753645671, "grad_norm": 0.22210118174552917, "learning_rate": 7.4411510893998766e-06, "loss": 0.2277, "num_input_tokens_seen": 15881216, "step": 26035 }, { "epoch": 8.079429103319889, "grad_norm": 0.09725081920623779, "learning_rate": 7.4399695115657926e-06, "loss": 0.2261, "num_input_tokens_seen": 15883776, "step": 26040 }, { "epoch": 8.080980452994105, "grad_norm": 0.14745065569877625, "learning_rate": 7.4387877548530495e-06, "loss": 0.2279, "num_input_tokens_seen": 15886304, "step": 26045 }, { "epoch": 8.08253180266832, "grad_norm": 0.09440911561250687, "learning_rate": 7.437605819348286e-06, "loss": 0.2315, "num_input_tokens_seen": 15889408, "step": 26050 }, { "epoch": 8.084083152342538, "grad_norm": 0.1417079120874405, "learning_rate": 7.43642370513815e-06, "loss": 0.2365, "num_input_tokens_seen": 15892192, "step": 26055 }, { "epoch": 8.085634502016754, "grad_norm": 0.2149236500263214, "learning_rate": 7.4352414123093045e-06, "loss": 0.2355, "num_input_tokens_seen": 15895232, "step": 26060 }, { "epoch": 8.087185851690972, "grad_norm": 0.1410958170890808, "learning_rate": 7.434058940948424e-06, "loss": 0.234, "num_input_tokens_seen": 15898432, "step": 26065 }, { "epoch": 8.088737201365188, "grad_norm": 0.11933066695928574, "learning_rate": 7.432876291142202e-06, "loss": 0.227, "num_input_tokens_seen": 15900960, "step": 26070 }, { "epoch": 8.090288551039404, "grad_norm": 0.1126594990491867, "learning_rate": 7.431693462977338e-06, "loss": 0.2328, "num_input_tokens_seen": 15903872, "step": 26075 }, { "epoch": 8.091839900713621, "grad_norm": 0.20356014370918274, "learning_rate": 7.430510456540547e-06, "loss": 0.2254, "num_input_tokens_seen": 15906912, "step": 26080 }, { "epoch": 8.093391250387837, "grad_norm": 0.14259135723114014, "learning_rate": 7.429327271918559e-06, "loss": 0.2288, "num_input_tokens_seen": 15910848, "step": 26085 }, { "epoch": 8.094942600062055, "grad_norm": 0.14080476760864258, "learning_rate": 7.428143909198113e-06, "loss": 0.2214, "num_input_tokens_seen": 15913984, "step": 26090 }, { "epoch": 8.09649394973627, "grad_norm": 0.1593744456768036, "learning_rate": 7.426960368465965e-06, "loss": 0.2311, "num_input_tokens_seen": 15916672, "step": 26095 }, { "epoch": 8.098045299410487, "grad_norm": 0.1234503984451294, "learning_rate": 7.425776649808885e-06, "loss": 0.2223, "num_input_tokens_seen": 15919296, "step": 26100 }, { "epoch": 8.099596649084704, "grad_norm": 0.24176786839962006, "learning_rate": 7.424592753313649e-06, "loss": 0.2252, "num_input_tokens_seen": 15922336, "step": 26105 }, { "epoch": 8.10114799875892, "grad_norm": 0.23326237499713898, "learning_rate": 7.423408679067053e-06, "loss": 0.2271, "num_input_tokens_seen": 15926208, "step": 26110 }, { "epoch": 8.102699348433136, "grad_norm": 0.2804640531539917, "learning_rate": 7.422224427155903e-06, "loss": 0.2342, "num_input_tokens_seen": 15929408, "step": 26115 }, { "epoch": 8.104250698107354, "grad_norm": 0.18297509849071503, "learning_rate": 7.421039997667019e-06, "loss": 0.2266, "num_input_tokens_seen": 15933024, "step": 26120 }, { "epoch": 8.10580204778157, "grad_norm": 0.21717464923858643, "learning_rate": 7.419855390687234e-06, "loss": 0.2228, "num_input_tokens_seen": 15936448, "step": 26125 }, { "epoch": 8.107353397455787, "grad_norm": 0.21840061247348785, "learning_rate": 7.418670606303393e-06, "loss": 0.2403, "num_input_tokens_seen": 15939168, "step": 26130 }, { "epoch": 8.108904747130003, "grad_norm": 0.2233307659626007, "learning_rate": 7.417485644602355e-06, "loss": 0.2326, "num_input_tokens_seen": 15942784, "step": 26135 }, { "epoch": 8.110456096804219, "grad_norm": 0.2294934093952179, "learning_rate": 7.4163005056709925e-06, "loss": 0.2367, "num_input_tokens_seen": 15945248, "step": 26140 }, { "epoch": 8.112007446478437, "grad_norm": 0.2544231116771698, "learning_rate": 7.41511518959619e-06, "loss": 0.2182, "num_input_tokens_seen": 15949440, "step": 26145 }, { "epoch": 8.113558796152653, "grad_norm": 0.19105221331119537, "learning_rate": 7.413929696464845e-06, "loss": 0.2379, "num_input_tokens_seen": 15952224, "step": 26150 }, { "epoch": 8.11511014582687, "grad_norm": 0.3656887710094452, "learning_rate": 7.412744026363867e-06, "loss": 0.2306, "num_input_tokens_seen": 15954976, "step": 26155 }, { "epoch": 8.116661495501086, "grad_norm": 0.25168484449386597, "learning_rate": 7.411558179380183e-06, "loss": 0.2353, "num_input_tokens_seen": 15957952, "step": 26160 }, { "epoch": 8.118212845175302, "grad_norm": 0.3379073739051819, "learning_rate": 7.410372155600724e-06, "loss": 0.2225, "num_input_tokens_seen": 15961024, "step": 26165 }, { "epoch": 8.11976419484952, "grad_norm": 0.17587845027446747, "learning_rate": 7.4091859551124454e-06, "loss": 0.2317, "num_input_tokens_seen": 15963808, "step": 26170 }, { "epoch": 8.121315544523735, "grad_norm": 0.28866833448410034, "learning_rate": 7.407999578002307e-06, "loss": 0.2264, "num_input_tokens_seen": 15965984, "step": 26175 }, { "epoch": 8.122866894197951, "grad_norm": 0.2617958188056946, "learning_rate": 7.406813024357284e-06, "loss": 0.2299, "num_input_tokens_seen": 15969312, "step": 26180 }, { "epoch": 8.124418243872169, "grad_norm": 0.17802762985229492, "learning_rate": 7.405626294264366e-06, "loss": 0.2442, "num_input_tokens_seen": 15972032, "step": 26185 }, { "epoch": 8.125969593546385, "grad_norm": 0.16378793120384216, "learning_rate": 7.404439387810553e-06, "loss": 0.241, "num_input_tokens_seen": 15974688, "step": 26190 }, { "epoch": 8.127520943220603, "grad_norm": 0.12447670102119446, "learning_rate": 7.403252305082861e-06, "loss": 0.2246, "num_input_tokens_seen": 15977408, "step": 26195 }, { "epoch": 8.129072292894818, "grad_norm": 0.2886216938495636, "learning_rate": 7.402065046168314e-06, "loss": 0.2244, "num_input_tokens_seen": 15980160, "step": 26200 }, { "epoch": 8.130623642569034, "grad_norm": 0.172555610537529, "learning_rate": 7.4008776111539575e-06, "loss": 0.2409, "num_input_tokens_seen": 15982272, "step": 26205 }, { "epoch": 8.132174992243252, "grad_norm": 0.1566767543554306, "learning_rate": 7.399690000126839e-06, "loss": 0.2355, "num_input_tokens_seen": 15984512, "step": 26210 }, { "epoch": 8.133726341917468, "grad_norm": 0.18985460698604584, "learning_rate": 7.398502213174028e-06, "loss": 0.2314, "num_input_tokens_seen": 15989344, "step": 26215 }, { "epoch": 8.135277691591686, "grad_norm": 0.22628383338451385, "learning_rate": 7.397314250382602e-06, "loss": 0.2333, "num_input_tokens_seen": 15991840, "step": 26220 }, { "epoch": 8.136829041265901, "grad_norm": 0.15380454063415527, "learning_rate": 7.396126111839654e-06, "loss": 0.227, "num_input_tokens_seen": 15994688, "step": 26225 }, { "epoch": 8.138380390940117, "grad_norm": 0.23356840014457703, "learning_rate": 7.394937797632286e-06, "loss": 0.2227, "num_input_tokens_seen": 15997760, "step": 26230 }, { "epoch": 8.139931740614335, "grad_norm": 0.18991920351982117, "learning_rate": 7.3937493078476185e-06, "loss": 0.2333, "num_input_tokens_seen": 16001184, "step": 26235 }, { "epoch": 8.14148309028855, "grad_norm": 0.16016291081905365, "learning_rate": 7.39256064257278e-06, "loss": 0.2286, "num_input_tokens_seen": 16004832, "step": 26240 }, { "epoch": 8.143034439962767, "grad_norm": 0.2010127454996109, "learning_rate": 7.391371801894915e-06, "loss": 0.236, "num_input_tokens_seen": 16007584, "step": 26245 }, { "epoch": 8.144585789636984, "grad_norm": 0.14648742973804474, "learning_rate": 7.3901827859011775e-06, "loss": 0.2286, "num_input_tokens_seen": 16010976, "step": 26250 }, { "epoch": 8.1461371393112, "grad_norm": 0.2682783007621765, "learning_rate": 7.388993594678739e-06, "loss": 0.2306, "num_input_tokens_seen": 16014080, "step": 26255 }, { "epoch": 8.147688488985418, "grad_norm": 0.14864501357078552, "learning_rate": 7.387804228314778e-06, "loss": 0.2309, "num_input_tokens_seen": 16016640, "step": 26260 }, { "epoch": 8.149239838659634, "grad_norm": 0.22494742274284363, "learning_rate": 7.386614686896493e-06, "loss": 0.2228, "num_input_tokens_seen": 16020960, "step": 26265 }, { "epoch": 8.15079118833385, "grad_norm": 0.16897308826446533, "learning_rate": 7.385424970511088e-06, "loss": 0.2258, "num_input_tokens_seen": 16024320, "step": 26270 }, { "epoch": 8.152342538008067, "grad_norm": 0.17129994928836823, "learning_rate": 7.384235079245787e-06, "loss": 0.2286, "num_input_tokens_seen": 16027168, "step": 26275 }, { "epoch": 8.153893887682283, "grad_norm": 0.19255417585372925, "learning_rate": 7.38304501318782e-06, "loss": 0.2254, "num_input_tokens_seen": 16029696, "step": 26280 }, { "epoch": 8.155445237356501, "grad_norm": 0.09794500470161438, "learning_rate": 7.381854772424435e-06, "loss": 0.2335, "num_input_tokens_seen": 16031808, "step": 26285 }, { "epoch": 8.156996587030717, "grad_norm": 0.15353535115718842, "learning_rate": 7.380664357042889e-06, "loss": 0.2196, "num_input_tokens_seen": 16034112, "step": 26290 }, { "epoch": 8.158547936704933, "grad_norm": 0.2617892920970917, "learning_rate": 7.379473767130455e-06, "loss": 0.2295, "num_input_tokens_seen": 16037504, "step": 26295 }, { "epoch": 8.16009928637915, "grad_norm": 0.14386801421642303, "learning_rate": 7.378283002774417e-06, "loss": 0.2254, "num_input_tokens_seen": 16040384, "step": 26300 }, { "epoch": 8.161650636053366, "grad_norm": 0.17767201364040375, "learning_rate": 7.3770920640620715e-06, "loss": 0.2241, "num_input_tokens_seen": 16043200, "step": 26305 }, { "epoch": 8.163201985727582, "grad_norm": 0.21257761120796204, "learning_rate": 7.375900951080728e-06, "loss": 0.2273, "num_input_tokens_seen": 16045728, "step": 26310 }, { "epoch": 8.1647533354018, "grad_norm": 0.15214957296848297, "learning_rate": 7.374709663917711e-06, "loss": 0.2401, "num_input_tokens_seen": 16049472, "step": 26315 }, { "epoch": 8.166304685076016, "grad_norm": 0.17575693130493164, "learning_rate": 7.373518202660352e-06, "loss": 0.2326, "num_input_tokens_seen": 16052992, "step": 26320 }, { "epoch": 8.167856034750233, "grad_norm": 0.163572296500206, "learning_rate": 7.372326567396004e-06, "loss": 0.2305, "num_input_tokens_seen": 16056192, "step": 26325 }, { "epoch": 8.16940738442445, "grad_norm": 0.2716321647167206, "learning_rate": 7.371134758212027e-06, "loss": 0.2382, "num_input_tokens_seen": 16059296, "step": 26330 }, { "epoch": 8.170958734098665, "grad_norm": 0.1866185963153839, "learning_rate": 7.369942775195793e-06, "loss": 0.2382, "num_input_tokens_seen": 16062368, "step": 26335 }, { "epoch": 8.172510083772883, "grad_norm": 0.13812127709388733, "learning_rate": 7.36875061843469e-06, "loss": 0.2283, "num_input_tokens_seen": 16065408, "step": 26340 }, { "epoch": 8.174061433447099, "grad_norm": 0.12537629902362823, "learning_rate": 7.3675582880161155e-06, "loss": 0.2321, "num_input_tokens_seen": 16068416, "step": 26345 }, { "epoch": 8.175612783121316, "grad_norm": 0.21251504123210907, "learning_rate": 7.366365784027483e-06, "loss": 0.2248, "num_input_tokens_seen": 16071456, "step": 26350 }, { "epoch": 8.177164132795532, "grad_norm": 0.15193773806095123, "learning_rate": 7.365173106556216e-06, "loss": 0.2264, "num_input_tokens_seen": 16074400, "step": 26355 }, { "epoch": 8.178715482469748, "grad_norm": 0.18763576447963715, "learning_rate": 7.3639802556897535e-06, "loss": 0.2228, "num_input_tokens_seen": 16077216, "step": 26360 }, { "epoch": 8.180266832143966, "grad_norm": 0.15505550801753998, "learning_rate": 7.362787231515543e-06, "loss": 0.2312, "num_input_tokens_seen": 16079552, "step": 26365 }, { "epoch": 8.181818181818182, "grad_norm": 0.13310225307941437, "learning_rate": 7.3615940341210505e-06, "loss": 0.2307, "num_input_tokens_seen": 16081920, "step": 26370 }, { "epoch": 8.183369531492398, "grad_norm": 0.19544439017772675, "learning_rate": 7.360400663593748e-06, "loss": 0.2227, "num_input_tokens_seen": 16085920, "step": 26375 }, { "epoch": 8.184920881166615, "grad_norm": 0.16131111979484558, "learning_rate": 7.359207120021126e-06, "loss": 0.2283, "num_input_tokens_seen": 16089120, "step": 26380 }, { "epoch": 8.186472230840831, "grad_norm": 0.08544201403856277, "learning_rate": 7.3580134034906855e-06, "loss": 0.2364, "num_input_tokens_seen": 16093824, "step": 26385 }, { "epoch": 8.188023580515049, "grad_norm": 0.15845058858394623, "learning_rate": 7.356819514089939e-06, "loss": 0.2319, "num_input_tokens_seen": 16096736, "step": 26390 }, { "epoch": 8.189574930189265, "grad_norm": 0.1748119592666626, "learning_rate": 7.355625451906414e-06, "loss": 0.2279, "num_input_tokens_seen": 16099392, "step": 26395 }, { "epoch": 8.19112627986348, "grad_norm": 0.17088386416435242, "learning_rate": 7.354431217027649e-06, "loss": 0.2339, "num_input_tokens_seen": 16101888, "step": 26400 }, { "epoch": 8.192677629537698, "grad_norm": 0.17052412033081055, "learning_rate": 7.353236809541193e-06, "loss": 0.2286, "num_input_tokens_seen": 16105152, "step": 26405 }, { "epoch": 8.194228979211914, "grad_norm": 0.36167535185813904, "learning_rate": 7.352042229534615e-06, "loss": 0.2397, "num_input_tokens_seen": 16108608, "step": 26410 }, { "epoch": 8.195780328886132, "grad_norm": 0.14140048623085022, "learning_rate": 7.350847477095486e-06, "loss": 0.2322, "num_input_tokens_seen": 16111520, "step": 26415 }, { "epoch": 8.197331678560348, "grad_norm": 0.19416865706443787, "learning_rate": 7.3496525523114015e-06, "loss": 0.2305, "num_input_tokens_seen": 16115104, "step": 26420 }, { "epoch": 8.198883028234563, "grad_norm": 0.13235223293304443, "learning_rate": 7.348457455269959e-06, "loss": 0.2203, "num_input_tokens_seen": 16118240, "step": 26425 }, { "epoch": 8.200434377908781, "grad_norm": 0.16777634620666504, "learning_rate": 7.347262186058777e-06, "loss": 0.226, "num_input_tokens_seen": 16120864, "step": 26430 }, { "epoch": 8.201985727582997, "grad_norm": 0.2940947413444519, "learning_rate": 7.346066744765481e-06, "loss": 0.2281, "num_input_tokens_seen": 16123424, "step": 26435 }, { "epoch": 8.203537077257213, "grad_norm": 0.17199526727199554, "learning_rate": 7.34487113147771e-06, "loss": 0.2313, "num_input_tokens_seen": 16127616, "step": 26440 }, { "epoch": 8.20508842693143, "grad_norm": 0.20940865576267242, "learning_rate": 7.343675346283118e-06, "loss": 0.2318, "num_input_tokens_seen": 16130400, "step": 26445 }, { "epoch": 8.206639776605646, "grad_norm": 0.3061257600784302, "learning_rate": 7.342479389269369e-06, "loss": 0.2236, "num_input_tokens_seen": 16134272, "step": 26450 }, { "epoch": 8.208191126279864, "grad_norm": 0.26722630858421326, "learning_rate": 7.341283260524142e-06, "loss": 0.2493, "num_input_tokens_seen": 16136800, "step": 26455 }, { "epoch": 8.20974247595408, "grad_norm": 0.38085246086120605, "learning_rate": 7.340086960135127e-06, "loss": 0.2304, "num_input_tokens_seen": 16140992, "step": 26460 }, { "epoch": 8.211293825628296, "grad_norm": 0.2980286478996277, "learning_rate": 7.338890488190027e-06, "loss": 0.2264, "num_input_tokens_seen": 16145120, "step": 26465 }, { "epoch": 8.212845175302514, "grad_norm": 0.27288517355918884, "learning_rate": 7.337693844776559e-06, "loss": 0.2235, "num_input_tokens_seen": 16148320, "step": 26470 }, { "epoch": 8.21439652497673, "grad_norm": 0.24988089501857758, "learning_rate": 7.336497029982448e-06, "loss": 0.2305, "num_input_tokens_seen": 16150944, "step": 26475 }, { "epoch": 8.215947874650947, "grad_norm": 0.14864656329154968, "learning_rate": 7.335300043895438e-06, "loss": 0.2313, "num_input_tokens_seen": 16154784, "step": 26480 }, { "epoch": 8.217499224325163, "grad_norm": 0.21886898577213287, "learning_rate": 7.33410288660328e-06, "loss": 0.2306, "num_input_tokens_seen": 16157440, "step": 26485 }, { "epoch": 8.219050573999379, "grad_norm": 0.2583712637424469, "learning_rate": 7.33290555819374e-06, "loss": 0.2202, "num_input_tokens_seen": 16160768, "step": 26490 }, { "epoch": 8.220601923673597, "grad_norm": 0.48410892486572266, "learning_rate": 7.331708058754598e-06, "loss": 0.2298, "num_input_tokens_seen": 16163296, "step": 26495 }, { "epoch": 8.222153273347812, "grad_norm": 0.2610452473163605, "learning_rate": 7.330510388373643e-06, "loss": 0.2383, "num_input_tokens_seen": 16166240, "step": 26500 }, { "epoch": 8.223704623022028, "grad_norm": 0.2561180591583252, "learning_rate": 7.329312547138678e-06, "loss": 0.2436, "num_input_tokens_seen": 16169472, "step": 26505 }, { "epoch": 8.225255972696246, "grad_norm": 0.2601754665374756, "learning_rate": 7.328114535137523e-06, "loss": 0.2306, "num_input_tokens_seen": 16172320, "step": 26510 }, { "epoch": 8.226807322370462, "grad_norm": 0.15304560959339142, "learning_rate": 7.326916352458e-06, "loss": 0.2288, "num_input_tokens_seen": 16175360, "step": 26515 }, { "epoch": 8.22835867204468, "grad_norm": 0.1507510393857956, "learning_rate": 7.3257179991879555e-06, "loss": 0.2394, "num_input_tokens_seen": 16179136, "step": 26520 }, { "epoch": 8.229910021718895, "grad_norm": 0.19626908004283905, "learning_rate": 7.324519475415241e-06, "loss": 0.2342, "num_input_tokens_seen": 16181920, "step": 26525 }, { "epoch": 8.231461371393111, "grad_norm": 0.1526343822479248, "learning_rate": 7.323320781227723e-06, "loss": 0.2262, "num_input_tokens_seen": 16185120, "step": 26530 }, { "epoch": 8.233012721067329, "grad_norm": 0.1435394585132599, "learning_rate": 7.322121916713278e-06, "loss": 0.2322, "num_input_tokens_seen": 16187296, "step": 26535 }, { "epoch": 8.234564070741545, "grad_norm": 0.21632754802703857, "learning_rate": 7.3209228819598e-06, "loss": 0.2324, "num_input_tokens_seen": 16191168, "step": 26540 }, { "epoch": 8.236115420415763, "grad_norm": 0.13643419742584229, "learning_rate": 7.319723677055191e-06, "loss": 0.2359, "num_input_tokens_seen": 16193728, "step": 26545 }, { "epoch": 8.237666770089978, "grad_norm": 0.20560474693775177, "learning_rate": 7.318524302087366e-06, "loss": 0.2245, "num_input_tokens_seen": 16196576, "step": 26550 }, { "epoch": 8.239218119764194, "grad_norm": 0.2719040513038635, "learning_rate": 7.317324757144254e-06, "loss": 0.23, "num_input_tokens_seen": 16199488, "step": 26555 }, { "epoch": 8.240769469438412, "grad_norm": 0.18603169918060303, "learning_rate": 7.316125042313798e-06, "loss": 0.2211, "num_input_tokens_seen": 16202496, "step": 26560 }, { "epoch": 8.242320819112628, "grad_norm": 0.24558725953102112, "learning_rate": 7.314925157683947e-06, "loss": 0.2328, "num_input_tokens_seen": 16205472, "step": 26565 }, { "epoch": 8.243872168786844, "grad_norm": 0.29422953724861145, "learning_rate": 7.313725103342671e-06, "loss": 0.2215, "num_input_tokens_seen": 16209184, "step": 26570 }, { "epoch": 8.245423518461061, "grad_norm": 0.1348499357700348, "learning_rate": 7.3125248793779436e-06, "loss": 0.2276, "num_input_tokens_seen": 16212960, "step": 26575 }, { "epoch": 8.246974868135277, "grad_norm": 0.18831193447113037, "learning_rate": 7.31132448587776e-06, "loss": 0.2228, "num_input_tokens_seen": 16215968, "step": 26580 }, { "epoch": 8.248526217809495, "grad_norm": 0.1596958488225937, "learning_rate": 7.310123922930122e-06, "loss": 0.2337, "num_input_tokens_seen": 16218752, "step": 26585 }, { "epoch": 8.25007756748371, "grad_norm": 0.16321326792240143, "learning_rate": 7.308923190623043e-06, "loss": 0.2281, "num_input_tokens_seen": 16220992, "step": 26590 }, { "epoch": 8.251628917157927, "grad_norm": 0.13936667144298553, "learning_rate": 7.307722289044553e-06, "loss": 0.223, "num_input_tokens_seen": 16223648, "step": 26595 }, { "epoch": 8.253180266832144, "grad_norm": 0.16803695261478424, "learning_rate": 7.306521218282691e-06, "loss": 0.2213, "num_input_tokens_seen": 16227104, "step": 26600 }, { "epoch": 8.25473161650636, "grad_norm": 0.16965337097644806, "learning_rate": 7.3053199784255125e-06, "loss": 0.2263, "num_input_tokens_seen": 16229472, "step": 26605 }, { "epoch": 8.256282966180578, "grad_norm": 0.15828171372413635, "learning_rate": 7.304118569561078e-06, "loss": 0.2325, "num_input_tokens_seen": 16231968, "step": 26610 }, { "epoch": 8.257834315854794, "grad_norm": 0.2454203963279724, "learning_rate": 7.30291699177747e-06, "loss": 0.2447, "num_input_tokens_seen": 16235648, "step": 26615 }, { "epoch": 8.25938566552901, "grad_norm": 0.2567875385284424, "learning_rate": 7.301715245162775e-06, "loss": 0.2225, "num_input_tokens_seen": 16239072, "step": 26620 }, { "epoch": 8.260937015203227, "grad_norm": 0.2298262119293213, "learning_rate": 7.300513329805095e-06, "loss": 0.2347, "num_input_tokens_seen": 16242656, "step": 26625 }, { "epoch": 8.262488364877443, "grad_norm": 0.209718257188797, "learning_rate": 7.2993112457925465e-06, "loss": 0.2237, "num_input_tokens_seen": 16245728, "step": 26630 }, { "epoch": 8.264039714551659, "grad_norm": 0.3208390772342682, "learning_rate": 7.298108993213255e-06, "loss": 0.2297, "num_input_tokens_seen": 16249216, "step": 26635 }, { "epoch": 8.265591064225877, "grad_norm": 0.24790294468402863, "learning_rate": 7.296906572155362e-06, "loss": 0.2279, "num_input_tokens_seen": 16251968, "step": 26640 }, { "epoch": 8.267142413900093, "grad_norm": 0.1706364005804062, "learning_rate": 7.295703982707016e-06, "loss": 0.2286, "num_input_tokens_seen": 16255104, "step": 26645 }, { "epoch": 8.26869376357431, "grad_norm": 0.24958059191703796, "learning_rate": 7.294501224956385e-06, "loss": 0.2307, "num_input_tokens_seen": 16260992, "step": 26650 }, { "epoch": 8.270245113248526, "grad_norm": 0.25205183029174805, "learning_rate": 7.293298298991642e-06, "loss": 0.2351, "num_input_tokens_seen": 16264064, "step": 26655 }, { "epoch": 8.271796462922742, "grad_norm": 0.1706303209066391, "learning_rate": 7.292095204900977e-06, "loss": 0.2354, "num_input_tokens_seen": 16267808, "step": 26660 }, { "epoch": 8.27334781259696, "grad_norm": 0.206073597073555, "learning_rate": 7.290891942772592e-06, "loss": 0.2251, "num_input_tokens_seen": 16270048, "step": 26665 }, { "epoch": 8.274899162271176, "grad_norm": 0.13882802426815033, "learning_rate": 7.289688512694699e-06, "loss": 0.2292, "num_input_tokens_seen": 16272480, "step": 26670 }, { "epoch": 8.276450511945393, "grad_norm": 0.21961277723312378, "learning_rate": 7.288484914755524e-06, "loss": 0.2281, "num_input_tokens_seen": 16276576, "step": 26675 }, { "epoch": 8.27800186161961, "grad_norm": 0.2800447344779968, "learning_rate": 7.287281149043305e-06, "loss": 0.2309, "num_input_tokens_seen": 16280032, "step": 26680 }, { "epoch": 8.279553211293825, "grad_norm": 0.22449316084384918, "learning_rate": 7.2860772156462925e-06, "loss": 0.2262, "num_input_tokens_seen": 16282912, "step": 26685 }, { "epoch": 8.281104560968043, "grad_norm": 0.13387702405452728, "learning_rate": 7.284873114652748e-06, "loss": 0.2184, "num_input_tokens_seen": 16285120, "step": 26690 }, { "epoch": 8.282655910642259, "grad_norm": 0.28283730149269104, "learning_rate": 7.2836688461509465e-06, "loss": 0.2309, "num_input_tokens_seen": 16289088, "step": 26695 }, { "epoch": 8.284207260316474, "grad_norm": 0.19991372525691986, "learning_rate": 7.282464410229178e-06, "loss": 0.2239, "num_input_tokens_seen": 16291648, "step": 26700 }, { "epoch": 8.285758609990692, "grad_norm": 0.17973382771015167, "learning_rate": 7.281259806975739e-06, "loss": 0.2412, "num_input_tokens_seen": 16294272, "step": 26705 }, { "epoch": 8.287309959664908, "grad_norm": 0.2889038622379303, "learning_rate": 7.280055036478941e-06, "loss": 0.229, "num_input_tokens_seen": 16296480, "step": 26710 }, { "epoch": 8.288861309339126, "grad_norm": 0.5386566519737244, "learning_rate": 7.278850098827109e-06, "loss": 0.2259, "num_input_tokens_seen": 16298976, "step": 26715 }, { "epoch": 8.290412659013342, "grad_norm": 0.3451460003852844, "learning_rate": 7.27764499410858e-06, "loss": 0.2436, "num_input_tokens_seen": 16301312, "step": 26720 }, { "epoch": 8.291964008687557, "grad_norm": 0.23324424028396606, "learning_rate": 7.276439722411702e-06, "loss": 0.2313, "num_input_tokens_seen": 16304192, "step": 26725 }, { "epoch": 8.293515358361775, "grad_norm": 0.1783752143383026, "learning_rate": 7.275234283824835e-06, "loss": 0.2327, "num_input_tokens_seen": 16306816, "step": 26730 }, { "epoch": 8.295066708035991, "grad_norm": 0.23826158046722412, "learning_rate": 7.274028678436353e-06, "loss": 0.2303, "num_input_tokens_seen": 16310816, "step": 26735 }, { "epoch": 8.296618057710209, "grad_norm": 0.39774638414382935, "learning_rate": 7.272822906334639e-06, "loss": 0.2229, "num_input_tokens_seen": 16314144, "step": 26740 }, { "epoch": 8.298169407384425, "grad_norm": 0.21867786347866058, "learning_rate": 7.271616967608092e-06, "loss": 0.2316, "num_input_tokens_seen": 16316928, "step": 26745 }, { "epoch": 8.29972075705864, "grad_norm": 0.25707530975341797, "learning_rate": 7.270410862345121e-06, "loss": 0.2315, "num_input_tokens_seen": 16319808, "step": 26750 }, { "epoch": 8.301272106732858, "grad_norm": 0.35702162981033325, "learning_rate": 7.269204590634149e-06, "loss": 0.2368, "num_input_tokens_seen": 16324992, "step": 26755 }, { "epoch": 8.302823456407074, "grad_norm": 0.3204318583011627, "learning_rate": 7.267998152563609e-06, "loss": 0.2302, "num_input_tokens_seen": 16328000, "step": 26760 }, { "epoch": 8.304374806081292, "grad_norm": 0.3859688937664032, "learning_rate": 7.266791548221946e-06, "loss": 0.2123, "num_input_tokens_seen": 16331680, "step": 26765 }, { "epoch": 8.305926155755508, "grad_norm": 0.4930415749549866, "learning_rate": 7.265584777697621e-06, "loss": 0.2298, "num_input_tokens_seen": 16334208, "step": 26770 }, { "epoch": 8.307477505429723, "grad_norm": 0.6580550670623779, "learning_rate": 7.2643778410791025e-06, "loss": 0.2245, "num_input_tokens_seen": 16336960, "step": 26775 }, { "epoch": 8.309028855103941, "grad_norm": 0.4609318971633911, "learning_rate": 7.263170738454874e-06, "loss": 0.2179, "num_input_tokens_seen": 16339520, "step": 26780 }, { "epoch": 8.310580204778157, "grad_norm": 0.9440341591835022, "learning_rate": 7.2619634699134315e-06, "loss": 0.2363, "num_input_tokens_seen": 16342592, "step": 26785 }, { "epoch": 8.312131554452373, "grad_norm": 1.3208256959915161, "learning_rate": 7.260756035543279e-06, "loss": 0.2388, "num_input_tokens_seen": 16345472, "step": 26790 }, { "epoch": 8.31368290412659, "grad_norm": 0.5648731589317322, "learning_rate": 7.259548435432941e-06, "loss": 0.2377, "num_input_tokens_seen": 16348128, "step": 26795 }, { "epoch": 8.315234253800806, "grad_norm": 0.3560030460357666, "learning_rate": 7.258340669670942e-06, "loss": 0.2305, "num_input_tokens_seen": 16350720, "step": 26800 }, { "epoch": 8.316785603475024, "grad_norm": 0.4977398216724396, "learning_rate": 7.257132738345831e-06, "loss": 0.2306, "num_input_tokens_seen": 16353984, "step": 26805 }, { "epoch": 8.31833695314924, "grad_norm": 0.3205244541168213, "learning_rate": 7.255924641546163e-06, "loss": 0.2352, "num_input_tokens_seen": 16356320, "step": 26810 }, { "epoch": 8.319888302823456, "grad_norm": 0.48571357131004333, "learning_rate": 7.254716379360503e-06, "loss": 0.2293, "num_input_tokens_seen": 16359008, "step": 26815 }, { "epoch": 8.321439652497673, "grad_norm": 0.3260996639728546, "learning_rate": 7.253507951877432e-06, "loss": 0.2321, "num_input_tokens_seen": 16362176, "step": 26820 }, { "epoch": 8.32299100217189, "grad_norm": 0.44047480821609497, "learning_rate": 7.252299359185544e-06, "loss": 0.2225, "num_input_tokens_seen": 16366208, "step": 26825 }, { "epoch": 8.324542351846105, "grad_norm": 0.3600696325302124, "learning_rate": 7.251090601373441e-06, "loss": 0.2239, "num_input_tokens_seen": 16369152, "step": 26830 }, { "epoch": 8.326093701520323, "grad_norm": 0.491061806678772, "learning_rate": 7.24988167852974e-06, "loss": 0.252, "num_input_tokens_seen": 16372192, "step": 26835 }, { "epoch": 8.327645051194539, "grad_norm": 0.2227727174758911, "learning_rate": 7.248672590743069e-06, "loss": 0.2364, "num_input_tokens_seen": 16375264, "step": 26840 }, { "epoch": 8.329196400868756, "grad_norm": 0.2713332176208496, "learning_rate": 7.2474633381020694e-06, "loss": 0.2383, "num_input_tokens_seen": 16377344, "step": 26845 }, { "epoch": 8.330747750542972, "grad_norm": 0.5219558477401733, "learning_rate": 7.246253920695394e-06, "loss": 0.2394, "num_input_tokens_seen": 16380672, "step": 26850 }, { "epoch": 8.332299100217188, "grad_norm": 0.2389167845249176, "learning_rate": 7.245044338611705e-06, "loss": 0.234, "num_input_tokens_seen": 16383328, "step": 26855 }, { "epoch": 8.333850449891406, "grad_norm": 0.30849987268447876, "learning_rate": 7.243834591939682e-06, "loss": 0.2338, "num_input_tokens_seen": 16385824, "step": 26860 }, { "epoch": 8.335401799565622, "grad_norm": 0.24349892139434814, "learning_rate": 7.242624680768012e-06, "loss": 0.2294, "num_input_tokens_seen": 16390048, "step": 26865 }, { "epoch": 8.33695314923984, "grad_norm": 0.3583610951900482, "learning_rate": 7.241414605185396e-06, "loss": 0.2377, "num_input_tokens_seen": 16393056, "step": 26870 }, { "epoch": 8.338504498914055, "grad_norm": 0.2371511608362198, "learning_rate": 7.240204365280548e-06, "loss": 0.2312, "num_input_tokens_seen": 16395744, "step": 26875 }, { "epoch": 8.340055848588271, "grad_norm": 0.16710954904556274, "learning_rate": 7.2389939611421915e-06, "loss": 0.2297, "num_input_tokens_seen": 16398784, "step": 26880 }, { "epoch": 8.341607198262489, "grad_norm": 0.2341231107711792, "learning_rate": 7.237783392859064e-06, "loss": 0.2376, "num_input_tokens_seen": 16402496, "step": 26885 }, { "epoch": 8.343158547936705, "grad_norm": 0.2895292639732361, "learning_rate": 7.236572660519913e-06, "loss": 0.2364, "num_input_tokens_seen": 16405504, "step": 26890 }, { "epoch": 8.344709897610922, "grad_norm": 0.3683237135410309, "learning_rate": 7.235361764213502e-06, "loss": 0.2372, "num_input_tokens_seen": 16409440, "step": 26895 }, { "epoch": 8.346261247285138, "grad_norm": 0.3043365478515625, "learning_rate": 7.234150704028603e-06, "loss": 0.2366, "num_input_tokens_seen": 16413600, "step": 26900 }, { "epoch": 8.347812596959354, "grad_norm": 0.24609698355197906, "learning_rate": 7.232939480054002e-06, "loss": 0.2295, "num_input_tokens_seen": 16416896, "step": 26905 }, { "epoch": 8.349363946633572, "grad_norm": 0.15371371805667877, "learning_rate": 7.231728092378494e-06, "loss": 0.2242, "num_input_tokens_seen": 16420224, "step": 26910 }, { "epoch": 8.350915296307788, "grad_norm": 0.15739767253398895, "learning_rate": 7.23051654109089e-06, "loss": 0.2314, "num_input_tokens_seen": 16423648, "step": 26915 }, { "epoch": 8.352466645982004, "grad_norm": 0.19958074390888214, "learning_rate": 7.22930482628001e-06, "loss": 0.2186, "num_input_tokens_seen": 16427840, "step": 26920 }, { "epoch": 8.354017995656221, "grad_norm": 0.1648636907339096, "learning_rate": 7.228092948034687e-06, "loss": 0.2305, "num_input_tokens_seen": 16430752, "step": 26925 }, { "epoch": 8.355569345330437, "grad_norm": 0.2390732318162918, "learning_rate": 7.2268809064437675e-06, "loss": 0.2292, "num_input_tokens_seen": 16434848, "step": 26930 }, { "epoch": 8.357120695004655, "grad_norm": 0.24842849373817444, "learning_rate": 7.225668701596107e-06, "loss": 0.2287, "num_input_tokens_seen": 16437824, "step": 26935 }, { "epoch": 8.35867204467887, "grad_norm": 0.17260496318340302, "learning_rate": 7.224456333580574e-06, "loss": 0.2327, "num_input_tokens_seen": 16440512, "step": 26940 }, { "epoch": 8.360223394353087, "grad_norm": 0.1759345531463623, "learning_rate": 7.22324380248605e-06, "loss": 0.2298, "num_input_tokens_seen": 16443072, "step": 26945 }, { "epoch": 8.361774744027304, "grad_norm": 0.2274230569601059, "learning_rate": 7.222031108401429e-06, "loss": 0.2329, "num_input_tokens_seen": 16446720, "step": 26950 }, { "epoch": 8.36332609370152, "grad_norm": 0.2624265253543854, "learning_rate": 7.220818251415614e-06, "loss": 0.2302, "num_input_tokens_seen": 16449376, "step": 26955 }, { "epoch": 8.364877443375736, "grad_norm": 0.24029399454593658, "learning_rate": 7.219605231617524e-06, "loss": 0.2325, "num_input_tokens_seen": 16454112, "step": 26960 }, { "epoch": 8.366428793049954, "grad_norm": 0.18374337255954742, "learning_rate": 7.218392049096085e-06, "loss": 0.2454, "num_input_tokens_seen": 16457504, "step": 26965 }, { "epoch": 8.36798014272417, "grad_norm": 0.29089120030403137, "learning_rate": 7.217178703940241e-06, "loss": 0.2323, "num_input_tokens_seen": 16460256, "step": 26970 }, { "epoch": 8.369531492398387, "grad_norm": 0.3001610338687897, "learning_rate": 7.215965196238941e-06, "loss": 0.2298, "num_input_tokens_seen": 16463136, "step": 26975 }, { "epoch": 8.371082842072603, "grad_norm": 0.13592776656150818, "learning_rate": 7.214751526081152e-06, "loss": 0.232, "num_input_tokens_seen": 16465920, "step": 26980 }, { "epoch": 8.372634191746819, "grad_norm": 0.1794939935207367, "learning_rate": 7.21353769355585e-06, "loss": 0.2336, "num_input_tokens_seen": 16468000, "step": 26985 }, { "epoch": 8.374185541421037, "grad_norm": 0.18350322544574738, "learning_rate": 7.212323698752022e-06, "loss": 0.2265, "num_input_tokens_seen": 16470592, "step": 26990 }, { "epoch": 8.375736891095253, "grad_norm": 0.1851268708705902, "learning_rate": 7.211109541758669e-06, "loss": 0.2263, "num_input_tokens_seen": 16473696, "step": 26995 }, { "epoch": 8.37728824076947, "grad_norm": 0.20768097043037415, "learning_rate": 7.209895222664803e-06, "loss": 0.2284, "num_input_tokens_seen": 16475936, "step": 27000 }, { "epoch": 8.378839590443686, "grad_norm": 0.23373174667358398, "learning_rate": 7.208680741559449e-06, "loss": 0.2328, "num_input_tokens_seen": 16478880, "step": 27005 }, { "epoch": 8.380390940117902, "grad_norm": 0.30493825674057007, "learning_rate": 7.207466098531642e-06, "loss": 0.2275, "num_input_tokens_seen": 16482432, "step": 27010 }, { "epoch": 8.38194228979212, "grad_norm": 0.15171144902706146, "learning_rate": 7.2062512936704284e-06, "loss": 0.2287, "num_input_tokens_seen": 16485088, "step": 27015 }, { "epoch": 8.383493639466336, "grad_norm": 0.21669934689998627, "learning_rate": 7.205036327064869e-06, "loss": 0.228, "num_input_tokens_seen": 16487040, "step": 27020 }, { "epoch": 8.385044989140553, "grad_norm": 0.27667364478111267, "learning_rate": 7.203821198804036e-06, "loss": 0.2306, "num_input_tokens_seen": 16490208, "step": 27025 }, { "epoch": 8.386596338814769, "grad_norm": 0.17635290324687958, "learning_rate": 7.202605908977013e-06, "loss": 0.2287, "num_input_tokens_seen": 16494944, "step": 27030 }, { "epoch": 8.388147688488985, "grad_norm": 0.2456492930650711, "learning_rate": 7.201390457672892e-06, "loss": 0.2347, "num_input_tokens_seen": 16498048, "step": 27035 }, { "epoch": 8.389699038163203, "grad_norm": 0.22428376972675323, "learning_rate": 7.200174844980784e-06, "loss": 0.226, "num_input_tokens_seen": 16500608, "step": 27040 }, { "epoch": 8.391250387837419, "grad_norm": 0.4587500989437103, "learning_rate": 7.198959070989805e-06, "loss": 0.2284, "num_input_tokens_seen": 16504352, "step": 27045 }, { "epoch": 8.392801737511634, "grad_norm": 0.2869911193847656, "learning_rate": 7.197743135789087e-06, "loss": 0.227, "num_input_tokens_seen": 16507264, "step": 27050 }, { "epoch": 8.394353087185852, "grad_norm": 0.2325560748577118, "learning_rate": 7.196527039467772e-06, "loss": 0.236, "num_input_tokens_seen": 16509408, "step": 27055 }, { "epoch": 8.395904436860068, "grad_norm": 0.2018752098083496, "learning_rate": 7.195310782115013e-06, "loss": 0.2217, "num_input_tokens_seen": 16512832, "step": 27060 }, { "epoch": 8.397455786534286, "grad_norm": 0.2400071620941162, "learning_rate": 7.194094363819979e-06, "loss": 0.2369, "num_input_tokens_seen": 16515392, "step": 27065 }, { "epoch": 8.399007136208501, "grad_norm": 0.14386321604251862, "learning_rate": 7.1928777846718454e-06, "loss": 0.2303, "num_input_tokens_seen": 16518624, "step": 27070 }, { "epoch": 8.400558485882717, "grad_norm": 0.24220514297485352, "learning_rate": 7.191661044759804e-06, "loss": 0.2363, "num_input_tokens_seen": 16524224, "step": 27075 }, { "epoch": 8.402109835556935, "grad_norm": 0.21263335645198822, "learning_rate": 7.190444144173052e-06, "loss": 0.2271, "num_input_tokens_seen": 16526624, "step": 27080 }, { "epoch": 8.403661185231151, "grad_norm": 0.15411372482776642, "learning_rate": 7.189227083000807e-06, "loss": 0.2279, "num_input_tokens_seen": 16529280, "step": 27085 }, { "epoch": 8.405212534905367, "grad_norm": 0.2906665503978729, "learning_rate": 7.1880098613322924e-06, "loss": 0.2289, "num_input_tokens_seen": 16532256, "step": 27090 }, { "epoch": 8.406763884579584, "grad_norm": 0.24600495398044586, "learning_rate": 7.186792479256746e-06, "loss": 0.2327, "num_input_tokens_seen": 16535008, "step": 27095 }, { "epoch": 8.4083152342538, "grad_norm": 0.27822333574295044, "learning_rate": 7.185574936863413e-06, "loss": 0.2273, "num_input_tokens_seen": 16538176, "step": 27100 }, { "epoch": 8.409866583928018, "grad_norm": 0.253467321395874, "learning_rate": 7.1843572342415576e-06, "loss": 0.2319, "num_input_tokens_seen": 16542400, "step": 27105 }, { "epoch": 8.411417933602234, "grad_norm": 0.21598154306411743, "learning_rate": 7.183139371480451e-06, "loss": 0.2272, "num_input_tokens_seen": 16545600, "step": 27110 }, { "epoch": 8.41296928327645, "grad_norm": 0.20686808228492737, "learning_rate": 7.181921348669375e-06, "loss": 0.2331, "num_input_tokens_seen": 16547968, "step": 27115 }, { "epoch": 8.414520632950667, "grad_norm": 0.2383982539176941, "learning_rate": 7.180703165897627e-06, "loss": 0.2307, "num_input_tokens_seen": 16550848, "step": 27120 }, { "epoch": 8.416071982624883, "grad_norm": 0.29367363452911377, "learning_rate": 7.179484823254513e-06, "loss": 0.2258, "num_input_tokens_seen": 16554752, "step": 27125 }, { "epoch": 8.417623332299101, "grad_norm": 0.2865016758441925, "learning_rate": 7.178266320829354e-06, "loss": 0.2278, "num_input_tokens_seen": 16556960, "step": 27130 }, { "epoch": 8.419174681973317, "grad_norm": 0.1744803488254547, "learning_rate": 7.177047658711478e-06, "loss": 0.2204, "num_input_tokens_seen": 16558784, "step": 27135 }, { "epoch": 8.420726031647533, "grad_norm": 0.2588542103767395, "learning_rate": 7.175828836990227e-06, "loss": 0.2248, "num_input_tokens_seen": 16561216, "step": 27140 }, { "epoch": 8.42227738132175, "grad_norm": 0.366624653339386, "learning_rate": 7.1746098557549585e-06, "loss": 0.2297, "num_input_tokens_seen": 16563520, "step": 27145 }, { "epoch": 8.423828730995966, "grad_norm": 0.2870407998561859, "learning_rate": 7.1733907150950355e-06, "loss": 0.2374, "num_input_tokens_seen": 16566080, "step": 27150 }, { "epoch": 8.425380080670184, "grad_norm": 0.2581429183483124, "learning_rate": 7.172171415099837e-06, "loss": 0.2311, "num_input_tokens_seen": 16568096, "step": 27155 }, { "epoch": 8.4269314303444, "grad_norm": 0.30829259753227234, "learning_rate": 7.1709519558587516e-06, "loss": 0.2272, "num_input_tokens_seen": 16570368, "step": 27160 }, { "epoch": 8.428482780018616, "grad_norm": 0.268854558467865, "learning_rate": 7.169732337461179e-06, "loss": 0.2238, "num_input_tokens_seen": 16572960, "step": 27165 }, { "epoch": 8.430034129692833, "grad_norm": 0.3122897148132324, "learning_rate": 7.168512559996533e-06, "loss": 0.2246, "num_input_tokens_seen": 16575648, "step": 27170 }, { "epoch": 8.43158547936705, "grad_norm": 0.4114765226840973, "learning_rate": 7.167292623554237e-06, "loss": 0.2359, "num_input_tokens_seen": 16578880, "step": 27175 }, { "epoch": 8.433136829041265, "grad_norm": 0.6818765997886658, "learning_rate": 7.166072528223729e-06, "loss": 0.2171, "num_input_tokens_seen": 16582880, "step": 27180 }, { "epoch": 8.434688178715483, "grad_norm": 0.5752264261245728, "learning_rate": 7.164852274094453e-06, "loss": 0.2465, "num_input_tokens_seen": 16585792, "step": 27185 }, { "epoch": 8.436239528389699, "grad_norm": 0.30204612016677856, "learning_rate": 7.163631861255869e-06, "loss": 0.2154, "num_input_tokens_seen": 16588576, "step": 27190 }, { "epoch": 8.437790878063916, "grad_norm": 0.16671887040138245, "learning_rate": 7.1624112897974485e-06, "loss": 0.2375, "num_input_tokens_seen": 16591296, "step": 27195 }, { "epoch": 8.439342227738132, "grad_norm": 0.23849605023860931, "learning_rate": 7.161190559808675e-06, "loss": 0.2251, "num_input_tokens_seen": 16594304, "step": 27200 }, { "epoch": 8.440893577412348, "grad_norm": 0.21747232973575592, "learning_rate": 7.159969671379039e-06, "loss": 0.2434, "num_input_tokens_seen": 16596640, "step": 27205 }, { "epoch": 8.442444927086566, "grad_norm": 0.614895224571228, "learning_rate": 7.15874862459805e-06, "loss": 0.226, "num_input_tokens_seen": 16599712, "step": 27210 }, { "epoch": 8.443996276760782, "grad_norm": 0.2713693380355835, "learning_rate": 7.157527419555223e-06, "loss": 0.2414, "num_input_tokens_seen": 16602592, "step": 27215 }, { "epoch": 8.445547626434998, "grad_norm": 0.23212499916553497, "learning_rate": 7.156306056340087e-06, "loss": 0.2333, "num_input_tokens_seen": 16605376, "step": 27220 }, { "epoch": 8.447098976109215, "grad_norm": 0.40964168310165405, "learning_rate": 7.155084535042183e-06, "loss": 0.2371, "num_input_tokens_seen": 16610720, "step": 27225 }, { "epoch": 8.448650325783431, "grad_norm": 0.18852487206459045, "learning_rate": 7.1538628557510614e-06, "loss": 0.2273, "num_input_tokens_seen": 16613568, "step": 27230 }, { "epoch": 8.450201675457649, "grad_norm": 0.1609502136707306, "learning_rate": 7.1526410185562875e-06, "loss": 0.2427, "num_input_tokens_seen": 16616416, "step": 27235 }, { "epoch": 8.451753025131865, "grad_norm": 0.14583076536655426, "learning_rate": 7.1514190235474365e-06, "loss": 0.2324, "num_input_tokens_seen": 16619392, "step": 27240 }, { "epoch": 8.45330437480608, "grad_norm": 0.2136511206626892, "learning_rate": 7.150196870814095e-06, "loss": 0.2271, "num_input_tokens_seen": 16622240, "step": 27245 }, { "epoch": 8.454855724480298, "grad_norm": 0.24391117691993713, "learning_rate": 7.148974560445859e-06, "loss": 0.2246, "num_input_tokens_seen": 16625952, "step": 27250 }, { "epoch": 8.456407074154514, "grad_norm": 0.22921493649482727, "learning_rate": 7.147752092532341e-06, "loss": 0.2528, "num_input_tokens_seen": 16628448, "step": 27255 }, { "epoch": 8.457958423828732, "grad_norm": 0.1355157494544983, "learning_rate": 7.146529467163161e-06, "loss": 0.2352, "num_input_tokens_seen": 16630752, "step": 27260 }, { "epoch": 8.459509773502948, "grad_norm": 0.20486697554588318, "learning_rate": 7.1453066844279525e-06, "loss": 0.2271, "num_input_tokens_seen": 16634464, "step": 27265 }, { "epoch": 8.461061123177164, "grad_norm": 0.21626746654510498, "learning_rate": 7.14408374441636e-06, "loss": 0.2363, "num_input_tokens_seen": 16636800, "step": 27270 }, { "epoch": 8.462612472851381, "grad_norm": 0.09924304485321045, "learning_rate": 7.14286064721804e-06, "loss": 0.2315, "num_input_tokens_seen": 16640096, "step": 27275 }, { "epoch": 8.464163822525597, "grad_norm": 0.27519404888153076, "learning_rate": 7.1416373929226565e-06, "loss": 0.2374, "num_input_tokens_seen": 16642944, "step": 27280 }, { "epoch": 8.465715172199815, "grad_norm": 0.14242467284202576, "learning_rate": 7.1404139816198935e-06, "loss": 0.226, "num_input_tokens_seen": 16645824, "step": 27285 }, { "epoch": 8.46726652187403, "grad_norm": 0.16117706894874573, "learning_rate": 7.13919041339944e-06, "loss": 0.2319, "num_input_tokens_seen": 16648672, "step": 27290 }, { "epoch": 8.468817871548247, "grad_norm": 0.16924448311328888, "learning_rate": 7.1379666883509964e-06, "loss": 0.2225, "num_input_tokens_seen": 16651936, "step": 27295 }, { "epoch": 8.470369221222464, "grad_norm": 0.22025848925113678, "learning_rate": 7.1367428065642775e-06, "loss": 0.2272, "num_input_tokens_seen": 16655104, "step": 27300 }, { "epoch": 8.47192057089668, "grad_norm": 0.131989523768425, "learning_rate": 7.135518768129008e-06, "loss": 0.2334, "num_input_tokens_seen": 16658528, "step": 27305 }, { "epoch": 8.473471920570896, "grad_norm": 0.22423246502876282, "learning_rate": 7.134294573134925e-06, "loss": 0.2236, "num_input_tokens_seen": 16661440, "step": 27310 }, { "epoch": 8.475023270245114, "grad_norm": 0.14958104491233826, "learning_rate": 7.133070221671775e-06, "loss": 0.2368, "num_input_tokens_seen": 16663648, "step": 27315 }, { "epoch": 8.47657461991933, "grad_norm": 0.1318279653787613, "learning_rate": 7.131845713829318e-06, "loss": 0.2254, "num_input_tokens_seen": 16666464, "step": 27320 }, { "epoch": 8.478125969593547, "grad_norm": 0.1425129622220993, "learning_rate": 7.130621049697327e-06, "loss": 0.2331, "num_input_tokens_seen": 16670688, "step": 27325 }, { "epoch": 8.479677319267763, "grad_norm": 0.18011926114559174, "learning_rate": 7.129396229365582e-06, "loss": 0.231, "num_input_tokens_seen": 16673632, "step": 27330 }, { "epoch": 8.481228668941979, "grad_norm": 0.2167033851146698, "learning_rate": 7.128171252923877e-06, "loss": 0.2367, "num_input_tokens_seen": 16677120, "step": 27335 }, { "epoch": 8.482780018616197, "grad_norm": 0.11162293702363968, "learning_rate": 7.126946120462018e-06, "loss": 0.229, "num_input_tokens_seen": 16681088, "step": 27340 }, { "epoch": 8.484331368290412, "grad_norm": 0.14032109081745148, "learning_rate": 7.125720832069822e-06, "loss": 0.2266, "num_input_tokens_seen": 16683776, "step": 27345 }, { "epoch": 8.485882717964628, "grad_norm": 0.16283950209617615, "learning_rate": 7.1244953878371155e-06, "loss": 0.2343, "num_input_tokens_seen": 16688992, "step": 27350 }, { "epoch": 8.487434067638846, "grad_norm": 0.20871052145957947, "learning_rate": 7.123269787853741e-06, "loss": 0.2359, "num_input_tokens_seen": 16692672, "step": 27355 }, { "epoch": 8.488985417313062, "grad_norm": 0.08128507435321808, "learning_rate": 7.122044032209548e-06, "loss": 0.2353, "num_input_tokens_seen": 16696000, "step": 27360 }, { "epoch": 8.49053676698728, "grad_norm": 0.25601598620414734, "learning_rate": 7.120818120994397e-06, "loss": 0.2244, "num_input_tokens_seen": 16698400, "step": 27365 }, { "epoch": 8.492088116661495, "grad_norm": 0.16843433678150177, "learning_rate": 7.1195920542981655e-06, "loss": 0.231, "num_input_tokens_seen": 16701440, "step": 27370 }, { "epoch": 8.493639466335711, "grad_norm": 0.14537908136844635, "learning_rate": 7.118365832210735e-06, "loss": 0.2277, "num_input_tokens_seen": 16704256, "step": 27375 }, { "epoch": 8.495190816009929, "grad_norm": 0.17837804555892944, "learning_rate": 7.117139454822004e-06, "loss": 0.2236, "num_input_tokens_seen": 16707488, "step": 27380 }, { "epoch": 8.496742165684145, "grad_norm": 0.1357979029417038, "learning_rate": 7.115912922221881e-06, "loss": 0.2319, "num_input_tokens_seen": 16709888, "step": 27385 }, { "epoch": 8.498293515358363, "grad_norm": 0.25055187940597534, "learning_rate": 7.114686234500284e-06, "loss": 0.2258, "num_input_tokens_seen": 16713856, "step": 27390 }, { "epoch": 8.499844865032578, "grad_norm": 0.20451213419437408, "learning_rate": 7.1134593917471435e-06, "loss": 0.2328, "num_input_tokens_seen": 16716320, "step": 27395 }, { "epoch": 8.501396214706794, "grad_norm": 0.2241506576538086, "learning_rate": 7.112232394052404e-06, "loss": 0.2208, "num_input_tokens_seen": 16720672, "step": 27400 }, { "epoch": 8.502947564381012, "grad_norm": 0.17854857444763184, "learning_rate": 7.1110052415060175e-06, "loss": 0.2311, "num_input_tokens_seen": 16724128, "step": 27405 }, { "epoch": 8.504498914055228, "grad_norm": 0.1539432555437088, "learning_rate": 7.109777934197948e-06, "loss": 0.234, "num_input_tokens_seen": 16727264, "step": 27410 }, { "epoch": 8.506050263729446, "grad_norm": 0.17435097694396973, "learning_rate": 7.108550472218173e-06, "loss": 0.2217, "num_input_tokens_seen": 16730176, "step": 27415 }, { "epoch": 8.507601613403661, "grad_norm": 0.16348817944526672, "learning_rate": 7.1073228556566785e-06, "loss": 0.233, "num_input_tokens_seen": 16732640, "step": 27420 }, { "epoch": 8.509152963077877, "grad_norm": 0.20645304024219513, "learning_rate": 7.106095084603466e-06, "loss": 0.2294, "num_input_tokens_seen": 16736384, "step": 27425 }, { "epoch": 8.510704312752095, "grad_norm": 0.16004586219787598, "learning_rate": 7.104867159148542e-06, "loss": 0.2181, "num_input_tokens_seen": 16739936, "step": 27430 }, { "epoch": 8.51225566242631, "grad_norm": 0.20315033197402954, "learning_rate": 7.103639079381931e-06, "loss": 0.2354, "num_input_tokens_seen": 16743104, "step": 27435 }, { "epoch": 8.513807012100527, "grad_norm": 0.17234405875205994, "learning_rate": 7.102410845393665e-06, "loss": 0.2235, "num_input_tokens_seen": 16745568, "step": 27440 }, { "epoch": 8.515358361774744, "grad_norm": 0.31120485067367554, "learning_rate": 7.1011824572737865e-06, "loss": 0.2239, "num_input_tokens_seen": 16748288, "step": 27445 }, { "epoch": 8.51690971144896, "grad_norm": 0.1710747480392456, "learning_rate": 7.099953915112353e-06, "loss": 0.2222, "num_input_tokens_seen": 16751424, "step": 27450 }, { "epoch": 8.518461061123178, "grad_norm": 0.23477154970169067, "learning_rate": 7.0987252189994295e-06, "loss": 0.2297, "num_input_tokens_seen": 16753792, "step": 27455 }, { "epoch": 8.520012410797394, "grad_norm": 0.2812924385070801, "learning_rate": 7.097496369025094e-06, "loss": 0.2356, "num_input_tokens_seen": 16756960, "step": 27460 }, { "epoch": 8.52156376047161, "grad_norm": 0.1280994862318039, "learning_rate": 7.096267365279439e-06, "loss": 0.2261, "num_input_tokens_seen": 16759616, "step": 27465 }, { "epoch": 8.523115110145827, "grad_norm": 0.18767711520195007, "learning_rate": 7.0950382078525616e-06, "loss": 0.2245, "num_input_tokens_seen": 16762496, "step": 27470 }, { "epoch": 8.524666459820043, "grad_norm": 0.20407885313034058, "learning_rate": 7.093808896834574e-06, "loss": 0.242, "num_input_tokens_seen": 16765184, "step": 27475 }, { "epoch": 8.52621780949426, "grad_norm": 0.19749848544597626, "learning_rate": 7.092579432315601e-06, "loss": 0.2257, "num_input_tokens_seen": 16769984, "step": 27480 }, { "epoch": 8.527769159168477, "grad_norm": 0.1700364649295807, "learning_rate": 7.091349814385775e-06, "loss": 0.2414, "num_input_tokens_seen": 16772640, "step": 27485 }, { "epoch": 8.529320508842693, "grad_norm": 0.19573678076267242, "learning_rate": 7.0901200431352424e-06, "loss": 0.2304, "num_input_tokens_seen": 16775488, "step": 27490 }, { "epoch": 8.53087185851691, "grad_norm": 0.27916255593299866, "learning_rate": 7.08889011865416e-06, "loss": 0.2341, "num_input_tokens_seen": 16777888, "step": 27495 }, { "epoch": 8.532423208191126, "grad_norm": 0.095223568379879, "learning_rate": 7.0876600410326964e-06, "loss": 0.2242, "num_input_tokens_seen": 16780608, "step": 27500 }, { "epoch": 8.533974557865342, "grad_norm": 0.20112520456314087, "learning_rate": 7.08642981036103e-06, "loss": 0.2389, "num_input_tokens_seen": 16784096, "step": 27505 }, { "epoch": 8.53552590753956, "grad_norm": 0.2931216061115265, "learning_rate": 7.085199426729351e-06, "loss": 0.2401, "num_input_tokens_seen": 16786496, "step": 27510 }, { "epoch": 8.537077257213776, "grad_norm": 0.23956196010112762, "learning_rate": 7.08396889022786e-06, "loss": 0.2325, "num_input_tokens_seen": 16788832, "step": 27515 }, { "epoch": 8.538628606887993, "grad_norm": 0.20886555314064026, "learning_rate": 7.082738200946774e-06, "loss": 0.23, "num_input_tokens_seen": 16791488, "step": 27520 }, { "epoch": 8.54017995656221, "grad_norm": 0.20293956995010376, "learning_rate": 7.0815073589763136e-06, "loss": 0.2233, "num_input_tokens_seen": 16795392, "step": 27525 }, { "epoch": 8.541731306236425, "grad_norm": 0.1760815978050232, "learning_rate": 7.080276364406716e-06, "loss": 0.2338, "num_input_tokens_seen": 16798368, "step": 27530 }, { "epoch": 8.543282655910643, "grad_norm": 0.16061033308506012, "learning_rate": 7.079045217328224e-06, "loss": 0.2297, "num_input_tokens_seen": 16800640, "step": 27535 }, { "epoch": 8.544834005584859, "grad_norm": 0.24789799749851227, "learning_rate": 7.0778139178311e-06, "loss": 0.2289, "num_input_tokens_seen": 16803520, "step": 27540 }, { "epoch": 8.546385355259076, "grad_norm": 0.27500513195991516, "learning_rate": 7.0765824660056114e-06, "loss": 0.2297, "num_input_tokens_seen": 16806336, "step": 27545 }, { "epoch": 8.547936704933292, "grad_norm": 0.25790104269981384, "learning_rate": 7.075350861942037e-06, "loss": 0.2364, "num_input_tokens_seen": 16810368, "step": 27550 }, { "epoch": 8.549488054607508, "grad_norm": 0.16226215660572052, "learning_rate": 7.074119105730668e-06, "loss": 0.2247, "num_input_tokens_seen": 16813312, "step": 27555 }, { "epoch": 8.551039404281726, "grad_norm": 0.18155443668365479, "learning_rate": 7.072887197461809e-06, "loss": 0.2281, "num_input_tokens_seen": 16815968, "step": 27560 }, { "epoch": 8.552590753955942, "grad_norm": 0.18942052125930786, "learning_rate": 7.0716551372257705e-06, "loss": 0.2312, "num_input_tokens_seen": 16819200, "step": 27565 }, { "epoch": 8.554142103630157, "grad_norm": 0.19326046109199524, "learning_rate": 7.070422925112877e-06, "loss": 0.2256, "num_input_tokens_seen": 16822944, "step": 27570 }, { "epoch": 8.555693453304375, "grad_norm": 0.17184288799762726, "learning_rate": 7.069190561213467e-06, "loss": 0.2264, "num_input_tokens_seen": 16825888, "step": 27575 }, { "epoch": 8.557244802978591, "grad_norm": 0.17901641130447388, "learning_rate": 7.067958045617886e-06, "loss": 0.2306, "num_input_tokens_seen": 16828576, "step": 27580 }, { "epoch": 8.558796152652809, "grad_norm": 0.18224966526031494, "learning_rate": 7.066725378416492e-06, "loss": 0.2367, "num_input_tokens_seen": 16831168, "step": 27585 }, { "epoch": 8.560347502327025, "grad_norm": 0.2288299947977066, "learning_rate": 7.065492559699653e-06, "loss": 0.2236, "num_input_tokens_seen": 16833728, "step": 27590 }, { "epoch": 8.56189885200124, "grad_norm": 0.2686428427696228, "learning_rate": 7.064259589557752e-06, "loss": 0.2346, "num_input_tokens_seen": 16836864, "step": 27595 }, { "epoch": 8.563450201675458, "grad_norm": 0.2209586501121521, "learning_rate": 7.063026468081178e-06, "loss": 0.2334, "num_input_tokens_seen": 16839776, "step": 27600 }, { "epoch": 8.565001551349674, "grad_norm": 0.3196496069431305, "learning_rate": 7.061793195360334e-06, "loss": 0.2318, "num_input_tokens_seen": 16843712, "step": 27605 }, { "epoch": 8.56655290102389, "grad_norm": 0.1630798727273941, "learning_rate": 7.060559771485633e-06, "loss": 0.231, "num_input_tokens_seen": 16846848, "step": 27610 }, { "epoch": 8.568104250698108, "grad_norm": 0.27491381764411926, "learning_rate": 7.0593261965475e-06, "loss": 0.2312, "num_input_tokens_seen": 16849696, "step": 27615 }, { "epoch": 8.569655600372323, "grad_norm": 0.38152071833610535, "learning_rate": 7.058092470636372e-06, "loss": 0.2382, "num_input_tokens_seen": 16853216, "step": 27620 }, { "epoch": 8.571206950046541, "grad_norm": 0.239314466714859, "learning_rate": 7.056858593842694e-06, "loss": 0.2287, "num_input_tokens_seen": 16857408, "step": 27625 }, { "epoch": 8.572758299720757, "grad_norm": 0.2556200623512268, "learning_rate": 7.055624566256923e-06, "loss": 0.2254, "num_input_tokens_seen": 16861376, "step": 27630 }, { "epoch": 8.574309649394973, "grad_norm": 0.1849883794784546, "learning_rate": 7.05439038796953e-06, "loss": 0.2245, "num_input_tokens_seen": 16863968, "step": 27635 }, { "epoch": 8.57586099906919, "grad_norm": 0.18757231533527374, "learning_rate": 7.0531560590709945e-06, "loss": 0.2249, "num_input_tokens_seen": 16866784, "step": 27640 }, { "epoch": 8.577412348743406, "grad_norm": 0.2646050751209259, "learning_rate": 7.051921579651806e-06, "loss": 0.2319, "num_input_tokens_seen": 16869728, "step": 27645 }, { "epoch": 8.578963698417624, "grad_norm": 0.20779339969158173, "learning_rate": 7.050686949802466e-06, "loss": 0.2308, "num_input_tokens_seen": 16872288, "step": 27650 }, { "epoch": 8.58051504809184, "grad_norm": 0.3132324814796448, "learning_rate": 7.049452169613491e-06, "loss": 0.2167, "num_input_tokens_seen": 16875232, "step": 27655 }, { "epoch": 8.582066397766056, "grad_norm": 0.24726268649101257, "learning_rate": 7.048217239175402e-06, "loss": 0.2311, "num_input_tokens_seen": 16877440, "step": 27660 }, { "epoch": 8.583617747440274, "grad_norm": 0.37140828371047974, "learning_rate": 7.046982158578736e-06, "loss": 0.2329, "num_input_tokens_seen": 16880576, "step": 27665 }, { "epoch": 8.58516909711449, "grad_norm": 0.29025083780288696, "learning_rate": 7.0457469279140364e-06, "loss": 0.2261, "num_input_tokens_seen": 16883552, "step": 27670 }, { "epoch": 8.586720446788707, "grad_norm": 0.1714889407157898, "learning_rate": 7.044511547271862e-06, "loss": 0.2314, "num_input_tokens_seen": 16886400, "step": 27675 }, { "epoch": 8.588271796462923, "grad_norm": 0.4160712659358978, "learning_rate": 7.043276016742781e-06, "loss": 0.2397, "num_input_tokens_seen": 16889600, "step": 27680 }, { "epoch": 8.589823146137139, "grad_norm": 0.19351044297218323, "learning_rate": 7.0420403364173715e-06, "loss": 0.2359, "num_input_tokens_seen": 16891776, "step": 27685 }, { "epoch": 8.591374495811356, "grad_norm": 0.3446459472179413, "learning_rate": 7.0408045063862255e-06, "loss": 0.2401, "num_input_tokens_seen": 16895968, "step": 27690 }, { "epoch": 8.592925845485572, "grad_norm": 0.2429138869047165, "learning_rate": 7.039568526739941e-06, "loss": 0.2305, "num_input_tokens_seen": 16898400, "step": 27695 }, { "epoch": 8.594477195159788, "grad_norm": 0.20235219597816467, "learning_rate": 7.038332397569131e-06, "loss": 0.236, "num_input_tokens_seen": 16902272, "step": 27700 }, { "epoch": 8.596028544834006, "grad_norm": 0.2580605149269104, "learning_rate": 7.0370961189644195e-06, "loss": 0.2395, "num_input_tokens_seen": 16905440, "step": 27705 }, { "epoch": 8.597579894508222, "grad_norm": 0.2137528955936432, "learning_rate": 7.035859691016439e-06, "loss": 0.2281, "num_input_tokens_seen": 16908160, "step": 27710 }, { "epoch": 8.59913124418244, "grad_norm": 0.3897552788257599, "learning_rate": 7.034623113815835e-06, "loss": 0.2331, "num_input_tokens_seen": 16912192, "step": 27715 }, { "epoch": 8.600682593856655, "grad_norm": 0.16415533423423767, "learning_rate": 7.0333863874532636e-06, "loss": 0.2316, "num_input_tokens_seen": 16915232, "step": 27720 }, { "epoch": 8.602233943530871, "grad_norm": 0.21046121418476105, "learning_rate": 7.032149512019392e-06, "loss": 0.2276, "num_input_tokens_seen": 16918528, "step": 27725 }, { "epoch": 8.603785293205089, "grad_norm": 0.2843952476978302, "learning_rate": 7.030912487604895e-06, "loss": 0.2275, "num_input_tokens_seen": 16923232, "step": 27730 }, { "epoch": 8.605336642879305, "grad_norm": 0.2033073753118515, "learning_rate": 7.0296753143004644e-06, "loss": 0.2356, "num_input_tokens_seen": 16925984, "step": 27735 }, { "epoch": 8.60688799255352, "grad_norm": 0.2816694378852844, "learning_rate": 7.028437992196798e-06, "loss": 0.2286, "num_input_tokens_seen": 16929216, "step": 27740 }, { "epoch": 8.608439342227738, "grad_norm": 0.2177019566297531, "learning_rate": 7.027200521384607e-06, "loss": 0.2224, "num_input_tokens_seen": 16933088, "step": 27745 }, { "epoch": 8.609990691901954, "grad_norm": 0.2741051912307739, "learning_rate": 7.025962901954611e-06, "loss": 0.2372, "num_input_tokens_seen": 16936576, "step": 27750 }, { "epoch": 8.611542041576172, "grad_norm": 0.25414472818374634, "learning_rate": 7.024725133997545e-06, "loss": 0.2297, "num_input_tokens_seen": 16940704, "step": 27755 }, { "epoch": 8.613093391250388, "grad_norm": 0.1979799121618271, "learning_rate": 7.023487217604149e-06, "loss": 0.2319, "num_input_tokens_seen": 16943360, "step": 27760 }, { "epoch": 8.614644740924604, "grad_norm": 0.1938708871603012, "learning_rate": 7.02224915286518e-06, "loss": 0.222, "num_input_tokens_seen": 16945984, "step": 27765 }, { "epoch": 8.616196090598821, "grad_norm": 0.21585409343242645, "learning_rate": 7.021010939871398e-06, "loss": 0.2287, "num_input_tokens_seen": 16949056, "step": 27770 }, { "epoch": 8.617747440273037, "grad_norm": 0.19048146903514862, "learning_rate": 7.019772578713583e-06, "loss": 0.2254, "num_input_tokens_seen": 16952256, "step": 27775 }, { "epoch": 8.619298789947255, "grad_norm": 0.22249308228492737, "learning_rate": 7.018534069482521e-06, "loss": 0.2333, "num_input_tokens_seen": 16955200, "step": 27780 }, { "epoch": 8.62085013962147, "grad_norm": 0.2225773185491562, "learning_rate": 7.017295412269009e-06, "loss": 0.2254, "num_input_tokens_seen": 16957824, "step": 27785 }, { "epoch": 8.622401489295687, "grad_norm": 0.19570736587047577, "learning_rate": 7.016056607163854e-06, "loss": 0.2251, "num_input_tokens_seen": 16960000, "step": 27790 }, { "epoch": 8.623952838969904, "grad_norm": 0.16761091351509094, "learning_rate": 7.014817654257876e-06, "loss": 0.2386, "num_input_tokens_seen": 16962752, "step": 27795 }, { "epoch": 8.62550418864412, "grad_norm": 0.22897055745124817, "learning_rate": 7.013578553641906e-06, "loss": 0.2301, "num_input_tokens_seen": 16965184, "step": 27800 }, { "epoch": 8.627055538318338, "grad_norm": 0.2658021152019501, "learning_rate": 7.012339305406782e-06, "loss": 0.2348, "num_input_tokens_seen": 16968064, "step": 27805 }, { "epoch": 8.628606887992554, "grad_norm": 0.19586819410324097, "learning_rate": 7.011099909643359e-06, "loss": 0.2348, "num_input_tokens_seen": 16970496, "step": 27810 }, { "epoch": 8.63015823766677, "grad_norm": 0.23734737932682037, "learning_rate": 7.009860366442497e-06, "loss": 0.2276, "num_input_tokens_seen": 16973152, "step": 27815 }, { "epoch": 8.631709587340987, "grad_norm": 0.2298574447631836, "learning_rate": 7.008620675895069e-06, "loss": 0.2225, "num_input_tokens_seen": 16975776, "step": 27820 }, { "epoch": 8.633260937015203, "grad_norm": 0.29943475127220154, "learning_rate": 7.007380838091961e-06, "loss": 0.2373, "num_input_tokens_seen": 16978496, "step": 27825 }, { "epoch": 8.634812286689419, "grad_norm": 0.18435297906398773, "learning_rate": 7.0061408531240645e-06, "loss": 0.2296, "num_input_tokens_seen": 16980672, "step": 27830 }, { "epoch": 8.636363636363637, "grad_norm": 0.24615399539470673, "learning_rate": 7.004900721082289e-06, "loss": 0.2325, "num_input_tokens_seen": 16983232, "step": 27835 }, { "epoch": 8.637914986037853, "grad_norm": 0.3203918933868408, "learning_rate": 7.003660442057549e-06, "loss": 0.231, "num_input_tokens_seen": 16986112, "step": 27840 }, { "epoch": 8.63946633571207, "grad_norm": 0.25158244371414185, "learning_rate": 7.002420016140772e-06, "loss": 0.2398, "num_input_tokens_seen": 16988928, "step": 27845 }, { "epoch": 8.641017685386286, "grad_norm": 0.24338342249393463, "learning_rate": 7.001179443422896e-06, "loss": 0.2288, "num_input_tokens_seen": 16991328, "step": 27850 }, { "epoch": 8.642569035060502, "grad_norm": 0.3443821668624878, "learning_rate": 6.999938723994868e-06, "loss": 0.2235, "num_input_tokens_seen": 16994240, "step": 27855 }, { "epoch": 8.64412038473472, "grad_norm": 0.21289308369159698, "learning_rate": 6.998697857947648e-06, "loss": 0.2284, "num_input_tokens_seen": 16997280, "step": 27860 }, { "epoch": 8.645671734408936, "grad_norm": 0.372272253036499, "learning_rate": 6.997456845372208e-06, "loss": 0.2339, "num_input_tokens_seen": 17000512, "step": 27865 }, { "epoch": 8.647223084083151, "grad_norm": 0.3234701454639435, "learning_rate": 6.996215686359529e-06, "loss": 0.2305, "num_input_tokens_seen": 17003776, "step": 27870 }, { "epoch": 8.648774433757369, "grad_norm": 0.4562557637691498, "learning_rate": 6.9949743810006e-06, "loss": 0.2305, "num_input_tokens_seen": 17007296, "step": 27875 }, { "epoch": 8.650325783431585, "grad_norm": 0.2778646945953369, "learning_rate": 6.993732929386426e-06, "loss": 0.2303, "num_input_tokens_seen": 17010272, "step": 27880 }, { "epoch": 8.651877133105803, "grad_norm": 0.23054063320159912, "learning_rate": 6.992491331608018e-06, "loss": 0.2285, "num_input_tokens_seen": 17012448, "step": 27885 }, { "epoch": 8.653428482780019, "grad_norm": 0.23143596947193146, "learning_rate": 6.9912495877564e-06, "loss": 0.2355, "num_input_tokens_seen": 17015392, "step": 27890 }, { "epoch": 8.654979832454234, "grad_norm": 0.19534841179847717, "learning_rate": 6.99000769792261e-06, "loss": 0.2245, "num_input_tokens_seen": 17018688, "step": 27895 }, { "epoch": 8.656531182128452, "grad_norm": 0.2873234152793884, "learning_rate": 6.988765662197687e-06, "loss": 0.2322, "num_input_tokens_seen": 17021504, "step": 27900 }, { "epoch": 8.658082531802668, "grad_norm": 0.26354649662971497, "learning_rate": 6.9875234806726925e-06, "loss": 0.2142, "num_input_tokens_seen": 17024288, "step": 27905 }, { "epoch": 8.659633881476886, "grad_norm": 0.28361859917640686, "learning_rate": 6.9862811534386894e-06, "loss": 0.2166, "num_input_tokens_seen": 17026848, "step": 27910 }, { "epoch": 8.661185231151102, "grad_norm": 0.8530133366584778, "learning_rate": 6.985038680586759e-06, "loss": 0.222, "num_input_tokens_seen": 17029568, "step": 27915 }, { "epoch": 8.662736580825317, "grad_norm": 0.2746097445487976, "learning_rate": 6.983796062207986e-06, "loss": 0.2326, "num_input_tokens_seen": 17032096, "step": 27920 }, { "epoch": 8.664287930499535, "grad_norm": 0.40305095911026, "learning_rate": 6.982553298393469e-06, "loss": 0.2446, "num_input_tokens_seen": 17034560, "step": 27925 }, { "epoch": 8.665839280173751, "grad_norm": 0.3622974157333374, "learning_rate": 6.9813103892343205e-06, "loss": 0.2268, "num_input_tokens_seen": 17037472, "step": 27930 }, { "epoch": 8.667390629847969, "grad_norm": 0.33458760380744934, "learning_rate": 6.9800673348216564e-06, "loss": 0.2335, "num_input_tokens_seen": 17040320, "step": 27935 }, { "epoch": 8.668941979522184, "grad_norm": 0.4424971342086792, "learning_rate": 6.97882413524661e-06, "loss": 0.2212, "num_input_tokens_seen": 17043520, "step": 27940 }, { "epoch": 8.6704933291964, "grad_norm": 0.4105192720890045, "learning_rate": 6.977580790600323e-06, "loss": 0.2346, "num_input_tokens_seen": 17046464, "step": 27945 }, { "epoch": 8.672044678870618, "grad_norm": 0.4240957796573639, "learning_rate": 6.976337300973943e-06, "loss": 0.2217, "num_input_tokens_seen": 17049472, "step": 27950 }, { "epoch": 8.673596028544834, "grad_norm": 0.3988337814807892, "learning_rate": 6.975093666458637e-06, "loss": 0.2285, "num_input_tokens_seen": 17051744, "step": 27955 }, { "epoch": 8.67514737821905, "grad_norm": 0.23509949445724487, "learning_rate": 6.973849887145577e-06, "loss": 0.2306, "num_input_tokens_seen": 17056672, "step": 27960 }, { "epoch": 8.676698727893267, "grad_norm": 0.4601147174835205, "learning_rate": 6.972605963125945e-06, "loss": 0.2296, "num_input_tokens_seen": 17060672, "step": 27965 }, { "epoch": 8.678250077567483, "grad_norm": 0.42578521370887756, "learning_rate": 6.971361894490938e-06, "loss": 0.233, "num_input_tokens_seen": 17063360, "step": 27970 }, { "epoch": 8.679801427241701, "grad_norm": 0.29420408606529236, "learning_rate": 6.9701176813317596e-06, "loss": 0.2236, "num_input_tokens_seen": 17065792, "step": 27975 }, { "epoch": 8.681352776915917, "grad_norm": 0.3073570728302002, "learning_rate": 6.968873323739624e-06, "loss": 0.2285, "num_input_tokens_seen": 17072704, "step": 27980 }, { "epoch": 8.682904126590133, "grad_norm": 0.36787882447242737, "learning_rate": 6.967628821805761e-06, "loss": 0.2371, "num_input_tokens_seen": 17075264, "step": 27985 }, { "epoch": 8.68445547626435, "grad_norm": 0.4896329641342163, "learning_rate": 6.966384175621404e-06, "loss": 0.2309, "num_input_tokens_seen": 17078272, "step": 27990 }, { "epoch": 8.686006825938566, "grad_norm": 0.3567539155483246, "learning_rate": 6.965139385277803e-06, "loss": 0.2354, "num_input_tokens_seen": 17080896, "step": 27995 }, { "epoch": 8.687558175612782, "grad_norm": 0.34433406591415405, "learning_rate": 6.963894450866212e-06, "loss": 0.2334, "num_input_tokens_seen": 17083712, "step": 28000 }, { "epoch": 8.689109525287, "grad_norm": 0.37105825543403625, "learning_rate": 6.962649372477903e-06, "loss": 0.231, "num_input_tokens_seen": 17087040, "step": 28005 }, { "epoch": 8.690660874961216, "grad_norm": 0.23305776715278625, "learning_rate": 6.961404150204154e-06, "loss": 0.232, "num_input_tokens_seen": 17090496, "step": 28010 }, { "epoch": 8.692212224635433, "grad_norm": 0.4917480945587158, "learning_rate": 6.960158784136254e-06, "loss": 0.2392, "num_input_tokens_seen": 17093344, "step": 28015 }, { "epoch": 8.69376357430965, "grad_norm": 0.3972581923007965, "learning_rate": 6.958913274365503e-06, "loss": 0.2306, "num_input_tokens_seen": 17096480, "step": 28020 }, { "epoch": 8.695314923983865, "grad_norm": 0.27666613459587097, "learning_rate": 6.9576676209832115e-06, "loss": 0.2198, "num_input_tokens_seen": 17098912, "step": 28025 }, { "epoch": 8.696866273658083, "grad_norm": 0.18819354474544525, "learning_rate": 6.9564218240807015e-06, "loss": 0.232, "num_input_tokens_seen": 17102688, "step": 28030 }, { "epoch": 8.698417623332299, "grad_norm": 0.23849573731422424, "learning_rate": 6.9551758837493055e-06, "loss": 0.229, "num_input_tokens_seen": 17104896, "step": 28035 }, { "epoch": 8.699968973006516, "grad_norm": 0.3579436242580414, "learning_rate": 6.953929800080363e-06, "loss": 0.2272, "num_input_tokens_seen": 17108128, "step": 28040 }, { "epoch": 8.701520322680732, "grad_norm": 0.23651833832263947, "learning_rate": 6.952683573165229e-06, "loss": 0.232, "num_input_tokens_seen": 17110848, "step": 28045 }, { "epoch": 8.703071672354948, "grad_norm": 0.3598651885986328, "learning_rate": 6.951437203095266e-06, "loss": 0.2342, "num_input_tokens_seen": 17113632, "step": 28050 }, { "epoch": 8.704623022029166, "grad_norm": 0.409107506275177, "learning_rate": 6.950190689961847e-06, "loss": 0.225, "num_input_tokens_seen": 17116512, "step": 28055 }, { "epoch": 8.706174371703382, "grad_norm": 0.3483237028121948, "learning_rate": 6.9489440338563575e-06, "loss": 0.221, "num_input_tokens_seen": 17119744, "step": 28060 }, { "epoch": 8.7077257213776, "grad_norm": 0.4377267360687256, "learning_rate": 6.9476972348701895e-06, "loss": 0.2403, "num_input_tokens_seen": 17122688, "step": 28065 }, { "epoch": 8.709277071051815, "grad_norm": 0.23889432847499847, "learning_rate": 6.946450293094752e-06, "loss": 0.2264, "num_input_tokens_seen": 17125760, "step": 28070 }, { "epoch": 8.710828420726031, "grad_norm": 0.3534879684448242, "learning_rate": 6.9452032086214585e-06, "loss": 0.2399, "num_input_tokens_seen": 17128672, "step": 28075 }, { "epoch": 8.712379770400249, "grad_norm": 0.23191095888614655, "learning_rate": 6.9439559815417345e-06, "loss": 0.2305, "num_input_tokens_seen": 17131648, "step": 28080 }, { "epoch": 8.713931120074465, "grad_norm": 0.3551565706729889, "learning_rate": 6.9427086119470155e-06, "loss": 0.2303, "num_input_tokens_seen": 17134368, "step": 28085 }, { "epoch": 8.71548246974868, "grad_norm": 0.4229218363761902, "learning_rate": 6.941461099928752e-06, "loss": 0.2477, "num_input_tokens_seen": 17137600, "step": 28090 }, { "epoch": 8.717033819422898, "grad_norm": 0.3156066834926605, "learning_rate": 6.9402134455783994e-06, "loss": 0.2296, "num_input_tokens_seen": 17140384, "step": 28095 }, { "epoch": 8.718585169097114, "grad_norm": 0.10187237709760666, "learning_rate": 6.938965648987426e-06, "loss": 0.2343, "num_input_tokens_seen": 17143296, "step": 28100 }, { "epoch": 8.720136518771332, "grad_norm": 0.19456636905670166, "learning_rate": 6.93771771024731e-06, "loss": 0.2246, "num_input_tokens_seen": 17145888, "step": 28105 }, { "epoch": 8.721687868445548, "grad_norm": 0.1515008956193924, "learning_rate": 6.9364696294495384e-06, "loss": 0.2387, "num_input_tokens_seen": 17148672, "step": 28110 }, { "epoch": 8.723239218119764, "grad_norm": 0.22515784204006195, "learning_rate": 6.935221406685613e-06, "loss": 0.2302, "num_input_tokens_seen": 17151744, "step": 28115 }, { "epoch": 8.724790567793981, "grad_norm": 0.16381500661373138, "learning_rate": 6.933973042047042e-06, "loss": 0.2251, "num_input_tokens_seen": 17154080, "step": 28120 }, { "epoch": 8.726341917468197, "grad_norm": 0.24204067885875702, "learning_rate": 6.932724535625344e-06, "loss": 0.2392, "num_input_tokens_seen": 17156960, "step": 28125 }, { "epoch": 8.727893267142413, "grad_norm": 0.21076516807079315, "learning_rate": 6.9314758875120525e-06, "loss": 0.2297, "num_input_tokens_seen": 17159264, "step": 28130 }, { "epoch": 8.72944461681663, "grad_norm": 0.2246556133031845, "learning_rate": 6.930227097798705e-06, "loss": 0.2286, "num_input_tokens_seen": 17162400, "step": 28135 }, { "epoch": 8.730995966490847, "grad_norm": 0.15184497833251953, "learning_rate": 6.928978166576854e-06, "loss": 0.2288, "num_input_tokens_seen": 17165408, "step": 28140 }, { "epoch": 8.732547316165064, "grad_norm": 0.280666708946228, "learning_rate": 6.92772909393806e-06, "loss": 0.2308, "num_input_tokens_seen": 17168512, "step": 28145 }, { "epoch": 8.73409866583928, "grad_norm": 0.11981282383203506, "learning_rate": 6.926479879973897e-06, "loss": 0.2359, "num_input_tokens_seen": 17172256, "step": 28150 }, { "epoch": 8.735650015513496, "grad_norm": 0.2099745273590088, "learning_rate": 6.925230524775945e-06, "loss": 0.2331, "num_input_tokens_seen": 17176608, "step": 28155 }, { "epoch": 8.737201365187714, "grad_norm": 0.16497325897216797, "learning_rate": 6.923981028435799e-06, "loss": 0.2311, "num_input_tokens_seen": 17179360, "step": 28160 }, { "epoch": 8.73875271486193, "grad_norm": 0.3205946981906891, "learning_rate": 6.922731391045059e-06, "loss": 0.2284, "num_input_tokens_seen": 17182368, "step": 28165 }, { "epoch": 8.740304064536147, "grad_norm": 0.24982792139053345, "learning_rate": 6.92148161269534e-06, "loss": 0.2348, "num_input_tokens_seen": 17185920, "step": 28170 }, { "epoch": 8.741855414210363, "grad_norm": 0.2382572591304779, "learning_rate": 6.920231693478265e-06, "loss": 0.2268, "num_input_tokens_seen": 17188800, "step": 28175 }, { "epoch": 8.743406763884579, "grad_norm": 0.24369722604751587, "learning_rate": 6.918981633485468e-06, "loss": 0.2331, "num_input_tokens_seen": 17191520, "step": 28180 }, { "epoch": 8.744958113558797, "grad_norm": 0.16166779398918152, "learning_rate": 6.917731432808593e-06, "loss": 0.2238, "num_input_tokens_seen": 17194336, "step": 28185 }, { "epoch": 8.746509463233012, "grad_norm": 0.20550376176834106, "learning_rate": 6.916481091539296e-06, "loss": 0.2333, "num_input_tokens_seen": 17197280, "step": 28190 }, { "epoch": 8.74806081290723, "grad_norm": 0.18778270483016968, "learning_rate": 6.915230609769239e-06, "loss": 0.224, "num_input_tokens_seen": 17200768, "step": 28195 }, { "epoch": 8.749612162581446, "grad_norm": 0.22200478613376617, "learning_rate": 6.913979987590098e-06, "loss": 0.2296, "num_input_tokens_seen": 17203648, "step": 28200 }, { "epoch": 8.751163512255662, "grad_norm": 0.2952456772327423, "learning_rate": 6.912729225093559e-06, "loss": 0.2349, "num_input_tokens_seen": 17206816, "step": 28205 }, { "epoch": 8.75271486192988, "grad_norm": 0.15766699612140656, "learning_rate": 6.911478322371319e-06, "loss": 0.2326, "num_input_tokens_seen": 17209792, "step": 28210 }, { "epoch": 8.754266211604095, "grad_norm": 0.18875890970230103, "learning_rate": 6.910227279515082e-06, "loss": 0.2308, "num_input_tokens_seen": 17212480, "step": 28215 }, { "epoch": 8.755817561278311, "grad_norm": 0.20279234647750854, "learning_rate": 6.9089760966165645e-06, "loss": 0.2336, "num_input_tokens_seen": 17215040, "step": 28220 }, { "epoch": 8.757368910952529, "grad_norm": 0.14578253030776978, "learning_rate": 6.907724773767495e-06, "loss": 0.227, "num_input_tokens_seen": 17217632, "step": 28225 }, { "epoch": 8.758920260626745, "grad_norm": 0.21464455127716064, "learning_rate": 6.9064733110596075e-06, "loss": 0.2284, "num_input_tokens_seen": 17221504, "step": 28230 }, { "epoch": 8.760471610300963, "grad_norm": 0.2305803894996643, "learning_rate": 6.905221708584649e-06, "loss": 0.2246, "num_input_tokens_seen": 17224128, "step": 28235 }, { "epoch": 8.762022959975178, "grad_norm": 0.31479349732398987, "learning_rate": 6.9039699664343805e-06, "loss": 0.2308, "num_input_tokens_seen": 17227360, "step": 28240 }, { "epoch": 8.763574309649394, "grad_norm": 0.15608838200569153, "learning_rate": 6.902718084700566e-06, "loss": 0.2344, "num_input_tokens_seen": 17231104, "step": 28245 }, { "epoch": 8.765125659323612, "grad_norm": 0.15764470398426056, "learning_rate": 6.901466063474984e-06, "loss": 0.24, "num_input_tokens_seen": 17234400, "step": 28250 }, { "epoch": 8.766677008997828, "grad_norm": 0.27651292085647583, "learning_rate": 6.900213902849424e-06, "loss": 0.2348, "num_input_tokens_seen": 17237824, "step": 28255 }, { "epoch": 8.768228358672044, "grad_norm": 0.13771341741085052, "learning_rate": 6.898961602915682e-06, "loss": 0.2329, "num_input_tokens_seen": 17240704, "step": 28260 }, { "epoch": 8.769779708346261, "grad_norm": 0.20764872431755066, "learning_rate": 6.897709163765568e-06, "loss": 0.228, "num_input_tokens_seen": 17243872, "step": 28265 }, { "epoch": 8.771331058020477, "grad_norm": 0.20357957482337952, "learning_rate": 6.896456585490901e-06, "loss": 0.2257, "num_input_tokens_seen": 17246112, "step": 28270 }, { "epoch": 8.772882407694695, "grad_norm": 0.18301816284656525, "learning_rate": 6.895203868183507e-06, "loss": 0.2355, "num_input_tokens_seen": 17249504, "step": 28275 }, { "epoch": 8.77443375736891, "grad_norm": 0.20610330998897552, "learning_rate": 6.893951011935227e-06, "loss": 0.2185, "num_input_tokens_seen": 17251744, "step": 28280 }, { "epoch": 8.775985107043127, "grad_norm": 0.3019401431083679, "learning_rate": 6.89269801683791e-06, "loss": 0.2385, "num_input_tokens_seen": 17255584, "step": 28285 }, { "epoch": 8.777536456717344, "grad_norm": 0.23990097641944885, "learning_rate": 6.891444882983416e-06, "loss": 0.2251, "num_input_tokens_seen": 17258720, "step": 28290 }, { "epoch": 8.77908780639156, "grad_norm": 0.24257433414459229, "learning_rate": 6.8901916104636146e-06, "loss": 0.2204, "num_input_tokens_seen": 17261088, "step": 28295 }, { "epoch": 8.780639156065778, "grad_norm": 0.2510926425457001, "learning_rate": 6.888938199370385e-06, "loss": 0.2298, "num_input_tokens_seen": 17264576, "step": 28300 }, { "epoch": 8.782190505739994, "grad_norm": 0.11483106017112732, "learning_rate": 6.887684649795616e-06, "loss": 0.2267, "num_input_tokens_seen": 17268160, "step": 28305 }, { "epoch": 8.78374185541421, "grad_norm": 0.2946043014526367, "learning_rate": 6.886430961831209e-06, "loss": 0.2216, "num_input_tokens_seen": 17270560, "step": 28310 }, { "epoch": 8.785293205088427, "grad_norm": 0.18400534987449646, "learning_rate": 6.885177135569074e-06, "loss": 0.2334, "num_input_tokens_seen": 17272544, "step": 28315 }, { "epoch": 8.786844554762643, "grad_norm": 0.22273075580596924, "learning_rate": 6.883923171101131e-06, "loss": 0.2418, "num_input_tokens_seen": 17275392, "step": 28320 }, { "epoch": 8.788395904436861, "grad_norm": 0.235478937625885, "learning_rate": 6.882669068519311e-06, "loss": 0.2308, "num_input_tokens_seen": 17278240, "step": 28325 }, { "epoch": 8.789947254111077, "grad_norm": 0.28337788581848145, "learning_rate": 6.881414827915553e-06, "loss": 0.2347, "num_input_tokens_seen": 17281280, "step": 28330 }, { "epoch": 8.791498603785293, "grad_norm": 0.1912815123796463, "learning_rate": 6.880160449381811e-06, "loss": 0.2243, "num_input_tokens_seen": 17284192, "step": 28335 }, { "epoch": 8.79304995345951, "grad_norm": 0.26998427510261536, "learning_rate": 6.8789059330100404e-06, "loss": 0.228, "num_input_tokens_seen": 17286816, "step": 28340 }, { "epoch": 8.794601303133726, "grad_norm": 0.20853058993816376, "learning_rate": 6.877651278892218e-06, "loss": 0.2297, "num_input_tokens_seen": 17289152, "step": 28345 }, { "epoch": 8.796152652807942, "grad_norm": 0.2433151751756668, "learning_rate": 6.8763964871203214e-06, "loss": 0.2329, "num_input_tokens_seen": 17291552, "step": 28350 }, { "epoch": 8.79770400248216, "grad_norm": 0.1389828324317932, "learning_rate": 6.8751415577863425e-06, "loss": 0.2264, "num_input_tokens_seen": 17294432, "step": 28355 }, { "epoch": 8.799255352156376, "grad_norm": 0.29931944608688354, "learning_rate": 6.873886490982282e-06, "loss": 0.2168, "num_input_tokens_seen": 17298176, "step": 28360 }, { "epoch": 8.800806701830593, "grad_norm": 0.24107050895690918, "learning_rate": 6.8726312868001525e-06, "loss": 0.2314, "num_input_tokens_seen": 17302592, "step": 28365 }, { "epoch": 8.80235805150481, "grad_norm": 0.2726427912712097, "learning_rate": 6.871375945331973e-06, "loss": 0.2378, "num_input_tokens_seen": 17305024, "step": 28370 }, { "epoch": 8.803909401179025, "grad_norm": 0.22797365486621857, "learning_rate": 6.870120466669778e-06, "loss": 0.223, "num_input_tokens_seen": 17308384, "step": 28375 }, { "epoch": 8.805460750853243, "grad_norm": 0.19580642879009247, "learning_rate": 6.868864850905606e-06, "loss": 0.2324, "num_input_tokens_seen": 17310752, "step": 28380 }, { "epoch": 8.807012100527459, "grad_norm": 0.334399938583374, "learning_rate": 6.86760909813151e-06, "loss": 0.235, "num_input_tokens_seen": 17314656, "step": 28385 }, { "epoch": 8.808563450201675, "grad_norm": 0.34815794229507446, "learning_rate": 6.866353208439551e-06, "loss": 0.234, "num_input_tokens_seen": 17317312, "step": 28390 }, { "epoch": 8.810114799875892, "grad_norm": 0.27064812183380127, "learning_rate": 6.865097181921802e-06, "loss": 0.2162, "num_input_tokens_seen": 17319808, "step": 28395 }, { "epoch": 8.811666149550108, "grad_norm": 0.3470843732357025, "learning_rate": 6.863841018670341e-06, "loss": 0.2289, "num_input_tokens_seen": 17322816, "step": 28400 }, { "epoch": 8.813217499224326, "grad_norm": 0.20159795880317688, "learning_rate": 6.8625847187772645e-06, "loss": 0.2303, "num_input_tokens_seen": 17326432, "step": 28405 }, { "epoch": 8.814768848898542, "grad_norm": 0.2891068756580353, "learning_rate": 6.861328282334672e-06, "loss": 0.2339, "num_input_tokens_seen": 17328896, "step": 28410 }, { "epoch": 8.816320198572758, "grad_norm": 0.3128388822078705, "learning_rate": 6.860071709434674e-06, "loss": 0.2178, "num_input_tokens_seen": 17331712, "step": 28415 }, { "epoch": 8.817871548246975, "grad_norm": 0.313191682100296, "learning_rate": 6.8588150001693935e-06, "loss": 0.2368, "num_input_tokens_seen": 17335488, "step": 28420 }, { "epoch": 8.819422897921191, "grad_norm": 0.2657018005847931, "learning_rate": 6.8575581546309614e-06, "loss": 0.2292, "num_input_tokens_seen": 17338496, "step": 28425 }, { "epoch": 8.820974247595409, "grad_norm": 0.24761618673801422, "learning_rate": 6.8563011729115204e-06, "loss": 0.2304, "num_input_tokens_seen": 17341728, "step": 28430 }, { "epoch": 8.822525597269625, "grad_norm": 0.2872997522354126, "learning_rate": 6.855044055103219e-06, "loss": 0.2361, "num_input_tokens_seen": 17345440, "step": 28435 }, { "epoch": 8.82407694694384, "grad_norm": 0.18372157216072083, "learning_rate": 6.8537868012982244e-06, "loss": 0.2282, "num_input_tokens_seen": 17348096, "step": 28440 }, { "epoch": 8.825628296618058, "grad_norm": 0.23599888384342194, "learning_rate": 6.852529411588704e-06, "loss": 0.2275, "num_input_tokens_seen": 17351040, "step": 28445 }, { "epoch": 8.827179646292274, "grad_norm": 0.3101281523704529, "learning_rate": 6.851271886066842e-06, "loss": 0.2258, "num_input_tokens_seen": 17354784, "step": 28450 }, { "epoch": 8.828730995966492, "grad_norm": 0.19102230668067932, "learning_rate": 6.850014224824827e-06, "loss": 0.2337, "num_input_tokens_seen": 17357952, "step": 28455 }, { "epoch": 8.830282345640708, "grad_norm": 0.3155595064163208, "learning_rate": 6.848756427954861e-06, "loss": 0.2291, "num_input_tokens_seen": 17362592, "step": 28460 }, { "epoch": 8.831833695314923, "grad_norm": 0.3118048310279846, "learning_rate": 6.847498495549159e-06, "loss": 0.2159, "num_input_tokens_seen": 17365792, "step": 28465 }, { "epoch": 8.833385044989141, "grad_norm": 0.3329842984676361, "learning_rate": 6.846240427699936e-06, "loss": 0.2398, "num_input_tokens_seen": 17368704, "step": 28470 }, { "epoch": 8.834936394663357, "grad_norm": 0.424973726272583, "learning_rate": 6.844982224499429e-06, "loss": 0.2312, "num_input_tokens_seen": 17371328, "step": 28475 }, { "epoch": 8.836487744337573, "grad_norm": 0.23695170879364014, "learning_rate": 6.843723886039877e-06, "loss": 0.2252, "num_input_tokens_seen": 17374112, "step": 28480 }, { "epoch": 8.83803909401179, "grad_norm": 0.3971700966358185, "learning_rate": 6.842465412413531e-06, "loss": 0.2352, "num_input_tokens_seen": 17377824, "step": 28485 }, { "epoch": 8.839590443686006, "grad_norm": 0.6569007039070129, "learning_rate": 6.841206803712652e-06, "loss": 0.2458, "num_input_tokens_seen": 17382080, "step": 28490 }, { "epoch": 8.841141793360224, "grad_norm": 0.18985934555530548, "learning_rate": 6.839948060029512e-06, "loss": 0.2302, "num_input_tokens_seen": 17384992, "step": 28495 }, { "epoch": 8.84269314303444, "grad_norm": 0.37871840596199036, "learning_rate": 6.8386891814563906e-06, "loss": 0.228, "num_input_tokens_seen": 17388480, "step": 28500 }, { "epoch": 8.844244492708656, "grad_norm": 0.2550343871116638, "learning_rate": 6.837430168085579e-06, "loss": 0.2399, "num_input_tokens_seen": 17390816, "step": 28505 }, { "epoch": 8.845795842382874, "grad_norm": 0.2052842080593109, "learning_rate": 6.836171020009378e-06, "loss": 0.2347, "num_input_tokens_seen": 17394720, "step": 28510 }, { "epoch": 8.84734719205709, "grad_norm": 0.3883604109287262, "learning_rate": 6.834911737320097e-06, "loss": 0.2372, "num_input_tokens_seen": 17397952, "step": 28515 }, { "epoch": 8.848898541731305, "grad_norm": 0.24666829407215118, "learning_rate": 6.833652320110057e-06, "loss": 0.227, "num_input_tokens_seen": 17400768, "step": 28520 }, { "epoch": 8.850449891405523, "grad_norm": 0.3608214557170868, "learning_rate": 6.832392768471588e-06, "loss": 0.2258, "num_input_tokens_seen": 17403712, "step": 28525 }, { "epoch": 8.852001241079739, "grad_norm": 0.14208024740219116, "learning_rate": 6.8311330824970305e-06, "loss": 0.2356, "num_input_tokens_seen": 17406720, "step": 28530 }, { "epoch": 8.853552590753957, "grad_norm": 0.24027766287326813, "learning_rate": 6.829873262278734e-06, "loss": 0.2297, "num_input_tokens_seen": 17409408, "step": 28535 }, { "epoch": 8.855103940428172, "grad_norm": 0.13686245679855347, "learning_rate": 6.828613307909059e-06, "loss": 0.2395, "num_input_tokens_seen": 17412160, "step": 28540 }, { "epoch": 8.856655290102388, "grad_norm": 0.26796597242355347, "learning_rate": 6.827353219480375e-06, "loss": 0.2239, "num_input_tokens_seen": 17414688, "step": 28545 }, { "epoch": 8.858206639776606, "grad_norm": 0.24156981706619263, "learning_rate": 6.8260929970850595e-06, "loss": 0.2255, "num_input_tokens_seen": 17417888, "step": 28550 }, { "epoch": 8.859757989450822, "grad_norm": 0.48638206720352173, "learning_rate": 6.824832640815504e-06, "loss": 0.2237, "num_input_tokens_seen": 17421824, "step": 28555 }, { "epoch": 8.86130933912504, "grad_norm": 0.16753606498241425, "learning_rate": 6.823572150764106e-06, "loss": 0.2379, "num_input_tokens_seen": 17425824, "step": 28560 }, { "epoch": 8.862860688799255, "grad_norm": 0.2544424533843994, "learning_rate": 6.822311527023276e-06, "loss": 0.2288, "num_input_tokens_seen": 17428672, "step": 28565 }, { "epoch": 8.864412038473471, "grad_norm": 0.2157856523990631, "learning_rate": 6.821050769685431e-06, "loss": 0.2339, "num_input_tokens_seen": 17431936, "step": 28570 }, { "epoch": 8.865963388147689, "grad_norm": 0.31182020902633667, "learning_rate": 6.819789878843001e-06, "loss": 0.2311, "num_input_tokens_seen": 17434272, "step": 28575 }, { "epoch": 8.867514737821905, "grad_norm": 0.17352978885173798, "learning_rate": 6.8185288545884235e-06, "loss": 0.241, "num_input_tokens_seen": 17436480, "step": 28580 }, { "epoch": 8.869066087496122, "grad_norm": 0.35243532061576843, "learning_rate": 6.8172676970141446e-06, "loss": 0.2323, "num_input_tokens_seen": 17439968, "step": 28585 }, { "epoch": 8.870617437170338, "grad_norm": 0.2631552219390869, "learning_rate": 6.816006406212624e-06, "loss": 0.2352, "num_input_tokens_seen": 17442912, "step": 28590 }, { "epoch": 8.872168786844554, "grad_norm": 0.26035043597221375, "learning_rate": 6.81474498227633e-06, "loss": 0.2323, "num_input_tokens_seen": 17445472, "step": 28595 }, { "epoch": 8.873720136518772, "grad_norm": 0.287824422121048, "learning_rate": 6.81348342529774e-06, "loss": 0.2295, "num_input_tokens_seen": 17447968, "step": 28600 }, { "epoch": 8.875271486192988, "grad_norm": 0.22098307311534882, "learning_rate": 6.812221735369339e-06, "loss": 0.2254, "num_input_tokens_seen": 17451168, "step": 28605 }, { "epoch": 8.876822835867204, "grad_norm": 0.18467193841934204, "learning_rate": 6.810959912583624e-06, "loss": 0.2265, "num_input_tokens_seen": 17453312, "step": 28610 }, { "epoch": 8.878374185541421, "grad_norm": 0.2175474613904953, "learning_rate": 6.809697957033103e-06, "loss": 0.2331, "num_input_tokens_seen": 17456160, "step": 28615 }, { "epoch": 8.879925535215637, "grad_norm": 0.20596526563167572, "learning_rate": 6.808435868810293e-06, "loss": 0.2264, "num_input_tokens_seen": 17459264, "step": 28620 }, { "epoch": 8.881476884889855, "grad_norm": 0.25576329231262207, "learning_rate": 6.807173648007718e-06, "loss": 0.2289, "num_input_tokens_seen": 17462560, "step": 28625 }, { "epoch": 8.88302823456407, "grad_norm": 0.18781965970993042, "learning_rate": 6.805911294717914e-06, "loss": 0.2274, "num_input_tokens_seen": 17465184, "step": 28630 }, { "epoch": 8.884579584238287, "grad_norm": 0.3069000244140625, "learning_rate": 6.804648809033428e-06, "loss": 0.238, "num_input_tokens_seen": 17467936, "step": 28635 }, { "epoch": 8.886130933912504, "grad_norm": 0.2826651930809021, "learning_rate": 6.8033861910468125e-06, "loss": 0.2235, "num_input_tokens_seen": 17470496, "step": 28640 }, { "epoch": 8.88768228358672, "grad_norm": 0.5052248239517212, "learning_rate": 6.8021234408506345e-06, "loss": 0.2398, "num_input_tokens_seen": 17473600, "step": 28645 }, { "epoch": 8.889233633260936, "grad_norm": 0.45016101002693176, "learning_rate": 6.800860558537467e-06, "loss": 0.2328, "num_input_tokens_seen": 17476416, "step": 28650 }, { "epoch": 8.890784982935154, "grad_norm": 0.21539488434791565, "learning_rate": 6.799597544199896e-06, "loss": 0.2325, "num_input_tokens_seen": 17478816, "step": 28655 }, { "epoch": 8.89233633260937, "grad_norm": 0.2507629990577698, "learning_rate": 6.798334397930515e-06, "loss": 0.2359, "num_input_tokens_seen": 17480928, "step": 28660 }, { "epoch": 8.893887682283587, "grad_norm": 0.14160402119159698, "learning_rate": 6.797071119821927e-06, "loss": 0.2353, "num_input_tokens_seen": 17483712, "step": 28665 }, { "epoch": 8.895439031957803, "grad_norm": 0.2934337854385376, "learning_rate": 6.795807709966745e-06, "loss": 0.2341, "num_input_tokens_seen": 17486368, "step": 28670 }, { "epoch": 8.896990381632019, "grad_norm": 0.28469106554985046, "learning_rate": 6.794544168457593e-06, "loss": 0.2407, "num_input_tokens_seen": 17489216, "step": 28675 }, { "epoch": 8.898541731306237, "grad_norm": 0.3430992364883423, "learning_rate": 6.7932804953871024e-06, "loss": 0.2235, "num_input_tokens_seen": 17492096, "step": 28680 }, { "epoch": 8.900093080980453, "grad_norm": 0.2736962139606476, "learning_rate": 6.792016690847917e-06, "loss": 0.2291, "num_input_tokens_seen": 17498432, "step": 28685 }, { "epoch": 8.90164443065467, "grad_norm": 0.21241495013237, "learning_rate": 6.7907527549326855e-06, "loss": 0.2267, "num_input_tokens_seen": 17501056, "step": 28690 }, { "epoch": 8.903195780328886, "grad_norm": 0.29402846097946167, "learning_rate": 6.789488687734074e-06, "loss": 0.2299, "num_input_tokens_seen": 17504640, "step": 28695 }, { "epoch": 8.904747130003102, "grad_norm": 0.22274483740329742, "learning_rate": 6.788224489344748e-06, "loss": 0.2373, "num_input_tokens_seen": 17507840, "step": 28700 }, { "epoch": 8.90629847967732, "grad_norm": 0.2220296710729599, "learning_rate": 6.786960159857393e-06, "loss": 0.2366, "num_input_tokens_seen": 17510848, "step": 28705 }, { "epoch": 8.907849829351536, "grad_norm": 0.271557092666626, "learning_rate": 6.785695699364696e-06, "loss": 0.2302, "num_input_tokens_seen": 17513664, "step": 28710 }, { "epoch": 8.909401179025753, "grad_norm": 0.4022219479084015, "learning_rate": 6.78443110795936e-06, "loss": 0.2288, "num_input_tokens_seen": 17516256, "step": 28715 }, { "epoch": 8.91095252869997, "grad_norm": 0.2703897953033447, "learning_rate": 6.783166385734092e-06, "loss": 0.2361, "num_input_tokens_seen": 17518528, "step": 28720 }, { "epoch": 8.912503878374185, "grad_norm": 0.2570085823535919, "learning_rate": 6.781901532781612e-06, "loss": 0.2338, "num_input_tokens_seen": 17521600, "step": 28725 }, { "epoch": 8.914055228048403, "grad_norm": 0.1725325733423233, "learning_rate": 6.780636549194649e-06, "loss": 0.2275, "num_input_tokens_seen": 17523648, "step": 28730 }, { "epoch": 8.915606577722619, "grad_norm": 0.18318237364292145, "learning_rate": 6.779371435065942e-06, "loss": 0.23, "num_input_tokens_seen": 17526208, "step": 28735 }, { "epoch": 8.917157927396834, "grad_norm": 0.31844690442085266, "learning_rate": 6.778106190488238e-06, "loss": 0.2326, "num_input_tokens_seen": 17529920, "step": 28740 }, { "epoch": 8.918709277071052, "grad_norm": 0.1821020245552063, "learning_rate": 6.776840815554293e-06, "loss": 0.2247, "num_input_tokens_seen": 17533728, "step": 28745 }, { "epoch": 8.920260626745268, "grad_norm": 0.2085142433643341, "learning_rate": 6.775575310356876e-06, "loss": 0.2265, "num_input_tokens_seen": 17537440, "step": 28750 }, { "epoch": 8.921811976419486, "grad_norm": 0.3775125741958618, "learning_rate": 6.7743096749887626e-06, "loss": 0.2257, "num_input_tokens_seen": 17540096, "step": 28755 }, { "epoch": 8.923363326093702, "grad_norm": 0.21996445953845978, "learning_rate": 6.773043909542739e-06, "loss": 0.2242, "num_input_tokens_seen": 17543968, "step": 28760 }, { "epoch": 8.924914675767917, "grad_norm": 0.26100656390190125, "learning_rate": 6.771778014111601e-06, "loss": 0.2277, "num_input_tokens_seen": 17546592, "step": 28765 }, { "epoch": 8.926466025442135, "grad_norm": 0.30933839082717896, "learning_rate": 6.770511988788153e-06, "loss": 0.2363, "num_input_tokens_seen": 17549152, "step": 28770 }, { "epoch": 8.928017375116351, "grad_norm": 0.31403684616088867, "learning_rate": 6.769245833665212e-06, "loss": 0.2323, "num_input_tokens_seen": 17551488, "step": 28775 }, { "epoch": 8.929568724790569, "grad_norm": 0.17272229492664337, "learning_rate": 6.767979548835599e-06, "loss": 0.224, "num_input_tokens_seen": 17554464, "step": 28780 }, { "epoch": 8.931120074464785, "grad_norm": 0.22873835265636444, "learning_rate": 6.766713134392148e-06, "loss": 0.237, "num_input_tokens_seen": 17557056, "step": 28785 }, { "epoch": 8.932671424139, "grad_norm": 0.21962808072566986, "learning_rate": 6.7654465904277045e-06, "loss": 0.2322, "num_input_tokens_seen": 17560128, "step": 28790 }, { "epoch": 8.934222773813218, "grad_norm": 0.2618701756000519, "learning_rate": 6.76417991703512e-06, "loss": 0.238, "num_input_tokens_seen": 17563168, "step": 28795 }, { "epoch": 8.935774123487434, "grad_norm": 0.2998585104942322, "learning_rate": 6.762913114307257e-06, "loss": 0.2366, "num_input_tokens_seen": 17566144, "step": 28800 }, { "epoch": 8.93732547316165, "grad_norm": 0.264016330242157, "learning_rate": 6.761646182336986e-06, "loss": 0.2258, "num_input_tokens_seen": 17569664, "step": 28805 }, { "epoch": 8.938876822835867, "grad_norm": 0.2377404272556305, "learning_rate": 6.760379121217189e-06, "loss": 0.2337, "num_input_tokens_seen": 17572224, "step": 28810 }, { "epoch": 8.940428172510083, "grad_norm": 0.21321669220924377, "learning_rate": 6.759111931040756e-06, "loss": 0.2279, "num_input_tokens_seen": 17575168, "step": 28815 }, { "epoch": 8.941979522184301, "grad_norm": 0.23847562074661255, "learning_rate": 6.757844611900588e-06, "loss": 0.2289, "num_input_tokens_seen": 17577696, "step": 28820 }, { "epoch": 8.943530871858517, "grad_norm": 0.28928616642951965, "learning_rate": 6.756577163889594e-06, "loss": 0.2339, "num_input_tokens_seen": 17580736, "step": 28825 }, { "epoch": 8.945082221532733, "grad_norm": 0.25549137592315674, "learning_rate": 6.755309587100692e-06, "loss": 0.2264, "num_input_tokens_seen": 17584480, "step": 28830 }, { "epoch": 8.94663357120695, "grad_norm": 0.23243334889411926, "learning_rate": 6.754041881626813e-06, "loss": 0.2284, "num_input_tokens_seen": 17586688, "step": 28835 }, { "epoch": 8.948184920881166, "grad_norm": 0.18959473073482513, "learning_rate": 6.752774047560892e-06, "loss": 0.2297, "num_input_tokens_seen": 17589056, "step": 28840 }, { "epoch": 8.949736270555384, "grad_norm": 0.21525153517723083, "learning_rate": 6.751506084995875e-06, "loss": 0.2316, "num_input_tokens_seen": 17591968, "step": 28845 }, { "epoch": 8.9512876202296, "grad_norm": 0.17397721111774445, "learning_rate": 6.750237994024725e-06, "loss": 0.2334, "num_input_tokens_seen": 17594720, "step": 28850 }, { "epoch": 8.952838969903816, "grad_norm": 0.21939021348953247, "learning_rate": 6.7489697747404024e-06, "loss": 0.2339, "num_input_tokens_seen": 17599232, "step": 28855 }, { "epoch": 8.954390319578033, "grad_norm": 0.25141459703445435, "learning_rate": 6.747701427235884e-06, "loss": 0.2386, "num_input_tokens_seen": 17603584, "step": 28860 }, { "epoch": 8.95594166925225, "grad_norm": 0.340406209230423, "learning_rate": 6.746432951604156e-06, "loss": 0.242, "num_input_tokens_seen": 17606368, "step": 28865 }, { "epoch": 8.957493018926465, "grad_norm": 0.1366320252418518, "learning_rate": 6.745164347938212e-06, "loss": 0.2196, "num_input_tokens_seen": 17609312, "step": 28870 }, { "epoch": 8.959044368600683, "grad_norm": 0.17018809914588928, "learning_rate": 6.743895616331055e-06, "loss": 0.23, "num_input_tokens_seen": 17615168, "step": 28875 }, { "epoch": 8.960595718274899, "grad_norm": 0.1903974711894989, "learning_rate": 6.742626756875699e-06, "loss": 0.233, "num_input_tokens_seen": 17619296, "step": 28880 }, { "epoch": 8.962147067949116, "grad_norm": 0.22357122600078583, "learning_rate": 6.741357769665165e-06, "loss": 0.2306, "num_input_tokens_seen": 17623392, "step": 28885 }, { "epoch": 8.963698417623332, "grad_norm": 0.2802318036556244, "learning_rate": 6.740088654792486e-06, "loss": 0.2356, "num_input_tokens_seen": 17627104, "step": 28890 }, { "epoch": 8.965249767297548, "grad_norm": 0.12229851633310318, "learning_rate": 6.738819412350703e-06, "loss": 0.228, "num_input_tokens_seen": 17629760, "step": 28895 }, { "epoch": 8.966801116971766, "grad_norm": 0.11442917585372925, "learning_rate": 6.737550042432866e-06, "loss": 0.2355, "num_input_tokens_seen": 17632992, "step": 28900 }, { "epoch": 8.968352466645982, "grad_norm": 0.2023760974407196, "learning_rate": 6.736280545132035e-06, "loss": 0.2314, "num_input_tokens_seen": 17635776, "step": 28905 }, { "epoch": 8.9699038163202, "grad_norm": 0.15461869537830353, "learning_rate": 6.735010920541281e-06, "loss": 0.2228, "num_input_tokens_seen": 17638720, "step": 28910 }, { "epoch": 8.971455165994415, "grad_norm": 0.14051714539527893, "learning_rate": 6.733741168753679e-06, "loss": 0.2296, "num_input_tokens_seen": 17641088, "step": 28915 }, { "epoch": 8.973006515668631, "grad_norm": 0.20803171396255493, "learning_rate": 6.73247128986232e-06, "loss": 0.2411, "num_input_tokens_seen": 17644416, "step": 28920 }, { "epoch": 8.974557865342849, "grad_norm": 0.19842877984046936, "learning_rate": 6.7312012839603004e-06, "loss": 0.2318, "num_input_tokens_seen": 17647424, "step": 28925 }, { "epoch": 8.976109215017065, "grad_norm": 0.2877463698387146, "learning_rate": 6.729931151140725e-06, "loss": 0.2295, "num_input_tokens_seen": 17650880, "step": 28930 }, { "epoch": 8.97766056469128, "grad_norm": 0.15177175402641296, "learning_rate": 6.728660891496711e-06, "loss": 0.2338, "num_input_tokens_seen": 17653696, "step": 28935 }, { "epoch": 8.979211914365498, "grad_norm": 0.27331462502479553, "learning_rate": 6.727390505121383e-06, "loss": 0.2343, "num_input_tokens_seen": 17656608, "step": 28940 }, { "epoch": 8.980763264039714, "grad_norm": 0.09322945773601532, "learning_rate": 6.726119992107877e-06, "loss": 0.232, "num_input_tokens_seen": 17659936, "step": 28945 }, { "epoch": 8.982314613713932, "grad_norm": 0.2111997902393341, "learning_rate": 6.724849352549334e-06, "loss": 0.2276, "num_input_tokens_seen": 17663328, "step": 28950 }, { "epoch": 8.983865963388148, "grad_norm": 0.19009630382061005, "learning_rate": 6.723578586538908e-06, "loss": 0.2336, "num_input_tokens_seen": 17665760, "step": 28955 }, { "epoch": 8.985417313062364, "grad_norm": 0.17040549218654633, "learning_rate": 6.722307694169762e-06, "loss": 0.2343, "num_input_tokens_seen": 17667936, "step": 28960 }, { "epoch": 8.986968662736581, "grad_norm": 0.15481393039226532, "learning_rate": 6.721036675535066e-06, "loss": 0.2249, "num_input_tokens_seen": 17671104, "step": 28965 }, { "epoch": 8.988520012410797, "grad_norm": 0.13440147042274475, "learning_rate": 6.719765530728002e-06, "loss": 0.2269, "num_input_tokens_seen": 17674272, "step": 28970 }, { "epoch": 8.990071362085015, "grad_norm": 0.1910376250743866, "learning_rate": 6.71849425984176e-06, "loss": 0.2349, "num_input_tokens_seen": 17677152, "step": 28975 }, { "epoch": 8.99162271175923, "grad_norm": 0.1552218645811081, "learning_rate": 6.717222862969539e-06, "loss": 0.2328, "num_input_tokens_seen": 17680224, "step": 28980 }, { "epoch": 8.993174061433447, "grad_norm": 0.1510736495256424, "learning_rate": 6.7159513402045464e-06, "loss": 0.2297, "num_input_tokens_seen": 17683264, "step": 28985 }, { "epoch": 8.994725411107664, "grad_norm": 0.15550680458545685, "learning_rate": 6.714679691640001e-06, "loss": 0.2352, "num_input_tokens_seen": 17685408, "step": 28990 }, { "epoch": 8.99627676078188, "grad_norm": 0.12897664308547974, "learning_rate": 6.713407917369129e-06, "loss": 0.2268, "num_input_tokens_seen": 17688192, "step": 28995 }, { "epoch": 8.997828110456096, "grad_norm": 0.16689179837703705, "learning_rate": 6.712136017485168e-06, "loss": 0.2302, "num_input_tokens_seen": 17690560, "step": 29000 }, { "epoch": 8.999379460130314, "grad_norm": 0.13417409360408783, "learning_rate": 6.710863992081361e-06, "loss": 0.2286, "num_input_tokens_seen": 17692960, "step": 29005 }, { "epoch": 9.00093080980453, "grad_norm": 0.1587679237127304, "learning_rate": 6.7095918412509645e-06, "loss": 0.2311, "num_input_tokens_seen": 17695376, "step": 29010 }, { "epoch": 9.002482159478747, "grad_norm": 0.2447175234556198, "learning_rate": 6.7083195650872405e-06, "loss": 0.2285, "num_input_tokens_seen": 17698192, "step": 29015 }, { "epoch": 9.004033509152963, "grad_norm": 0.12751224637031555, "learning_rate": 6.707047163683462e-06, "loss": 0.2346, "num_input_tokens_seen": 17700624, "step": 29020 }, { "epoch": 9.005584858827179, "grad_norm": 0.16139279305934906, "learning_rate": 6.7057746371329135e-06, "loss": 0.2345, "num_input_tokens_seen": 17704016, "step": 29025 }, { "epoch": 9.007136208501397, "grad_norm": 0.18264421820640564, "learning_rate": 6.7045019855288855e-06, "loss": 0.2302, "num_input_tokens_seen": 17706960, "step": 29030 }, { "epoch": 9.008687558175613, "grad_norm": 0.15752893686294556, "learning_rate": 6.703229208964677e-06, "loss": 0.2284, "num_input_tokens_seen": 17709264, "step": 29035 }, { "epoch": 9.01023890784983, "grad_norm": 0.155290886759758, "learning_rate": 6.701956307533597e-06, "loss": 0.2356, "num_input_tokens_seen": 17712848, "step": 29040 }, { "epoch": 9.011790257524046, "grad_norm": 0.14023160934448242, "learning_rate": 6.7006832813289654e-06, "loss": 0.2286, "num_input_tokens_seen": 17715568, "step": 29045 }, { "epoch": 9.013341607198262, "grad_norm": 0.1636120229959488, "learning_rate": 6.699410130444112e-06, "loss": 0.2352, "num_input_tokens_seen": 17718576, "step": 29050 }, { "epoch": 9.01489295687248, "grad_norm": 0.21339787542819977, "learning_rate": 6.6981368549723705e-06, "loss": 0.2257, "num_input_tokens_seen": 17721968, "step": 29055 }, { "epoch": 9.016444306546695, "grad_norm": 0.17502547800540924, "learning_rate": 6.69686345500709e-06, "loss": 0.2353, "num_input_tokens_seen": 17724624, "step": 29060 }, { "epoch": 9.017995656220911, "grad_norm": 0.19448447227478027, "learning_rate": 6.695589930641624e-06, "loss": 0.2356, "num_input_tokens_seen": 17727216, "step": 29065 }, { "epoch": 9.019547005895129, "grad_norm": 0.12895150482654572, "learning_rate": 6.6943162819693366e-06, "loss": 0.2256, "num_input_tokens_seen": 17730608, "step": 29070 }, { "epoch": 9.021098355569345, "grad_norm": 0.12599948048591614, "learning_rate": 6.693042509083603e-06, "loss": 0.2269, "num_input_tokens_seen": 17733808, "step": 29075 }, { "epoch": 9.022649705243563, "grad_norm": 0.0946471095085144, "learning_rate": 6.6917686120778035e-06, "loss": 0.229, "num_input_tokens_seen": 17736528, "step": 29080 }, { "epoch": 9.024201054917778, "grad_norm": 0.21992139518260956, "learning_rate": 6.690494591045331e-06, "loss": 0.2321, "num_input_tokens_seen": 17738768, "step": 29085 }, { "epoch": 9.025752404591994, "grad_norm": 0.32083460688591003, "learning_rate": 6.689220446079588e-06, "loss": 0.2338, "num_input_tokens_seen": 17742064, "step": 29090 }, { "epoch": 9.027303754266212, "grad_norm": 0.12689639627933502, "learning_rate": 6.687946177273983e-06, "loss": 0.2294, "num_input_tokens_seen": 17744144, "step": 29095 }, { "epoch": 9.028855103940428, "grad_norm": 0.21960732340812683, "learning_rate": 6.686671784721933e-06, "loss": 0.2294, "num_input_tokens_seen": 17746928, "step": 29100 }, { "epoch": 9.030406453614646, "grad_norm": 0.11609358340501785, "learning_rate": 6.685397268516869e-06, "loss": 0.2301, "num_input_tokens_seen": 17749840, "step": 29105 }, { "epoch": 9.031957803288861, "grad_norm": 0.21327228844165802, "learning_rate": 6.6841226287522275e-06, "loss": 0.2224, "num_input_tokens_seen": 17752688, "step": 29110 }, { "epoch": 9.033509152963077, "grad_norm": 0.22902435064315796, "learning_rate": 6.682847865521455e-06, "loss": 0.227, "num_input_tokens_seen": 17755088, "step": 29115 }, { "epoch": 9.035060502637295, "grad_norm": 0.18225215375423431, "learning_rate": 6.681572978918005e-06, "loss": 0.235, "num_input_tokens_seen": 17758640, "step": 29120 }, { "epoch": 9.03661185231151, "grad_norm": 0.26779699325561523, "learning_rate": 6.680297969035343e-06, "loss": 0.2317, "num_input_tokens_seen": 17761712, "step": 29125 }, { "epoch": 9.038163201985727, "grad_norm": 0.2008616030216217, "learning_rate": 6.679022835966943e-06, "loss": 0.2274, "num_input_tokens_seen": 17764912, "step": 29130 }, { "epoch": 9.039714551659944, "grad_norm": 0.20015567541122437, "learning_rate": 6.677747579806285e-06, "loss": 0.2264, "num_input_tokens_seen": 17768176, "step": 29135 }, { "epoch": 9.04126590133416, "grad_norm": 0.27560725808143616, "learning_rate": 6.676472200646863e-06, "loss": 0.2332, "num_input_tokens_seen": 17771056, "step": 29140 }, { "epoch": 9.042817251008378, "grad_norm": 0.1632167249917984, "learning_rate": 6.675196698582176e-06, "loss": 0.2253, "num_input_tokens_seen": 17774000, "step": 29145 }, { "epoch": 9.044368600682594, "grad_norm": 0.13930048048496246, "learning_rate": 6.673921073705734e-06, "loss": 0.2308, "num_input_tokens_seen": 17778160, "step": 29150 }, { "epoch": 9.04591995035681, "grad_norm": 0.21698741614818573, "learning_rate": 6.672645326111056e-06, "loss": 0.2249, "num_input_tokens_seen": 17780880, "step": 29155 }, { "epoch": 9.047471300031027, "grad_norm": 0.2550620138645172, "learning_rate": 6.671369455891666e-06, "loss": 0.2204, "num_input_tokens_seen": 17783920, "step": 29160 }, { "epoch": 9.049022649705243, "grad_norm": 0.30535393953323364, "learning_rate": 6.670093463141104e-06, "loss": 0.2294, "num_input_tokens_seen": 17788080, "step": 29165 }, { "epoch": 9.050573999379461, "grad_norm": 0.22219392657279968, "learning_rate": 6.668817347952914e-06, "loss": 0.2437, "num_input_tokens_seen": 17790640, "step": 29170 }, { "epoch": 9.052125349053677, "grad_norm": 0.2972548305988312, "learning_rate": 6.667541110420652e-06, "loss": 0.2182, "num_input_tokens_seen": 17792976, "step": 29175 }, { "epoch": 9.053676698727893, "grad_norm": 0.23802204430103302, "learning_rate": 6.666264750637879e-06, "loss": 0.2303, "num_input_tokens_seen": 17795760, "step": 29180 }, { "epoch": 9.05522804840211, "grad_norm": 0.21206969022750854, "learning_rate": 6.6649882686981685e-06, "loss": 0.2389, "num_input_tokens_seen": 17798640, "step": 29185 }, { "epoch": 9.056779398076326, "grad_norm": 0.2900053560733795, "learning_rate": 6.663711664695101e-06, "loss": 0.2275, "num_input_tokens_seen": 17801520, "step": 29190 }, { "epoch": 9.058330747750542, "grad_norm": 0.18665121495723724, "learning_rate": 6.662434938722268e-06, "loss": 0.2305, "num_input_tokens_seen": 17805328, "step": 29195 }, { "epoch": 9.05988209742476, "grad_norm": 0.23049341142177582, "learning_rate": 6.661158090873267e-06, "loss": 0.2412, "num_input_tokens_seen": 17807600, "step": 29200 }, { "epoch": 9.061433447098976, "grad_norm": 0.2872185707092285, "learning_rate": 6.659881121241707e-06, "loss": 0.2237, "num_input_tokens_seen": 17811152, "step": 29205 }, { "epoch": 9.062984796773193, "grad_norm": 0.1791830211877823, "learning_rate": 6.658604029921205e-06, "loss": 0.2309, "num_input_tokens_seen": 17813968, "step": 29210 }, { "epoch": 9.06453614644741, "grad_norm": 0.25659674406051636, "learning_rate": 6.657326817005387e-06, "loss": 0.2344, "num_input_tokens_seen": 17816336, "step": 29215 }, { "epoch": 9.066087496121625, "grad_norm": 0.20465679466724396, "learning_rate": 6.6560494825878875e-06, "loss": 0.2288, "num_input_tokens_seen": 17819504, "step": 29220 }, { "epoch": 9.067638845795843, "grad_norm": 0.23014885187149048, "learning_rate": 6.65477202676235e-06, "loss": 0.2262, "num_input_tokens_seen": 17822448, "step": 29225 }, { "epoch": 9.069190195470059, "grad_norm": 0.1691979467868805, "learning_rate": 6.653494449622428e-06, "loss": 0.2163, "num_input_tokens_seen": 17825584, "step": 29230 }, { "epoch": 9.070741545144276, "grad_norm": 0.16672097146511078, "learning_rate": 6.652216751261783e-06, "loss": 0.2384, "num_input_tokens_seen": 17829136, "step": 29235 }, { "epoch": 9.072292894818492, "grad_norm": 0.21617992222309113, "learning_rate": 6.650938931774085e-06, "loss": 0.2262, "num_input_tokens_seen": 17832688, "step": 29240 }, { "epoch": 9.073844244492708, "grad_norm": 0.2373717576265335, "learning_rate": 6.649660991253012e-06, "loss": 0.236, "num_input_tokens_seen": 17835952, "step": 29245 }, { "epoch": 9.075395594166926, "grad_norm": 0.2633947432041168, "learning_rate": 6.6483829297922544e-06, "loss": 0.2262, "num_input_tokens_seen": 17838032, "step": 29250 }, { "epoch": 9.076946943841142, "grad_norm": 0.2383640855550766, "learning_rate": 6.647104747485508e-06, "loss": 0.2289, "num_input_tokens_seen": 17840208, "step": 29255 }, { "epoch": 9.078498293515358, "grad_norm": 0.17991259694099426, "learning_rate": 6.64582644442648e-06, "loss": 0.2312, "num_input_tokens_seen": 17843600, "step": 29260 }, { "epoch": 9.080049643189575, "grad_norm": 0.13088390231132507, "learning_rate": 6.644548020708882e-06, "loss": 0.2371, "num_input_tokens_seen": 17845904, "step": 29265 }, { "epoch": 9.081600992863791, "grad_norm": 0.15342482924461365, "learning_rate": 6.6432694764264415e-06, "loss": 0.2486, "num_input_tokens_seen": 17850256, "step": 29270 }, { "epoch": 9.083152342538009, "grad_norm": 0.13615594804286957, "learning_rate": 6.641990811672888e-06, "loss": 0.2375, "num_input_tokens_seen": 17854192, "step": 29275 }, { "epoch": 9.084703692212225, "grad_norm": 0.14481034874916077, "learning_rate": 6.640712026541963e-06, "loss": 0.2215, "num_input_tokens_seen": 17856976, "step": 29280 }, { "epoch": 9.08625504188644, "grad_norm": 0.3156697452068329, "learning_rate": 6.63943312112742e-06, "loss": 0.2357, "num_input_tokens_seen": 17861712, "step": 29285 }, { "epoch": 9.087806391560658, "grad_norm": 0.20974121987819672, "learning_rate": 6.638154095523014e-06, "loss": 0.2201, "num_input_tokens_seen": 17864624, "step": 29290 }, { "epoch": 9.089357741234874, "grad_norm": 0.16559794545173645, "learning_rate": 6.636874949822515e-06, "loss": 0.2304, "num_input_tokens_seen": 17867792, "step": 29295 }, { "epoch": 9.090909090909092, "grad_norm": 0.17839157581329346, "learning_rate": 6.635595684119698e-06, "loss": 0.2265, "num_input_tokens_seen": 17870192, "step": 29300 }, { "epoch": 9.092460440583308, "grad_norm": 0.24427330493927002, "learning_rate": 6.6343162985083474e-06, "loss": 0.2336, "num_input_tokens_seen": 17873648, "step": 29305 }, { "epoch": 9.094011790257523, "grad_norm": 0.18494126200675964, "learning_rate": 6.633036793082261e-06, "loss": 0.2327, "num_input_tokens_seen": 17877776, "step": 29310 }, { "epoch": 9.095563139931741, "grad_norm": 0.42346975207328796, "learning_rate": 6.63175716793524e-06, "loss": 0.2301, "num_input_tokens_seen": 17880624, "step": 29315 }, { "epoch": 9.097114489605957, "grad_norm": 0.15353256464004517, "learning_rate": 6.630477423161095e-06, "loss": 0.2287, "num_input_tokens_seen": 17883984, "step": 29320 }, { "epoch": 9.098665839280173, "grad_norm": 0.31531792879104614, "learning_rate": 6.629197558853647e-06, "loss": 0.2289, "num_input_tokens_seen": 17888016, "step": 29325 }, { "epoch": 9.10021718895439, "grad_norm": 0.3731744587421417, "learning_rate": 6.627917575106726e-06, "loss": 0.2273, "num_input_tokens_seen": 17895824, "step": 29330 }, { "epoch": 9.101768538628606, "grad_norm": 0.17706933617591858, "learning_rate": 6.626637472014168e-06, "loss": 0.2287, "num_input_tokens_seen": 17898832, "step": 29335 }, { "epoch": 9.103319888302824, "grad_norm": 0.21021993458271027, "learning_rate": 6.625357249669823e-06, "loss": 0.2323, "num_input_tokens_seen": 17901840, "step": 29340 }, { "epoch": 9.10487123797704, "grad_norm": 0.2152874916791916, "learning_rate": 6.624076908167543e-06, "loss": 0.2242, "num_input_tokens_seen": 17904848, "step": 29345 }, { "epoch": 9.106422587651256, "grad_norm": 0.14389225840568542, "learning_rate": 6.622796447601194e-06, "loss": 0.2251, "num_input_tokens_seen": 17908560, "step": 29350 }, { "epoch": 9.107973937325474, "grad_norm": 0.22437813878059387, "learning_rate": 6.621515868064648e-06, "loss": 0.2238, "num_input_tokens_seen": 17911024, "step": 29355 }, { "epoch": 9.10952528699969, "grad_norm": 0.419882208108902, "learning_rate": 6.620235169651787e-06, "loss": 0.2434, "num_input_tokens_seen": 17914000, "step": 29360 }, { "epoch": 9.111076636673907, "grad_norm": 0.28831586241722107, "learning_rate": 6.6189543524565026e-06, "loss": 0.2289, "num_input_tokens_seen": 17916336, "step": 29365 }, { "epoch": 9.112627986348123, "grad_norm": 0.3079048693180084, "learning_rate": 6.617673416572693e-06, "loss": 0.2362, "num_input_tokens_seen": 17918768, "step": 29370 }, { "epoch": 9.114179336022339, "grad_norm": 0.27461162209510803, "learning_rate": 6.616392362094266e-06, "loss": 0.2281, "num_input_tokens_seen": 17921744, "step": 29375 }, { "epoch": 9.115730685696557, "grad_norm": 0.20176656544208527, "learning_rate": 6.6151111891151374e-06, "loss": 0.2216, "num_input_tokens_seen": 17925808, "step": 29380 }, { "epoch": 9.117282035370772, "grad_norm": 0.13910242915153503, "learning_rate": 6.613829897729234e-06, "loss": 0.2184, "num_input_tokens_seen": 17928656, "step": 29385 }, { "epoch": 9.118833385044988, "grad_norm": 0.1894080489873886, "learning_rate": 6.612548488030487e-06, "loss": 0.2305, "num_input_tokens_seen": 17931216, "step": 29390 }, { "epoch": 9.120384734719206, "grad_norm": 0.22137446701526642, "learning_rate": 6.611266960112841e-06, "loss": 0.2236, "num_input_tokens_seen": 17934512, "step": 29395 }, { "epoch": 9.121936084393422, "grad_norm": 0.2489195168018341, "learning_rate": 6.609985314070246e-06, "loss": 0.2219, "num_input_tokens_seen": 17936944, "step": 29400 }, { "epoch": 9.12348743406764, "grad_norm": 0.3405507504940033, "learning_rate": 6.608703549996662e-06, "loss": 0.2297, "num_input_tokens_seen": 17940048, "step": 29405 }, { "epoch": 9.125038783741855, "grad_norm": 0.2302720993757248, "learning_rate": 6.60742166798606e-06, "loss": 0.23, "num_input_tokens_seen": 17942864, "step": 29410 }, { "epoch": 9.126590133416071, "grad_norm": 0.238139346241951, "learning_rate": 6.606139668132412e-06, "loss": 0.2322, "num_input_tokens_seen": 17945648, "step": 29415 }, { "epoch": 9.128141483090289, "grad_norm": 0.4812220335006714, "learning_rate": 6.604857550529709e-06, "loss": 0.2166, "num_input_tokens_seen": 17949552, "step": 29420 }, { "epoch": 9.129692832764505, "grad_norm": 0.20179089903831482, "learning_rate": 6.6035753152719426e-06, "loss": 0.2253, "num_input_tokens_seen": 17953456, "step": 29425 }, { "epoch": 9.131244182438722, "grad_norm": 0.19241230189800262, "learning_rate": 6.602292962453116e-06, "loss": 0.2279, "num_input_tokens_seen": 17955536, "step": 29430 }, { "epoch": 9.132795532112938, "grad_norm": 0.38353368639945984, "learning_rate": 6.601010492167243e-06, "loss": 0.2187, "num_input_tokens_seen": 17958288, "step": 29435 }, { "epoch": 9.134346881787154, "grad_norm": 0.3761970102787018, "learning_rate": 6.599727904508341e-06, "loss": 0.2289, "num_input_tokens_seen": 17962384, "step": 29440 }, { "epoch": 9.135898231461372, "grad_norm": 0.5075197815895081, "learning_rate": 6.59844519957044e-06, "loss": 0.2323, "num_input_tokens_seen": 17964688, "step": 29445 }, { "epoch": 9.137449581135588, "grad_norm": 0.4063670039176941, "learning_rate": 6.597162377447577e-06, "loss": 0.2286, "num_input_tokens_seen": 17967600, "step": 29450 }, { "epoch": 9.139000930809804, "grad_norm": 0.5479270219802856, "learning_rate": 6.595879438233799e-06, "loss": 0.2353, "num_input_tokens_seen": 17970832, "step": 29455 }, { "epoch": 9.140552280484021, "grad_norm": 0.448811799287796, "learning_rate": 6.59459638202316e-06, "loss": 0.2435, "num_input_tokens_seen": 17973296, "step": 29460 }, { "epoch": 9.142103630158237, "grad_norm": 0.421939879655838, "learning_rate": 6.593313208909724e-06, "loss": 0.236, "num_input_tokens_seen": 17976848, "step": 29465 }, { "epoch": 9.143654979832455, "grad_norm": 0.4071846008300781, "learning_rate": 6.592029918987562e-06, "loss": 0.2281, "num_input_tokens_seen": 17979888, "step": 29470 }, { "epoch": 9.14520632950667, "grad_norm": 0.35624414682388306, "learning_rate": 6.590746512350752e-06, "loss": 0.2296, "num_input_tokens_seen": 17985168, "step": 29475 }, { "epoch": 9.146757679180887, "grad_norm": 0.3466757535934448, "learning_rate": 6.589462989093387e-06, "loss": 0.2263, "num_input_tokens_seen": 17988496, "step": 29480 }, { "epoch": 9.148309028855104, "grad_norm": 0.2991260588169098, "learning_rate": 6.588179349309564e-06, "loss": 0.2306, "num_input_tokens_seen": 17991376, "step": 29485 }, { "epoch": 9.14986037852932, "grad_norm": 0.2502967119216919, "learning_rate": 6.586895593093386e-06, "loss": 0.2219, "num_input_tokens_seen": 17994096, "step": 29490 }, { "epoch": 9.151411728203538, "grad_norm": 0.3982725143432617, "learning_rate": 6.5856117205389716e-06, "loss": 0.2324, "num_input_tokens_seen": 17996848, "step": 29495 }, { "epoch": 9.152963077877754, "grad_norm": 0.40309157967567444, "learning_rate": 6.584327731740441e-06, "loss": 0.2305, "num_input_tokens_seen": 18001040, "step": 29500 }, { "epoch": 9.15451442755197, "grad_norm": 0.257684588432312, "learning_rate": 6.583043626791925e-06, "loss": 0.2167, "num_input_tokens_seen": 18004560, "step": 29505 }, { "epoch": 9.156065777226187, "grad_norm": 0.24535605311393738, "learning_rate": 6.581759405787566e-06, "loss": 0.2263, "num_input_tokens_seen": 18007504, "step": 29510 }, { "epoch": 9.157617126900403, "grad_norm": 0.44809648394584656, "learning_rate": 6.580475068821512e-06, "loss": 0.2253, "num_input_tokens_seen": 18009840, "step": 29515 }, { "epoch": 9.159168476574619, "grad_norm": 0.32477959990501404, "learning_rate": 6.579190615987919e-06, "loss": 0.2287, "num_input_tokens_seen": 18012432, "step": 29520 }, { "epoch": 9.160719826248837, "grad_norm": 0.4129081070423126, "learning_rate": 6.5779060473809545e-06, "loss": 0.2316, "num_input_tokens_seen": 18016784, "step": 29525 }, { "epoch": 9.162271175923053, "grad_norm": 0.6977484226226807, "learning_rate": 6.576621363094791e-06, "loss": 0.2186, "num_input_tokens_seen": 18020016, "step": 29530 }, { "epoch": 9.16382252559727, "grad_norm": 0.4705640375614166, "learning_rate": 6.575336563223611e-06, "loss": 0.2372, "num_input_tokens_seen": 18022704, "step": 29535 }, { "epoch": 9.165373875271486, "grad_norm": 0.4419527053833008, "learning_rate": 6.574051647861607e-06, "loss": 0.2217, "num_input_tokens_seen": 18025168, "step": 29540 }, { "epoch": 9.166925224945702, "grad_norm": 0.4956662058830261, "learning_rate": 6.572766617102977e-06, "loss": 0.2237, "num_input_tokens_seen": 18027920, "step": 29545 }, { "epoch": 9.16847657461992, "grad_norm": 0.7746727466583252, "learning_rate": 6.57148147104193e-06, "loss": 0.215, "num_input_tokens_seen": 18030256, "step": 29550 }, { "epoch": 9.170027924294136, "grad_norm": 1.0019265413284302, "learning_rate": 6.570196209772682e-06, "loss": 0.2183, "num_input_tokens_seen": 18032784, "step": 29555 }, { "epoch": 9.171579273968353, "grad_norm": 0.5898739099502563, "learning_rate": 6.568910833389458e-06, "loss": 0.2347, "num_input_tokens_seen": 18034992, "step": 29560 }, { "epoch": 9.17313062364257, "grad_norm": 0.4052552878856659, "learning_rate": 6.567625341986491e-06, "loss": 0.2269, "num_input_tokens_seen": 18037872, "step": 29565 }, { "epoch": 9.174681973316785, "grad_norm": 0.45989346504211426, "learning_rate": 6.5663397356580234e-06, "loss": 0.2238, "num_input_tokens_seen": 18040944, "step": 29570 }, { "epoch": 9.176233322991003, "grad_norm": 0.3837839663028717, "learning_rate": 6.565054014498305e-06, "loss": 0.2258, "num_input_tokens_seen": 18044368, "step": 29575 }, { "epoch": 9.177784672665219, "grad_norm": 0.6782793998718262, "learning_rate": 6.563768178601594e-06, "loss": 0.2333, "num_input_tokens_seen": 18046992, "step": 29580 }, { "epoch": 9.179336022339434, "grad_norm": 0.4237406253814697, "learning_rate": 6.562482228062158e-06, "loss": 0.2376, "num_input_tokens_seen": 18049648, "step": 29585 }, { "epoch": 9.180887372013652, "grad_norm": 0.3732234239578247, "learning_rate": 6.56119616297427e-06, "loss": 0.2379, "num_input_tokens_seen": 18053136, "step": 29590 }, { "epoch": 9.182438721687868, "grad_norm": 0.9016228914260864, "learning_rate": 6.559909983432218e-06, "loss": 0.2257, "num_input_tokens_seen": 18056464, "step": 29595 }, { "epoch": 9.183990071362086, "grad_norm": 0.47703027725219727, "learning_rate": 6.558623689530293e-06, "loss": 0.2411, "num_input_tokens_seen": 18059312, "step": 29600 }, { "epoch": 9.185541421036302, "grad_norm": 0.4838729500770569, "learning_rate": 6.557337281362793e-06, "loss": 0.2355, "num_input_tokens_seen": 18062352, "step": 29605 }, { "epoch": 9.187092770710517, "grad_norm": 0.5638741850852966, "learning_rate": 6.556050759024028e-06, "loss": 0.2247, "num_input_tokens_seen": 18065488, "step": 29610 }, { "epoch": 9.188644120384735, "grad_norm": 0.44788047671318054, "learning_rate": 6.554764122608317e-06, "loss": 0.2224, "num_input_tokens_seen": 18067952, "step": 29615 }, { "epoch": 9.190195470058951, "grad_norm": 0.3289619982242584, "learning_rate": 6.553477372209985e-06, "loss": 0.224, "num_input_tokens_seen": 18071120, "step": 29620 }, { "epoch": 9.191746819733169, "grad_norm": 0.2889353930950165, "learning_rate": 6.552190507923366e-06, "loss": 0.2342, "num_input_tokens_seen": 18073680, "step": 29625 }, { "epoch": 9.193298169407385, "grad_norm": 0.2854796350002289, "learning_rate": 6.550903529842803e-06, "loss": 0.2386, "num_input_tokens_seen": 18076720, "step": 29630 }, { "epoch": 9.1948495190816, "grad_norm": 0.3274117410182953, "learning_rate": 6.549616438062644e-06, "loss": 0.2367, "num_input_tokens_seen": 18079248, "step": 29635 }, { "epoch": 9.196400868755818, "grad_norm": 0.3084247410297394, "learning_rate": 6.548329232677249e-06, "loss": 0.2314, "num_input_tokens_seen": 18081456, "step": 29640 }, { "epoch": 9.197952218430034, "grad_norm": 0.2956722676753998, "learning_rate": 6.547041913780988e-06, "loss": 0.228, "num_input_tokens_seen": 18084944, "step": 29645 }, { "epoch": 9.19950356810425, "grad_norm": 0.5897151827812195, "learning_rate": 6.545754481468235e-06, "loss": 0.2325, "num_input_tokens_seen": 18088048, "step": 29650 }, { "epoch": 9.201054917778468, "grad_norm": 0.46947234869003296, "learning_rate": 6.5444669358333735e-06, "loss": 0.2204, "num_input_tokens_seen": 18091088, "step": 29655 }, { "epoch": 9.202606267452683, "grad_norm": 0.4716440439224243, "learning_rate": 6.543179276970797e-06, "loss": 0.2339, "num_input_tokens_seen": 18093776, "step": 29660 }, { "epoch": 9.204157617126901, "grad_norm": 0.4474058747291565, "learning_rate": 6.541891504974904e-06, "loss": 0.2356, "num_input_tokens_seen": 18095984, "step": 29665 }, { "epoch": 9.205708966801117, "grad_norm": 0.4883273243904114, "learning_rate": 6.540603619940107e-06, "loss": 0.2319, "num_input_tokens_seen": 18098992, "step": 29670 }, { "epoch": 9.207260316475333, "grad_norm": 0.4709955155849457, "learning_rate": 6.53931562196082e-06, "loss": 0.2184, "num_input_tokens_seen": 18102000, "step": 29675 }, { "epoch": 9.20881166614955, "grad_norm": 0.3614434003829956, "learning_rate": 6.538027511131469e-06, "loss": 0.2287, "num_input_tokens_seen": 18104240, "step": 29680 }, { "epoch": 9.210363015823766, "grad_norm": 0.37467873096466064, "learning_rate": 6.53673928754649e-06, "loss": 0.2261, "num_input_tokens_seen": 18107376, "step": 29685 }, { "epoch": 9.211914365497984, "grad_norm": 0.2976283133029938, "learning_rate": 6.535450951300323e-06, "loss": 0.2283, "num_input_tokens_seen": 18110256, "step": 29690 }, { "epoch": 9.2134657151722, "grad_norm": 0.26549068093299866, "learning_rate": 6.534162502487419e-06, "loss": 0.2193, "num_input_tokens_seen": 18113104, "step": 29695 }, { "epoch": 9.215017064846416, "grad_norm": 0.3891907036304474, "learning_rate": 6.532873941202236e-06, "loss": 0.2335, "num_input_tokens_seen": 18117136, "step": 29700 }, { "epoch": 9.216568414520633, "grad_norm": 0.35543444752693176, "learning_rate": 6.531585267539241e-06, "loss": 0.2337, "num_input_tokens_seen": 18120048, "step": 29705 }, { "epoch": 9.21811976419485, "grad_norm": 0.38857993483543396, "learning_rate": 6.530296481592909e-06, "loss": 0.226, "num_input_tokens_seen": 18122320, "step": 29710 }, { "epoch": 9.219671113869065, "grad_norm": 0.515846848487854, "learning_rate": 6.5290075834577245e-06, "loss": 0.2339, "num_input_tokens_seen": 18125744, "step": 29715 }, { "epoch": 9.221222463543283, "grad_norm": 0.3574053943157196, "learning_rate": 6.527718573228177e-06, "loss": 0.2148, "num_input_tokens_seen": 18129264, "step": 29720 }, { "epoch": 9.222773813217499, "grad_norm": 0.4292166531085968, "learning_rate": 6.526429450998767e-06, "loss": 0.2429, "num_input_tokens_seen": 18132592, "step": 29725 }, { "epoch": 9.224325162891716, "grad_norm": 0.45665886998176575, "learning_rate": 6.525140216864003e-06, "loss": 0.2285, "num_input_tokens_seen": 18135536, "step": 29730 }, { "epoch": 9.225876512565932, "grad_norm": 0.3808872401714325, "learning_rate": 6.523850870918402e-06, "loss": 0.2182, "num_input_tokens_seen": 18137776, "step": 29735 }, { "epoch": 9.227427862240148, "grad_norm": 0.5604586601257324, "learning_rate": 6.5225614132564855e-06, "loss": 0.2223, "num_input_tokens_seen": 18141968, "step": 29740 }, { "epoch": 9.228979211914366, "grad_norm": 0.3657499849796295, "learning_rate": 6.521271843972787e-06, "loss": 0.2307, "num_input_tokens_seen": 18144304, "step": 29745 }, { "epoch": 9.230530561588582, "grad_norm": 0.5394580960273743, "learning_rate": 6.51998216316185e-06, "loss": 0.2356, "num_input_tokens_seen": 18146768, "step": 29750 }, { "epoch": 9.2320819112628, "grad_norm": 0.5215045213699341, "learning_rate": 6.518692370918219e-06, "loss": 0.2207, "num_input_tokens_seen": 18149360, "step": 29755 }, { "epoch": 9.233633260937015, "grad_norm": 0.3698697090148926, "learning_rate": 6.517402467336455e-06, "loss": 0.2226, "num_input_tokens_seen": 18152080, "step": 29760 }, { "epoch": 9.235184610611231, "grad_norm": 0.5280438661575317, "learning_rate": 6.516112452511121e-06, "loss": 0.2178, "num_input_tokens_seen": 18155248, "step": 29765 }, { "epoch": 9.236735960285449, "grad_norm": 0.5926714539527893, "learning_rate": 6.514822326536789e-06, "loss": 0.2188, "num_input_tokens_seen": 18158704, "step": 29770 }, { "epoch": 9.238287309959665, "grad_norm": 0.37977349758148193, "learning_rate": 6.513532089508045e-06, "loss": 0.2367, "num_input_tokens_seen": 18161552, "step": 29775 }, { "epoch": 9.23983865963388, "grad_norm": 0.45096635818481445, "learning_rate": 6.512241741519474e-06, "loss": 0.2102, "num_input_tokens_seen": 18164304, "step": 29780 }, { "epoch": 9.241390009308098, "grad_norm": 0.5292826294898987, "learning_rate": 6.5109512826656765e-06, "loss": 0.2281, "num_input_tokens_seen": 18167568, "step": 29785 }, { "epoch": 9.242941358982314, "grad_norm": 0.28300756216049194, "learning_rate": 6.509660713041257e-06, "loss": 0.2248, "num_input_tokens_seen": 18170544, "step": 29790 }, { "epoch": 9.244492708656532, "grad_norm": 0.4935232102870941, "learning_rate": 6.50837003274083e-06, "loss": 0.2443, "num_input_tokens_seen": 18174672, "step": 29795 }, { "epoch": 9.246044058330748, "grad_norm": 0.4431411921977997, "learning_rate": 6.507079241859019e-06, "loss": 0.221, "num_input_tokens_seen": 18178736, "step": 29800 }, { "epoch": 9.247595408004964, "grad_norm": 0.5685586333274841, "learning_rate": 6.505788340490453e-06, "loss": 0.2461, "num_input_tokens_seen": 18181328, "step": 29805 }, { "epoch": 9.249146757679181, "grad_norm": 0.4005923867225647, "learning_rate": 6.5044973287297695e-06, "loss": 0.2309, "num_input_tokens_seen": 18183920, "step": 29810 }, { "epoch": 9.250698107353397, "grad_norm": 0.36726436018943787, "learning_rate": 6.503206206671617e-06, "loss": 0.2277, "num_input_tokens_seen": 18188784, "step": 29815 }, { "epoch": 9.252249457027615, "grad_norm": 0.36384668946266174, "learning_rate": 6.501914974410648e-06, "loss": 0.2364, "num_input_tokens_seen": 18193264, "step": 29820 }, { "epoch": 9.25380080670183, "grad_norm": 0.48386508226394653, "learning_rate": 6.500623632041527e-06, "loss": 0.2216, "num_input_tokens_seen": 18196048, "step": 29825 }, { "epoch": 9.255352156376047, "grad_norm": 0.45581045746803284, "learning_rate": 6.499332179658923e-06, "loss": 0.2246, "num_input_tokens_seen": 18198800, "step": 29830 }, { "epoch": 9.256903506050264, "grad_norm": 0.4082182049751282, "learning_rate": 6.498040617357515e-06, "loss": 0.2201, "num_input_tokens_seen": 18202192, "step": 29835 }, { "epoch": 9.25845485572448, "grad_norm": 0.34679344296455383, "learning_rate": 6.496748945231991e-06, "loss": 0.2293, "num_input_tokens_seen": 18205072, "step": 29840 }, { "epoch": 9.260006205398696, "grad_norm": 0.5704792737960815, "learning_rate": 6.495457163377046e-06, "loss": 0.2203, "num_input_tokens_seen": 18208592, "step": 29845 }, { "epoch": 9.261557555072914, "grad_norm": 0.36550432443618774, "learning_rate": 6.494165271887379e-06, "loss": 0.2326, "num_input_tokens_seen": 18210736, "step": 29850 }, { "epoch": 9.26310890474713, "grad_norm": 0.30552786588668823, "learning_rate": 6.492873270857707e-06, "loss": 0.2261, "num_input_tokens_seen": 18212752, "step": 29855 }, { "epoch": 9.264660254421347, "grad_norm": 0.42590346932411194, "learning_rate": 6.491581160382745e-06, "loss": 0.2369, "num_input_tokens_seen": 18216272, "step": 29860 }, { "epoch": 9.266211604095563, "grad_norm": 0.42049640417099, "learning_rate": 6.490288940557221e-06, "loss": 0.2169, "num_input_tokens_seen": 18218832, "step": 29865 }, { "epoch": 9.267762953769779, "grad_norm": 0.347854882478714, "learning_rate": 6.4889966114758704e-06, "loss": 0.2328, "num_input_tokens_seen": 18221872, "step": 29870 }, { "epoch": 9.269314303443997, "grad_norm": 0.6507933139801025, "learning_rate": 6.487704173233436e-06, "loss": 0.2193, "num_input_tokens_seen": 18224272, "step": 29875 }, { "epoch": 9.270865653118213, "grad_norm": 0.49148207902908325, "learning_rate": 6.4864116259246665e-06, "loss": 0.2378, "num_input_tokens_seen": 18227216, "step": 29880 }, { "epoch": 9.27241700279243, "grad_norm": 0.3104003369808197, "learning_rate": 6.485118969644324e-06, "loss": 0.2222, "num_input_tokens_seen": 18230128, "step": 29885 }, { "epoch": 9.273968352466646, "grad_norm": 0.5116109251976013, "learning_rate": 6.4838262044871755e-06, "loss": 0.2243, "num_input_tokens_seen": 18233264, "step": 29890 }, { "epoch": 9.275519702140862, "grad_norm": 0.44973066449165344, "learning_rate": 6.482533330547995e-06, "loss": 0.2241, "num_input_tokens_seen": 18236240, "step": 29895 }, { "epoch": 9.27707105181508, "grad_norm": 0.3276880979537964, "learning_rate": 6.481240347921566e-06, "loss": 0.2296, "num_input_tokens_seen": 18239408, "step": 29900 }, { "epoch": 9.278622401489296, "grad_norm": 0.8387464880943298, "learning_rate": 6.479947256702679e-06, "loss": 0.2389, "num_input_tokens_seen": 18241680, "step": 29905 }, { "epoch": 9.280173751163511, "grad_norm": 0.7106526494026184, "learning_rate": 6.4786540569861315e-06, "loss": 0.2162, "num_input_tokens_seen": 18245328, "step": 29910 }, { "epoch": 9.281725100837729, "grad_norm": 0.5005519390106201, "learning_rate": 6.477360748866732e-06, "loss": 0.2261, "num_input_tokens_seen": 18248432, "step": 29915 }, { "epoch": 9.283276450511945, "grad_norm": 0.5255812406539917, "learning_rate": 6.476067332439295e-06, "loss": 0.2351, "num_input_tokens_seen": 18251888, "step": 29920 }, { "epoch": 9.284827800186163, "grad_norm": 0.4585478603839874, "learning_rate": 6.474773807798644e-06, "loss": 0.2233, "num_input_tokens_seen": 18254864, "step": 29925 }, { "epoch": 9.286379149860378, "grad_norm": 0.6038764715194702, "learning_rate": 6.4734801750396095e-06, "loss": 0.2336, "num_input_tokens_seen": 18257040, "step": 29930 }, { "epoch": 9.287930499534594, "grad_norm": 0.4824659824371338, "learning_rate": 6.472186434257028e-06, "loss": 0.2203, "num_input_tokens_seen": 18259728, "step": 29935 }, { "epoch": 9.289481849208812, "grad_norm": 0.6645903587341309, "learning_rate": 6.470892585545749e-06, "loss": 0.2424, "num_input_tokens_seen": 18264144, "step": 29940 }, { "epoch": 9.291033198883028, "grad_norm": 0.6829733848571777, "learning_rate": 6.469598629000625e-06, "loss": 0.2243, "num_input_tokens_seen": 18267408, "step": 29945 }, { "epoch": 9.292584548557246, "grad_norm": 0.4301622211933136, "learning_rate": 6.468304564716519e-06, "loss": 0.2303, "num_input_tokens_seen": 18270192, "step": 29950 }, { "epoch": 9.294135898231461, "grad_norm": 0.671834409236908, "learning_rate": 6.467010392788301e-06, "loss": 0.2405, "num_input_tokens_seen": 18273456, "step": 29955 }, { "epoch": 9.295687247905677, "grad_norm": 0.28240692615509033, "learning_rate": 6.465716113310849e-06, "loss": 0.2342, "num_input_tokens_seen": 18275792, "step": 29960 }, { "epoch": 9.297238597579895, "grad_norm": 0.5290594100952148, "learning_rate": 6.464421726379049e-06, "loss": 0.2452, "num_input_tokens_seen": 18279152, "step": 29965 }, { "epoch": 9.298789947254111, "grad_norm": 0.43236806988716125, "learning_rate": 6.4631272320877955e-06, "loss": 0.2302, "num_input_tokens_seen": 18281680, "step": 29970 }, { "epoch": 9.300341296928327, "grad_norm": 0.2715842127799988, "learning_rate": 6.46183263053199e-06, "loss": 0.2249, "num_input_tokens_seen": 18284240, "step": 29975 }, { "epoch": 9.301892646602544, "grad_norm": 0.6311061382293701, "learning_rate": 6.46053792180654e-06, "loss": 0.2524, "num_input_tokens_seen": 18288208, "step": 29980 }, { "epoch": 9.30344399627676, "grad_norm": 0.17653195559978485, "learning_rate": 6.459243106006367e-06, "loss": 0.2265, "num_input_tokens_seen": 18290576, "step": 29985 }, { "epoch": 9.304995345950978, "grad_norm": 0.35090067982673645, "learning_rate": 6.4579481832263946e-06, "loss": 0.2275, "num_input_tokens_seen": 18294832, "step": 29990 }, { "epoch": 9.306546695625194, "grad_norm": 0.36199086904525757, "learning_rate": 6.456653153561555e-06, "loss": 0.2334, "num_input_tokens_seen": 18297584, "step": 29995 }, { "epoch": 9.30809804529941, "grad_norm": 0.575244665145874, "learning_rate": 6.455358017106789e-06, "loss": 0.23, "num_input_tokens_seen": 18301424, "step": 30000 }, { "epoch": 9.309649394973627, "grad_norm": 0.2379070371389389, "learning_rate": 6.454062773957048e-06, "loss": 0.2307, "num_input_tokens_seen": 18303824, "step": 30005 }, { "epoch": 9.311200744647843, "grad_norm": 0.27801313996315, "learning_rate": 6.452767424207287e-06, "loss": 0.2274, "num_input_tokens_seen": 18306576, "step": 30010 }, { "epoch": 9.312752094322061, "grad_norm": 0.2349725216627121, "learning_rate": 6.4514719679524695e-06, "loss": 0.2316, "num_input_tokens_seen": 18309552, "step": 30015 }, { "epoch": 9.314303443996277, "grad_norm": 0.1728503704071045, "learning_rate": 6.450176405287569e-06, "loss": 0.2289, "num_input_tokens_seen": 18312944, "step": 30020 }, { "epoch": 9.315854793670493, "grad_norm": 0.27502870559692383, "learning_rate": 6.448880736307566e-06, "loss": 0.2336, "num_input_tokens_seen": 18316016, "step": 30025 }, { "epoch": 9.31740614334471, "grad_norm": 0.5197232365608215, "learning_rate": 6.447584961107447e-06, "loss": 0.2267, "num_input_tokens_seen": 18318672, "step": 30030 }, { "epoch": 9.318957493018926, "grad_norm": 0.3814423382282257, "learning_rate": 6.446289079782209e-06, "loss": 0.2289, "num_input_tokens_seen": 18321616, "step": 30035 }, { "epoch": 9.320508842693144, "grad_norm": 0.3197024166584015, "learning_rate": 6.444993092426852e-06, "loss": 0.2218, "num_input_tokens_seen": 18324272, "step": 30040 }, { "epoch": 9.32206019236736, "grad_norm": 0.3494347035884857, "learning_rate": 6.443696999136393e-06, "loss": 0.244, "num_input_tokens_seen": 18327408, "step": 30045 }, { "epoch": 9.323611542041576, "grad_norm": 0.4768792390823364, "learning_rate": 6.442400800005848e-06, "loss": 0.2389, "num_input_tokens_seen": 18330672, "step": 30050 }, { "epoch": 9.325162891715793, "grad_norm": 0.32143595814704895, "learning_rate": 6.441104495130242e-06, "loss": 0.229, "num_input_tokens_seen": 18333584, "step": 30055 }, { "epoch": 9.32671424139001, "grad_norm": 0.34818094968795776, "learning_rate": 6.439808084604613e-06, "loss": 0.2377, "num_input_tokens_seen": 18338064, "step": 30060 }, { "epoch": 9.328265591064225, "grad_norm": 0.2110781967639923, "learning_rate": 6.4385115685240016e-06, "loss": 0.2222, "num_input_tokens_seen": 18340304, "step": 30065 }, { "epoch": 9.329816940738443, "grad_norm": 0.33997946977615356, "learning_rate": 6.437214946983457e-06, "loss": 0.2259, "num_input_tokens_seen": 18342864, "step": 30070 }, { "epoch": 9.331368290412659, "grad_norm": 0.40381836891174316, "learning_rate": 6.435918220078038e-06, "loss": 0.2315, "num_input_tokens_seen": 18348624, "step": 30075 }, { "epoch": 9.332919640086876, "grad_norm": 0.3514193594455719, "learning_rate": 6.434621387902808e-06, "loss": 0.2261, "num_input_tokens_seen": 18351760, "step": 30080 }, { "epoch": 9.334470989761092, "grad_norm": 0.3340289890766144, "learning_rate": 6.433324450552845e-06, "loss": 0.2302, "num_input_tokens_seen": 18354096, "step": 30085 }, { "epoch": 9.336022339435308, "grad_norm": 0.29361364245414734, "learning_rate": 6.432027408123224e-06, "loss": 0.24, "num_input_tokens_seen": 18357392, "step": 30090 }, { "epoch": 9.337573689109526, "grad_norm": 0.2421642243862152, "learning_rate": 6.430730260709037e-06, "loss": 0.232, "num_input_tokens_seen": 18360432, "step": 30095 }, { "epoch": 9.339125038783742, "grad_norm": 0.33244210481643677, "learning_rate": 6.429433008405379e-06, "loss": 0.2192, "num_input_tokens_seen": 18363216, "step": 30100 }, { "epoch": 9.340676388457958, "grad_norm": 0.49383780360221863, "learning_rate": 6.4281356513073545e-06, "loss": 0.2349, "num_input_tokens_seen": 18365840, "step": 30105 }, { "epoch": 9.342227738132175, "grad_norm": 0.40890270471572876, "learning_rate": 6.426838189510076e-06, "loss": 0.2398, "num_input_tokens_seen": 18368816, "step": 30110 }, { "epoch": 9.343779087806391, "grad_norm": 0.23336248099803925, "learning_rate": 6.425540623108662e-06, "loss": 0.2322, "num_input_tokens_seen": 18371024, "step": 30115 }, { "epoch": 9.345330437480609, "grad_norm": 0.3542748689651489, "learning_rate": 6.424242952198239e-06, "loss": 0.215, "num_input_tokens_seen": 18374256, "step": 30120 }, { "epoch": 9.346881787154825, "grad_norm": 0.2827928066253662, "learning_rate": 6.422945176873942e-06, "loss": 0.2422, "num_input_tokens_seen": 18377136, "step": 30125 }, { "epoch": 9.34843313682904, "grad_norm": 0.4026448428630829, "learning_rate": 6.421647297230913e-06, "loss": 0.2312, "num_input_tokens_seen": 18380496, "step": 30130 }, { "epoch": 9.349984486503258, "grad_norm": 0.4354228973388672, "learning_rate": 6.420349313364303e-06, "loss": 0.2265, "num_input_tokens_seen": 18383344, "step": 30135 }, { "epoch": 9.351535836177474, "grad_norm": 0.3620607256889343, "learning_rate": 6.419051225369268e-06, "loss": 0.2359, "num_input_tokens_seen": 18385584, "step": 30140 }, { "epoch": 9.353087185851692, "grad_norm": 0.37809717655181885, "learning_rate": 6.417753033340974e-06, "loss": 0.2375, "num_input_tokens_seen": 18389456, "step": 30145 }, { "epoch": 9.354638535525908, "grad_norm": 0.2911559045314789, "learning_rate": 6.416454737374595e-06, "loss": 0.243, "num_input_tokens_seen": 18392272, "step": 30150 }, { "epoch": 9.356189885200124, "grad_norm": 0.3593146502971649, "learning_rate": 6.41515633756531e-06, "loss": 0.2426, "num_input_tokens_seen": 18396432, "step": 30155 }, { "epoch": 9.357741234874341, "grad_norm": 0.3611564040184021, "learning_rate": 6.413857834008307e-06, "loss": 0.2336, "num_input_tokens_seen": 18399472, "step": 30160 }, { "epoch": 9.359292584548557, "grad_norm": 0.40889865159988403, "learning_rate": 6.412559226798782e-06, "loss": 0.2217, "num_input_tokens_seen": 18402576, "step": 30165 }, { "epoch": 9.360843934222775, "grad_norm": 0.33610203862190247, "learning_rate": 6.411260516031939e-06, "loss": 0.2201, "num_input_tokens_seen": 18405744, "step": 30170 }, { "epoch": 9.36239528389699, "grad_norm": 0.35789254307746887, "learning_rate": 6.409961701802988e-06, "loss": 0.2321, "num_input_tokens_seen": 18409392, "step": 30175 }, { "epoch": 9.363946633571206, "grad_norm": 0.30151793360710144, "learning_rate": 6.408662784207149e-06, "loss": 0.2338, "num_input_tokens_seen": 18412208, "step": 30180 }, { "epoch": 9.365497983245424, "grad_norm": 0.4597570598125458, "learning_rate": 6.407363763339646e-06, "loss": 0.233, "num_input_tokens_seen": 18414768, "step": 30185 }, { "epoch": 9.36704933291964, "grad_norm": 0.2361305207014084, "learning_rate": 6.4060646392957124e-06, "loss": 0.2251, "num_input_tokens_seen": 18417488, "step": 30190 }, { "epoch": 9.368600682593856, "grad_norm": 0.31501439213752747, "learning_rate": 6.404765412170593e-06, "loss": 0.2274, "num_input_tokens_seen": 18420016, "step": 30195 }, { "epoch": 9.370152032268074, "grad_norm": 0.6198977828025818, "learning_rate": 6.403466082059533e-06, "loss": 0.2351, "num_input_tokens_seen": 18423056, "step": 30200 }, { "epoch": 9.37170338194229, "grad_norm": 0.3825250566005707, "learning_rate": 6.402166649057788e-06, "loss": 0.2341, "num_input_tokens_seen": 18426960, "step": 30205 }, { "epoch": 9.373254731616507, "grad_norm": 0.32568877935409546, "learning_rate": 6.400867113260625e-06, "loss": 0.233, "num_input_tokens_seen": 18430352, "step": 30210 }, { "epoch": 9.374806081290723, "grad_norm": 0.21453239023685455, "learning_rate": 6.399567474763314e-06, "loss": 0.2404, "num_input_tokens_seen": 18435152, "step": 30215 }, { "epoch": 9.376357430964939, "grad_norm": 0.2595535218715668, "learning_rate": 6.398267733661133e-06, "loss": 0.2328, "num_input_tokens_seen": 18437456, "step": 30220 }, { "epoch": 9.377908780639157, "grad_norm": 0.3099747896194458, "learning_rate": 6.39696789004937e-06, "loss": 0.2281, "num_input_tokens_seen": 18439600, "step": 30225 }, { "epoch": 9.379460130313372, "grad_norm": 0.2717802822589874, "learning_rate": 6.395667944023318e-06, "loss": 0.2359, "num_input_tokens_seen": 18442000, "step": 30230 }, { "epoch": 9.381011479987588, "grad_norm": 0.28838062286376953, "learning_rate": 6.394367895678277e-06, "loss": 0.2267, "num_input_tokens_seen": 18445360, "step": 30235 }, { "epoch": 9.382562829661806, "grad_norm": 0.2624318599700928, "learning_rate": 6.393067745109558e-06, "loss": 0.2298, "num_input_tokens_seen": 18449136, "step": 30240 }, { "epoch": 9.384114179336022, "grad_norm": 0.3451862335205078, "learning_rate": 6.3917674924124785e-06, "loss": 0.2245, "num_input_tokens_seen": 18454896, "step": 30245 }, { "epoch": 9.38566552901024, "grad_norm": 0.3038361072540283, "learning_rate": 6.390467137682359e-06, "loss": 0.2356, "num_input_tokens_seen": 18458160, "step": 30250 }, { "epoch": 9.387216878684455, "grad_norm": 0.32184767723083496, "learning_rate": 6.389166681014534e-06, "loss": 0.2205, "num_input_tokens_seen": 18461520, "step": 30255 }, { "epoch": 9.388768228358671, "grad_norm": 0.2175295054912567, "learning_rate": 6.38786612250434e-06, "loss": 0.2333, "num_input_tokens_seen": 18464560, "step": 30260 }, { "epoch": 9.390319578032889, "grad_norm": 0.3742258846759796, "learning_rate": 6.3865654622471255e-06, "loss": 0.2346, "num_input_tokens_seen": 18467152, "step": 30265 }, { "epoch": 9.391870927707105, "grad_norm": 0.4492568373680115, "learning_rate": 6.385264700338243e-06, "loss": 0.2261, "num_input_tokens_seen": 18470800, "step": 30270 }, { "epoch": 9.393422277381323, "grad_norm": 0.41456514596939087, "learning_rate": 6.383963836873054e-06, "loss": 0.2356, "num_input_tokens_seen": 18473328, "step": 30275 }, { "epoch": 9.394973627055538, "grad_norm": 0.29873818159103394, "learning_rate": 6.3826628719469265e-06, "loss": 0.227, "num_input_tokens_seen": 18476336, "step": 30280 }, { "epoch": 9.396524976729754, "grad_norm": 0.32048723101615906, "learning_rate": 6.381361805655237e-06, "loss": 0.222, "num_input_tokens_seen": 18479888, "step": 30285 }, { "epoch": 9.398076326403972, "grad_norm": 0.33443546295166016, "learning_rate": 6.380060638093371e-06, "loss": 0.2283, "num_input_tokens_seen": 18483664, "step": 30290 }, { "epoch": 9.399627676078188, "grad_norm": 0.42001232504844666, "learning_rate": 6.378759369356716e-06, "loss": 0.237, "num_input_tokens_seen": 18486512, "step": 30295 }, { "epoch": 9.401179025752405, "grad_norm": 0.29118525981903076, "learning_rate": 6.377457999540672e-06, "loss": 0.2292, "num_input_tokens_seen": 18489648, "step": 30300 }, { "epoch": 9.402730375426621, "grad_norm": 0.38666507601737976, "learning_rate": 6.376156528740648e-06, "loss": 0.2429, "num_input_tokens_seen": 18492880, "step": 30305 }, { "epoch": 9.404281725100837, "grad_norm": 0.4566086530685425, "learning_rate": 6.3748549570520524e-06, "loss": 0.2314, "num_input_tokens_seen": 18495824, "step": 30310 }, { "epoch": 9.405833074775055, "grad_norm": 0.40079978108406067, "learning_rate": 6.373553284570309e-06, "loss": 0.2325, "num_input_tokens_seen": 18498416, "step": 30315 }, { "epoch": 9.40738442444927, "grad_norm": 0.378909707069397, "learning_rate": 6.372251511390842e-06, "loss": 0.2216, "num_input_tokens_seen": 18501680, "step": 30320 }, { "epoch": 9.408935774123487, "grad_norm": 0.39149221777915955, "learning_rate": 6.370949637609092e-06, "loss": 0.2294, "num_input_tokens_seen": 18504048, "step": 30325 }, { "epoch": 9.410487123797704, "grad_norm": 0.16552165150642395, "learning_rate": 6.3696476633204975e-06, "loss": 0.22, "num_input_tokens_seen": 18506832, "step": 30330 }, { "epoch": 9.41203847347192, "grad_norm": 0.45178335905075073, "learning_rate": 6.36834558862051e-06, "loss": 0.2406, "num_input_tokens_seen": 18509584, "step": 30335 }, { "epoch": 9.413589823146138, "grad_norm": 0.33155009150505066, "learning_rate": 6.3670434136045876e-06, "loss": 0.2243, "num_input_tokens_seen": 18512848, "step": 30340 }, { "epoch": 9.415141172820354, "grad_norm": 0.23004139959812164, "learning_rate": 6.365741138368196e-06, "loss": 0.2302, "num_input_tokens_seen": 18515504, "step": 30345 }, { "epoch": 9.41669252249457, "grad_norm": 0.3492445945739746, "learning_rate": 6.364438763006805e-06, "loss": 0.2212, "num_input_tokens_seen": 18518832, "step": 30350 }, { "epoch": 9.418243872168787, "grad_norm": 0.4525054693222046, "learning_rate": 6.363136287615894e-06, "loss": 0.2335, "num_input_tokens_seen": 18521552, "step": 30355 }, { "epoch": 9.419795221843003, "grad_norm": 0.33309659361839294, "learning_rate": 6.361833712290953e-06, "loss": 0.2239, "num_input_tokens_seen": 18524784, "step": 30360 }, { "epoch": 9.421346571517219, "grad_norm": 0.36217960715293884, "learning_rate": 6.360531037127474e-06, "loss": 0.2317, "num_input_tokens_seen": 18528400, "step": 30365 }, { "epoch": 9.422897921191437, "grad_norm": 0.2998080253601074, "learning_rate": 6.359228262220959e-06, "loss": 0.2274, "num_input_tokens_seen": 18530704, "step": 30370 }, { "epoch": 9.424449270865653, "grad_norm": 0.3645622730255127, "learning_rate": 6.357925387666917e-06, "loss": 0.2333, "num_input_tokens_seen": 18534224, "step": 30375 }, { "epoch": 9.42600062053987, "grad_norm": 0.3702489733695984, "learning_rate": 6.356622413560863e-06, "loss": 0.2241, "num_input_tokens_seen": 18537136, "step": 30380 }, { "epoch": 9.427551970214086, "grad_norm": 0.33780235052108765, "learning_rate": 6.355319339998322e-06, "loss": 0.2337, "num_input_tokens_seen": 18540720, "step": 30385 }, { "epoch": 9.429103319888302, "grad_norm": 0.2552822530269623, "learning_rate": 6.354016167074824e-06, "loss": 0.2302, "num_input_tokens_seen": 18542928, "step": 30390 }, { "epoch": 9.43065466956252, "grad_norm": 0.19718694686889648, "learning_rate": 6.352712894885906e-06, "loss": 0.2265, "num_input_tokens_seen": 18545264, "step": 30395 }, { "epoch": 9.432206019236736, "grad_norm": 0.39689111709594727, "learning_rate": 6.351409523527115e-06, "loss": 0.2338, "num_input_tokens_seen": 18549232, "step": 30400 }, { "epoch": 9.433757368910953, "grad_norm": 0.41931208968162537, "learning_rate": 6.350106053094004e-06, "loss": 0.2338, "num_input_tokens_seen": 18552272, "step": 30405 }, { "epoch": 9.43530871858517, "grad_norm": 0.4349074959754944, "learning_rate": 6.348802483682131e-06, "loss": 0.2334, "num_input_tokens_seen": 18555344, "step": 30410 }, { "epoch": 9.436860068259385, "grad_norm": 0.4478464126586914, "learning_rate": 6.347498815387061e-06, "loss": 0.2296, "num_input_tokens_seen": 18558448, "step": 30415 }, { "epoch": 9.438411417933603, "grad_norm": 0.3333100974559784, "learning_rate": 6.346195048304373e-06, "loss": 0.2259, "num_input_tokens_seen": 18561904, "step": 30420 }, { "epoch": 9.439962767607819, "grad_norm": 0.26950693130493164, "learning_rate": 6.344891182529648e-06, "loss": 0.2303, "num_input_tokens_seen": 18566576, "step": 30425 }, { "epoch": 9.441514117282036, "grad_norm": 0.28857624530792236, "learning_rate": 6.343587218158472e-06, "loss": 0.2333, "num_input_tokens_seen": 18569680, "step": 30430 }, { "epoch": 9.443065466956252, "grad_norm": 0.2719840109348297, "learning_rate": 6.342283155286443e-06, "loss": 0.2236, "num_input_tokens_seen": 18572272, "step": 30435 }, { "epoch": 9.444616816630468, "grad_norm": 0.3596717119216919, "learning_rate": 6.340978994009164e-06, "loss": 0.2272, "num_input_tokens_seen": 18575408, "step": 30440 }, { "epoch": 9.446168166304686, "grad_norm": 0.3366219103336334, "learning_rate": 6.339674734422244e-06, "loss": 0.2256, "num_input_tokens_seen": 18579600, "step": 30445 }, { "epoch": 9.447719515978902, "grad_norm": 0.38990575075149536, "learning_rate": 6.338370376621303e-06, "loss": 0.2318, "num_input_tokens_seen": 18582320, "step": 30450 }, { "epoch": 9.449270865653117, "grad_norm": 0.24457469582557678, "learning_rate": 6.337065920701963e-06, "loss": 0.2235, "num_input_tokens_seen": 18585296, "step": 30455 }, { "epoch": 9.450822215327335, "grad_norm": 0.27584734559059143, "learning_rate": 6.335761366759859e-06, "loss": 0.2175, "num_input_tokens_seen": 18587472, "step": 30460 }, { "epoch": 9.452373565001551, "grad_norm": 0.37683629989624023, "learning_rate": 6.3344567148906284e-06, "loss": 0.2233, "num_input_tokens_seen": 18591696, "step": 30465 }, { "epoch": 9.453924914675769, "grad_norm": 0.482959121465683, "learning_rate": 6.33315196518992e-06, "loss": 0.2262, "num_input_tokens_seen": 18594608, "step": 30470 }, { "epoch": 9.455476264349985, "grad_norm": 0.31113389134407043, "learning_rate": 6.331847117753385e-06, "loss": 0.2342, "num_input_tokens_seen": 18597424, "step": 30475 }, { "epoch": 9.4570276140242, "grad_norm": 0.35403549671173096, "learning_rate": 6.3305421726766835e-06, "loss": 0.2357, "num_input_tokens_seen": 18600016, "step": 30480 }, { "epoch": 9.458578963698418, "grad_norm": 0.33919480443000793, "learning_rate": 6.329237130055487e-06, "loss": 0.2247, "num_input_tokens_seen": 18602832, "step": 30485 }, { "epoch": 9.460130313372634, "grad_norm": 0.39347147941589355, "learning_rate": 6.327931989985465e-06, "loss": 0.2263, "num_input_tokens_seen": 18606288, "step": 30490 }, { "epoch": 9.46168166304685, "grad_norm": 0.2612588405609131, "learning_rate": 6.3266267525623055e-06, "loss": 0.2302, "num_input_tokens_seen": 18608912, "step": 30495 }, { "epoch": 9.463233012721068, "grad_norm": 0.21253931522369385, "learning_rate": 6.3253214178816955e-06, "loss": 0.2281, "num_input_tokens_seen": 18612656, "step": 30500 }, { "epoch": 9.464784362395283, "grad_norm": 0.37612006068229675, "learning_rate": 6.324015986039331e-06, "loss": 0.2263, "num_input_tokens_seen": 18614864, "step": 30505 }, { "epoch": 9.466335712069501, "grad_norm": 0.34915226697921753, "learning_rate": 6.322710457130916e-06, "loss": 0.2295, "num_input_tokens_seen": 18617968, "step": 30510 }, { "epoch": 9.467887061743717, "grad_norm": 0.3291149139404297, "learning_rate": 6.3214048312521604e-06, "loss": 0.2285, "num_input_tokens_seen": 18620176, "step": 30515 }, { "epoch": 9.469438411417933, "grad_norm": 0.37582385540008545, "learning_rate": 6.320099108498785e-06, "loss": 0.2315, "num_input_tokens_seen": 18622640, "step": 30520 }, { "epoch": 9.47098976109215, "grad_norm": 0.3913080394268036, "learning_rate": 6.31879328896651e-06, "loss": 0.2213, "num_input_tokens_seen": 18625072, "step": 30525 }, { "epoch": 9.472541110766366, "grad_norm": 0.4061570465564728, "learning_rate": 6.317487372751072e-06, "loss": 0.2276, "num_input_tokens_seen": 18627760, "step": 30530 }, { "epoch": 9.474092460440584, "grad_norm": 0.45623189210891724, "learning_rate": 6.316181359948208e-06, "loss": 0.2296, "num_input_tokens_seen": 18630384, "step": 30535 }, { "epoch": 9.4756438101148, "grad_norm": 0.5471447110176086, "learning_rate": 6.314875250653665e-06, "loss": 0.2293, "num_input_tokens_seen": 18633456, "step": 30540 }, { "epoch": 9.477195159789016, "grad_norm": 0.5831302404403687, "learning_rate": 6.313569044963193e-06, "loss": 0.2303, "num_input_tokens_seen": 18636368, "step": 30545 }, { "epoch": 9.478746509463233, "grad_norm": 0.5409619808197021, "learning_rate": 6.312262742972557e-06, "loss": 0.2383, "num_input_tokens_seen": 18639984, "step": 30550 }, { "epoch": 9.48029785913745, "grad_norm": 0.25451648235321045, "learning_rate": 6.3109563447775214e-06, "loss": 0.2291, "num_input_tokens_seen": 18642224, "step": 30555 }, { "epoch": 9.481849208811667, "grad_norm": 0.38426506519317627, "learning_rate": 6.309649850473862e-06, "loss": 0.2129, "num_input_tokens_seen": 18645040, "step": 30560 }, { "epoch": 9.483400558485883, "grad_norm": 0.3003656268119812, "learning_rate": 6.308343260157361e-06, "loss": 0.2338, "num_input_tokens_seen": 18647664, "step": 30565 }, { "epoch": 9.484951908160099, "grad_norm": 0.4493451714515686, "learning_rate": 6.307036573923806e-06, "loss": 0.2441, "num_input_tokens_seen": 18650512, "step": 30570 }, { "epoch": 9.486503257834316, "grad_norm": 0.38169029355049133, "learning_rate": 6.305729791868992e-06, "loss": 0.2354, "num_input_tokens_seen": 18652720, "step": 30575 }, { "epoch": 9.488054607508532, "grad_norm": 0.27200645208358765, "learning_rate": 6.304422914088722e-06, "loss": 0.2266, "num_input_tokens_seen": 18655696, "step": 30580 }, { "epoch": 9.489605957182748, "grad_norm": 0.6015222072601318, "learning_rate": 6.303115940678806e-06, "loss": 0.2344, "num_input_tokens_seen": 18660016, "step": 30585 }, { "epoch": 9.491157306856966, "grad_norm": 0.3049349784851074, "learning_rate": 6.30180887173506e-06, "loss": 0.2335, "num_input_tokens_seen": 18662800, "step": 30590 }, { "epoch": 9.492708656531182, "grad_norm": 0.4567252993583679, "learning_rate": 6.300501707353308e-06, "loss": 0.2239, "num_input_tokens_seen": 18665808, "step": 30595 }, { "epoch": 9.4942600062054, "grad_norm": 0.21836307644844055, "learning_rate": 6.29919444762938e-06, "loss": 0.2277, "num_input_tokens_seen": 18668688, "step": 30600 }, { "epoch": 9.495811355879615, "grad_norm": 0.41131338477134705, "learning_rate": 6.297887092659115e-06, "loss": 0.2263, "num_input_tokens_seen": 18671696, "step": 30605 }, { "epoch": 9.497362705553831, "grad_norm": 0.3033078908920288, "learning_rate": 6.296579642538356e-06, "loss": 0.2331, "num_input_tokens_seen": 18673904, "step": 30610 }, { "epoch": 9.498914055228049, "grad_norm": 0.42173781991004944, "learning_rate": 6.295272097362956e-06, "loss": 0.2348, "num_input_tokens_seen": 18676656, "step": 30615 }, { "epoch": 9.500465404902265, "grad_norm": 0.37909746170043945, "learning_rate": 6.293964457228773e-06, "loss": 0.2311, "num_input_tokens_seen": 18679600, "step": 30620 }, { "epoch": 9.50201675457648, "grad_norm": 0.27889153361320496, "learning_rate": 6.292656722231672e-06, "loss": 0.2234, "num_input_tokens_seen": 18683120, "step": 30625 }, { "epoch": 9.503568104250698, "grad_norm": 0.319750040769577, "learning_rate": 6.291348892467527e-06, "loss": 0.2356, "num_input_tokens_seen": 18686000, "step": 30630 }, { "epoch": 9.505119453924914, "grad_norm": 0.3297862410545349, "learning_rate": 6.290040968032216e-06, "loss": 0.2236, "num_input_tokens_seen": 18688944, "step": 30635 }, { "epoch": 9.506670803599132, "grad_norm": 0.27363672852516174, "learning_rate": 6.288732949021625e-06, "loss": 0.2298, "num_input_tokens_seen": 18692304, "step": 30640 }, { "epoch": 9.508222153273348, "grad_norm": 0.3162922263145447, "learning_rate": 6.287424835531648e-06, "loss": 0.2207, "num_input_tokens_seen": 18695792, "step": 30645 }, { "epoch": 9.509773502947564, "grad_norm": 0.4277004897594452, "learning_rate": 6.286116627658185e-06, "loss": 0.2291, "num_input_tokens_seen": 18698896, "step": 30650 }, { "epoch": 9.511324852621781, "grad_norm": 0.5516321063041687, "learning_rate": 6.284808325497145e-06, "loss": 0.2366, "num_input_tokens_seen": 18701296, "step": 30655 }, { "epoch": 9.512876202295997, "grad_norm": 0.3079093396663666, "learning_rate": 6.283499929144439e-06, "loss": 0.2371, "num_input_tokens_seen": 18703888, "step": 30660 }, { "epoch": 9.514427551970215, "grad_norm": 0.4337453842163086, "learning_rate": 6.2821914386959894e-06, "loss": 0.2391, "num_input_tokens_seen": 18706800, "step": 30665 }, { "epoch": 9.51597890164443, "grad_norm": 0.2801889181137085, "learning_rate": 6.2808828542477225e-06, "loss": 0.2343, "num_input_tokens_seen": 18709776, "step": 30670 }, { "epoch": 9.517530251318647, "grad_norm": 0.35822421312332153, "learning_rate": 6.279574175895576e-06, "loss": 0.2274, "num_input_tokens_seen": 18712784, "step": 30675 }, { "epoch": 9.519081600992864, "grad_norm": 0.2665993273258209, "learning_rate": 6.278265403735491e-06, "loss": 0.2315, "num_input_tokens_seen": 18715920, "step": 30680 }, { "epoch": 9.52063295066708, "grad_norm": 0.362554132938385, "learning_rate": 6.276956537863413e-06, "loss": 0.221, "num_input_tokens_seen": 18718448, "step": 30685 }, { "epoch": 9.522184300341298, "grad_norm": 0.43241867423057556, "learning_rate": 6.2756475783753e-06, "loss": 0.2251, "num_input_tokens_seen": 18722128, "step": 30690 }, { "epoch": 9.523735650015514, "grad_norm": 0.4574381709098816, "learning_rate": 6.274338525367114e-06, "loss": 0.2185, "num_input_tokens_seen": 18725776, "step": 30695 }, { "epoch": 9.52528699968973, "grad_norm": 0.32918068766593933, "learning_rate": 6.273029378934824e-06, "loss": 0.2361, "num_input_tokens_seen": 18728368, "step": 30700 }, { "epoch": 9.526838349363947, "grad_norm": 0.28180432319641113, "learning_rate": 6.271720139174404e-06, "loss": 0.225, "num_input_tokens_seen": 18731216, "step": 30705 }, { "epoch": 9.528389699038163, "grad_norm": 0.4389680325984955, "learning_rate": 6.27041080618184e-06, "loss": 0.2276, "num_input_tokens_seen": 18734672, "step": 30710 }, { "epoch": 9.529941048712379, "grad_norm": 0.3230978548526764, "learning_rate": 6.26910138005312e-06, "loss": 0.233, "num_input_tokens_seen": 18736944, "step": 30715 }, { "epoch": 9.531492398386597, "grad_norm": 0.22690366208553314, "learning_rate": 6.267791860884241e-06, "loss": 0.2167, "num_input_tokens_seen": 18739600, "step": 30720 }, { "epoch": 9.533043748060813, "grad_norm": 0.5272845029830933, "learning_rate": 6.266482248771203e-06, "loss": 0.2248, "num_input_tokens_seen": 18741968, "step": 30725 }, { "epoch": 9.53459509773503, "grad_norm": 0.5521706938743591, "learning_rate": 6.26517254381002e-06, "loss": 0.2314, "num_input_tokens_seen": 18745520, "step": 30730 }, { "epoch": 9.536146447409246, "grad_norm": 0.2751433849334717, "learning_rate": 6.263862746096707e-06, "loss": 0.2395, "num_input_tokens_seen": 18747920, "step": 30735 }, { "epoch": 9.537697797083462, "grad_norm": 0.41712114214897156, "learning_rate": 6.262552855727289e-06, "loss": 0.2177, "num_input_tokens_seen": 18750736, "step": 30740 }, { "epoch": 9.53924914675768, "grad_norm": 0.6407169699668884, "learning_rate": 6.261242872797796e-06, "loss": 0.2243, "num_input_tokens_seen": 18753552, "step": 30745 }, { "epoch": 9.540800496431896, "grad_norm": 0.4220219552516937, "learning_rate": 6.259932797404265e-06, "loss": 0.2265, "num_input_tokens_seen": 18756656, "step": 30750 }, { "epoch": 9.542351846106111, "grad_norm": 0.4297489523887634, "learning_rate": 6.258622629642741e-06, "loss": 0.2258, "num_input_tokens_seen": 18759280, "step": 30755 }, { "epoch": 9.543903195780329, "grad_norm": 0.5004351139068604, "learning_rate": 6.257312369609273e-06, "loss": 0.2268, "num_input_tokens_seen": 18762736, "step": 30760 }, { "epoch": 9.545454545454545, "grad_norm": 0.5632019639015198, "learning_rate": 6.256002017399921e-06, "loss": 0.2262, "num_input_tokens_seen": 18765424, "step": 30765 }, { "epoch": 9.547005895128763, "grad_norm": 0.5749183297157288, "learning_rate": 6.254691573110745e-06, "loss": 0.2384, "num_input_tokens_seen": 18768368, "step": 30770 }, { "epoch": 9.548557244802979, "grad_norm": 0.5883527398109436, "learning_rate": 6.253381036837821e-06, "loss": 0.2185, "num_input_tokens_seen": 18771312, "step": 30775 }, { "epoch": 9.550108594477194, "grad_norm": 0.7503089904785156, "learning_rate": 6.252070408677224e-06, "loss": 0.2189, "num_input_tokens_seen": 18774224, "step": 30780 }, { "epoch": 9.551659944151412, "grad_norm": 0.41305577754974365, "learning_rate": 6.25075968872504e-06, "loss": 0.2249, "num_input_tokens_seen": 18776624, "step": 30785 }, { "epoch": 9.553211293825628, "grad_norm": 0.5909034609794617, "learning_rate": 6.249448877077359e-06, "loss": 0.2227, "num_input_tokens_seen": 18779728, "step": 30790 }, { "epoch": 9.554762643499846, "grad_norm": 0.5361271500587463, "learning_rate": 6.2481379738302794e-06, "loss": 0.2404, "num_input_tokens_seen": 18782256, "step": 30795 }, { "epoch": 9.556313993174061, "grad_norm": 0.5358310341835022, "learning_rate": 6.246826979079907e-06, "loss": 0.2325, "num_input_tokens_seen": 18784784, "step": 30800 }, { "epoch": 9.557865342848277, "grad_norm": 0.9128548502922058, "learning_rate": 6.245515892922351e-06, "loss": 0.2259, "num_input_tokens_seen": 18787856, "step": 30805 }, { "epoch": 9.559416692522495, "grad_norm": 0.8100122809410095, "learning_rate": 6.244204715453732e-06, "loss": 0.2345, "num_input_tokens_seen": 18790416, "step": 30810 }, { "epoch": 9.560968042196711, "grad_norm": 0.7341654896736145, "learning_rate": 6.242893446770173e-06, "loss": 0.2156, "num_input_tokens_seen": 18793296, "step": 30815 }, { "epoch": 9.562519391870929, "grad_norm": 1.1797151565551758, "learning_rate": 6.241582086967808e-06, "loss": 0.2289, "num_input_tokens_seen": 18796112, "step": 30820 }, { "epoch": 9.564070741545144, "grad_norm": 1.4433971643447876, "learning_rate": 6.2402706361427725e-06, "loss": 0.2431, "num_input_tokens_seen": 18799344, "step": 30825 }, { "epoch": 9.56562209121936, "grad_norm": 0.5673738121986389, "learning_rate": 6.2389590943912114e-06, "loss": 0.2199, "num_input_tokens_seen": 18801968, "step": 30830 }, { "epoch": 9.567173440893578, "grad_norm": 0.7273882031440735, "learning_rate": 6.2376474618092785e-06, "loss": 0.2283, "num_input_tokens_seen": 18804880, "step": 30835 }, { "epoch": 9.568724790567794, "grad_norm": 0.7800704836845398, "learning_rate": 6.23633573849313e-06, "loss": 0.2203, "num_input_tokens_seen": 18808144, "step": 30840 }, { "epoch": 9.57027614024201, "grad_norm": 0.632915198802948, "learning_rate": 6.235023924538932e-06, "loss": 0.2412, "num_input_tokens_seen": 18811184, "step": 30845 }, { "epoch": 9.571827489916227, "grad_norm": 0.7703281044960022, "learning_rate": 6.233712020042856e-06, "loss": 0.2303, "num_input_tokens_seen": 18815280, "step": 30850 }, { "epoch": 9.573378839590443, "grad_norm": 1.220409631729126, "learning_rate": 6.232400025101078e-06, "loss": 0.245, "num_input_tokens_seen": 18818544, "step": 30855 }, { "epoch": 9.574930189264661, "grad_norm": 1.2441213130950928, "learning_rate": 6.2310879398097854e-06, "loss": 0.2408, "num_input_tokens_seen": 18821616, "step": 30860 }, { "epoch": 9.576481538938877, "grad_norm": 0.5124188661575317, "learning_rate": 6.229775764265167e-06, "loss": 0.2243, "num_input_tokens_seen": 18824624, "step": 30865 }, { "epoch": 9.578032888613093, "grad_norm": 1.5023316144943237, "learning_rate": 6.228463498563424e-06, "loss": 0.2199, "num_input_tokens_seen": 18827536, "step": 30870 }, { "epoch": 9.57958423828731, "grad_norm": 0.8962253332138062, "learning_rate": 6.227151142800759e-06, "loss": 0.231, "num_input_tokens_seen": 18830672, "step": 30875 }, { "epoch": 9.581135587961526, "grad_norm": 0.9638836979866028, "learning_rate": 6.225838697073384e-06, "loss": 0.2243, "num_input_tokens_seen": 18833552, "step": 30880 }, { "epoch": 9.582686937635742, "grad_norm": 0.5732321739196777, "learning_rate": 6.2245261614775155e-06, "loss": 0.2237, "num_input_tokens_seen": 18837232, "step": 30885 }, { "epoch": 9.58423828730996, "grad_norm": 0.8003467917442322, "learning_rate": 6.223213536109381e-06, "loss": 0.2337, "num_input_tokens_seen": 18840752, "step": 30890 }, { "epoch": 9.585789636984176, "grad_norm": 0.522222101688385, "learning_rate": 6.221900821065206e-06, "loss": 0.2197, "num_input_tokens_seen": 18843568, "step": 30895 }, { "epoch": 9.587340986658393, "grad_norm": 0.813452959060669, "learning_rate": 6.220588016441234e-06, "loss": 0.241, "num_input_tokens_seen": 18846320, "step": 30900 }, { "epoch": 9.58889233633261, "grad_norm": 0.8100584745407104, "learning_rate": 6.219275122333706e-06, "loss": 0.224, "num_input_tokens_seen": 18850064, "step": 30905 }, { "epoch": 9.590443686006825, "grad_norm": 0.9052654504776001, "learning_rate": 6.217962138838872e-06, "loss": 0.215, "num_input_tokens_seen": 18853136, "step": 30910 }, { "epoch": 9.591995035681043, "grad_norm": 0.9082737565040588, "learning_rate": 6.216649066052991e-06, "loss": 0.2328, "num_input_tokens_seen": 18856912, "step": 30915 }, { "epoch": 9.593546385355259, "grad_norm": 0.715690553188324, "learning_rate": 6.215335904072326e-06, "loss": 0.2143, "num_input_tokens_seen": 18859760, "step": 30920 }, { "epoch": 9.595097735029476, "grad_norm": 0.9742071628570557, "learning_rate": 6.214022652993147e-06, "loss": 0.2383, "num_input_tokens_seen": 18863216, "step": 30925 }, { "epoch": 9.596649084703692, "grad_norm": 3.4091668128967285, "learning_rate": 6.2127093129117324e-06, "loss": 0.2137, "num_input_tokens_seen": 18865680, "step": 30930 }, { "epoch": 9.598200434377908, "grad_norm": 0.8498044013977051, "learning_rate": 6.211395883924364e-06, "loss": 0.2236, "num_input_tokens_seen": 18868784, "step": 30935 }, { "epoch": 9.599751784052126, "grad_norm": 0.8483030796051025, "learning_rate": 6.210082366127333e-06, "loss": 0.2217, "num_input_tokens_seen": 18871408, "step": 30940 }, { "epoch": 9.601303133726342, "grad_norm": 1.190388560295105, "learning_rate": 6.2087687596169335e-06, "loss": 0.2273, "num_input_tokens_seen": 18874576, "step": 30945 }, { "epoch": 9.60285448340056, "grad_norm": 1.281536340713501, "learning_rate": 6.2074550644894714e-06, "loss": 0.2176, "num_input_tokens_seen": 18878960, "step": 30950 }, { "epoch": 9.604405833074775, "grad_norm": 1.1702669858932495, "learning_rate": 6.206141280841253e-06, "loss": 0.2335, "num_input_tokens_seen": 18882064, "step": 30955 }, { "epoch": 9.605957182748991, "grad_norm": 0.8997696042060852, "learning_rate": 6.2048274087685975e-06, "loss": 0.2259, "num_input_tokens_seen": 18885328, "step": 30960 }, { "epoch": 9.607508532423209, "grad_norm": 1.7355589866638184, "learning_rate": 6.203513448367826e-06, "loss": 0.2388, "num_input_tokens_seen": 18887664, "step": 30965 }, { "epoch": 9.609059882097425, "grad_norm": 0.8859462738037109, "learning_rate": 6.202199399735266e-06, "loss": 0.2362, "num_input_tokens_seen": 18890512, "step": 30970 }, { "epoch": 9.61061123177164, "grad_norm": 0.6852632761001587, "learning_rate": 6.200885262967254e-06, "loss": 0.2478, "num_input_tokens_seen": 18893936, "step": 30975 }, { "epoch": 9.612162581445858, "grad_norm": 0.7227914929389954, "learning_rate": 6.199571038160132e-06, "loss": 0.2068, "num_input_tokens_seen": 18897232, "step": 30980 }, { "epoch": 9.613713931120074, "grad_norm": 0.49291372299194336, "learning_rate": 6.198256725410247e-06, "loss": 0.2391, "num_input_tokens_seen": 18899920, "step": 30985 }, { "epoch": 9.615265280794292, "grad_norm": 0.9045130610466003, "learning_rate": 6.196942324813955e-06, "loss": 0.2376, "num_input_tokens_seen": 18903024, "step": 30990 }, { "epoch": 9.616816630468508, "grad_norm": 1.0278114080429077, "learning_rate": 6.195627836467616e-06, "loss": 0.2219, "num_input_tokens_seen": 18906320, "step": 30995 }, { "epoch": 9.618367980142724, "grad_norm": 0.6526980996131897, "learning_rate": 6.194313260467599e-06, "loss": 0.2252, "num_input_tokens_seen": 18909200, "step": 31000 }, { "epoch": 9.619919329816941, "grad_norm": 0.7543724775314331, "learning_rate": 6.192998596910278e-06, "loss": 0.2291, "num_input_tokens_seen": 18912368, "step": 31005 }, { "epoch": 9.621470679491157, "grad_norm": 0.4033236801624298, "learning_rate": 6.191683845892032e-06, "loss": 0.2407, "num_input_tokens_seen": 18915312, "step": 31010 }, { "epoch": 9.623022029165373, "grad_norm": 0.4356725811958313, "learning_rate": 6.190369007509247e-06, "loss": 0.2219, "num_input_tokens_seen": 18918640, "step": 31015 }, { "epoch": 9.62457337883959, "grad_norm": 0.5024083256721497, "learning_rate": 6.189054081858319e-06, "loss": 0.2214, "num_input_tokens_seen": 18921584, "step": 31020 }, { "epoch": 9.626124728513807, "grad_norm": 0.5379118919372559, "learning_rate": 6.187739069035647e-06, "loss": 0.2326, "num_input_tokens_seen": 18924240, "step": 31025 }, { "epoch": 9.627676078188024, "grad_norm": 1.5430914163589478, "learning_rate": 6.186423969137635e-06, "loss": 0.227, "num_input_tokens_seen": 18927344, "step": 31030 }, { "epoch": 9.62922742786224, "grad_norm": 0.8693633079528809, "learning_rate": 6.185108782260696e-06, "loss": 0.2129, "num_input_tokens_seen": 18930128, "step": 31035 }, { "epoch": 9.630778777536456, "grad_norm": 0.7241694927215576, "learning_rate": 6.183793508501251e-06, "loss": 0.2326, "num_input_tokens_seen": 18932784, "step": 31040 }, { "epoch": 9.632330127210674, "grad_norm": 0.7746721506118774, "learning_rate": 6.1824781479557235e-06, "loss": 0.2445, "num_input_tokens_seen": 18935216, "step": 31045 }, { "epoch": 9.63388147688489, "grad_norm": 0.5804651379585266, "learning_rate": 6.1811627007205455e-06, "loss": 0.2564, "num_input_tokens_seen": 18938288, "step": 31050 }, { "epoch": 9.635432826559107, "grad_norm": 0.693200945854187, "learning_rate": 6.179847166892153e-06, "loss": 0.2431, "num_input_tokens_seen": 18941296, "step": 31055 }, { "epoch": 9.636984176233323, "grad_norm": 0.4949784576892853, "learning_rate": 6.178531546566993e-06, "loss": 0.2265, "num_input_tokens_seen": 18945936, "step": 31060 }, { "epoch": 9.638535525907539, "grad_norm": 0.8992601633071899, "learning_rate": 6.177215839841514e-06, "loss": 0.2533, "num_input_tokens_seen": 18948272, "step": 31065 }, { "epoch": 9.640086875581757, "grad_norm": 0.3373676836490631, "learning_rate": 6.175900046812173e-06, "loss": 0.2246, "num_input_tokens_seen": 18950416, "step": 31070 }, { "epoch": 9.641638225255972, "grad_norm": 0.4286569356918335, "learning_rate": 6.174584167575434e-06, "loss": 0.2276, "num_input_tokens_seen": 18952848, "step": 31075 }, { "epoch": 9.64318957493019, "grad_norm": 0.3345526158809662, "learning_rate": 6.173268202227769e-06, "loss": 0.2233, "num_input_tokens_seen": 18956400, "step": 31080 }, { "epoch": 9.644740924604406, "grad_norm": 0.3931068778038025, "learning_rate": 6.171952150865649e-06, "loss": 0.2309, "num_input_tokens_seen": 18958960, "step": 31085 }, { "epoch": 9.646292274278622, "grad_norm": 0.5764011740684509, "learning_rate": 6.170636013585558e-06, "loss": 0.2363, "num_input_tokens_seen": 18962128, "step": 31090 }, { "epoch": 9.64784362395284, "grad_norm": 0.5084288120269775, "learning_rate": 6.1693197904839865e-06, "loss": 0.2375, "num_input_tokens_seen": 18964624, "step": 31095 }, { "epoch": 9.649394973627055, "grad_norm": 0.15145346522331238, "learning_rate": 6.168003481657427e-06, "loss": 0.2263, "num_input_tokens_seen": 18966832, "step": 31100 }, { "epoch": 9.650946323301271, "grad_norm": 0.5094130635261536, "learning_rate": 6.16668708720238e-06, "loss": 0.2328, "num_input_tokens_seen": 18969552, "step": 31105 }, { "epoch": 9.652497672975489, "grad_norm": 0.28206735849380493, "learning_rate": 6.165370607215354e-06, "loss": 0.2286, "num_input_tokens_seen": 18971984, "step": 31110 }, { "epoch": 9.654049022649705, "grad_norm": 0.24868249893188477, "learning_rate": 6.164054041792861e-06, "loss": 0.2395, "num_input_tokens_seen": 18973936, "step": 31115 }, { "epoch": 9.655600372323923, "grad_norm": 0.6067829728126526, "learning_rate": 6.162737391031426e-06, "loss": 0.2232, "num_input_tokens_seen": 18977360, "step": 31120 }, { "epoch": 9.657151721998138, "grad_norm": 0.29402410984039307, "learning_rate": 6.161420655027569e-06, "loss": 0.2306, "num_input_tokens_seen": 18980368, "step": 31125 }, { "epoch": 9.658703071672354, "grad_norm": 0.4961742162704468, "learning_rate": 6.1601038338778255e-06, "loss": 0.2228, "num_input_tokens_seen": 18985008, "step": 31130 }, { "epoch": 9.660254421346572, "grad_norm": 0.31767094135284424, "learning_rate": 6.1587869276787325e-06, "loss": 0.2382, "num_input_tokens_seen": 18987344, "step": 31135 }, { "epoch": 9.661805771020788, "grad_norm": 0.6301977038383484, "learning_rate": 6.157469936526837e-06, "loss": 0.2302, "num_input_tokens_seen": 18991280, "step": 31140 }, { "epoch": 9.663357120695004, "grad_norm": 0.3614828288555145, "learning_rate": 6.156152860518687e-06, "loss": 0.2287, "num_input_tokens_seen": 18993872, "step": 31145 }, { "epoch": 9.664908470369221, "grad_norm": 0.30482614040374756, "learning_rate": 6.154835699750843e-06, "loss": 0.225, "num_input_tokens_seen": 18996336, "step": 31150 }, { "epoch": 9.666459820043437, "grad_norm": 0.3024557828903198, "learning_rate": 6.153518454319866e-06, "loss": 0.2157, "num_input_tokens_seen": 18998448, "step": 31155 }, { "epoch": 9.668011169717655, "grad_norm": 0.3481554090976715, "learning_rate": 6.152201124322327e-06, "loss": 0.2274, "num_input_tokens_seen": 19002256, "step": 31160 }, { "epoch": 9.66956251939187, "grad_norm": 0.44917529821395874, "learning_rate": 6.150883709854801e-06, "loss": 0.2353, "num_input_tokens_seen": 19005584, "step": 31165 }, { "epoch": 9.671113869066087, "grad_norm": 0.4749499559402466, "learning_rate": 6.149566211013871e-06, "loss": 0.2567, "num_input_tokens_seen": 19008656, "step": 31170 }, { "epoch": 9.672665218740304, "grad_norm": 0.5912392139434814, "learning_rate": 6.148248627896123e-06, "loss": 0.2187, "num_input_tokens_seen": 19011952, "step": 31175 }, { "epoch": 9.67421656841452, "grad_norm": 0.4295589327812195, "learning_rate": 6.146930960598155e-06, "loss": 0.2214, "num_input_tokens_seen": 19015856, "step": 31180 }, { "epoch": 9.675767918088738, "grad_norm": 0.5226146578788757, "learning_rate": 6.145613209216567e-06, "loss": 0.2252, "num_input_tokens_seen": 19018224, "step": 31185 }, { "epoch": 9.677319267762954, "grad_norm": 0.4110593795776367, "learning_rate": 6.144295373847963e-06, "loss": 0.2305, "num_input_tokens_seen": 19021584, "step": 31190 }, { "epoch": 9.67887061743717, "grad_norm": 0.4293605387210846, "learning_rate": 6.142977454588957e-06, "loss": 0.2234, "num_input_tokens_seen": 19025936, "step": 31195 }, { "epoch": 9.680421967111387, "grad_norm": 0.3853223919868469, "learning_rate": 6.1416594515361706e-06, "loss": 0.2293, "num_input_tokens_seen": 19030064, "step": 31200 }, { "epoch": 9.681973316785603, "grad_norm": 0.3214711546897888, "learning_rate": 6.140341364786226e-06, "loss": 0.2203, "num_input_tokens_seen": 19033392, "step": 31205 }, { "epoch": 9.683524666459821, "grad_norm": 0.3717966675758362, "learning_rate": 6.139023194435756e-06, "loss": 0.2319, "num_input_tokens_seen": 19036112, "step": 31210 }, { "epoch": 9.685076016134037, "grad_norm": 0.4741426706314087, "learning_rate": 6.137704940581399e-06, "loss": 0.2305, "num_input_tokens_seen": 19039248, "step": 31215 }, { "epoch": 9.686627365808253, "grad_norm": 0.3636969029903412, "learning_rate": 6.136386603319795e-06, "loss": 0.2336, "num_input_tokens_seen": 19041648, "step": 31220 }, { "epoch": 9.68817871548247, "grad_norm": 0.30952733755111694, "learning_rate": 6.135068182747598e-06, "loss": 0.2252, "num_input_tokens_seen": 19043856, "step": 31225 }, { "epoch": 9.689730065156686, "grad_norm": 0.3068277835845947, "learning_rate": 6.133749678961461e-06, "loss": 0.2198, "num_input_tokens_seen": 19046544, "step": 31230 }, { "epoch": 9.691281414830902, "grad_norm": 0.31864580512046814, "learning_rate": 6.132431092058047e-06, "loss": 0.2234, "num_input_tokens_seen": 19049776, "step": 31235 }, { "epoch": 9.69283276450512, "grad_norm": 0.4292040467262268, "learning_rate": 6.1311124221340235e-06, "loss": 0.2171, "num_input_tokens_seen": 19052944, "step": 31240 }, { "epoch": 9.694384114179336, "grad_norm": 0.4261265993118286, "learning_rate": 6.129793669286066e-06, "loss": 0.2227, "num_input_tokens_seen": 19055376, "step": 31245 }, { "epoch": 9.695935463853553, "grad_norm": 0.40857475996017456, "learning_rate": 6.128474833610853e-06, "loss": 0.227, "num_input_tokens_seen": 19057872, "step": 31250 }, { "epoch": 9.69748681352777, "grad_norm": 0.42280516028404236, "learning_rate": 6.127155915205073e-06, "loss": 0.2146, "num_input_tokens_seen": 19061584, "step": 31255 }, { "epoch": 9.699038163201985, "grad_norm": 0.536166250705719, "learning_rate": 6.125836914165416e-06, "loss": 0.2319, "num_input_tokens_seen": 19064144, "step": 31260 }, { "epoch": 9.700589512876203, "grad_norm": 0.4969187080860138, "learning_rate": 6.124517830588581e-06, "loss": 0.2332, "num_input_tokens_seen": 19067536, "step": 31265 }, { "epoch": 9.702140862550419, "grad_norm": 0.3710913360118866, "learning_rate": 6.123198664571274e-06, "loss": 0.2184, "num_input_tokens_seen": 19070032, "step": 31270 }, { "epoch": 9.703692212224635, "grad_norm": 0.5610102415084839, "learning_rate": 6.121879416210204e-06, "loss": 0.2193, "num_input_tokens_seen": 19073552, "step": 31275 }, { "epoch": 9.705243561898852, "grad_norm": 0.4266074299812317, "learning_rate": 6.1205600856020865e-06, "loss": 0.2417, "num_input_tokens_seen": 19078736, "step": 31280 }, { "epoch": 9.706794911573068, "grad_norm": 0.3636525273323059, "learning_rate": 6.119240672843646e-06, "loss": 0.216, "num_input_tokens_seen": 19082000, "step": 31285 }, { "epoch": 9.708346261247286, "grad_norm": 0.45441606640815735, "learning_rate": 6.1179211780316094e-06, "loss": 0.2285, "num_input_tokens_seen": 19084592, "step": 31290 }, { "epoch": 9.709897610921502, "grad_norm": 0.8637814521789551, "learning_rate": 6.1166016012627126e-06, "loss": 0.2301, "num_input_tokens_seen": 19089360, "step": 31295 }, { "epoch": 9.711448960595717, "grad_norm": 0.47167131304740906, "learning_rate": 6.115281942633696e-06, "loss": 0.2411, "num_input_tokens_seen": 19092784, "step": 31300 }, { "epoch": 9.713000310269935, "grad_norm": 0.32372012734413147, "learning_rate": 6.113962202241307e-06, "loss": 0.2385, "num_input_tokens_seen": 19097424, "step": 31305 }, { "epoch": 9.714551659944151, "grad_norm": 0.4620715081691742, "learning_rate": 6.1126423801822965e-06, "loss": 0.2406, "num_input_tokens_seen": 19100400, "step": 31310 }, { "epoch": 9.716103009618369, "grad_norm": 0.38250163197517395, "learning_rate": 6.111322476553425e-06, "loss": 0.2357, "num_input_tokens_seen": 19104144, "step": 31315 }, { "epoch": 9.717654359292585, "grad_norm": 0.4793521463871002, "learning_rate": 6.110002491451455e-06, "loss": 0.2189, "num_input_tokens_seen": 19106864, "step": 31320 }, { "epoch": 9.7192057089668, "grad_norm": 0.6285690665245056, "learning_rate": 6.108682424973157e-06, "loss": 0.2237, "num_input_tokens_seen": 19109200, "step": 31325 }, { "epoch": 9.720757058641018, "grad_norm": 0.3187572658061981, "learning_rate": 6.1073622772153094e-06, "loss": 0.2291, "num_input_tokens_seen": 19113712, "step": 31330 }, { "epoch": 9.722308408315234, "grad_norm": 0.35387706756591797, "learning_rate": 6.106042048274694e-06, "loss": 0.2349, "num_input_tokens_seen": 19116112, "step": 31335 }, { "epoch": 9.723859757989452, "grad_norm": 0.4251892566680908, "learning_rate": 6.104721738248099e-06, "loss": 0.2336, "num_input_tokens_seen": 19118416, "step": 31340 }, { "epoch": 9.725411107663668, "grad_norm": 0.4591800570487976, "learning_rate": 6.103401347232318e-06, "loss": 0.2376, "num_input_tokens_seen": 19121008, "step": 31345 }, { "epoch": 9.726962457337883, "grad_norm": 0.4844726622104645, "learning_rate": 6.102080875324153e-06, "loss": 0.2286, "num_input_tokens_seen": 19125136, "step": 31350 }, { "epoch": 9.728513807012101, "grad_norm": 0.33802279829978943, "learning_rate": 6.100760322620409e-06, "loss": 0.226, "num_input_tokens_seen": 19127440, "step": 31355 }, { "epoch": 9.730065156686317, "grad_norm": 0.32431504130363464, "learning_rate": 6.099439689217898e-06, "loss": 0.2369, "num_input_tokens_seen": 19130064, "step": 31360 }, { "epoch": 9.731616506360533, "grad_norm": 0.37295567989349365, "learning_rate": 6.09811897521344e-06, "loss": 0.2261, "num_input_tokens_seen": 19132816, "step": 31365 }, { "epoch": 9.73316785603475, "grad_norm": 0.33836570382118225, "learning_rate": 6.096798180703854e-06, "loss": 0.2271, "num_input_tokens_seen": 19136720, "step": 31370 }, { "epoch": 9.734719205708966, "grad_norm": 0.31392452120780945, "learning_rate": 6.095477305785976e-06, "loss": 0.2314, "num_input_tokens_seen": 19139792, "step": 31375 }, { "epoch": 9.736270555383184, "grad_norm": 0.38675349950790405, "learning_rate": 6.094156350556639e-06, "loss": 0.2324, "num_input_tokens_seen": 19142704, "step": 31380 }, { "epoch": 9.7378219050574, "grad_norm": 0.4013572335243225, "learning_rate": 6.092835315112684e-06, "loss": 0.2317, "num_input_tokens_seen": 19145616, "step": 31385 }, { "epoch": 9.739373254731616, "grad_norm": 0.2658527195453644, "learning_rate": 6.09151419955096e-06, "loss": 0.2321, "num_input_tokens_seen": 19148720, "step": 31390 }, { "epoch": 9.740924604405834, "grad_norm": 0.32657042145729065, "learning_rate": 6.090193003968319e-06, "loss": 0.2326, "num_input_tokens_seen": 19151216, "step": 31395 }, { "epoch": 9.74247595408005, "grad_norm": 0.5863398313522339, "learning_rate": 6.088871728461621e-06, "loss": 0.2495, "num_input_tokens_seen": 19154224, "step": 31400 }, { "epoch": 9.744027303754265, "grad_norm": 0.7080817818641663, "learning_rate": 6.087550373127732e-06, "loss": 0.2343, "num_input_tokens_seen": 19157648, "step": 31405 }, { "epoch": 9.745578653428483, "grad_norm": 0.31652942299842834, "learning_rate": 6.086228938063522e-06, "loss": 0.2341, "num_input_tokens_seen": 19161040, "step": 31410 }, { "epoch": 9.747130003102699, "grad_norm": 0.5835241675376892, "learning_rate": 6.084907423365868e-06, "loss": 0.2376, "num_input_tokens_seen": 19164496, "step": 31415 }, { "epoch": 9.748681352776916, "grad_norm": 0.5145591497421265, "learning_rate": 6.083585829131652e-06, "loss": 0.2404, "num_input_tokens_seen": 19167568, "step": 31420 }, { "epoch": 9.750232702451132, "grad_norm": 0.4729386568069458, "learning_rate": 6.082264155457764e-06, "loss": 0.2332, "num_input_tokens_seen": 19170128, "step": 31425 }, { "epoch": 9.751784052125348, "grad_norm": 0.2692227065563202, "learning_rate": 6.080942402441095e-06, "loss": 0.2277, "num_input_tokens_seen": 19173488, "step": 31430 }, { "epoch": 9.753335401799566, "grad_norm": 0.21238096058368683, "learning_rate": 6.0796205701785495e-06, "loss": 0.224, "num_input_tokens_seen": 19176688, "step": 31435 }, { "epoch": 9.754886751473782, "grad_norm": 0.29121536016464233, "learning_rate": 6.078298658767032e-06, "loss": 0.2353, "num_input_tokens_seen": 19179856, "step": 31440 }, { "epoch": 9.756438101148, "grad_norm": 0.5536774396896362, "learning_rate": 6.076976668303454e-06, "loss": 0.2244, "num_input_tokens_seen": 19183344, "step": 31445 }, { "epoch": 9.757989450822215, "grad_norm": 0.29668521881103516, "learning_rate": 6.075654598884732e-06, "loss": 0.228, "num_input_tokens_seen": 19186000, "step": 31450 }, { "epoch": 9.759540800496431, "grad_norm": 0.261724054813385, "learning_rate": 6.07433245060779e-06, "loss": 0.2334, "num_input_tokens_seen": 19188944, "step": 31455 }, { "epoch": 9.761092150170649, "grad_norm": 0.36841970682144165, "learning_rate": 6.073010223569559e-06, "loss": 0.22, "num_input_tokens_seen": 19191856, "step": 31460 }, { "epoch": 9.762643499844865, "grad_norm": 0.21493098139762878, "learning_rate": 6.07168791786697e-06, "loss": 0.222, "num_input_tokens_seen": 19194672, "step": 31465 }, { "epoch": 9.764194849519082, "grad_norm": 0.4385201334953308, "learning_rate": 6.070365533596968e-06, "loss": 0.223, "num_input_tokens_seen": 19197744, "step": 31470 }, { "epoch": 9.765746199193298, "grad_norm": 0.31743159890174866, "learning_rate": 6.069043070856496e-06, "loss": 0.233, "num_input_tokens_seen": 19200496, "step": 31475 }, { "epoch": 9.767297548867514, "grad_norm": 0.48939087986946106, "learning_rate": 6.067720529742509e-06, "loss": 0.2365, "num_input_tokens_seen": 19204144, "step": 31480 }, { "epoch": 9.768848898541732, "grad_norm": 0.3592425286769867, "learning_rate": 6.066397910351962e-06, "loss": 0.2235, "num_input_tokens_seen": 19207536, "step": 31485 }, { "epoch": 9.770400248215948, "grad_norm": 0.2706175148487091, "learning_rate": 6.065075212781819e-06, "loss": 0.2407, "num_input_tokens_seen": 19210224, "step": 31490 }, { "epoch": 9.771951597890164, "grad_norm": 0.3021032512187958, "learning_rate": 6.063752437129053e-06, "loss": 0.2361, "num_input_tokens_seen": 19212656, "step": 31495 }, { "epoch": 9.773502947564381, "grad_norm": 0.24061621725559235, "learning_rate": 6.062429583490635e-06, "loss": 0.2195, "num_input_tokens_seen": 19215088, "step": 31500 }, { "epoch": 9.775054297238597, "grad_norm": 0.3717830181121826, "learning_rate": 6.061106651963548e-06, "loss": 0.2451, "num_input_tokens_seen": 19218096, "step": 31505 }, { "epoch": 9.776605646912815, "grad_norm": 0.2067226618528366, "learning_rate": 6.05978364264478e-06, "loss": 0.2283, "num_input_tokens_seen": 19220624, "step": 31510 }, { "epoch": 9.77815699658703, "grad_norm": 0.36277469992637634, "learning_rate": 6.058460555631319e-06, "loss": 0.2293, "num_input_tokens_seen": 19223504, "step": 31515 }, { "epoch": 9.779708346261247, "grad_norm": 0.29811424016952515, "learning_rate": 6.057137391020166e-06, "loss": 0.2239, "num_input_tokens_seen": 19226576, "step": 31520 }, { "epoch": 9.781259695935464, "grad_norm": 0.4619463086128235, "learning_rate": 6.055814148908323e-06, "loss": 0.2407, "num_input_tokens_seen": 19229584, "step": 31525 }, { "epoch": 9.78281104560968, "grad_norm": 0.3938966393470764, "learning_rate": 6.054490829392802e-06, "loss": 0.2231, "num_input_tokens_seen": 19233904, "step": 31530 }, { "epoch": 9.784362395283896, "grad_norm": 0.26170921325683594, "learning_rate": 6.053167432570614e-06, "loss": 0.2179, "num_input_tokens_seen": 19237648, "step": 31535 }, { "epoch": 9.785913744958114, "grad_norm": 0.7477288842201233, "learning_rate": 6.051843958538783e-06, "loss": 0.226, "num_input_tokens_seen": 19240016, "step": 31540 }, { "epoch": 9.78746509463233, "grad_norm": 0.3714250326156616, "learning_rate": 6.0505204073943344e-06, "loss": 0.2439, "num_input_tokens_seen": 19243440, "step": 31545 }, { "epoch": 9.789016444306547, "grad_norm": 0.4348876178264618, "learning_rate": 6.0491967792342985e-06, "loss": 0.2225, "num_input_tokens_seen": 19246256, "step": 31550 }, { "epoch": 9.790567793980763, "grad_norm": 0.5518752932548523, "learning_rate": 6.047873074155716e-06, "loss": 0.2387, "num_input_tokens_seen": 19250672, "step": 31555 }, { "epoch": 9.792119143654979, "grad_norm": 0.3752800226211548, "learning_rate": 6.046549292255628e-06, "loss": 0.2173, "num_input_tokens_seen": 19253136, "step": 31560 }, { "epoch": 9.793670493329197, "grad_norm": 0.39745014905929565, "learning_rate": 6.045225433631083e-06, "loss": 0.2252, "num_input_tokens_seen": 19255824, "step": 31565 }, { "epoch": 9.795221843003413, "grad_norm": 0.4012010097503662, "learning_rate": 6.043901498379138e-06, "loss": 0.2252, "num_input_tokens_seen": 19259248, "step": 31570 }, { "epoch": 9.79677319267763, "grad_norm": 0.3694298565387726, "learning_rate": 6.04257748659685e-06, "loss": 0.2279, "num_input_tokens_seen": 19262256, "step": 31575 }, { "epoch": 9.798324542351846, "grad_norm": 0.4192580580711365, "learning_rate": 6.0412533983812874e-06, "loss": 0.2397, "num_input_tokens_seen": 19265008, "step": 31580 }, { "epoch": 9.799875892026062, "grad_norm": 0.46199852228164673, "learning_rate": 6.03992923382952e-06, "loss": 0.2429, "num_input_tokens_seen": 19268976, "step": 31585 }, { "epoch": 9.80142724170028, "grad_norm": 0.6234238743782043, "learning_rate": 6.038604993038625e-06, "loss": 0.2277, "num_input_tokens_seen": 19272912, "step": 31590 }, { "epoch": 9.802978591374496, "grad_norm": 0.4485631585121155, "learning_rate": 6.037280676105685e-06, "loss": 0.2277, "num_input_tokens_seen": 19276304, "step": 31595 }, { "epoch": 9.804529941048713, "grad_norm": 0.3845391571521759, "learning_rate": 6.035956283127789e-06, "loss": 0.2208, "num_input_tokens_seen": 19278672, "step": 31600 }, { "epoch": 9.806081290722929, "grad_norm": 0.2586769163608551, "learning_rate": 6.034631814202029e-06, "loss": 0.2356, "num_input_tokens_seen": 19281200, "step": 31605 }, { "epoch": 9.807632640397145, "grad_norm": 0.3645835816860199, "learning_rate": 6.033307269425503e-06, "loss": 0.2296, "num_input_tokens_seen": 19284464, "step": 31610 }, { "epoch": 9.809183990071363, "grad_norm": 0.2682251036167145, "learning_rate": 6.031982648895321e-06, "loss": 0.2401, "num_input_tokens_seen": 19287952, "step": 31615 }, { "epoch": 9.810735339745579, "grad_norm": 0.2710365056991577, "learning_rate": 6.030657952708591e-06, "loss": 0.2324, "num_input_tokens_seen": 19289840, "step": 31620 }, { "epoch": 9.812286689419794, "grad_norm": 0.49087777733802795, "learning_rate": 6.029333180962426e-06, "loss": 0.2195, "num_input_tokens_seen": 19292624, "step": 31625 }, { "epoch": 9.813838039094012, "grad_norm": 0.4512263536453247, "learning_rate": 6.028008333753949e-06, "loss": 0.2393, "num_input_tokens_seen": 19295120, "step": 31630 }, { "epoch": 9.815389388768228, "grad_norm": 0.37798699736595154, "learning_rate": 6.02668341118029e-06, "loss": 0.2128, "num_input_tokens_seen": 19298416, "step": 31635 }, { "epoch": 9.816940738442446, "grad_norm": 0.4068123400211334, "learning_rate": 6.025358413338579e-06, "loss": 0.2288, "num_input_tokens_seen": 19301008, "step": 31640 }, { "epoch": 9.818492088116662, "grad_norm": 0.5870955586433411, "learning_rate": 6.024033340325954e-06, "loss": 0.2224, "num_input_tokens_seen": 19303792, "step": 31645 }, { "epoch": 9.820043437790877, "grad_norm": 0.5098356008529663, "learning_rate": 6.022708192239558e-06, "loss": 0.2304, "num_input_tokens_seen": 19306768, "step": 31650 }, { "epoch": 9.821594787465095, "grad_norm": 0.5129152536392212, "learning_rate": 6.021382969176541e-06, "loss": 0.2377, "num_input_tokens_seen": 19309552, "step": 31655 }, { "epoch": 9.823146137139311, "grad_norm": 0.5470303893089294, "learning_rate": 6.0200576712340585e-06, "loss": 0.2234, "num_input_tokens_seen": 19313808, "step": 31660 }, { "epoch": 9.824697486813527, "grad_norm": 0.5395456552505493, "learning_rate": 6.018732298509269e-06, "loss": 0.2196, "num_input_tokens_seen": 19316752, "step": 31665 }, { "epoch": 9.826248836487744, "grad_norm": 0.3485807478427887, "learning_rate": 6.017406851099338e-06, "loss": 0.2319, "num_input_tokens_seen": 19319536, "step": 31670 }, { "epoch": 9.82780018616196, "grad_norm": 0.4341370761394501, "learning_rate": 6.0160813291014375e-06, "loss": 0.2354, "num_input_tokens_seen": 19322704, "step": 31675 }, { "epoch": 9.829351535836178, "grad_norm": 0.40473049879074097, "learning_rate": 6.014755732612742e-06, "loss": 0.227, "num_input_tokens_seen": 19325552, "step": 31680 }, { "epoch": 9.830902885510394, "grad_norm": 0.5502253770828247, "learning_rate": 6.013430061730435e-06, "loss": 0.2303, "num_input_tokens_seen": 19328112, "step": 31685 }, { "epoch": 9.83245423518461, "grad_norm": 0.9628447890281677, "learning_rate": 6.012104316551704e-06, "loss": 0.2392, "num_input_tokens_seen": 19332880, "step": 31690 }, { "epoch": 9.834005584858827, "grad_norm": 0.4628852903842926, "learning_rate": 6.010778497173743e-06, "loss": 0.2266, "num_input_tokens_seen": 19335376, "step": 31695 }, { "epoch": 9.835556934533043, "grad_norm": 0.5080543160438538, "learning_rate": 6.009452603693747e-06, "loss": 0.2359, "num_input_tokens_seen": 19339376, "step": 31700 }, { "epoch": 9.837108284207261, "grad_norm": 0.3270127773284912, "learning_rate": 6.008126636208922e-06, "loss": 0.2405, "num_input_tokens_seen": 19341840, "step": 31705 }, { "epoch": 9.838659633881477, "grad_norm": 0.5061001777648926, "learning_rate": 6.006800594816478e-06, "loss": 0.2333, "num_input_tokens_seen": 19344624, "step": 31710 }, { "epoch": 9.840210983555693, "grad_norm": 0.20540596544742584, "learning_rate": 6.005474479613625e-06, "loss": 0.2318, "num_input_tokens_seen": 19348016, "step": 31715 }, { "epoch": 9.84176233322991, "grad_norm": 0.3427632451057434, "learning_rate": 6.004148290697589e-06, "loss": 0.214, "num_input_tokens_seen": 19351408, "step": 31720 }, { "epoch": 9.843313682904126, "grad_norm": 0.6218007802963257, "learning_rate": 6.002822028165591e-06, "loss": 0.237, "num_input_tokens_seen": 19355312, "step": 31725 }, { "epoch": 9.844865032578344, "grad_norm": 0.23870941996574402, "learning_rate": 6.001495692114863e-06, "loss": 0.2352, "num_input_tokens_seen": 19357872, "step": 31730 }, { "epoch": 9.84641638225256, "grad_norm": 0.33120501041412354, "learning_rate": 6.000169282642641e-06, "loss": 0.226, "num_input_tokens_seen": 19360112, "step": 31735 }, { "epoch": 9.847967731926776, "grad_norm": 0.5148958563804626, "learning_rate": 5.998842799846168e-06, "loss": 0.2275, "num_input_tokens_seen": 19363280, "step": 31740 }, { "epoch": 9.849519081600993, "grad_norm": 0.3246402442455292, "learning_rate": 5.997516243822689e-06, "loss": 0.2238, "num_input_tokens_seen": 19366064, "step": 31745 }, { "epoch": 9.85107043127521, "grad_norm": 0.4977015256881714, "learning_rate": 5.996189614669457e-06, "loss": 0.2434, "num_input_tokens_seen": 19369520, "step": 31750 }, { "epoch": 9.852621780949425, "grad_norm": 0.3822042644023895, "learning_rate": 5.994862912483729e-06, "loss": 0.229, "num_input_tokens_seen": 19372176, "step": 31755 }, { "epoch": 9.854173130623643, "grad_norm": 0.2691214680671692, "learning_rate": 5.99353613736277e-06, "loss": 0.234, "num_input_tokens_seen": 19375632, "step": 31760 }, { "epoch": 9.855724480297859, "grad_norm": 0.30344679951667786, "learning_rate": 5.992209289403845e-06, "loss": 0.216, "num_input_tokens_seen": 19378320, "step": 31765 }, { "epoch": 9.857275829972076, "grad_norm": 0.25426843762397766, "learning_rate": 5.990882368704232e-06, "loss": 0.2391, "num_input_tokens_seen": 19381328, "step": 31770 }, { "epoch": 9.858827179646292, "grad_norm": 0.27424201369285583, "learning_rate": 5.989555375361206e-06, "loss": 0.2285, "num_input_tokens_seen": 19384496, "step": 31775 }, { "epoch": 9.860378529320508, "grad_norm": 0.29706019163131714, "learning_rate": 5.988228309472053e-06, "loss": 0.2303, "num_input_tokens_seen": 19387664, "step": 31780 }, { "epoch": 9.861929878994726, "grad_norm": 0.47630542516708374, "learning_rate": 5.986901171134063e-06, "loss": 0.2271, "num_input_tokens_seen": 19390768, "step": 31785 }, { "epoch": 9.863481228668942, "grad_norm": 0.2546805143356323, "learning_rate": 5.985573960444529e-06, "loss": 0.2285, "num_input_tokens_seen": 19393008, "step": 31790 }, { "epoch": 9.865032578343158, "grad_norm": 0.2610114812850952, "learning_rate": 5.984246677500755e-06, "loss": 0.2206, "num_input_tokens_seen": 19395920, "step": 31795 }, { "epoch": 9.866583928017375, "grad_norm": 0.37181708216667175, "learning_rate": 5.982919322400044e-06, "loss": 0.2265, "num_input_tokens_seen": 19399120, "step": 31800 }, { "epoch": 9.868135277691591, "grad_norm": 0.42490601539611816, "learning_rate": 5.981591895239705e-06, "loss": 0.2374, "num_input_tokens_seen": 19403312, "step": 31805 }, { "epoch": 9.869686627365809, "grad_norm": 0.4261598587036133, "learning_rate": 5.980264396117057e-06, "loss": 0.2196, "num_input_tokens_seen": 19406288, "step": 31810 }, { "epoch": 9.871237977040025, "grad_norm": 0.2909732162952423, "learning_rate": 5.978936825129422e-06, "loss": 0.2157, "num_input_tokens_seen": 19409392, "step": 31815 }, { "epoch": 9.87278932671424, "grad_norm": 0.29580891132354736, "learning_rate": 5.977609182374124e-06, "loss": 0.2242, "num_input_tokens_seen": 19411728, "step": 31820 }, { "epoch": 9.874340676388458, "grad_norm": 0.3862057626247406, "learning_rate": 5.976281467948498e-06, "loss": 0.2149, "num_input_tokens_seen": 19414896, "step": 31825 }, { "epoch": 9.875892026062674, "grad_norm": 0.28285109996795654, "learning_rate": 5.974953681949878e-06, "loss": 0.2375, "num_input_tokens_seen": 19417488, "step": 31830 }, { "epoch": 9.877443375736892, "grad_norm": 0.36256128549575806, "learning_rate": 5.973625824475609e-06, "loss": 0.2311, "num_input_tokens_seen": 19420400, "step": 31835 }, { "epoch": 9.878994725411108, "grad_norm": 0.23231518268585205, "learning_rate": 5.9722978956230385e-06, "loss": 0.2412, "num_input_tokens_seen": 19423536, "step": 31840 }, { "epoch": 9.880546075085324, "grad_norm": 0.46859994530677795, "learning_rate": 5.970969895489517e-06, "loss": 0.2368, "num_input_tokens_seen": 19426128, "step": 31845 }, { "epoch": 9.882097424759541, "grad_norm": 0.3445844054222107, "learning_rate": 5.969641824172404e-06, "loss": 0.2509, "num_input_tokens_seen": 19428784, "step": 31850 }, { "epoch": 9.883648774433757, "grad_norm": 0.5114994645118713, "learning_rate": 5.968313681769064e-06, "loss": 0.2378, "num_input_tokens_seen": 19432336, "step": 31855 }, { "epoch": 9.885200124107975, "grad_norm": 0.5149898529052734, "learning_rate": 5.966985468376864e-06, "loss": 0.2309, "num_input_tokens_seen": 19434640, "step": 31860 }, { "epoch": 9.88675147378219, "grad_norm": 0.30555737018585205, "learning_rate": 5.965657184093176e-06, "loss": 0.2369, "num_input_tokens_seen": 19437616, "step": 31865 }, { "epoch": 9.888302823456407, "grad_norm": 0.5059952139854431, "learning_rate": 5.964328829015385e-06, "loss": 0.239, "num_input_tokens_seen": 19440592, "step": 31870 }, { "epoch": 9.889854173130624, "grad_norm": 0.39046064019203186, "learning_rate": 5.963000403240869e-06, "loss": 0.2356, "num_input_tokens_seen": 19444304, "step": 31875 }, { "epoch": 9.89140552280484, "grad_norm": 0.30513158440589905, "learning_rate": 5.961671906867022e-06, "loss": 0.2334, "num_input_tokens_seen": 19446384, "step": 31880 }, { "epoch": 9.892956872479056, "grad_norm": 0.3097825050354004, "learning_rate": 5.9603433399912345e-06, "loss": 0.2294, "num_input_tokens_seen": 19450640, "step": 31885 }, { "epoch": 9.894508222153274, "grad_norm": 0.45043623447418213, "learning_rate": 5.959014702710908e-06, "loss": 0.2234, "num_input_tokens_seen": 19454288, "step": 31890 }, { "epoch": 9.89605957182749, "grad_norm": 0.27801963686943054, "learning_rate": 5.957685995123449e-06, "loss": 0.2366, "num_input_tokens_seen": 19457040, "step": 31895 }, { "epoch": 9.897610921501707, "grad_norm": 0.34379637241363525, "learning_rate": 5.956357217326265e-06, "loss": 0.238, "num_input_tokens_seen": 19459696, "step": 31900 }, { "epoch": 9.899162271175923, "grad_norm": 0.23743289709091187, "learning_rate": 5.955028369416771e-06, "loss": 0.2234, "num_input_tokens_seen": 19462192, "step": 31905 }, { "epoch": 9.900713620850139, "grad_norm": 0.4187246859073639, "learning_rate": 5.953699451492389e-06, "loss": 0.2233, "num_input_tokens_seen": 19468368, "step": 31910 }, { "epoch": 9.902264970524357, "grad_norm": 0.30783453583717346, "learning_rate": 5.952370463650544e-06, "loss": 0.2338, "num_input_tokens_seen": 19470800, "step": 31915 }, { "epoch": 9.903816320198572, "grad_norm": 0.4616951644420624, "learning_rate": 5.951041405988666e-06, "loss": 0.2198, "num_input_tokens_seen": 19474192, "step": 31920 }, { "epoch": 9.905367669872788, "grad_norm": 0.23351511359214783, "learning_rate": 5.949712278604192e-06, "loss": 0.2261, "num_input_tokens_seen": 19476624, "step": 31925 }, { "epoch": 9.906919019547006, "grad_norm": 0.4471251964569092, "learning_rate": 5.94838308159456e-06, "loss": 0.2356, "num_input_tokens_seen": 19479024, "step": 31930 }, { "epoch": 9.908470369221222, "grad_norm": 0.2602474093437195, "learning_rate": 5.947053815057219e-06, "loss": 0.2292, "num_input_tokens_seen": 19481552, "step": 31935 }, { "epoch": 9.91002171889544, "grad_norm": 0.2784152030944824, "learning_rate": 5.945724479089616e-06, "loss": 0.2321, "num_input_tokens_seen": 19484336, "step": 31940 }, { "epoch": 9.911573068569655, "grad_norm": 0.38731181621551514, "learning_rate": 5.944395073789212e-06, "loss": 0.2296, "num_input_tokens_seen": 19486768, "step": 31945 }, { "epoch": 9.913124418243871, "grad_norm": 0.4697522521018982, "learning_rate": 5.9430655992534654e-06, "loss": 0.2273, "num_input_tokens_seen": 19489648, "step": 31950 }, { "epoch": 9.914675767918089, "grad_norm": 0.3511449694633484, "learning_rate": 5.9417360555798434e-06, "loss": 0.2365, "num_input_tokens_seen": 19492784, "step": 31955 }, { "epoch": 9.916227117592305, "grad_norm": 0.2718878388404846, "learning_rate": 5.940406442865816e-06, "loss": 0.2322, "num_input_tokens_seen": 19495280, "step": 31960 }, { "epoch": 9.917778467266523, "grad_norm": 0.43943315744400024, "learning_rate": 5.939076761208861e-06, "loss": 0.229, "num_input_tokens_seen": 19499280, "step": 31965 }, { "epoch": 9.919329816940738, "grad_norm": 0.3664143681526184, "learning_rate": 5.937747010706457e-06, "loss": 0.2262, "num_input_tokens_seen": 19502960, "step": 31970 }, { "epoch": 9.920881166614954, "grad_norm": 0.3022310435771942, "learning_rate": 5.936417191456094e-06, "loss": 0.2259, "num_input_tokens_seen": 19507248, "step": 31975 }, { "epoch": 9.922432516289172, "grad_norm": 0.3084385395050049, "learning_rate": 5.935087303555263e-06, "loss": 0.2281, "num_input_tokens_seen": 19510128, "step": 31980 }, { "epoch": 9.923983865963388, "grad_norm": 0.3010764718055725, "learning_rate": 5.933757347101459e-06, "loss": 0.22, "num_input_tokens_seen": 19512560, "step": 31985 }, { "epoch": 9.925535215637606, "grad_norm": 0.23612375557422638, "learning_rate": 5.932427322192182e-06, "loss": 0.229, "num_input_tokens_seen": 19516080, "step": 31990 }, { "epoch": 9.927086565311821, "grad_norm": 0.5262511968612671, "learning_rate": 5.931097228924943e-06, "loss": 0.2284, "num_input_tokens_seen": 19519248, "step": 31995 }, { "epoch": 9.928637914986037, "grad_norm": 0.5988909602165222, "learning_rate": 5.92976706739725e-06, "loss": 0.24, "num_input_tokens_seen": 19522448, "step": 32000 }, { "epoch": 9.930189264660255, "grad_norm": 0.2003776878118515, "learning_rate": 5.9284368377066215e-06, "loss": 0.2274, "num_input_tokens_seen": 19525584, "step": 32005 }, { "epoch": 9.93174061433447, "grad_norm": 0.3603587746620178, "learning_rate": 5.927106539950579e-06, "loss": 0.2312, "num_input_tokens_seen": 19528688, "step": 32010 }, { "epoch": 9.933291964008687, "grad_norm": 0.3319437801837921, "learning_rate": 5.925776174226648e-06, "loss": 0.2338, "num_input_tokens_seen": 19531088, "step": 32015 }, { "epoch": 9.934843313682904, "grad_norm": 0.34402209520339966, "learning_rate": 5.924445740632361e-06, "loss": 0.232, "num_input_tokens_seen": 19533520, "step": 32020 }, { "epoch": 9.93639466335712, "grad_norm": 0.5250240564346313, "learning_rate": 5.9231152392652534e-06, "loss": 0.2337, "num_input_tokens_seen": 19536720, "step": 32025 }, { "epoch": 9.937946013031338, "grad_norm": 0.21121542155742645, "learning_rate": 5.921784670222867e-06, "loss": 0.234, "num_input_tokens_seen": 19539024, "step": 32030 }, { "epoch": 9.939497362705554, "grad_norm": 0.4180419445037842, "learning_rate": 5.92045403360275e-06, "loss": 0.2378, "num_input_tokens_seen": 19541360, "step": 32035 }, { "epoch": 9.94104871237977, "grad_norm": 0.30515527725219727, "learning_rate": 5.919123329502452e-06, "loss": 0.2418, "num_input_tokens_seen": 19544208, "step": 32040 }, { "epoch": 9.942600062053987, "grad_norm": 0.40411025285720825, "learning_rate": 5.917792558019531e-06, "loss": 0.2261, "num_input_tokens_seen": 19548240, "step": 32045 }, { "epoch": 9.944151411728203, "grad_norm": 0.3347192704677582, "learning_rate": 5.916461719251545e-06, "loss": 0.2342, "num_input_tokens_seen": 19550768, "step": 32050 }, { "epoch": 9.945702761402421, "grad_norm": 0.23908977210521698, "learning_rate": 5.915130813296064e-06, "loss": 0.2336, "num_input_tokens_seen": 19553360, "step": 32055 }, { "epoch": 9.947254111076637, "grad_norm": 0.459635466337204, "learning_rate": 5.913799840250656e-06, "loss": 0.2264, "num_input_tokens_seen": 19556304, "step": 32060 }, { "epoch": 9.948805460750853, "grad_norm": 0.29628461599349976, "learning_rate": 5.9124688002129e-06, "loss": 0.2369, "num_input_tokens_seen": 19558512, "step": 32065 }, { "epoch": 9.95035681042507, "grad_norm": 0.6309347152709961, "learning_rate": 5.911137693280376e-06, "loss": 0.2357, "num_input_tokens_seen": 19561296, "step": 32070 }, { "epoch": 9.951908160099286, "grad_norm": 0.3181273639202118, "learning_rate": 5.909806519550669e-06, "loss": 0.2339, "num_input_tokens_seen": 19564784, "step": 32075 }, { "epoch": 9.953459509773502, "grad_norm": 0.47956612706184387, "learning_rate": 5.9084752791213706e-06, "loss": 0.2212, "num_input_tokens_seen": 19567536, "step": 32080 }, { "epoch": 9.95501085944772, "grad_norm": 0.3637533187866211, "learning_rate": 5.907143972090076e-06, "loss": 0.2288, "num_input_tokens_seen": 19570480, "step": 32085 }, { "epoch": 9.956562209121936, "grad_norm": 0.5610751509666443, "learning_rate": 5.905812598554387e-06, "loss": 0.2349, "num_input_tokens_seen": 19573200, "step": 32090 }, { "epoch": 9.958113558796153, "grad_norm": 0.3686087429523468, "learning_rate": 5.904481158611906e-06, "loss": 0.2231, "num_input_tokens_seen": 19576112, "step": 32095 }, { "epoch": 9.95966490847037, "grad_norm": 0.4808257520198822, "learning_rate": 5.903149652360249e-06, "loss": 0.226, "num_input_tokens_seen": 19579440, "step": 32100 }, { "epoch": 9.961216258144585, "grad_norm": 0.25151124596595764, "learning_rate": 5.901818079897024e-06, "loss": 0.2299, "num_input_tokens_seen": 19581552, "step": 32105 }, { "epoch": 9.962767607818803, "grad_norm": 0.3124472200870514, "learning_rate": 5.900486441319857e-06, "loss": 0.2282, "num_input_tokens_seen": 19584208, "step": 32110 }, { "epoch": 9.964318957493019, "grad_norm": 0.4693269431591034, "learning_rate": 5.899154736726369e-06, "loss": 0.2408, "num_input_tokens_seen": 19587440, "step": 32115 }, { "epoch": 9.965870307167236, "grad_norm": 0.23527014255523682, "learning_rate": 5.89782296621419e-06, "loss": 0.2328, "num_input_tokens_seen": 19590096, "step": 32120 }, { "epoch": 9.967421656841452, "grad_norm": 0.356453537940979, "learning_rate": 5.896491129880958e-06, "loss": 0.2275, "num_input_tokens_seen": 19592688, "step": 32125 }, { "epoch": 9.968973006515668, "grad_norm": 0.4588179886341095, "learning_rate": 5.89515922782431e-06, "loss": 0.2429, "num_input_tokens_seen": 19596464, "step": 32130 }, { "epoch": 9.970524356189886, "grad_norm": 0.15501561760902405, "learning_rate": 5.89382726014189e-06, "loss": 0.2302, "num_input_tokens_seen": 19599504, "step": 32135 }, { "epoch": 9.972075705864102, "grad_norm": 0.3540312349796295, "learning_rate": 5.892495226931348e-06, "loss": 0.2285, "num_input_tokens_seen": 19602544, "step": 32140 }, { "epoch": 9.973627055538318, "grad_norm": 0.2718927264213562, "learning_rate": 5.8911631282903355e-06, "loss": 0.2311, "num_input_tokens_seen": 19605200, "step": 32145 }, { "epoch": 9.975178405212535, "grad_norm": 0.28181084990501404, "learning_rate": 5.889830964316514e-06, "loss": 0.2309, "num_input_tokens_seen": 19607952, "step": 32150 }, { "epoch": 9.976729754886751, "grad_norm": 0.28320056200027466, "learning_rate": 5.888498735107545e-06, "loss": 0.2452, "num_input_tokens_seen": 19611248, "step": 32155 }, { "epoch": 9.978281104560969, "grad_norm": 0.24862994253635406, "learning_rate": 5.8871664407610984e-06, "loss": 0.2305, "num_input_tokens_seen": 19613520, "step": 32160 }, { "epoch": 9.979832454235185, "grad_norm": 0.14463494718074799, "learning_rate": 5.885834081374845e-06, "loss": 0.2253, "num_input_tokens_seen": 19616304, "step": 32165 }, { "epoch": 9.9813838039094, "grad_norm": 0.4207337498664856, "learning_rate": 5.8845016570464645e-06, "loss": 0.2291, "num_input_tokens_seen": 19619184, "step": 32170 }, { "epoch": 9.982935153583618, "grad_norm": 0.26452192664146423, "learning_rate": 5.883169167873638e-06, "loss": 0.2341, "num_input_tokens_seen": 19622832, "step": 32175 }, { "epoch": 9.984486503257834, "grad_norm": 0.3600510060787201, "learning_rate": 5.881836613954052e-06, "loss": 0.2225, "num_input_tokens_seen": 19625552, "step": 32180 }, { "epoch": 9.986037852932052, "grad_norm": 0.2830483913421631, "learning_rate": 5.8805039953854e-06, "loss": 0.2277, "num_input_tokens_seen": 19629616, "step": 32185 }, { "epoch": 9.987589202606268, "grad_norm": 0.23431843519210815, "learning_rate": 5.8791713122653785e-06, "loss": 0.2285, "num_input_tokens_seen": 19633008, "step": 32190 }, { "epoch": 9.989140552280483, "grad_norm": 0.3756887912750244, "learning_rate": 5.87783856469169e-06, "loss": 0.2329, "num_input_tokens_seen": 19635760, "step": 32195 }, { "epoch": 9.990691901954701, "grad_norm": 0.31835439801216125, "learning_rate": 5.876505752762038e-06, "loss": 0.2223, "num_input_tokens_seen": 19638640, "step": 32200 }, { "epoch": 9.992243251628917, "grad_norm": 0.4140779972076416, "learning_rate": 5.875172876574137e-06, "loss": 0.233, "num_input_tokens_seen": 19641264, "step": 32205 }, { "epoch": 9.993794601303133, "grad_norm": 0.2084285020828247, "learning_rate": 5.873839936225699e-06, "loss": 0.2228, "num_input_tokens_seen": 19643856, "step": 32210 }, { "epoch": 9.99534595097735, "grad_norm": 0.26011887192726135, "learning_rate": 5.872506931814447e-06, "loss": 0.2307, "num_input_tokens_seen": 19646992, "step": 32215 }, { "epoch": 9.996897300651566, "grad_norm": 0.27690890431404114, "learning_rate": 5.8711738634381045e-06, "loss": 0.2234, "num_input_tokens_seen": 19649392, "step": 32220 }, { "epoch": 9.998448650325784, "grad_norm": 0.4391305148601532, "learning_rate": 5.869840731194401e-06, "loss": 0.2374, "num_input_tokens_seen": 19652464, "step": 32225 }, { "epoch": 10.0, "grad_norm": 0.6724677681922913, "learning_rate": 5.8685075351810725e-06, "loss": 0.248, "num_input_tokens_seen": 19655824, "step": 32230 }, { "epoch": 10.0, "eval_loss": 0.2318788319826126, "eval_runtime": 34.4616, "eval_samples_per_second": 93.524, "eval_steps_per_second": 23.388, "num_input_tokens_seen": 19655824, "step": 32230 }, { "epoch": 10.001551349674216, "grad_norm": 0.36959922313690186, "learning_rate": 5.867174275495856e-06, "loss": 0.2324, "num_input_tokens_seen": 19659056, "step": 32235 }, { "epoch": 10.003102699348434, "grad_norm": 0.34881100058555603, "learning_rate": 5.865840952236497e-06, "loss": 0.2261, "num_input_tokens_seen": 19662736, "step": 32240 }, { "epoch": 10.00465404902265, "grad_norm": 0.23975339531898499, "learning_rate": 5.864507565500742e-06, "loss": 0.2297, "num_input_tokens_seen": 19664944, "step": 32245 }, { "epoch": 10.006205398696867, "grad_norm": 0.29877394437789917, "learning_rate": 5.8631741153863455e-06, "loss": 0.2287, "num_input_tokens_seen": 19667472, "step": 32250 }, { "epoch": 10.007756748371083, "grad_norm": 0.36310407519340515, "learning_rate": 5.8618406019910655e-06, "loss": 0.2203, "num_input_tokens_seen": 19670576, "step": 32255 }, { "epoch": 10.009308098045299, "grad_norm": 0.3131287097930908, "learning_rate": 5.860507025412663e-06, "loss": 0.2175, "num_input_tokens_seen": 19673840, "step": 32260 }, { "epoch": 10.010859447719517, "grad_norm": 0.3333371877670288, "learning_rate": 5.859173385748907e-06, "loss": 0.2347, "num_input_tokens_seen": 19676048, "step": 32265 }, { "epoch": 10.012410797393732, "grad_norm": 0.3876143991947174, "learning_rate": 5.857839683097566e-06, "loss": 0.2392, "num_input_tokens_seen": 19679600, "step": 32270 }, { "epoch": 10.013962147067948, "grad_norm": 0.289020299911499, "learning_rate": 5.856505917556418e-06, "loss": 0.2287, "num_input_tokens_seen": 19681904, "step": 32275 }, { "epoch": 10.015513496742166, "grad_norm": 0.47192156314849854, "learning_rate": 5.855172089223244e-06, "loss": 0.2267, "num_input_tokens_seen": 19687024, "step": 32280 }, { "epoch": 10.017064846416382, "grad_norm": 0.29290205240249634, "learning_rate": 5.853838198195829e-06, "loss": 0.2318, "num_input_tokens_seen": 19689840, "step": 32285 }, { "epoch": 10.0186161960906, "grad_norm": 0.22590290009975433, "learning_rate": 5.852504244571964e-06, "loss": 0.2246, "num_input_tokens_seen": 19692240, "step": 32290 }, { "epoch": 10.020167545764815, "grad_norm": 0.2552660405635834, "learning_rate": 5.8511702284494424e-06, "loss": 0.2306, "num_input_tokens_seen": 19695280, "step": 32295 }, { "epoch": 10.021718895439031, "grad_norm": 0.30444324016571045, "learning_rate": 5.849836149926064e-06, "loss": 0.2246, "num_input_tokens_seen": 19697808, "step": 32300 }, { "epoch": 10.023270245113249, "grad_norm": 0.4093632698059082, "learning_rate": 5.848502009099631e-06, "loss": 0.2261, "num_input_tokens_seen": 19700656, "step": 32305 }, { "epoch": 10.024821594787465, "grad_norm": 0.44185543060302734, "learning_rate": 5.847167806067952e-06, "loss": 0.2295, "num_input_tokens_seen": 19704464, "step": 32310 }, { "epoch": 10.026372944461682, "grad_norm": 0.4368995726108551, "learning_rate": 5.845833540928844e-06, "loss": 0.2273, "num_input_tokens_seen": 19707600, "step": 32315 }, { "epoch": 10.027924294135898, "grad_norm": 0.2472875565290451, "learning_rate": 5.844499213780119e-06, "loss": 0.2222, "num_input_tokens_seen": 19710608, "step": 32320 }, { "epoch": 10.029475643810114, "grad_norm": 0.360063761472702, "learning_rate": 5.8431648247196025e-06, "loss": 0.2322, "num_input_tokens_seen": 19713520, "step": 32325 }, { "epoch": 10.031026993484332, "grad_norm": 0.2921960949897766, "learning_rate": 5.841830373845119e-06, "loss": 0.2356, "num_input_tokens_seen": 19716048, "step": 32330 }, { "epoch": 10.032578343158548, "grad_norm": 0.2714434266090393, "learning_rate": 5.840495861254501e-06, "loss": 0.2363, "num_input_tokens_seen": 19718576, "step": 32335 }, { "epoch": 10.034129692832764, "grad_norm": 0.3651655912399292, "learning_rate": 5.839161287045583e-06, "loss": 0.228, "num_input_tokens_seen": 19721200, "step": 32340 }, { "epoch": 10.035681042506981, "grad_norm": 0.36877137422561646, "learning_rate": 5.8378266513162055e-06, "loss": 0.2357, "num_input_tokens_seen": 19723792, "step": 32345 }, { "epoch": 10.037232392181197, "grad_norm": 0.5055186152458191, "learning_rate": 5.836491954164213e-06, "loss": 0.2307, "num_input_tokens_seen": 19726320, "step": 32350 }, { "epoch": 10.038783741855415, "grad_norm": 0.4451121389865875, "learning_rate": 5.835157195687456e-06, "loss": 0.2322, "num_input_tokens_seen": 19730288, "step": 32355 }, { "epoch": 10.04033509152963, "grad_norm": 0.6525634527206421, "learning_rate": 5.833822375983785e-06, "loss": 0.2318, "num_input_tokens_seen": 19733808, "step": 32360 }, { "epoch": 10.041886441203847, "grad_norm": 0.7916449904441833, "learning_rate": 5.83248749515106e-06, "loss": 0.2224, "num_input_tokens_seen": 19736880, "step": 32365 }, { "epoch": 10.043437790878064, "grad_norm": 0.43939074873924255, "learning_rate": 5.8311525532871445e-06, "loss": 0.2258, "num_input_tokens_seen": 19739952, "step": 32370 }, { "epoch": 10.04498914055228, "grad_norm": 0.3612096607685089, "learning_rate": 5.8298175504899035e-06, "loss": 0.2232, "num_input_tokens_seen": 19742416, "step": 32375 }, { "epoch": 10.046540490226498, "grad_norm": 0.400004118680954, "learning_rate": 5.82848248685721e-06, "loss": 0.2251, "num_input_tokens_seen": 19745040, "step": 32380 }, { "epoch": 10.048091839900714, "grad_norm": 0.7393637299537659, "learning_rate": 5.82714736248694e-06, "loss": 0.243, "num_input_tokens_seen": 19748048, "step": 32385 }, { "epoch": 10.04964318957493, "grad_norm": 0.34367990493774414, "learning_rate": 5.8258121774769725e-06, "loss": 0.2235, "num_input_tokens_seen": 19751024, "step": 32390 }, { "epoch": 10.051194539249147, "grad_norm": 0.3627162277698517, "learning_rate": 5.824476931925195e-06, "loss": 0.2224, "num_input_tokens_seen": 19753168, "step": 32395 }, { "epoch": 10.052745888923363, "grad_norm": 0.3789682388305664, "learning_rate": 5.823141625929494e-06, "loss": 0.2164, "num_input_tokens_seen": 19756112, "step": 32400 }, { "epoch": 10.054297238597579, "grad_norm": 0.45436805486679077, "learning_rate": 5.821806259587764e-06, "loss": 0.2304, "num_input_tokens_seen": 19759184, "step": 32405 }, { "epoch": 10.055848588271797, "grad_norm": 0.4557183086872101, "learning_rate": 5.820470832997904e-06, "loss": 0.2352, "num_input_tokens_seen": 19763024, "step": 32410 }, { "epoch": 10.057399937946013, "grad_norm": 0.5932743549346924, "learning_rate": 5.819135346257816e-06, "loss": 0.2363, "num_input_tokens_seen": 19765680, "step": 32415 }, { "epoch": 10.05895128762023, "grad_norm": 0.5434921383857727, "learning_rate": 5.817799799465407e-06, "loss": 0.229, "num_input_tokens_seen": 19768336, "step": 32420 }, { "epoch": 10.060502637294446, "grad_norm": 0.5801302194595337, "learning_rate": 5.8164641927185896e-06, "loss": 0.2249, "num_input_tokens_seen": 19771216, "step": 32425 }, { "epoch": 10.062053986968662, "grad_norm": 0.5637505054473877, "learning_rate": 5.815128526115277e-06, "loss": 0.2311, "num_input_tokens_seen": 19774000, "step": 32430 }, { "epoch": 10.06360533664288, "grad_norm": 0.39734500646591187, "learning_rate": 5.813792799753391e-06, "loss": 0.2199, "num_input_tokens_seen": 19777456, "step": 32435 }, { "epoch": 10.065156686317096, "grad_norm": 0.4857270419597626, "learning_rate": 5.812457013730855e-06, "loss": 0.2252, "num_input_tokens_seen": 19780336, "step": 32440 }, { "epoch": 10.066708035991313, "grad_norm": 0.5818551778793335, "learning_rate": 5.8111211681456014e-06, "loss": 0.223, "num_input_tokens_seen": 19783952, "step": 32445 }, { "epoch": 10.06825938566553, "grad_norm": 0.4256228506565094, "learning_rate": 5.80978526309556e-06, "loss": 0.2167, "num_input_tokens_seen": 19787056, "step": 32450 }, { "epoch": 10.069810735339745, "grad_norm": 0.3966648578643799, "learning_rate": 5.808449298678669e-06, "loss": 0.2289, "num_input_tokens_seen": 19790384, "step": 32455 }, { "epoch": 10.071362085013963, "grad_norm": 0.519624650478363, "learning_rate": 5.807113274992873e-06, "loss": 0.2194, "num_input_tokens_seen": 19792624, "step": 32460 }, { "epoch": 10.072913434688179, "grad_norm": 0.7470789551734924, "learning_rate": 5.805777192136114e-06, "loss": 0.2214, "num_input_tokens_seen": 19795920, "step": 32465 }, { "epoch": 10.074464784362394, "grad_norm": 0.7313746809959412, "learning_rate": 5.804441050206346e-06, "loss": 0.2198, "num_input_tokens_seen": 19799088, "step": 32470 }, { "epoch": 10.076016134036612, "grad_norm": 0.6310725808143616, "learning_rate": 5.803104849301523e-06, "loss": 0.214, "num_input_tokens_seen": 19801936, "step": 32475 }, { "epoch": 10.077567483710828, "grad_norm": 0.5507174134254456, "learning_rate": 5.801768589519603e-06, "loss": 0.2173, "num_input_tokens_seen": 19804272, "step": 32480 }, { "epoch": 10.079118833385046, "grad_norm": 0.7722460031509399, "learning_rate": 5.8004322709585515e-06, "loss": 0.2224, "num_input_tokens_seen": 19807248, "step": 32485 }, { "epoch": 10.080670183059262, "grad_norm": 0.9207651019096375, "learning_rate": 5.799095893716338e-06, "loss": 0.2296, "num_input_tokens_seen": 19810352, "step": 32490 }, { "epoch": 10.082221532733477, "grad_norm": 0.6966132521629333, "learning_rate": 5.797759457890932e-06, "loss": 0.2288, "num_input_tokens_seen": 19812848, "step": 32495 }, { "epoch": 10.083772882407695, "grad_norm": 0.4698309600353241, "learning_rate": 5.7964229635803095e-06, "loss": 0.2303, "num_input_tokens_seen": 19815600, "step": 32500 }, { "epoch": 10.085324232081911, "grad_norm": 0.7172631621360779, "learning_rate": 5.795086410882452e-06, "loss": 0.2297, "num_input_tokens_seen": 19818416, "step": 32505 }, { "epoch": 10.086875581756129, "grad_norm": 0.7214159369468689, "learning_rate": 5.793749799895348e-06, "loss": 0.2102, "num_input_tokens_seen": 19822224, "step": 32510 }, { "epoch": 10.088426931430345, "grad_norm": 1.1286978721618652, "learning_rate": 5.792413130716983e-06, "loss": 0.224, "num_input_tokens_seen": 19825584, "step": 32515 }, { "epoch": 10.08997828110456, "grad_norm": 0.5832887887954712, "learning_rate": 5.791076403445352e-06, "loss": 0.2233, "num_input_tokens_seen": 19827632, "step": 32520 }, { "epoch": 10.091529630778778, "grad_norm": 0.8168708086013794, "learning_rate": 5.789739618178453e-06, "loss": 0.2381, "num_input_tokens_seen": 19830288, "step": 32525 }, { "epoch": 10.093080980452994, "grad_norm": 0.6257369518280029, "learning_rate": 5.788402775014288e-06, "loss": 0.2287, "num_input_tokens_seen": 19833168, "step": 32530 }, { "epoch": 10.09463233012721, "grad_norm": 0.47243165969848633, "learning_rate": 5.787065874050863e-06, "loss": 0.2361, "num_input_tokens_seen": 19835760, "step": 32535 }, { "epoch": 10.096183679801428, "grad_norm": 1.1771931648254395, "learning_rate": 5.785728915386191e-06, "loss": 0.2367, "num_input_tokens_seen": 19838800, "step": 32540 }, { "epoch": 10.097735029475643, "grad_norm": 0.5758445858955383, "learning_rate": 5.784391899118283e-06, "loss": 0.2103, "num_input_tokens_seen": 19842192, "step": 32545 }, { "epoch": 10.099286379149861, "grad_norm": 0.9432080984115601, "learning_rate": 5.783054825345161e-06, "loss": 0.2375, "num_input_tokens_seen": 19845136, "step": 32550 }, { "epoch": 10.100837728824077, "grad_norm": 0.6434993743896484, "learning_rate": 5.781717694164848e-06, "loss": 0.2403, "num_input_tokens_seen": 19847792, "step": 32555 }, { "epoch": 10.102389078498293, "grad_norm": 1.0287153720855713, "learning_rate": 5.78038050567537e-06, "loss": 0.2358, "num_input_tokens_seen": 19851440, "step": 32560 }, { "epoch": 10.10394042817251, "grad_norm": 0.5447304248809814, "learning_rate": 5.779043259974759e-06, "loss": 0.2064, "num_input_tokens_seen": 19854896, "step": 32565 }, { "epoch": 10.105491777846726, "grad_norm": 0.8874340057373047, "learning_rate": 5.777705957161054e-06, "loss": 0.232, "num_input_tokens_seen": 19857232, "step": 32570 }, { "epoch": 10.107043127520944, "grad_norm": 0.6711286306381226, "learning_rate": 5.7763685973322925e-06, "loss": 0.1991, "num_input_tokens_seen": 19861328, "step": 32575 }, { "epoch": 10.10859447719516, "grad_norm": 0.7291198372840881, "learning_rate": 5.775031180586519e-06, "loss": 0.2328, "num_input_tokens_seen": 19864112, "step": 32580 }, { "epoch": 10.110145826869376, "grad_norm": 0.5119507908821106, "learning_rate": 5.773693707021783e-06, "loss": 0.2206, "num_input_tokens_seen": 19867472, "step": 32585 }, { "epoch": 10.111697176543593, "grad_norm": 0.46855732798576355, "learning_rate": 5.7723561767361346e-06, "loss": 0.2179, "num_input_tokens_seen": 19870384, "step": 32590 }, { "epoch": 10.11324852621781, "grad_norm": 0.4724161922931671, "learning_rate": 5.771018589827634e-06, "loss": 0.2187, "num_input_tokens_seen": 19873104, "step": 32595 }, { "epoch": 10.114799875892025, "grad_norm": 0.984345555305481, "learning_rate": 5.76968094639434e-06, "loss": 0.2337, "num_input_tokens_seen": 19875824, "step": 32600 }, { "epoch": 10.116351225566243, "grad_norm": 1.013150930404663, "learning_rate": 5.768343246534318e-06, "loss": 0.2187, "num_input_tokens_seen": 19878640, "step": 32605 }, { "epoch": 10.117902575240459, "grad_norm": 0.5470888018608093, "learning_rate": 5.767005490345637e-06, "loss": 0.2388, "num_input_tokens_seen": 19880784, "step": 32610 }, { "epoch": 10.119453924914676, "grad_norm": 1.4718754291534424, "learning_rate": 5.765667677926372e-06, "loss": 0.2415, "num_input_tokens_seen": 19884144, "step": 32615 }, { "epoch": 10.121005274588892, "grad_norm": 1.055117130279541, "learning_rate": 5.7643298093745995e-06, "loss": 0.256, "num_input_tokens_seen": 19887344, "step": 32620 }, { "epoch": 10.122556624263108, "grad_norm": 0.8223236203193665, "learning_rate": 5.762991884788398e-06, "loss": 0.2644, "num_input_tokens_seen": 19890192, "step": 32625 }, { "epoch": 10.124107973937326, "grad_norm": 0.4030059576034546, "learning_rate": 5.761653904265859e-06, "loss": 0.2233, "num_input_tokens_seen": 19892784, "step": 32630 }, { "epoch": 10.125659323611542, "grad_norm": 0.777033805847168, "learning_rate": 5.760315867905069e-06, "loss": 0.2302, "num_input_tokens_seen": 19895504, "step": 32635 }, { "epoch": 10.12721067328576, "grad_norm": 0.4318546652793884, "learning_rate": 5.758977775804122e-06, "loss": 0.2335, "num_input_tokens_seen": 19898384, "step": 32640 }, { "epoch": 10.128762022959975, "grad_norm": 0.46986812353134155, "learning_rate": 5.757639628061117e-06, "loss": 0.2267, "num_input_tokens_seen": 19901424, "step": 32645 }, { "epoch": 10.130313372634191, "grad_norm": 0.4572584629058838, "learning_rate": 5.7563014247741554e-06, "loss": 0.2347, "num_input_tokens_seen": 19904208, "step": 32650 }, { "epoch": 10.131864722308409, "grad_norm": 0.3287888467311859, "learning_rate": 5.754963166041343e-06, "loss": 0.2242, "num_input_tokens_seen": 19907632, "step": 32655 }, { "epoch": 10.133416071982625, "grad_norm": 0.9773184657096863, "learning_rate": 5.753624851960791e-06, "loss": 0.2347, "num_input_tokens_seen": 19912208, "step": 32660 }, { "epoch": 10.13496742165684, "grad_norm": 0.7804239392280579, "learning_rate": 5.7522864826306125e-06, "loss": 0.2169, "num_input_tokens_seen": 19915248, "step": 32665 }, { "epoch": 10.136518771331058, "grad_norm": 0.8484684228897095, "learning_rate": 5.750948058148926e-06, "loss": 0.2136, "num_input_tokens_seen": 19917936, "step": 32670 }, { "epoch": 10.138070121005274, "grad_norm": 1.101577639579773, "learning_rate": 5.749609578613855e-06, "loss": 0.2347, "num_input_tokens_seen": 19920816, "step": 32675 }, { "epoch": 10.139621470679492, "grad_norm": 0.5240450501441956, "learning_rate": 5.7482710441235254e-06, "loss": 0.2092, "num_input_tokens_seen": 19923344, "step": 32680 }, { "epoch": 10.141172820353708, "grad_norm": 0.7467389106750488, "learning_rate": 5.746932454776066e-06, "loss": 0.2235, "num_input_tokens_seen": 19925904, "step": 32685 }, { "epoch": 10.142724170027924, "grad_norm": 0.7975060939788818, "learning_rate": 5.745593810669614e-06, "loss": 0.221, "num_input_tokens_seen": 19928144, "step": 32690 }, { "epoch": 10.144275519702141, "grad_norm": 0.9198117852210999, "learning_rate": 5.744255111902306e-06, "loss": 0.2248, "num_input_tokens_seen": 19931216, "step": 32695 }, { "epoch": 10.145826869376357, "grad_norm": 0.9768708944320679, "learning_rate": 5.742916358572286e-06, "loss": 0.2317, "num_input_tokens_seen": 19933712, "step": 32700 }, { "epoch": 10.147378219050575, "grad_norm": 0.7668578028678894, "learning_rate": 5.7415775507776985e-06, "loss": 0.2216, "num_input_tokens_seen": 19936080, "step": 32705 }, { "epoch": 10.14892956872479, "grad_norm": 1.8603670597076416, "learning_rate": 5.740238688616695e-06, "loss": 0.2576, "num_input_tokens_seen": 19940560, "step": 32710 }, { "epoch": 10.150480918399007, "grad_norm": 0.47767525911331177, "learning_rate": 5.7388997721874304e-06, "loss": 0.2346, "num_input_tokens_seen": 19943824, "step": 32715 }, { "epoch": 10.152032268073224, "grad_norm": 0.6346036791801453, "learning_rate": 5.737560801588062e-06, "loss": 0.2309, "num_input_tokens_seen": 19946384, "step": 32720 }, { "epoch": 10.15358361774744, "grad_norm": 0.7345063090324402, "learning_rate": 5.7362217769167535e-06, "loss": 0.2274, "num_input_tokens_seen": 19949456, "step": 32725 }, { "epoch": 10.155134967421656, "grad_norm": 1.2879616022109985, "learning_rate": 5.734882698271671e-06, "loss": 0.2497, "num_input_tokens_seen": 19954064, "step": 32730 }, { "epoch": 10.156686317095874, "grad_norm": 0.8662071228027344, "learning_rate": 5.7335435657509834e-06, "loss": 0.2323, "num_input_tokens_seen": 19957680, "step": 32735 }, { "epoch": 10.15823766677009, "grad_norm": 0.5166003704071045, "learning_rate": 5.732204379452866e-06, "loss": 0.222, "num_input_tokens_seen": 19960816, "step": 32740 }, { "epoch": 10.159789016444307, "grad_norm": 0.5348240733146667, "learning_rate": 5.730865139475497e-06, "loss": 0.2236, "num_input_tokens_seen": 19962800, "step": 32745 }, { "epoch": 10.161340366118523, "grad_norm": 0.9862552881240845, "learning_rate": 5.729525845917058e-06, "loss": 0.2206, "num_input_tokens_seen": 19966160, "step": 32750 }, { "epoch": 10.162891715792739, "grad_norm": 0.6919369101524353, "learning_rate": 5.728186498875736e-06, "loss": 0.2425, "num_input_tokens_seen": 19969552, "step": 32755 }, { "epoch": 10.164443065466957, "grad_norm": 0.6495134234428406, "learning_rate": 5.7268470984497215e-06, "loss": 0.22, "num_input_tokens_seen": 19973328, "step": 32760 }, { "epoch": 10.165994415141173, "grad_norm": 0.8767880797386169, "learning_rate": 5.725507644737207e-06, "loss": 0.2384, "num_input_tokens_seen": 19975792, "step": 32765 }, { "epoch": 10.16754576481539, "grad_norm": 0.5008128881454468, "learning_rate": 5.72416813783639e-06, "loss": 0.2312, "num_input_tokens_seen": 19978416, "step": 32770 }, { "epoch": 10.169097114489606, "grad_norm": 0.6823640465736389, "learning_rate": 5.722828577845474e-06, "loss": 0.2293, "num_input_tokens_seen": 19981424, "step": 32775 }, { "epoch": 10.170648464163822, "grad_norm": 0.9017747044563293, "learning_rate": 5.721488964862663e-06, "loss": 0.2206, "num_input_tokens_seen": 19984240, "step": 32780 }, { "epoch": 10.17219981383804, "grad_norm": 0.570580005645752, "learning_rate": 5.720149298986167e-06, "loss": 0.2206, "num_input_tokens_seen": 19987664, "step": 32785 }, { "epoch": 10.173751163512256, "grad_norm": 0.47453033924102783, "learning_rate": 5.718809580314201e-06, "loss": 0.2393, "num_input_tokens_seen": 19990320, "step": 32790 }, { "epoch": 10.175302513186471, "grad_norm": 0.4677395224571228, "learning_rate": 5.71746980894498e-06, "loss": 0.2151, "num_input_tokens_seen": 19993040, "step": 32795 }, { "epoch": 10.176853862860689, "grad_norm": 0.5524698495864868, "learning_rate": 5.716129984976726e-06, "loss": 0.2385, "num_input_tokens_seen": 19996208, "step": 32800 }, { "epoch": 10.178405212534905, "grad_norm": 0.5503283739089966, "learning_rate": 5.714790108507664e-06, "loss": 0.2305, "num_input_tokens_seen": 19998512, "step": 32805 }, { "epoch": 10.179956562209123, "grad_norm": 1.1078823804855347, "learning_rate": 5.713450179636022e-06, "loss": 0.2296, "num_input_tokens_seen": 20001744, "step": 32810 }, { "epoch": 10.181507911883338, "grad_norm": 0.3781926929950714, "learning_rate": 5.7121101984600345e-06, "loss": 0.2307, "num_input_tokens_seen": 20004080, "step": 32815 }, { "epoch": 10.183059261557554, "grad_norm": 0.4596903622150421, "learning_rate": 5.710770165077934e-06, "loss": 0.2325, "num_input_tokens_seen": 20006192, "step": 32820 }, { "epoch": 10.184610611231772, "grad_norm": 0.2819710671901703, "learning_rate": 5.7094300795879665e-06, "loss": 0.2471, "num_input_tokens_seen": 20010480, "step": 32825 }, { "epoch": 10.186161960905988, "grad_norm": 0.6014285683631897, "learning_rate": 5.708089942088373e-06, "loss": 0.2221, "num_input_tokens_seen": 20012848, "step": 32830 }, { "epoch": 10.187713310580206, "grad_norm": 0.5906281471252441, "learning_rate": 5.706749752677401e-06, "loss": 0.2281, "num_input_tokens_seen": 20015920, "step": 32835 }, { "epoch": 10.189264660254421, "grad_norm": 0.5534579157829285, "learning_rate": 5.705409511453302e-06, "loss": 0.2493, "num_input_tokens_seen": 20019984, "step": 32840 }, { "epoch": 10.190816009928637, "grad_norm": 0.5644808411598206, "learning_rate": 5.704069218514335e-06, "loss": 0.235, "num_input_tokens_seen": 20022608, "step": 32845 }, { "epoch": 10.192367359602855, "grad_norm": 0.6202318072319031, "learning_rate": 5.7027288739587545e-06, "loss": 0.2029, "num_input_tokens_seen": 20025776, "step": 32850 }, { "epoch": 10.193918709277071, "grad_norm": 0.5613641142845154, "learning_rate": 5.701388477884827e-06, "loss": 0.233, "num_input_tokens_seen": 20032528, "step": 32855 }, { "epoch": 10.195470058951287, "grad_norm": 0.38797906041145325, "learning_rate": 5.700048030390819e-06, "loss": 0.2291, "num_input_tokens_seen": 20035472, "step": 32860 }, { "epoch": 10.197021408625504, "grad_norm": 0.4003472924232483, "learning_rate": 5.6987075315749995e-06, "loss": 0.2243, "num_input_tokens_seen": 20038288, "step": 32865 }, { "epoch": 10.19857275829972, "grad_norm": 0.5027625560760498, "learning_rate": 5.697366981535644e-06, "loss": 0.2387, "num_input_tokens_seen": 20040848, "step": 32870 }, { "epoch": 10.200124107973938, "grad_norm": 0.328559547662735, "learning_rate": 5.696026380371032e-06, "loss": 0.2196, "num_input_tokens_seen": 20045008, "step": 32875 }, { "epoch": 10.201675457648154, "grad_norm": 0.7022257447242737, "learning_rate": 5.694685728179442e-06, "loss": 0.2294, "num_input_tokens_seen": 20047440, "step": 32880 }, { "epoch": 10.20322680732237, "grad_norm": 0.4801291227340698, "learning_rate": 5.693345025059164e-06, "loss": 0.2338, "num_input_tokens_seen": 20051088, "step": 32885 }, { "epoch": 10.204778156996587, "grad_norm": 0.3631557822227478, "learning_rate": 5.6920042711084836e-06, "loss": 0.2194, "num_input_tokens_seen": 20053584, "step": 32890 }, { "epoch": 10.206329506670803, "grad_norm": 0.9187966585159302, "learning_rate": 5.6906634664256966e-06, "loss": 0.2366, "num_input_tokens_seen": 20056208, "step": 32895 }, { "epoch": 10.207880856345021, "grad_norm": 0.5770383477210999, "learning_rate": 5.6893226111090985e-06, "loss": 0.2219, "num_input_tokens_seen": 20060528, "step": 32900 }, { "epoch": 10.209432206019237, "grad_norm": 0.5484739542007446, "learning_rate": 5.687981705256989e-06, "loss": 0.2427, "num_input_tokens_seen": 20063472, "step": 32905 }, { "epoch": 10.210983555693453, "grad_norm": 0.8518034815788269, "learning_rate": 5.686640748967676e-06, "loss": 0.2198, "num_input_tokens_seen": 20066672, "step": 32910 }, { "epoch": 10.21253490536767, "grad_norm": 0.5829988121986389, "learning_rate": 5.685299742339464e-06, "loss": 0.2161, "num_input_tokens_seen": 20068912, "step": 32915 }, { "epoch": 10.214086255041886, "grad_norm": 0.4501565992832184, "learning_rate": 5.683958685470665e-06, "loss": 0.2302, "num_input_tokens_seen": 20070960, "step": 32920 }, { "epoch": 10.215637604716102, "grad_norm": 0.44526246190071106, "learning_rate": 5.682617578459596e-06, "loss": 0.2274, "num_input_tokens_seen": 20074768, "step": 32925 }, { "epoch": 10.21718895439032, "grad_norm": 0.495227575302124, "learning_rate": 5.681276421404574e-06, "loss": 0.2361, "num_input_tokens_seen": 20076944, "step": 32930 }, { "epoch": 10.218740304064536, "grad_norm": 0.41076788306236267, "learning_rate": 5.679935214403924e-06, "loss": 0.2292, "num_input_tokens_seen": 20079600, "step": 32935 }, { "epoch": 10.220291653738753, "grad_norm": 0.561714768409729, "learning_rate": 5.67859395755597e-06, "loss": 0.2306, "num_input_tokens_seen": 20082128, "step": 32940 }, { "epoch": 10.22184300341297, "grad_norm": 0.5592831373214722, "learning_rate": 5.677252650959044e-06, "loss": 0.2278, "num_input_tokens_seen": 20084144, "step": 32945 }, { "epoch": 10.223394353087185, "grad_norm": 0.954703688621521, "learning_rate": 5.675911294711478e-06, "loss": 0.2259, "num_input_tokens_seen": 20088656, "step": 32950 }, { "epoch": 10.224945702761403, "grad_norm": 0.4397277235984802, "learning_rate": 5.6745698889116115e-06, "loss": 0.2384, "num_input_tokens_seen": 20091472, "step": 32955 }, { "epoch": 10.226497052435619, "grad_norm": 0.7733102440834045, "learning_rate": 5.673228433657784e-06, "loss": 0.2506, "num_input_tokens_seen": 20095216, "step": 32960 }, { "epoch": 10.228048402109836, "grad_norm": 0.45808443427085876, "learning_rate": 5.6718869290483395e-06, "loss": 0.2214, "num_input_tokens_seen": 20098576, "step": 32965 }, { "epoch": 10.229599751784052, "grad_norm": 0.511962890625, "learning_rate": 5.670545375181626e-06, "loss": 0.2336, "num_input_tokens_seen": 20101840, "step": 32970 }, { "epoch": 10.231151101458268, "grad_norm": 0.5539218187332153, "learning_rate": 5.669203772155997e-06, "loss": 0.2346, "num_input_tokens_seen": 20104592, "step": 32975 }, { "epoch": 10.232702451132486, "grad_norm": 0.478822261095047, "learning_rate": 5.667862120069807e-06, "loss": 0.2282, "num_input_tokens_seen": 20107952, "step": 32980 }, { "epoch": 10.234253800806702, "grad_norm": 0.526032567024231, "learning_rate": 5.666520419021414e-06, "loss": 0.2293, "num_input_tokens_seen": 20110704, "step": 32985 }, { "epoch": 10.235805150480918, "grad_norm": 0.7077295184135437, "learning_rate": 5.6651786691091826e-06, "loss": 0.242, "num_input_tokens_seen": 20114672, "step": 32990 }, { "epoch": 10.237356500155135, "grad_norm": 0.3093375265598297, "learning_rate": 5.663836870431477e-06, "loss": 0.2351, "num_input_tokens_seen": 20117840, "step": 32995 }, { "epoch": 10.238907849829351, "grad_norm": 0.7339089512825012, "learning_rate": 5.662495023086667e-06, "loss": 0.227, "num_input_tokens_seen": 20120944, "step": 33000 }, { "epoch": 10.240459199503569, "grad_norm": 0.6945838928222656, "learning_rate": 5.661153127173128e-06, "loss": 0.2268, "num_input_tokens_seen": 20124176, "step": 33005 }, { "epoch": 10.242010549177785, "grad_norm": 0.5080917477607727, "learning_rate": 5.659811182789235e-06, "loss": 0.2291, "num_input_tokens_seen": 20127984, "step": 33010 }, { "epoch": 10.243561898852, "grad_norm": 0.35245877504348755, "learning_rate": 5.65846919003337e-06, "loss": 0.2293, "num_input_tokens_seen": 20130480, "step": 33015 }, { "epoch": 10.245113248526218, "grad_norm": 0.3334295451641083, "learning_rate": 5.657127149003915e-06, "loss": 0.2393, "num_input_tokens_seen": 20132880, "step": 33020 }, { "epoch": 10.246664598200434, "grad_norm": 0.2845138907432556, "learning_rate": 5.655785059799259e-06, "loss": 0.2316, "num_input_tokens_seen": 20135952, "step": 33025 }, { "epoch": 10.248215947874652, "grad_norm": 0.49005836248397827, "learning_rate": 5.654442922517792e-06, "loss": 0.2307, "num_input_tokens_seen": 20140048, "step": 33030 }, { "epoch": 10.249767297548868, "grad_norm": 0.48699328303337097, "learning_rate": 5.653100737257909e-06, "loss": 0.2208, "num_input_tokens_seen": 20142672, "step": 33035 }, { "epoch": 10.251318647223084, "grad_norm": 0.45829665660858154, "learning_rate": 5.651758504118009e-06, "loss": 0.2233, "num_input_tokens_seen": 20146192, "step": 33040 }, { "epoch": 10.252869996897301, "grad_norm": 0.3463996946811676, "learning_rate": 5.650416223196492e-06, "loss": 0.2278, "num_input_tokens_seen": 20148336, "step": 33045 }, { "epoch": 10.254421346571517, "grad_norm": 0.5713897347450256, "learning_rate": 5.649073894591764e-06, "loss": 0.2143, "num_input_tokens_seen": 20151440, "step": 33050 }, { "epoch": 10.255972696245733, "grad_norm": 0.30480530858039856, "learning_rate": 5.647731518402233e-06, "loss": 0.2379, "num_input_tokens_seen": 20154608, "step": 33055 }, { "epoch": 10.25752404591995, "grad_norm": 0.758684515953064, "learning_rate": 5.64638909472631e-06, "loss": 0.2326, "num_input_tokens_seen": 20157776, "step": 33060 }, { "epoch": 10.259075395594166, "grad_norm": 0.8535345792770386, "learning_rate": 5.645046623662413e-06, "loss": 0.2274, "num_input_tokens_seen": 20160624, "step": 33065 }, { "epoch": 10.260626745268384, "grad_norm": 0.46794936060905457, "learning_rate": 5.64370410530896e-06, "loss": 0.2206, "num_input_tokens_seen": 20163312, "step": 33070 }, { "epoch": 10.2621780949426, "grad_norm": 0.5193493962287903, "learning_rate": 5.6423615397643734e-06, "loss": 0.226, "num_input_tokens_seen": 20165776, "step": 33075 }, { "epoch": 10.263729444616816, "grad_norm": 0.5104600787162781, "learning_rate": 5.64101892712708e-06, "loss": 0.2389, "num_input_tokens_seen": 20168880, "step": 33080 }, { "epoch": 10.265280794291034, "grad_norm": 0.8629378080368042, "learning_rate": 5.639676267495508e-06, "loss": 0.2246, "num_input_tokens_seen": 20172464, "step": 33085 }, { "epoch": 10.26683214396525, "grad_norm": 0.5768038630485535, "learning_rate": 5.638333560968091e-06, "loss": 0.2159, "num_input_tokens_seen": 20175120, "step": 33090 }, { "epoch": 10.268383493639467, "grad_norm": 0.45863422751426697, "learning_rate": 5.636990807643265e-06, "loss": 0.2306, "num_input_tokens_seen": 20178000, "step": 33095 }, { "epoch": 10.269934843313683, "grad_norm": 0.41168212890625, "learning_rate": 5.63564800761947e-06, "loss": 0.2297, "num_input_tokens_seen": 20179856, "step": 33100 }, { "epoch": 10.271486192987899, "grad_norm": 0.4558357000350952, "learning_rate": 5.634305160995148e-06, "loss": 0.2205, "num_input_tokens_seen": 20183088, "step": 33105 }, { "epoch": 10.273037542662117, "grad_norm": 0.4411795139312744, "learning_rate": 5.632962267868747e-06, "loss": 0.2243, "num_input_tokens_seen": 20187024, "step": 33110 }, { "epoch": 10.274588892336332, "grad_norm": 0.5619028806686401, "learning_rate": 5.631619328338717e-06, "loss": 0.2272, "num_input_tokens_seen": 20189712, "step": 33115 }, { "epoch": 10.276140242010548, "grad_norm": 0.8532229065895081, "learning_rate": 5.630276342503509e-06, "loss": 0.2244, "num_input_tokens_seen": 20192496, "step": 33120 }, { "epoch": 10.277691591684766, "grad_norm": 0.5895193815231323, "learning_rate": 5.6289333104615825e-06, "loss": 0.2281, "num_input_tokens_seen": 20195856, "step": 33125 }, { "epoch": 10.279242941358982, "grad_norm": 0.7319860458374023, "learning_rate": 5.6275902323113975e-06, "loss": 0.2042, "num_input_tokens_seen": 20198832, "step": 33130 }, { "epoch": 10.2807942910332, "grad_norm": 0.5952509641647339, "learning_rate": 5.626247108151414e-06, "loss": 0.2085, "num_input_tokens_seen": 20200976, "step": 33135 }, { "epoch": 10.282345640707415, "grad_norm": 0.655066967010498, "learning_rate": 5.624903938080103e-06, "loss": 0.2492, "num_input_tokens_seen": 20203440, "step": 33140 }, { "epoch": 10.283896990381631, "grad_norm": 0.48836570978164673, "learning_rate": 5.623560722195934e-06, "loss": 0.2307, "num_input_tokens_seen": 20206000, "step": 33145 }, { "epoch": 10.285448340055849, "grad_norm": 0.6059862971305847, "learning_rate": 5.62221746059738e-06, "loss": 0.2313, "num_input_tokens_seen": 20209392, "step": 33150 }, { "epoch": 10.286999689730065, "grad_norm": 0.42575570940971375, "learning_rate": 5.620874153382918e-06, "loss": 0.2389, "num_input_tokens_seen": 20212528, "step": 33155 }, { "epoch": 10.288551039404283, "grad_norm": 0.8587070107460022, "learning_rate": 5.619530800651027e-06, "loss": 0.2234, "num_input_tokens_seen": 20216496, "step": 33160 }, { "epoch": 10.290102389078498, "grad_norm": 0.5744319558143616, "learning_rate": 5.618187402500194e-06, "loss": 0.2089, "num_input_tokens_seen": 20219024, "step": 33165 }, { "epoch": 10.291653738752714, "grad_norm": 0.6353322267532349, "learning_rate": 5.6168439590289025e-06, "loss": 0.2279, "num_input_tokens_seen": 20222480, "step": 33170 }, { "epoch": 10.293205088426932, "grad_norm": 0.39054229855537415, "learning_rate": 5.615500470335645e-06, "loss": 0.2457, "num_input_tokens_seen": 20225072, "step": 33175 }, { "epoch": 10.294756438101148, "grad_norm": 0.6222186088562012, "learning_rate": 5.6141569365189144e-06, "loss": 0.2046, "num_input_tokens_seen": 20230288, "step": 33180 }, { "epoch": 10.296307787775364, "grad_norm": 1.036427617073059, "learning_rate": 5.612813357677207e-06, "loss": 0.2196, "num_input_tokens_seen": 20233488, "step": 33185 }, { "epoch": 10.297859137449581, "grad_norm": 1.0207722187042236, "learning_rate": 5.611469733909024e-06, "loss": 0.2214, "num_input_tokens_seen": 20236848, "step": 33190 }, { "epoch": 10.299410487123797, "grad_norm": 0.674889326095581, "learning_rate": 5.610126065312868e-06, "loss": 0.2118, "num_input_tokens_seen": 20240336, "step": 33195 }, { "epoch": 10.300961836798015, "grad_norm": 0.6647543907165527, "learning_rate": 5.608782351987248e-06, "loss": 0.2284, "num_input_tokens_seen": 20243184, "step": 33200 }, { "epoch": 10.30251318647223, "grad_norm": 0.6704895496368408, "learning_rate": 5.607438594030671e-06, "loss": 0.2301, "num_input_tokens_seen": 20245744, "step": 33205 }, { "epoch": 10.304064536146447, "grad_norm": 0.8171505928039551, "learning_rate": 5.606094791541653e-06, "loss": 0.2204, "num_input_tokens_seen": 20248848, "step": 33210 }, { "epoch": 10.305615885820664, "grad_norm": 0.5783675909042358, "learning_rate": 5.6047509446187085e-06, "loss": 0.2228, "num_input_tokens_seen": 20251504, "step": 33215 }, { "epoch": 10.30716723549488, "grad_norm": 0.8859720826148987, "learning_rate": 5.603407053360359e-06, "loss": 0.213, "num_input_tokens_seen": 20255152, "step": 33220 }, { "epoch": 10.308718585169098, "grad_norm": 1.014034628868103, "learning_rate": 5.602063117865127e-06, "loss": 0.2395, "num_input_tokens_seen": 20258032, "step": 33225 }, { "epoch": 10.310269934843314, "grad_norm": 0.782126247882843, "learning_rate": 5.600719138231539e-06, "loss": 0.2287, "num_input_tokens_seen": 20261680, "step": 33230 }, { "epoch": 10.31182128451753, "grad_norm": 0.562385082244873, "learning_rate": 5.5993751145581255e-06, "loss": 0.2291, "num_input_tokens_seen": 20265424, "step": 33235 }, { "epoch": 10.313372634191747, "grad_norm": 1.0771995782852173, "learning_rate": 5.5980310469434174e-06, "loss": 0.2244, "num_input_tokens_seen": 20268528, "step": 33240 }, { "epoch": 10.314923983865963, "grad_norm": 0.8113213777542114, "learning_rate": 5.596686935485952e-06, "loss": 0.235, "num_input_tokens_seen": 20271760, "step": 33245 }, { "epoch": 10.316475333540179, "grad_norm": 0.6315094232559204, "learning_rate": 5.595342780284269e-06, "loss": 0.2389, "num_input_tokens_seen": 20274448, "step": 33250 }, { "epoch": 10.318026683214397, "grad_norm": 0.431648313999176, "learning_rate": 5.593998581436908e-06, "loss": 0.2259, "num_input_tokens_seen": 20277712, "step": 33255 }, { "epoch": 10.319578032888613, "grad_norm": 0.8498551845550537, "learning_rate": 5.592654339042419e-06, "loss": 0.2157, "num_input_tokens_seen": 20280368, "step": 33260 }, { "epoch": 10.32112938256283, "grad_norm": 0.49093005061149597, "learning_rate": 5.591310053199349e-06, "loss": 0.2246, "num_input_tokens_seen": 20283888, "step": 33265 }, { "epoch": 10.322680732237046, "grad_norm": 0.3460127115249634, "learning_rate": 5.58996572400625e-06, "loss": 0.2168, "num_input_tokens_seen": 20286608, "step": 33270 }, { "epoch": 10.324232081911262, "grad_norm": 0.49007925391197205, "learning_rate": 5.588621351561676e-06, "loss": 0.2364, "num_input_tokens_seen": 20289520, "step": 33275 }, { "epoch": 10.32578343158548, "grad_norm": 0.38452523946762085, "learning_rate": 5.5872769359641875e-06, "loss": 0.2367, "num_input_tokens_seen": 20292400, "step": 33280 }, { "epoch": 10.327334781259696, "grad_norm": 0.7657303810119629, "learning_rate": 5.585932477312344e-06, "loss": 0.2318, "num_input_tokens_seen": 20295568, "step": 33285 }, { "epoch": 10.328886130933913, "grad_norm": 0.7132109999656677, "learning_rate": 5.584587975704714e-06, "loss": 0.2084, "num_input_tokens_seen": 20298960, "step": 33290 }, { "epoch": 10.33043748060813, "grad_norm": 0.8218289017677307, "learning_rate": 5.583243431239861e-06, "loss": 0.2234, "num_input_tokens_seen": 20301904, "step": 33295 }, { "epoch": 10.331988830282345, "grad_norm": 0.9794231057167053, "learning_rate": 5.581898844016358e-06, "loss": 0.2369, "num_input_tokens_seen": 20305872, "step": 33300 }, { "epoch": 10.333540179956563, "grad_norm": 0.742385745048523, "learning_rate": 5.58055421413278e-06, "loss": 0.2157, "num_input_tokens_seen": 20309776, "step": 33305 }, { "epoch": 10.335091529630779, "grad_norm": 0.965707540512085, "learning_rate": 5.579209541687702e-06, "loss": 0.2265, "num_input_tokens_seen": 20313456, "step": 33310 }, { "epoch": 10.336642879304996, "grad_norm": 0.5191265344619751, "learning_rate": 5.577864826779705e-06, "loss": 0.2609, "num_input_tokens_seen": 20316336, "step": 33315 }, { "epoch": 10.338194228979212, "grad_norm": 0.7073953747749329, "learning_rate": 5.576520069507374e-06, "loss": 0.2263, "num_input_tokens_seen": 20319216, "step": 33320 }, { "epoch": 10.339745578653428, "grad_norm": 0.7239944934844971, "learning_rate": 5.575175269969297e-06, "loss": 0.2273, "num_input_tokens_seen": 20322704, "step": 33325 }, { "epoch": 10.341296928327646, "grad_norm": 0.46780046820640564, "learning_rate": 5.573830428264059e-06, "loss": 0.2335, "num_input_tokens_seen": 20325072, "step": 33330 }, { "epoch": 10.342848278001862, "grad_norm": 0.5147444009780884, "learning_rate": 5.5724855444902556e-06, "loss": 0.2324, "num_input_tokens_seen": 20328400, "step": 33335 }, { "epoch": 10.344399627676077, "grad_norm": 1.0641534328460693, "learning_rate": 5.571140618746485e-06, "loss": 0.2231, "num_input_tokens_seen": 20331984, "step": 33340 }, { "epoch": 10.345950977350295, "grad_norm": 0.7188686728477478, "learning_rate": 5.56979565113134e-06, "loss": 0.2326, "num_input_tokens_seen": 20334576, "step": 33345 }, { "epoch": 10.347502327024511, "grad_norm": 0.5489305853843689, "learning_rate": 5.568450641743429e-06, "loss": 0.2301, "num_input_tokens_seen": 20336752, "step": 33350 }, { "epoch": 10.349053676698729, "grad_norm": 1.0046347379684448, "learning_rate": 5.567105590681354e-06, "loss": 0.2351, "num_input_tokens_seen": 20339952, "step": 33355 }, { "epoch": 10.350605026372945, "grad_norm": 1.082560658454895, "learning_rate": 5.565760498043723e-06, "loss": 0.2339, "num_input_tokens_seen": 20343952, "step": 33360 }, { "epoch": 10.35215637604716, "grad_norm": 0.4421245753765106, "learning_rate": 5.564415363929149e-06, "loss": 0.2517, "num_input_tokens_seen": 20346576, "step": 33365 }, { "epoch": 10.353707725721378, "grad_norm": 0.6055351495742798, "learning_rate": 5.563070188436245e-06, "loss": 0.2246, "num_input_tokens_seen": 20351728, "step": 33370 }, { "epoch": 10.355259075395594, "grad_norm": 0.8853834271430969, "learning_rate": 5.561724971663628e-06, "loss": 0.2388, "num_input_tokens_seen": 20354448, "step": 33375 }, { "epoch": 10.35681042506981, "grad_norm": 0.6038067936897278, "learning_rate": 5.560379713709919e-06, "loss": 0.2382, "num_input_tokens_seen": 20357296, "step": 33380 }, { "epoch": 10.358361774744028, "grad_norm": 0.5340945720672607, "learning_rate": 5.55903441467374e-06, "loss": 0.2245, "num_input_tokens_seen": 20360112, "step": 33385 }, { "epoch": 10.359913124418243, "grad_norm": 0.3928429186344147, "learning_rate": 5.557689074653719e-06, "loss": 0.2264, "num_input_tokens_seen": 20363024, "step": 33390 }, { "epoch": 10.361464474092461, "grad_norm": 0.8582980036735535, "learning_rate": 5.5563436937484845e-06, "loss": 0.2392, "num_input_tokens_seen": 20366320, "step": 33395 }, { "epoch": 10.363015823766677, "grad_norm": 0.3307948708534241, "learning_rate": 5.554998272056671e-06, "loss": 0.2326, "num_input_tokens_seen": 20369680, "step": 33400 }, { "epoch": 10.364567173440893, "grad_norm": 0.7461035251617432, "learning_rate": 5.553652809676911e-06, "loss": 0.2208, "num_input_tokens_seen": 20372912, "step": 33405 }, { "epoch": 10.36611852311511, "grad_norm": 0.7817564010620117, "learning_rate": 5.552307306707844e-06, "loss": 0.2311, "num_input_tokens_seen": 20376432, "step": 33410 }, { "epoch": 10.367669872789326, "grad_norm": 0.8292250037193298, "learning_rate": 5.5509617632481105e-06, "loss": 0.2315, "num_input_tokens_seen": 20379120, "step": 33415 }, { "epoch": 10.369221222463544, "grad_norm": 0.3735920190811157, "learning_rate": 5.549616179396355e-06, "loss": 0.2294, "num_input_tokens_seen": 20382288, "step": 33420 }, { "epoch": 10.37077257213776, "grad_norm": 0.37895721197128296, "learning_rate": 5.5482705552512265e-06, "loss": 0.2343, "num_input_tokens_seen": 20385008, "step": 33425 }, { "epoch": 10.372323921811976, "grad_norm": 0.43802016973495483, "learning_rate": 5.546924890911372e-06, "loss": 0.2312, "num_input_tokens_seen": 20387152, "step": 33430 }, { "epoch": 10.373875271486193, "grad_norm": 0.49718809127807617, "learning_rate": 5.5455791864754485e-06, "loss": 0.2337, "num_input_tokens_seen": 20389488, "step": 33435 }, { "epoch": 10.37542662116041, "grad_norm": 0.7281576991081238, "learning_rate": 5.54423344204211e-06, "loss": 0.2357, "num_input_tokens_seen": 20394128, "step": 33440 }, { "epoch": 10.376977970834627, "grad_norm": 0.6554040908813477, "learning_rate": 5.542887657710013e-06, "loss": 0.2333, "num_input_tokens_seen": 20397744, "step": 33445 }, { "epoch": 10.378529320508843, "grad_norm": 0.9006314277648926, "learning_rate": 5.541541833577823e-06, "loss": 0.2322, "num_input_tokens_seen": 20400112, "step": 33450 }, { "epoch": 10.380080670183059, "grad_norm": 0.3119596242904663, "learning_rate": 5.540195969744204e-06, "loss": 0.2256, "num_input_tokens_seen": 20402416, "step": 33455 }, { "epoch": 10.381632019857276, "grad_norm": 0.5615270733833313, "learning_rate": 5.538850066307824e-06, "loss": 0.2312, "num_input_tokens_seen": 20405200, "step": 33460 }, { "epoch": 10.383183369531492, "grad_norm": 0.5466188192367554, "learning_rate": 5.537504123367353e-06, "loss": 0.2319, "num_input_tokens_seen": 20408592, "step": 33465 }, { "epoch": 10.384734719205708, "grad_norm": 0.6302344799041748, "learning_rate": 5.536158141021465e-06, "loss": 0.2246, "num_input_tokens_seen": 20410928, "step": 33470 }, { "epoch": 10.386286068879926, "grad_norm": 0.4851839542388916, "learning_rate": 5.534812119368837e-06, "loss": 0.2371, "num_input_tokens_seen": 20413328, "step": 33475 }, { "epoch": 10.387837418554142, "grad_norm": 0.7626573443412781, "learning_rate": 5.533466058508146e-06, "loss": 0.2199, "num_input_tokens_seen": 20416240, "step": 33480 }, { "epoch": 10.38938876822836, "grad_norm": 0.5061626434326172, "learning_rate": 5.532119958538076e-06, "loss": 0.2245, "num_input_tokens_seen": 20418352, "step": 33485 }, { "epoch": 10.390940117902575, "grad_norm": 0.5308452844619751, "learning_rate": 5.530773819557313e-06, "loss": 0.2214, "num_input_tokens_seen": 20421840, "step": 33490 }, { "epoch": 10.392491467576791, "grad_norm": 0.6176546216011047, "learning_rate": 5.529427641664542e-06, "loss": 0.2231, "num_input_tokens_seen": 20425840, "step": 33495 }, { "epoch": 10.394042817251009, "grad_norm": 0.8494701981544495, "learning_rate": 5.528081424958456e-06, "loss": 0.2459, "num_input_tokens_seen": 20429232, "step": 33500 }, { "epoch": 10.395594166925225, "grad_norm": 0.7495899796485901, "learning_rate": 5.526735169537749e-06, "loss": 0.2292, "num_input_tokens_seen": 20431568, "step": 33505 }, { "epoch": 10.39714551659944, "grad_norm": 1.0165690183639526, "learning_rate": 5.525388875501114e-06, "loss": 0.2192, "num_input_tokens_seen": 20435696, "step": 33510 }, { "epoch": 10.398696866273658, "grad_norm": 0.777092456817627, "learning_rate": 5.524042542947256e-06, "loss": 0.2357, "num_input_tokens_seen": 20437936, "step": 33515 }, { "epoch": 10.400248215947874, "grad_norm": 0.4971836805343628, "learning_rate": 5.5226961719748725e-06, "loss": 0.2362, "num_input_tokens_seen": 20440080, "step": 33520 }, { "epoch": 10.401799565622092, "grad_norm": 0.934904158115387, "learning_rate": 5.52134976268267e-06, "loss": 0.2286, "num_input_tokens_seen": 20442448, "step": 33525 }, { "epoch": 10.403350915296308, "grad_norm": 0.324186772108078, "learning_rate": 5.5200033151693566e-06, "loss": 0.2271, "num_input_tokens_seen": 20445328, "step": 33530 }, { "epoch": 10.404902264970524, "grad_norm": 0.7137240171432495, "learning_rate": 5.518656829533643e-06, "loss": 0.2202, "num_input_tokens_seen": 20447728, "step": 33535 }, { "epoch": 10.406453614644741, "grad_norm": 0.7332755327224731, "learning_rate": 5.5173103058742414e-06, "loss": 0.2187, "num_input_tokens_seen": 20451632, "step": 33540 }, { "epoch": 10.408004964318957, "grad_norm": 0.46678590774536133, "learning_rate": 5.515963744289868e-06, "loss": 0.2455, "num_input_tokens_seen": 20455952, "step": 33545 }, { "epoch": 10.409556313993175, "grad_norm": 0.5297486782073975, "learning_rate": 5.514617144879243e-06, "loss": 0.2286, "num_input_tokens_seen": 20458352, "step": 33550 }, { "epoch": 10.41110766366739, "grad_norm": 0.6296843886375427, "learning_rate": 5.513270507741086e-06, "loss": 0.2237, "num_input_tokens_seen": 20461392, "step": 33555 }, { "epoch": 10.412659013341607, "grad_norm": 0.4431554079055786, "learning_rate": 5.511923832974124e-06, "loss": 0.2249, "num_input_tokens_seen": 20464656, "step": 33560 }, { "epoch": 10.414210363015824, "grad_norm": 0.5510021448135376, "learning_rate": 5.510577120677083e-06, "loss": 0.2345, "num_input_tokens_seen": 20467920, "step": 33565 }, { "epoch": 10.41576171269004, "grad_norm": 0.5130434632301331, "learning_rate": 5.509230370948692e-06, "loss": 0.2345, "num_input_tokens_seen": 20471440, "step": 33570 }, { "epoch": 10.417313062364258, "grad_norm": 0.538968026638031, "learning_rate": 5.507883583887685e-06, "loss": 0.214, "num_input_tokens_seen": 20474800, "step": 33575 }, { "epoch": 10.418864412038474, "grad_norm": 0.37910208106040955, "learning_rate": 5.5065367595927975e-06, "loss": 0.2358, "num_input_tokens_seen": 20477488, "step": 33580 }, { "epoch": 10.42041576171269, "grad_norm": 0.5191695094108582, "learning_rate": 5.505189898162767e-06, "loss": 0.2214, "num_input_tokens_seen": 20480496, "step": 33585 }, { "epoch": 10.421967111386907, "grad_norm": 0.436914324760437, "learning_rate": 5.503842999696335e-06, "loss": 0.2308, "num_input_tokens_seen": 20483632, "step": 33590 }, { "epoch": 10.423518461061123, "grad_norm": 0.44609618186950684, "learning_rate": 5.5024960642922445e-06, "loss": 0.2358, "num_input_tokens_seen": 20486192, "step": 33595 }, { "epoch": 10.425069810735339, "grad_norm": 0.5112714171409607, "learning_rate": 5.501149092049242e-06, "loss": 0.2195, "num_input_tokens_seen": 20489936, "step": 33600 }, { "epoch": 10.426621160409557, "grad_norm": 0.3303934931755066, "learning_rate": 5.499802083066076e-06, "loss": 0.2199, "num_input_tokens_seen": 20493136, "step": 33605 }, { "epoch": 10.428172510083773, "grad_norm": 0.38489043712615967, "learning_rate": 5.498455037441499e-06, "loss": 0.2208, "num_input_tokens_seen": 20495696, "step": 33610 }, { "epoch": 10.42972385975799, "grad_norm": 0.5697932243347168, "learning_rate": 5.4971079552742655e-06, "loss": 0.2218, "num_input_tokens_seen": 20499920, "step": 33615 }, { "epoch": 10.431275209432206, "grad_norm": 0.5280293822288513, "learning_rate": 5.495760836663132e-06, "loss": 0.2189, "num_input_tokens_seen": 20503472, "step": 33620 }, { "epoch": 10.432826559106422, "grad_norm": 0.6648653149604797, "learning_rate": 5.494413681706859e-06, "loss": 0.226, "num_input_tokens_seen": 20507536, "step": 33625 }, { "epoch": 10.43437790878064, "grad_norm": 0.5890928506851196, "learning_rate": 5.4930664905042076e-06, "loss": 0.2341, "num_input_tokens_seen": 20510096, "step": 33630 }, { "epoch": 10.435929258454856, "grad_norm": 0.6990231275558472, "learning_rate": 5.491719263153944e-06, "loss": 0.2254, "num_input_tokens_seen": 20514000, "step": 33635 }, { "epoch": 10.437480608129071, "grad_norm": 0.5051277279853821, "learning_rate": 5.490371999754834e-06, "loss": 0.2296, "num_input_tokens_seen": 20517680, "step": 33640 }, { "epoch": 10.439031957803289, "grad_norm": 0.8261251449584961, "learning_rate": 5.489024700405652e-06, "loss": 0.2264, "num_input_tokens_seen": 20520784, "step": 33645 }, { "epoch": 10.440583307477505, "grad_norm": 0.6915192604064941, "learning_rate": 5.487677365205167e-06, "loss": 0.225, "num_input_tokens_seen": 20523312, "step": 33650 }, { "epoch": 10.442134657151723, "grad_norm": 0.5822247266769409, "learning_rate": 5.486329994252157e-06, "loss": 0.2262, "num_input_tokens_seen": 20525648, "step": 33655 }, { "epoch": 10.443686006825939, "grad_norm": 0.492510050535202, "learning_rate": 5.4849825876454e-06, "loss": 0.2152, "num_input_tokens_seen": 20530032, "step": 33660 }, { "epoch": 10.445237356500154, "grad_norm": 0.8738226294517517, "learning_rate": 5.4836351454836755e-06, "loss": 0.2242, "num_input_tokens_seen": 20534352, "step": 33665 }, { "epoch": 10.446788706174372, "grad_norm": 0.5327644348144531, "learning_rate": 5.482287667865768e-06, "loss": 0.2305, "num_input_tokens_seen": 20537488, "step": 33670 }, { "epoch": 10.448340055848588, "grad_norm": 0.48589178919792175, "learning_rate": 5.480940154890463e-06, "loss": 0.2409, "num_input_tokens_seen": 20540080, "step": 33675 }, { "epoch": 10.449891405522806, "grad_norm": 0.7933342456817627, "learning_rate": 5.479592606656549e-06, "loss": 0.2419, "num_input_tokens_seen": 20543088, "step": 33680 }, { "epoch": 10.451442755197021, "grad_norm": 0.6391341686248779, "learning_rate": 5.478245023262819e-06, "loss": 0.2232, "num_input_tokens_seen": 20546256, "step": 33685 }, { "epoch": 10.452994104871237, "grad_norm": 0.5452823042869568, "learning_rate": 5.476897404808065e-06, "loss": 0.224, "num_input_tokens_seen": 20549616, "step": 33690 }, { "epoch": 10.454545454545455, "grad_norm": 0.8007177710533142, "learning_rate": 5.475549751391084e-06, "loss": 0.2199, "num_input_tokens_seen": 20554384, "step": 33695 }, { "epoch": 10.456096804219671, "grad_norm": 0.5714091658592224, "learning_rate": 5.474202063110673e-06, "loss": 0.2311, "num_input_tokens_seen": 20557232, "step": 33700 }, { "epoch": 10.457648153893889, "grad_norm": 0.7095460295677185, "learning_rate": 5.472854340065637e-06, "loss": 0.2246, "num_input_tokens_seen": 20560304, "step": 33705 }, { "epoch": 10.459199503568104, "grad_norm": 0.517509400844574, "learning_rate": 5.4715065823547785e-06, "loss": 0.2335, "num_input_tokens_seen": 20563824, "step": 33710 }, { "epoch": 10.46075085324232, "grad_norm": 0.3214600384235382, "learning_rate": 5.470158790076903e-06, "loss": 0.2184, "num_input_tokens_seen": 20566608, "step": 33715 }, { "epoch": 10.462302202916538, "grad_norm": 0.44665563106536865, "learning_rate": 5.468810963330823e-06, "loss": 0.2284, "num_input_tokens_seen": 20569360, "step": 33720 }, { "epoch": 10.463853552590754, "grad_norm": 0.5948283076286316, "learning_rate": 5.467463102215347e-06, "loss": 0.2323, "num_input_tokens_seen": 20572656, "step": 33725 }, { "epoch": 10.46540490226497, "grad_norm": 0.587365984916687, "learning_rate": 5.466115206829289e-06, "loss": 0.2266, "num_input_tokens_seen": 20575376, "step": 33730 }, { "epoch": 10.466956251939187, "grad_norm": 0.6698169112205505, "learning_rate": 5.464767277271468e-06, "loss": 0.2367, "num_input_tokens_seen": 20579056, "step": 33735 }, { "epoch": 10.468507601613403, "grad_norm": 0.2761770188808441, "learning_rate": 5.463419313640701e-06, "loss": 0.2312, "num_input_tokens_seen": 20581392, "step": 33740 }, { "epoch": 10.470058951287621, "grad_norm": 0.933348536491394, "learning_rate": 5.462071316035811e-06, "loss": 0.2087, "num_input_tokens_seen": 20584880, "step": 33745 }, { "epoch": 10.471610300961837, "grad_norm": 0.5556358695030212, "learning_rate": 5.460723284555622e-06, "loss": 0.2311, "num_input_tokens_seen": 20587920, "step": 33750 }, { "epoch": 10.473161650636053, "grad_norm": 0.6413889527320862, "learning_rate": 5.459375219298959e-06, "loss": 0.2301, "num_input_tokens_seen": 20590544, "step": 33755 }, { "epoch": 10.47471300031027, "grad_norm": 0.6766295433044434, "learning_rate": 5.458027120364653e-06, "loss": 0.2221, "num_input_tokens_seen": 20592816, "step": 33760 }, { "epoch": 10.476264349984486, "grad_norm": 0.7055283188819885, "learning_rate": 5.456678987851536e-06, "loss": 0.2318, "num_input_tokens_seen": 20595952, "step": 33765 }, { "epoch": 10.477815699658702, "grad_norm": 0.5474059581756592, "learning_rate": 5.45533082185844e-06, "loss": 0.2236, "num_input_tokens_seen": 20598704, "step": 33770 }, { "epoch": 10.47936704933292, "grad_norm": 0.48577430844306946, "learning_rate": 5.453982622484204e-06, "loss": 0.2313, "num_input_tokens_seen": 20601616, "step": 33775 }, { "epoch": 10.480918399007136, "grad_norm": 0.5314834713935852, "learning_rate": 5.452634389827667e-06, "loss": 0.2217, "num_input_tokens_seen": 20604688, "step": 33780 }, { "epoch": 10.482469748681353, "grad_norm": 0.5154004693031311, "learning_rate": 5.451286123987667e-06, "loss": 0.2318, "num_input_tokens_seen": 20607760, "step": 33785 }, { "epoch": 10.48402109835557, "grad_norm": 0.5514765977859497, "learning_rate": 5.449937825063052e-06, "loss": 0.2254, "num_input_tokens_seen": 20610512, "step": 33790 }, { "epoch": 10.485572448029785, "grad_norm": 0.5870792865753174, "learning_rate": 5.448589493152665e-06, "loss": 0.2199, "num_input_tokens_seen": 20613488, "step": 33795 }, { "epoch": 10.487123797704003, "grad_norm": 0.670647919178009, "learning_rate": 5.447241128355356e-06, "loss": 0.2198, "num_input_tokens_seen": 20616784, "step": 33800 }, { "epoch": 10.488675147378219, "grad_norm": 0.6616703271865845, "learning_rate": 5.445892730769977e-06, "loss": 0.2356, "num_input_tokens_seen": 20619408, "step": 33805 }, { "epoch": 10.490226497052436, "grad_norm": 0.6901629567146301, "learning_rate": 5.44454430049538e-06, "loss": 0.2325, "num_input_tokens_seen": 20622608, "step": 33810 }, { "epoch": 10.491777846726652, "grad_norm": 0.7515869736671448, "learning_rate": 5.443195837630423e-06, "loss": 0.2251, "num_input_tokens_seen": 20626448, "step": 33815 }, { "epoch": 10.493329196400868, "grad_norm": 0.5526338219642639, "learning_rate": 5.44184734227396e-06, "loss": 0.2275, "num_input_tokens_seen": 20628976, "step": 33820 }, { "epoch": 10.494880546075086, "grad_norm": 0.5918685793876648, "learning_rate": 5.440498814524857e-06, "loss": 0.2024, "num_input_tokens_seen": 20633040, "step": 33825 }, { "epoch": 10.496431895749302, "grad_norm": 0.6462839245796204, "learning_rate": 5.439150254481974e-06, "loss": 0.2238, "num_input_tokens_seen": 20637232, "step": 33830 }, { "epoch": 10.49798324542352, "grad_norm": 1.2048405408859253, "learning_rate": 5.437801662244177e-06, "loss": 0.2276, "num_input_tokens_seen": 20639760, "step": 33835 }, { "epoch": 10.499534595097735, "grad_norm": 0.8205052614212036, "learning_rate": 5.4364530379103344e-06, "loss": 0.2247, "num_input_tokens_seen": 20642064, "step": 33840 }, { "epoch": 10.501085944771951, "grad_norm": 0.681046187877655, "learning_rate": 5.435104381579315e-06, "loss": 0.2458, "num_input_tokens_seen": 20644144, "step": 33845 }, { "epoch": 10.502637294446169, "grad_norm": 0.9654083251953125, "learning_rate": 5.433755693349991e-06, "loss": 0.2243, "num_input_tokens_seen": 20647248, "step": 33850 }, { "epoch": 10.504188644120385, "grad_norm": 0.6665304899215698, "learning_rate": 5.43240697332124e-06, "loss": 0.2176, "num_input_tokens_seen": 20650032, "step": 33855 }, { "epoch": 10.5057399937946, "grad_norm": 0.6053235530853271, "learning_rate": 5.4310582215919375e-06, "loss": 0.2241, "num_input_tokens_seen": 20653232, "step": 33860 }, { "epoch": 10.507291343468818, "grad_norm": 0.9951511025428772, "learning_rate": 5.429709438260962e-06, "loss": 0.2302, "num_input_tokens_seen": 20656528, "step": 33865 }, { "epoch": 10.508842693143034, "grad_norm": 0.752460777759552, "learning_rate": 5.4283606234271955e-06, "loss": 0.2206, "num_input_tokens_seen": 20658768, "step": 33870 }, { "epoch": 10.510394042817252, "grad_norm": 1.0640850067138672, "learning_rate": 5.427011777189525e-06, "loss": 0.2285, "num_input_tokens_seen": 20661328, "step": 33875 }, { "epoch": 10.511945392491468, "grad_norm": 0.5314187407493591, "learning_rate": 5.425662899646832e-06, "loss": 0.2365, "num_input_tokens_seen": 20663568, "step": 33880 }, { "epoch": 10.513496742165684, "grad_norm": 1.1029306650161743, "learning_rate": 5.424313990898009e-06, "loss": 0.2261, "num_input_tokens_seen": 20666416, "step": 33885 }, { "epoch": 10.515048091839901, "grad_norm": 0.7604422569274902, "learning_rate": 5.422965051041945e-06, "loss": 0.2383, "num_input_tokens_seen": 20668848, "step": 33890 }, { "epoch": 10.516599441514117, "grad_norm": 0.8175315856933594, "learning_rate": 5.421616080177535e-06, "loss": 0.23, "num_input_tokens_seen": 20671824, "step": 33895 }, { "epoch": 10.518150791188333, "grad_norm": 0.8651338815689087, "learning_rate": 5.4202670784036745e-06, "loss": 0.2463, "num_input_tokens_seen": 20674288, "step": 33900 }, { "epoch": 10.51970214086255, "grad_norm": 0.7297122478485107, "learning_rate": 5.418918045819259e-06, "loss": 0.2317, "num_input_tokens_seen": 20677168, "step": 33905 }, { "epoch": 10.521253490536767, "grad_norm": 0.8062557578086853, "learning_rate": 5.417568982523191e-06, "loss": 0.2208, "num_input_tokens_seen": 20680336, "step": 33910 }, { "epoch": 10.522804840210984, "grad_norm": 1.1904866695404053, "learning_rate": 5.416219888614372e-06, "loss": 0.2266, "num_input_tokens_seen": 20683760, "step": 33915 }, { "epoch": 10.5243561898852, "grad_norm": 0.6824865341186523, "learning_rate": 5.4148707641917054e-06, "loss": 0.2354, "num_input_tokens_seen": 20686832, "step": 33920 }, { "epoch": 10.525907539559416, "grad_norm": 0.5892631411552429, "learning_rate": 5.413521609354099e-06, "loss": 0.2145, "num_input_tokens_seen": 20689680, "step": 33925 }, { "epoch": 10.527458889233634, "grad_norm": 0.8771490454673767, "learning_rate": 5.412172424200462e-06, "loss": 0.2229, "num_input_tokens_seen": 20692528, "step": 33930 }, { "epoch": 10.52901023890785, "grad_norm": 0.8206179141998291, "learning_rate": 5.4108232088297055e-06, "loss": 0.2191, "num_input_tokens_seen": 20695568, "step": 33935 }, { "epoch": 10.530561588582067, "grad_norm": 1.2330433130264282, "learning_rate": 5.409473963340744e-06, "loss": 0.22, "num_input_tokens_seen": 20698032, "step": 33940 }, { "epoch": 10.532112938256283, "grad_norm": 1.6696439981460571, "learning_rate": 5.40812468783249e-06, "loss": 0.2346, "num_input_tokens_seen": 20701360, "step": 33945 }, { "epoch": 10.533664287930499, "grad_norm": 0.9850446581840515, "learning_rate": 5.406775382403865e-06, "loss": 0.2409, "num_input_tokens_seen": 20703856, "step": 33950 }, { "epoch": 10.535215637604717, "grad_norm": 1.0522966384887695, "learning_rate": 5.4054260471537866e-06, "loss": 0.2277, "num_input_tokens_seen": 20706768, "step": 33955 }, { "epoch": 10.536766987278932, "grad_norm": 1.2313599586486816, "learning_rate": 5.404076682181178e-06, "loss": 0.2318, "num_input_tokens_seen": 20709648, "step": 33960 }, { "epoch": 10.53831833695315, "grad_norm": 0.9408629536628723, "learning_rate": 5.402727287584965e-06, "loss": 0.2205, "num_input_tokens_seen": 20712176, "step": 33965 }, { "epoch": 10.539869686627366, "grad_norm": 0.7483890056610107, "learning_rate": 5.401377863464071e-06, "loss": 0.2172, "num_input_tokens_seen": 20714608, "step": 33970 }, { "epoch": 10.541421036301582, "grad_norm": 1.4461264610290527, "learning_rate": 5.400028409917428e-06, "loss": 0.2456, "num_input_tokens_seen": 20717296, "step": 33975 }, { "epoch": 10.5429723859758, "grad_norm": 0.4299512505531311, "learning_rate": 5.398678927043965e-06, "loss": 0.2254, "num_input_tokens_seen": 20720080, "step": 33980 }, { "epoch": 10.544523735650015, "grad_norm": 0.7070675492286682, "learning_rate": 5.397329414942616e-06, "loss": 0.2044, "num_input_tokens_seen": 20723504, "step": 33985 }, { "epoch": 10.546075085324231, "grad_norm": 0.9494454860687256, "learning_rate": 5.3959798737123156e-06, "loss": 0.233, "num_input_tokens_seen": 20726096, "step": 33990 }, { "epoch": 10.547626434998449, "grad_norm": 0.6734031438827515, "learning_rate": 5.394630303452001e-06, "loss": 0.2378, "num_input_tokens_seen": 20728752, "step": 33995 }, { "epoch": 10.549177784672665, "grad_norm": 0.5626466274261475, "learning_rate": 5.3932807042606125e-06, "loss": 0.2421, "num_input_tokens_seen": 20731664, "step": 34000 }, { "epoch": 10.550729134346883, "grad_norm": 0.6958081126213074, "learning_rate": 5.391931076237091e-06, "loss": 0.2248, "num_input_tokens_seen": 20734800, "step": 34005 }, { "epoch": 10.552280484021098, "grad_norm": 1.0398404598236084, "learning_rate": 5.3905814194803805e-06, "loss": 0.2237, "num_input_tokens_seen": 20738800, "step": 34010 }, { "epoch": 10.553831833695314, "grad_norm": 0.7778611779212952, "learning_rate": 5.3892317340894265e-06, "loss": 0.2245, "num_input_tokens_seen": 20741232, "step": 34015 }, { "epoch": 10.555383183369532, "grad_norm": 0.4870353639125824, "learning_rate": 5.387882020163177e-06, "loss": 0.2177, "num_input_tokens_seen": 20744304, "step": 34020 }, { "epoch": 10.556934533043748, "grad_norm": 0.8439188003540039, "learning_rate": 5.386532277800583e-06, "loss": 0.2219, "num_input_tokens_seen": 20747696, "step": 34025 }, { "epoch": 10.558485882717964, "grad_norm": 0.5736650824546814, "learning_rate": 5.385182507100594e-06, "loss": 0.2196, "num_input_tokens_seen": 20749648, "step": 34030 }, { "epoch": 10.560037232392181, "grad_norm": 0.8013983964920044, "learning_rate": 5.383832708162169e-06, "loss": 0.2125, "num_input_tokens_seen": 20753200, "step": 34035 }, { "epoch": 10.561588582066397, "grad_norm": 0.9581590294837952, "learning_rate": 5.382482881084259e-06, "loss": 0.2292, "num_input_tokens_seen": 20757904, "step": 34040 }, { "epoch": 10.563139931740615, "grad_norm": 0.9005267024040222, "learning_rate": 5.381133025965826e-06, "loss": 0.2208, "num_input_tokens_seen": 20761168, "step": 34045 }, { "epoch": 10.56469128141483, "grad_norm": 0.8253068923950195, "learning_rate": 5.379783142905828e-06, "loss": 0.2308, "num_input_tokens_seen": 20765968, "step": 34050 }, { "epoch": 10.566242631089047, "grad_norm": 0.7131008505821228, "learning_rate": 5.378433232003228e-06, "loss": 0.2328, "num_input_tokens_seen": 20769584, "step": 34055 }, { "epoch": 10.567793980763264, "grad_norm": 0.6988634467124939, "learning_rate": 5.377083293356991e-06, "loss": 0.2195, "num_input_tokens_seen": 20772624, "step": 34060 }, { "epoch": 10.56934533043748, "grad_norm": 0.9125916957855225, "learning_rate": 5.375733327066084e-06, "loss": 0.2279, "num_input_tokens_seen": 20776144, "step": 34065 }, { "epoch": 10.570896680111698, "grad_norm": 0.9222729802131653, "learning_rate": 5.374383333229474e-06, "loss": 0.2376, "num_input_tokens_seen": 20779344, "step": 34070 }, { "epoch": 10.572448029785914, "grad_norm": 0.868998646736145, "learning_rate": 5.373033311946132e-06, "loss": 0.2307, "num_input_tokens_seen": 20782320, "step": 34075 }, { "epoch": 10.57399937946013, "grad_norm": 1.0767319202423096, "learning_rate": 5.371683263315031e-06, "loss": 0.2129, "num_input_tokens_seen": 20784976, "step": 34080 }, { "epoch": 10.575550729134347, "grad_norm": 1.200042963027954, "learning_rate": 5.370333187435146e-06, "loss": 0.2338, "num_input_tokens_seen": 20788176, "step": 34085 }, { "epoch": 10.577102078808563, "grad_norm": 0.5613723397254944, "learning_rate": 5.3689830844054525e-06, "loss": 0.2378, "num_input_tokens_seen": 20792208, "step": 34090 }, { "epoch": 10.578653428482781, "grad_norm": 0.9578118920326233, "learning_rate": 5.36763295432493e-06, "loss": 0.2379, "num_input_tokens_seen": 20795248, "step": 34095 }, { "epoch": 10.580204778156997, "grad_norm": 0.7361347675323486, "learning_rate": 5.366282797292558e-06, "loss": 0.2363, "num_input_tokens_seen": 20797424, "step": 34100 }, { "epoch": 10.581756127831213, "grad_norm": 1.503147006034851, "learning_rate": 5.36493261340732e-06, "loss": 0.2481, "num_input_tokens_seen": 20800464, "step": 34105 }, { "epoch": 10.58330747750543, "grad_norm": 1.443557858467102, "learning_rate": 5.3635824027682005e-06, "loss": 0.2381, "num_input_tokens_seen": 20805104, "step": 34110 }, { "epoch": 10.584858827179646, "grad_norm": 0.6850069761276245, "learning_rate": 5.362232165474185e-06, "loss": 0.2272, "num_input_tokens_seen": 20807440, "step": 34115 }, { "epoch": 10.586410176853862, "grad_norm": 0.960291862487793, "learning_rate": 5.360881901624262e-06, "loss": 0.2226, "num_input_tokens_seen": 20811568, "step": 34120 }, { "epoch": 10.58796152652808, "grad_norm": 0.5721745491027832, "learning_rate": 5.359531611317421e-06, "loss": 0.2258, "num_input_tokens_seen": 20813808, "step": 34125 }, { "epoch": 10.589512876202296, "grad_norm": 0.4133159816265106, "learning_rate": 5.358181294652658e-06, "loss": 0.2316, "num_input_tokens_seen": 20816432, "step": 34130 }, { "epoch": 10.591064225876513, "grad_norm": 0.7945817112922668, "learning_rate": 5.356830951728962e-06, "loss": 0.2321, "num_input_tokens_seen": 20819856, "step": 34135 }, { "epoch": 10.59261557555073, "grad_norm": 0.4494301974773407, "learning_rate": 5.355480582645332e-06, "loss": 0.21, "num_input_tokens_seen": 20822672, "step": 34140 }, { "epoch": 10.594166925224945, "grad_norm": 0.717157244682312, "learning_rate": 5.354130187500766e-06, "loss": 0.2083, "num_input_tokens_seen": 20826608, "step": 34145 }, { "epoch": 10.595718274899163, "grad_norm": 0.8161299824714661, "learning_rate": 5.352779766394266e-06, "loss": 0.2112, "num_input_tokens_seen": 20829552, "step": 34150 }, { "epoch": 10.597269624573379, "grad_norm": 0.8843263983726501, "learning_rate": 5.3514293194248305e-06, "loss": 0.2252, "num_input_tokens_seen": 20833104, "step": 34155 }, { "epoch": 10.598820974247595, "grad_norm": 0.9715953469276428, "learning_rate": 5.350078846691463e-06, "loss": 0.2265, "num_input_tokens_seen": 20835536, "step": 34160 }, { "epoch": 10.600372323921812, "grad_norm": 0.8379912972450256, "learning_rate": 5.348728348293172e-06, "loss": 0.234, "num_input_tokens_seen": 20838352, "step": 34165 }, { "epoch": 10.601923673596028, "grad_norm": 0.9679178595542908, "learning_rate": 5.347377824328962e-06, "loss": 0.2184, "num_input_tokens_seen": 20840880, "step": 34170 }, { "epoch": 10.603475023270246, "grad_norm": 0.6905739307403564, "learning_rate": 5.3460272748978445e-06, "loss": 0.2309, "num_input_tokens_seen": 20844304, "step": 34175 }, { "epoch": 10.605026372944462, "grad_norm": 0.9311623573303223, "learning_rate": 5.3446767000988306e-06, "loss": 0.2299, "num_input_tokens_seen": 20847920, "step": 34180 }, { "epoch": 10.606577722618677, "grad_norm": 0.8738739490509033, "learning_rate": 5.343326100030933e-06, "loss": 0.2349, "num_input_tokens_seen": 20850896, "step": 34185 }, { "epoch": 10.608129072292895, "grad_norm": 0.801896870136261, "learning_rate": 5.341975474793166e-06, "loss": 0.2142, "num_input_tokens_seen": 20853488, "step": 34190 }, { "epoch": 10.609680421967111, "grad_norm": 0.6851862668991089, "learning_rate": 5.340624824484547e-06, "loss": 0.2371, "num_input_tokens_seen": 20856304, "step": 34195 }, { "epoch": 10.611231771641329, "grad_norm": 1.539109468460083, "learning_rate": 5.339274149204096e-06, "loss": 0.2134, "num_input_tokens_seen": 20858384, "step": 34200 }, { "epoch": 10.612783121315545, "grad_norm": 1.5491548776626587, "learning_rate": 5.337923449050831e-06, "loss": 0.2116, "num_input_tokens_seen": 20861808, "step": 34205 }, { "epoch": 10.61433447098976, "grad_norm": 0.8870126605033875, "learning_rate": 5.3365727241237765e-06, "loss": 0.2494, "num_input_tokens_seen": 20864496, "step": 34210 }, { "epoch": 10.615885820663978, "grad_norm": 1.0879019498825073, "learning_rate": 5.335221974521953e-06, "loss": 0.2139, "num_input_tokens_seen": 20867632, "step": 34215 }, { "epoch": 10.617437170338194, "grad_norm": 0.649893045425415, "learning_rate": 5.333871200344393e-06, "loss": 0.2481, "num_input_tokens_seen": 20870928, "step": 34220 }, { "epoch": 10.618988520012412, "grad_norm": 0.7667312622070312, "learning_rate": 5.332520401690119e-06, "loss": 0.229, "num_input_tokens_seen": 20874256, "step": 34225 }, { "epoch": 10.620539869686628, "grad_norm": 1.6979938745498657, "learning_rate": 5.331169578658162e-06, "loss": 0.2298, "num_input_tokens_seen": 20877648, "step": 34230 }, { "epoch": 10.622091219360843, "grad_norm": 1.3815590143203735, "learning_rate": 5.3298187313475525e-06, "loss": 0.2244, "num_input_tokens_seen": 20881392, "step": 34235 }, { "epoch": 10.623642569035061, "grad_norm": 0.9293661713600159, "learning_rate": 5.328467859857325e-06, "loss": 0.2343, "num_input_tokens_seen": 20884592, "step": 34240 }, { "epoch": 10.625193918709277, "grad_norm": 0.7070670127868652, "learning_rate": 5.327116964286514e-06, "loss": 0.2185, "num_input_tokens_seen": 20887440, "step": 34245 }, { "epoch": 10.626745268383493, "grad_norm": 0.581173300743103, "learning_rate": 5.325766044734156e-06, "loss": 0.2214, "num_input_tokens_seen": 20890896, "step": 34250 }, { "epoch": 10.62829661805771, "grad_norm": 0.8915934562683105, "learning_rate": 5.324415101299289e-06, "loss": 0.2369, "num_input_tokens_seen": 20893840, "step": 34255 }, { "epoch": 10.629847967731926, "grad_norm": 0.6021586656570435, "learning_rate": 5.323064134080953e-06, "loss": 0.2411, "num_input_tokens_seen": 20896688, "step": 34260 }, { "epoch": 10.631399317406144, "grad_norm": 0.6715231537818909, "learning_rate": 5.32171314317819e-06, "loss": 0.2288, "num_input_tokens_seen": 20900176, "step": 34265 }, { "epoch": 10.63295066708036, "grad_norm": 1.2314785718917847, "learning_rate": 5.3203621286900445e-06, "loss": 0.2255, "num_input_tokens_seen": 20905616, "step": 34270 }, { "epoch": 10.634502016754576, "grad_norm": 0.8333666920661926, "learning_rate": 5.31901109071556e-06, "loss": 0.2392, "num_input_tokens_seen": 20908208, "step": 34275 }, { "epoch": 10.636053366428794, "grad_norm": 0.6861802935600281, "learning_rate": 5.3176600293537875e-06, "loss": 0.2472, "num_input_tokens_seen": 20910480, "step": 34280 }, { "epoch": 10.63760471610301, "grad_norm": 0.733771562576294, "learning_rate": 5.3163089447037705e-06, "loss": 0.2321, "num_input_tokens_seen": 20913168, "step": 34285 }, { "epoch": 10.639156065777225, "grad_norm": 0.7044857740402222, "learning_rate": 5.314957836864565e-06, "loss": 0.2347, "num_input_tokens_seen": 20917936, "step": 34290 }, { "epoch": 10.640707415451443, "grad_norm": 0.952197790145874, "learning_rate": 5.313606705935218e-06, "loss": 0.2198, "num_input_tokens_seen": 20920688, "step": 34295 }, { "epoch": 10.642258765125659, "grad_norm": 0.7002281546592712, "learning_rate": 5.312255552014786e-06, "loss": 0.2355, "num_input_tokens_seen": 20923280, "step": 34300 }, { "epoch": 10.643810114799876, "grad_norm": 0.7137023210525513, "learning_rate": 5.310904375202326e-06, "loss": 0.2161, "num_input_tokens_seen": 20925840, "step": 34305 }, { "epoch": 10.645361464474092, "grad_norm": 0.4805293679237366, "learning_rate": 5.309553175596893e-06, "loss": 0.2334, "num_input_tokens_seen": 20928976, "step": 34310 }, { "epoch": 10.646912814148308, "grad_norm": 0.547558069229126, "learning_rate": 5.3082019532975475e-06, "loss": 0.2245, "num_input_tokens_seen": 20931664, "step": 34315 }, { "epoch": 10.648464163822526, "grad_norm": 0.7841506004333496, "learning_rate": 5.306850708403347e-06, "loss": 0.2146, "num_input_tokens_seen": 20933744, "step": 34320 }, { "epoch": 10.650015513496742, "grad_norm": 0.6370704770088196, "learning_rate": 5.305499441013358e-06, "loss": 0.2319, "num_input_tokens_seen": 20937232, "step": 34325 }, { "epoch": 10.65156686317096, "grad_norm": 0.5287631750106812, "learning_rate": 5.30414815122664e-06, "loss": 0.2215, "num_input_tokens_seen": 20940208, "step": 34330 }, { "epoch": 10.653118212845175, "grad_norm": 1.246646761894226, "learning_rate": 5.302796839142261e-06, "loss": 0.2403, "num_input_tokens_seen": 20942640, "step": 34335 }, { "epoch": 10.654669562519391, "grad_norm": 0.6794112324714661, "learning_rate": 5.301445504859291e-06, "loss": 0.2312, "num_input_tokens_seen": 20945680, "step": 34340 }, { "epoch": 10.656220912193609, "grad_norm": 0.63516765832901, "learning_rate": 5.300094148476793e-06, "loss": 0.2316, "num_input_tokens_seen": 20948528, "step": 34345 }, { "epoch": 10.657772261867825, "grad_norm": 0.7219563126564026, "learning_rate": 5.298742770093841e-06, "loss": 0.2385, "num_input_tokens_seen": 20951472, "step": 34350 }, { "epoch": 10.659323611542042, "grad_norm": 1.4615132808685303, "learning_rate": 5.297391369809507e-06, "loss": 0.2135, "num_input_tokens_seen": 20954384, "step": 34355 }, { "epoch": 10.660874961216258, "grad_norm": 0.43577736616134644, "learning_rate": 5.296039947722864e-06, "loss": 0.233, "num_input_tokens_seen": 20956752, "step": 34360 }, { "epoch": 10.662426310890474, "grad_norm": 0.5821143388748169, "learning_rate": 5.294688503932986e-06, "loss": 0.2045, "num_input_tokens_seen": 20960240, "step": 34365 }, { "epoch": 10.663977660564692, "grad_norm": 0.5740541219711304, "learning_rate": 5.293337038538952e-06, "loss": 0.21, "num_input_tokens_seen": 20963504, "step": 34370 }, { "epoch": 10.665529010238908, "grad_norm": 1.2776970863342285, "learning_rate": 5.29198555163984e-06, "loss": 0.2201, "num_input_tokens_seen": 20967152, "step": 34375 }, { "epoch": 10.667080359913124, "grad_norm": 1.0603113174438477, "learning_rate": 5.290634043334728e-06, "loss": 0.2137, "num_input_tokens_seen": 20971248, "step": 34380 }, { "epoch": 10.668631709587341, "grad_norm": 1.059391736984253, "learning_rate": 5.2892825137227e-06, "loss": 0.246, "num_input_tokens_seen": 20974640, "step": 34385 }, { "epoch": 10.670183059261557, "grad_norm": 0.7815641164779663, "learning_rate": 5.287930962902836e-06, "loss": 0.2225, "num_input_tokens_seen": 20977552, "step": 34390 }, { "epoch": 10.671734408935775, "grad_norm": 0.887148916721344, "learning_rate": 5.2865793909742254e-06, "loss": 0.2281, "num_input_tokens_seen": 20980176, "step": 34395 }, { "epoch": 10.67328575860999, "grad_norm": 1.033233642578125, "learning_rate": 5.2852277980359515e-06, "loss": 0.2148, "num_input_tokens_seen": 20982896, "step": 34400 }, { "epoch": 10.674837108284207, "grad_norm": 0.7444972395896912, "learning_rate": 5.283876184187102e-06, "loss": 0.2323, "num_input_tokens_seen": 20986256, "step": 34405 }, { "epoch": 10.676388457958424, "grad_norm": 1.5282703638076782, "learning_rate": 5.282524549526766e-06, "loss": 0.2296, "num_input_tokens_seen": 20989104, "step": 34410 }, { "epoch": 10.67793980763264, "grad_norm": 1.3506611585617065, "learning_rate": 5.281172894154036e-06, "loss": 0.2262, "num_input_tokens_seen": 20992336, "step": 34415 }, { "epoch": 10.679491157306856, "grad_norm": 0.905705451965332, "learning_rate": 5.279821218168003e-06, "loss": 0.2206, "num_input_tokens_seen": 20994640, "step": 34420 }, { "epoch": 10.681042506981074, "grad_norm": 1.204440712928772, "learning_rate": 5.27846952166776e-06, "loss": 0.2463, "num_input_tokens_seen": 20997712, "step": 34425 }, { "epoch": 10.68259385665529, "grad_norm": 0.654065728187561, "learning_rate": 5.2771178047524054e-06, "loss": 0.2331, "num_input_tokens_seen": 21000880, "step": 34430 }, { "epoch": 10.684145206329507, "grad_norm": 0.6762789487838745, "learning_rate": 5.275766067521033e-06, "loss": 0.2066, "num_input_tokens_seen": 21003760, "step": 34435 }, { "epoch": 10.685696556003723, "grad_norm": 1.3461568355560303, "learning_rate": 5.274414310072742e-06, "loss": 0.2459, "num_input_tokens_seen": 21006864, "step": 34440 }, { "epoch": 10.687247905677939, "grad_norm": 0.9745102524757385, "learning_rate": 5.273062532506633e-06, "loss": 0.2083, "num_input_tokens_seen": 21009488, "step": 34445 }, { "epoch": 10.688799255352157, "grad_norm": 1.0554479360580444, "learning_rate": 5.271710734921806e-06, "loss": 0.2377, "num_input_tokens_seen": 21012272, "step": 34450 }, { "epoch": 10.690350605026373, "grad_norm": 0.5707553625106812, "learning_rate": 5.270358917417366e-06, "loss": 0.2132, "num_input_tokens_seen": 21014800, "step": 34455 }, { "epoch": 10.69190195470059, "grad_norm": 0.7463486194610596, "learning_rate": 5.269007080092414e-06, "loss": 0.2193, "num_input_tokens_seen": 21018608, "step": 34460 }, { "epoch": 10.693453304374806, "grad_norm": 0.9496745467185974, "learning_rate": 5.2676552230460586e-06, "loss": 0.2344, "num_input_tokens_seen": 21021360, "step": 34465 }, { "epoch": 10.695004654049022, "grad_norm": 1.2082154750823975, "learning_rate": 5.266303346377406e-06, "loss": 0.239, "num_input_tokens_seen": 21025232, "step": 34470 }, { "epoch": 10.69655600372324, "grad_norm": 0.986280083656311, "learning_rate": 5.264951450185565e-06, "loss": 0.2185, "num_input_tokens_seen": 21027760, "step": 34475 }, { "epoch": 10.698107353397456, "grad_norm": 0.7569303512573242, "learning_rate": 5.263599534569644e-06, "loss": 0.2158, "num_input_tokens_seen": 21031312, "step": 34480 }, { "epoch": 10.699658703071673, "grad_norm": 1.220332384109497, "learning_rate": 5.262247599628757e-06, "loss": 0.2389, "num_input_tokens_seen": 21034640, "step": 34485 }, { "epoch": 10.701210052745889, "grad_norm": 0.7036747932434082, "learning_rate": 5.260895645462015e-06, "loss": 0.2248, "num_input_tokens_seen": 21037616, "step": 34490 }, { "epoch": 10.702761402420105, "grad_norm": 0.8525975346565247, "learning_rate": 5.259543672168532e-06, "loss": 0.2069, "num_input_tokens_seen": 21040272, "step": 34495 }, { "epoch": 10.704312752094323, "grad_norm": 1.512665867805481, "learning_rate": 5.258191679847425e-06, "loss": 0.271, "num_input_tokens_seen": 21042896, "step": 34500 }, { "epoch": 10.705864101768539, "grad_norm": 0.3981988728046417, "learning_rate": 5.256839668597809e-06, "loss": 0.2196, "num_input_tokens_seen": 21045328, "step": 34505 }, { "epoch": 10.707415451442754, "grad_norm": 1.2713596820831299, "learning_rate": 5.255487638518806e-06, "loss": 0.2405, "num_input_tokens_seen": 21048720, "step": 34510 }, { "epoch": 10.708966801116972, "grad_norm": 0.8476005792617798, "learning_rate": 5.254135589709533e-06, "loss": 0.2225, "num_input_tokens_seen": 21051472, "step": 34515 }, { "epoch": 10.710518150791188, "grad_norm": 0.9357578158378601, "learning_rate": 5.252783522269112e-06, "loss": 0.2134, "num_input_tokens_seen": 21054096, "step": 34520 }, { "epoch": 10.712069500465406, "grad_norm": 1.0386724472045898, "learning_rate": 5.2514314362966635e-06, "loss": 0.2346, "num_input_tokens_seen": 21057456, "step": 34525 }, { "epoch": 10.713620850139622, "grad_norm": 0.7908459901809692, "learning_rate": 5.2500793318913154e-06, "loss": 0.2287, "num_input_tokens_seen": 21060208, "step": 34530 }, { "epoch": 10.715172199813837, "grad_norm": 0.5551729202270508, "learning_rate": 5.24872720915219e-06, "loss": 0.2315, "num_input_tokens_seen": 21063024, "step": 34535 }, { "epoch": 10.716723549488055, "grad_norm": 1.216078758239746, "learning_rate": 5.247375068178415e-06, "loss": 0.2017, "num_input_tokens_seen": 21067152, "step": 34540 }, { "epoch": 10.718274899162271, "grad_norm": 0.9167473912239075, "learning_rate": 5.24602290906912e-06, "loss": 0.223, "num_input_tokens_seen": 21070352, "step": 34545 }, { "epoch": 10.719826248836487, "grad_norm": 0.5689379572868347, "learning_rate": 5.24467073192343e-06, "loss": 0.2177, "num_input_tokens_seen": 21073264, "step": 34550 }, { "epoch": 10.721377598510704, "grad_norm": 0.7962693572044373, "learning_rate": 5.243318536840479e-06, "loss": 0.2352, "num_input_tokens_seen": 21075888, "step": 34555 }, { "epoch": 10.72292894818492, "grad_norm": 1.318468451499939, "learning_rate": 5.241966323919397e-06, "loss": 0.2133, "num_input_tokens_seen": 21078608, "step": 34560 }, { "epoch": 10.724480297859138, "grad_norm": 1.106472373008728, "learning_rate": 5.24061409325932e-06, "loss": 0.2271, "num_input_tokens_seen": 21081136, "step": 34565 }, { "epoch": 10.726031647533354, "grad_norm": 1.027491569519043, "learning_rate": 5.239261844959379e-06, "loss": 0.2289, "num_input_tokens_seen": 21083856, "step": 34570 }, { "epoch": 10.72758299720757, "grad_norm": 0.6577162146568298, "learning_rate": 5.237909579118713e-06, "loss": 0.2272, "num_input_tokens_seen": 21086352, "step": 34575 }, { "epoch": 10.729134346881787, "grad_norm": 1.3618563413619995, "learning_rate": 5.236557295836457e-06, "loss": 0.2236, "num_input_tokens_seen": 21090128, "step": 34580 }, { "epoch": 10.730685696556003, "grad_norm": 0.6965073347091675, "learning_rate": 5.235204995211748e-06, "loss": 0.2066, "num_input_tokens_seen": 21093840, "step": 34585 }, { "epoch": 10.732237046230221, "grad_norm": 0.7820295691490173, "learning_rate": 5.2338526773437295e-06, "loss": 0.229, "num_input_tokens_seen": 21096784, "step": 34590 }, { "epoch": 10.733788395904437, "grad_norm": 0.9680221080780029, "learning_rate": 5.23250034233154e-06, "loss": 0.2223, "num_input_tokens_seen": 21099632, "step": 34595 }, { "epoch": 10.735339745578653, "grad_norm": 0.853100061416626, "learning_rate": 5.231147990274322e-06, "loss": 0.2233, "num_input_tokens_seen": 21103696, "step": 34600 }, { "epoch": 10.73689109525287, "grad_norm": 1.0432146787643433, "learning_rate": 5.2297956212712195e-06, "loss": 0.2346, "num_input_tokens_seen": 21106768, "step": 34605 }, { "epoch": 10.738442444927086, "grad_norm": 1.796832799911499, "learning_rate": 5.228443235421377e-06, "loss": 0.2444, "num_input_tokens_seen": 21110032, "step": 34610 }, { "epoch": 10.739993794601304, "grad_norm": 0.5475175380706787, "learning_rate": 5.22709083282394e-06, "loss": 0.2266, "num_input_tokens_seen": 21114512, "step": 34615 }, { "epoch": 10.74154514427552, "grad_norm": 0.7398710250854492, "learning_rate": 5.225738413578055e-06, "loss": 0.2251, "num_input_tokens_seen": 21117136, "step": 34620 }, { "epoch": 10.743096493949736, "grad_norm": 0.8084455728530884, "learning_rate": 5.224385977782873e-06, "loss": 0.2236, "num_input_tokens_seen": 21120496, "step": 34625 }, { "epoch": 10.744647843623953, "grad_norm": 0.8061413764953613, "learning_rate": 5.223033525537541e-06, "loss": 0.2284, "num_input_tokens_seen": 21122832, "step": 34630 }, { "epoch": 10.74619919329817, "grad_norm": 1.3808025121688843, "learning_rate": 5.221681056941209e-06, "loss": 0.2221, "num_input_tokens_seen": 21125968, "step": 34635 }, { "epoch": 10.747750542972385, "grad_norm": 1.283785343170166, "learning_rate": 5.220328572093032e-06, "loss": 0.203, "num_input_tokens_seen": 21128752, "step": 34640 }, { "epoch": 10.749301892646603, "grad_norm": 0.6837317943572998, "learning_rate": 5.21897607109216e-06, "loss": 0.2306, "num_input_tokens_seen": 21131664, "step": 34645 }, { "epoch": 10.750853242320819, "grad_norm": 1.1280568838119507, "learning_rate": 5.217623554037751e-06, "loss": 0.2343, "num_input_tokens_seen": 21135056, "step": 34650 }, { "epoch": 10.752404591995036, "grad_norm": 1.7847447395324707, "learning_rate": 5.216271021028957e-06, "loss": 0.2301, "num_input_tokens_seen": 21138288, "step": 34655 }, { "epoch": 10.753955941669252, "grad_norm": 0.7258649468421936, "learning_rate": 5.214918472164936e-06, "loss": 0.2466, "num_input_tokens_seen": 21141264, "step": 34660 }, { "epoch": 10.755507291343468, "grad_norm": 0.9268198609352112, "learning_rate": 5.213565907544847e-06, "loss": 0.2237, "num_input_tokens_seen": 21144368, "step": 34665 }, { "epoch": 10.757058641017686, "grad_norm": 0.7992119193077087, "learning_rate": 5.2122133272678475e-06, "loss": 0.2215, "num_input_tokens_seen": 21147152, "step": 34670 }, { "epoch": 10.758609990691902, "grad_norm": 0.8503202199935913, "learning_rate": 5.2108607314330984e-06, "loss": 0.2112, "num_input_tokens_seen": 21149808, "step": 34675 }, { "epoch": 10.760161340366118, "grad_norm": 0.9638184309005737, "learning_rate": 5.20950812013976e-06, "loss": 0.2442, "num_input_tokens_seen": 21154192, "step": 34680 }, { "epoch": 10.761712690040335, "grad_norm": 0.6126213073730469, "learning_rate": 5.208155493486996e-06, "loss": 0.2375, "num_input_tokens_seen": 21156656, "step": 34685 }, { "epoch": 10.763264039714551, "grad_norm": 0.5649996995925903, "learning_rate": 5.20680285157397e-06, "loss": 0.2323, "num_input_tokens_seen": 21159344, "step": 34690 }, { "epoch": 10.764815389388769, "grad_norm": 1.1800953149795532, "learning_rate": 5.205450194499845e-06, "loss": 0.2502, "num_input_tokens_seen": 21162224, "step": 34695 }, { "epoch": 10.766366739062985, "grad_norm": 0.5974743366241455, "learning_rate": 5.204097522363788e-06, "loss": 0.2366, "num_input_tokens_seen": 21164336, "step": 34700 }, { "epoch": 10.7679180887372, "grad_norm": 0.9898948073387146, "learning_rate": 5.202744835264966e-06, "loss": 0.2101, "num_input_tokens_seen": 21167440, "step": 34705 }, { "epoch": 10.769469438411418, "grad_norm": 0.7176990509033203, "learning_rate": 5.201392133302547e-06, "loss": 0.2448, "num_input_tokens_seen": 21169872, "step": 34710 }, { "epoch": 10.771020788085634, "grad_norm": 0.9529001712799072, "learning_rate": 5.200039416575699e-06, "loss": 0.2306, "num_input_tokens_seen": 21173808, "step": 34715 }, { "epoch": 10.772572137759852, "grad_norm": 0.8124091625213623, "learning_rate": 5.1986866851835925e-06, "loss": 0.2164, "num_input_tokens_seen": 21177360, "step": 34720 }, { "epoch": 10.774123487434068, "grad_norm": 0.868831992149353, "learning_rate": 5.197333939225401e-06, "loss": 0.2373, "num_input_tokens_seen": 21180304, "step": 34725 }, { "epoch": 10.775674837108284, "grad_norm": 1.6700348854064941, "learning_rate": 5.195981178800294e-06, "loss": 0.2412, "num_input_tokens_seen": 21183920, "step": 34730 }, { "epoch": 10.777226186782501, "grad_norm": 0.5362790822982788, "learning_rate": 5.194628404007446e-06, "loss": 0.2373, "num_input_tokens_seen": 21187984, "step": 34735 }, { "epoch": 10.778777536456717, "grad_norm": 0.7494991421699524, "learning_rate": 5.1932756149460305e-06, "loss": 0.2449, "num_input_tokens_seen": 21190832, "step": 34740 }, { "epoch": 10.780328886130935, "grad_norm": 0.4799570143222809, "learning_rate": 5.1919228117152245e-06, "loss": 0.2203, "num_input_tokens_seen": 21193488, "step": 34745 }, { "epoch": 10.78188023580515, "grad_norm": 0.48027846217155457, "learning_rate": 5.190569994414204e-06, "loss": 0.234, "num_input_tokens_seen": 21196112, "step": 34750 }, { "epoch": 10.783431585479367, "grad_norm": 0.4720142185688019, "learning_rate": 5.1892171631421464e-06, "loss": 0.2316, "num_input_tokens_seen": 21198672, "step": 34755 }, { "epoch": 10.784982935153584, "grad_norm": 0.8185999393463135, "learning_rate": 5.187864317998229e-06, "loss": 0.2189, "num_input_tokens_seen": 21201552, "step": 34760 }, { "epoch": 10.7865342848278, "grad_norm": 0.5657881498336792, "learning_rate": 5.1865114590816315e-06, "loss": 0.2034, "num_input_tokens_seen": 21204528, "step": 34765 }, { "epoch": 10.788085634502016, "grad_norm": 0.7713239789009094, "learning_rate": 5.185158586491537e-06, "loss": 0.2318, "num_input_tokens_seen": 21208368, "step": 34770 }, { "epoch": 10.789636984176234, "grad_norm": 0.4799084961414337, "learning_rate": 5.183805700327124e-06, "loss": 0.2134, "num_input_tokens_seen": 21211504, "step": 34775 }, { "epoch": 10.79118833385045, "grad_norm": 0.6521451473236084, "learning_rate": 5.18245280068758e-06, "loss": 0.209, "num_input_tokens_seen": 21214672, "step": 34780 }, { "epoch": 10.792739683524667, "grad_norm": 0.7992465496063232, "learning_rate": 5.181099887672082e-06, "loss": 0.2298, "num_input_tokens_seen": 21217584, "step": 34785 }, { "epoch": 10.794291033198883, "grad_norm": 0.8253694176673889, "learning_rate": 5.179746961379818e-06, "loss": 0.2294, "num_input_tokens_seen": 21220272, "step": 34790 }, { "epoch": 10.795842382873099, "grad_norm": 0.6196645498275757, "learning_rate": 5.178394021909974e-06, "loss": 0.2131, "num_input_tokens_seen": 21223248, "step": 34795 }, { "epoch": 10.797393732547317, "grad_norm": 1.0022802352905273, "learning_rate": 5.177041069361736e-06, "loss": 0.2378, "num_input_tokens_seen": 21225328, "step": 34800 }, { "epoch": 10.798945082221532, "grad_norm": 0.9352325797080994, "learning_rate": 5.17568810383429e-06, "loss": 0.2084, "num_input_tokens_seen": 21228528, "step": 34805 }, { "epoch": 10.800496431895748, "grad_norm": 0.665459930896759, "learning_rate": 5.174335125426827e-06, "loss": 0.2193, "num_input_tokens_seen": 21231920, "step": 34810 }, { "epoch": 10.802047781569966, "grad_norm": 0.7947020530700684, "learning_rate": 5.172982134238533e-06, "loss": 0.2169, "num_input_tokens_seen": 21234672, "step": 34815 }, { "epoch": 10.803599131244182, "grad_norm": 1.091883659362793, "learning_rate": 5.171629130368602e-06, "loss": 0.2303, "num_input_tokens_seen": 21237776, "step": 34820 }, { "epoch": 10.8051504809184, "grad_norm": 0.7598965167999268, "learning_rate": 5.170276113916222e-06, "loss": 0.2254, "num_input_tokens_seen": 21241936, "step": 34825 }, { "epoch": 10.806701830592615, "grad_norm": 0.5930742621421814, "learning_rate": 5.168923084980588e-06, "loss": 0.2302, "num_input_tokens_seen": 21244720, "step": 34830 }, { "epoch": 10.808253180266831, "grad_norm": 0.9167813658714294, "learning_rate": 5.16757004366089e-06, "loss": 0.2378, "num_input_tokens_seen": 21247312, "step": 34835 }, { "epoch": 10.809804529941049, "grad_norm": 0.8340013027191162, "learning_rate": 5.166216990056324e-06, "loss": 0.2181, "num_input_tokens_seen": 21250320, "step": 34840 }, { "epoch": 10.811355879615265, "grad_norm": 1.1179393529891968, "learning_rate": 5.164863924266085e-06, "loss": 0.2136, "num_input_tokens_seen": 21253616, "step": 34845 }, { "epoch": 10.812907229289483, "grad_norm": 1.1877671480178833, "learning_rate": 5.163510846389367e-06, "loss": 0.2321, "num_input_tokens_seen": 21256720, "step": 34850 }, { "epoch": 10.814458578963698, "grad_norm": 1.004471778869629, "learning_rate": 5.162157756525368e-06, "loss": 0.1973, "num_input_tokens_seen": 21259664, "step": 34855 }, { "epoch": 10.816009928637914, "grad_norm": 0.9353479146957397, "learning_rate": 5.160804654773286e-06, "loss": 0.2283, "num_input_tokens_seen": 21262480, "step": 34860 }, { "epoch": 10.817561278312132, "grad_norm": 0.9383805990219116, "learning_rate": 5.159451541232318e-06, "loss": 0.224, "num_input_tokens_seen": 21267120, "step": 34865 }, { "epoch": 10.819112627986348, "grad_norm": 0.8064733743667603, "learning_rate": 5.1580984160016635e-06, "loss": 0.2119, "num_input_tokens_seen": 21269232, "step": 34870 }, { "epoch": 10.820663977660566, "grad_norm": 0.6794940233230591, "learning_rate": 5.156745279180524e-06, "loss": 0.2452, "num_input_tokens_seen": 21271696, "step": 34875 }, { "epoch": 10.822215327334781, "grad_norm": 1.278812050819397, "learning_rate": 5.1553921308680985e-06, "loss": 0.2272, "num_input_tokens_seen": 21275056, "step": 34880 }, { "epoch": 10.823766677008997, "grad_norm": 0.9022728204727173, "learning_rate": 5.154038971163589e-06, "loss": 0.2443, "num_input_tokens_seen": 21277680, "step": 34885 }, { "epoch": 10.825318026683215, "grad_norm": 0.8133646249771118, "learning_rate": 5.1526858001662005e-06, "loss": 0.2327, "num_input_tokens_seen": 21280336, "step": 34890 }, { "epoch": 10.82686937635743, "grad_norm": 0.7874846458435059, "learning_rate": 5.151332617975133e-06, "loss": 0.2296, "num_input_tokens_seen": 21283152, "step": 34895 }, { "epoch": 10.828420726031647, "grad_norm": 1.1061116456985474, "learning_rate": 5.149979424689594e-06, "loss": 0.2373, "num_input_tokens_seen": 21285712, "step": 34900 }, { "epoch": 10.829972075705864, "grad_norm": 0.8984960913658142, "learning_rate": 5.1486262204087865e-06, "loss": 0.2362, "num_input_tokens_seen": 21287952, "step": 34905 }, { "epoch": 10.83152342538008, "grad_norm": 0.8961347341537476, "learning_rate": 5.147273005231918e-06, "loss": 0.2185, "num_input_tokens_seen": 21290576, "step": 34910 }, { "epoch": 10.833074775054298, "grad_norm": 0.6316424608230591, "learning_rate": 5.145919779258193e-06, "loss": 0.224, "num_input_tokens_seen": 21292848, "step": 34915 }, { "epoch": 10.834626124728514, "grad_norm": 0.6173596382141113, "learning_rate": 5.144566542586821e-06, "loss": 0.2314, "num_input_tokens_seen": 21294896, "step": 34920 }, { "epoch": 10.83617747440273, "grad_norm": 0.6530225872993469, "learning_rate": 5.143213295317009e-06, "loss": 0.2162, "num_input_tokens_seen": 21297808, "step": 34925 }, { "epoch": 10.837728824076947, "grad_norm": 0.7194841504096985, "learning_rate": 5.141860037547966e-06, "loss": 0.2309, "num_input_tokens_seen": 21300208, "step": 34930 }, { "epoch": 10.839280173751163, "grad_norm": 0.8302609920501709, "learning_rate": 5.140506769378904e-06, "loss": 0.2145, "num_input_tokens_seen": 21303440, "step": 34935 }, { "epoch": 10.84083152342538, "grad_norm": 0.6761049032211304, "learning_rate": 5.139153490909031e-06, "loss": 0.2205, "num_input_tokens_seen": 21306992, "step": 34940 }, { "epoch": 10.842382873099597, "grad_norm": 0.5562831163406372, "learning_rate": 5.1378002022375596e-06, "loss": 0.231, "num_input_tokens_seen": 21311024, "step": 34945 }, { "epoch": 10.843934222773813, "grad_norm": 0.7849498987197876, "learning_rate": 5.136446903463701e-06, "loss": 0.2364, "num_input_tokens_seen": 21313520, "step": 34950 }, { "epoch": 10.84548557244803, "grad_norm": 1.0431147813796997, "learning_rate": 5.135093594686669e-06, "loss": 0.2326, "num_input_tokens_seen": 21316592, "step": 34955 }, { "epoch": 10.847036922122246, "grad_norm": 1.1335601806640625, "learning_rate": 5.133740276005676e-06, "loss": 0.234, "num_input_tokens_seen": 21319760, "step": 34960 }, { "epoch": 10.848588271796462, "grad_norm": 0.7532634139060974, "learning_rate": 5.132386947519937e-06, "loss": 0.2328, "num_input_tokens_seen": 21323088, "step": 34965 }, { "epoch": 10.85013962147068, "grad_norm": 0.5751296281814575, "learning_rate": 5.131033609328668e-06, "loss": 0.2142, "num_input_tokens_seen": 21326288, "step": 34970 }, { "epoch": 10.851690971144896, "grad_norm": 0.5588636994361877, "learning_rate": 5.129680261531083e-06, "loss": 0.2215, "num_input_tokens_seen": 21330032, "step": 34975 }, { "epoch": 10.853242320819113, "grad_norm": 1.3743959665298462, "learning_rate": 5.128326904226401e-06, "loss": 0.2302, "num_input_tokens_seen": 21333264, "step": 34980 }, { "epoch": 10.85479367049333, "grad_norm": 0.7514774203300476, "learning_rate": 5.126973537513837e-06, "loss": 0.2346, "num_input_tokens_seen": 21335504, "step": 34985 }, { "epoch": 10.856345020167545, "grad_norm": 0.47643211483955383, "learning_rate": 5.125620161492607e-06, "loss": 0.2334, "num_input_tokens_seen": 21338576, "step": 34990 }, { "epoch": 10.857896369841763, "grad_norm": 0.6722144484519958, "learning_rate": 5.124266776261934e-06, "loss": 0.2322, "num_input_tokens_seen": 21341808, "step": 34995 }, { "epoch": 10.859447719515979, "grad_norm": 0.7440456748008728, "learning_rate": 5.122913381921033e-06, "loss": 0.2292, "num_input_tokens_seen": 21344592, "step": 35000 }, { "epoch": 10.860999069190196, "grad_norm": 1.5276098251342773, "learning_rate": 5.121559978569126e-06, "loss": 0.2259, "num_input_tokens_seen": 21347824, "step": 35005 }, { "epoch": 10.862550418864412, "grad_norm": 0.5184440612792969, "learning_rate": 5.120206566305433e-06, "loss": 0.2371, "num_input_tokens_seen": 21349808, "step": 35010 }, { "epoch": 10.864101768538628, "grad_norm": 0.7758476138114929, "learning_rate": 5.118853145229176e-06, "loss": 0.2408, "num_input_tokens_seen": 21352272, "step": 35015 }, { "epoch": 10.865653118212846, "grad_norm": 0.613418459892273, "learning_rate": 5.117499715439574e-06, "loss": 0.2178, "num_input_tokens_seen": 21354800, "step": 35020 }, { "epoch": 10.867204467887062, "grad_norm": 0.6005262136459351, "learning_rate": 5.1161462770358524e-06, "loss": 0.2117, "num_input_tokens_seen": 21358256, "step": 35025 }, { "epoch": 10.868755817561278, "grad_norm": 0.5595966577529907, "learning_rate": 5.114792830117235e-06, "loss": 0.2358, "num_input_tokens_seen": 21360912, "step": 35030 }, { "epoch": 10.870307167235495, "grad_norm": 0.6721777319908142, "learning_rate": 5.113439374782942e-06, "loss": 0.2152, "num_input_tokens_seen": 21362960, "step": 35035 }, { "epoch": 10.871858516909711, "grad_norm": 0.8532884120941162, "learning_rate": 5.112085911132201e-06, "loss": 0.2208, "num_input_tokens_seen": 21367792, "step": 35040 }, { "epoch": 10.873409866583929, "grad_norm": 0.6495610475540161, "learning_rate": 5.110732439264234e-06, "loss": 0.227, "num_input_tokens_seen": 21370576, "step": 35045 }, { "epoch": 10.874961216258145, "grad_norm": 0.7379007339477539, "learning_rate": 5.109378959278268e-06, "loss": 0.2294, "num_input_tokens_seen": 21373424, "step": 35050 }, { "epoch": 10.87651256593236, "grad_norm": 0.7256932258605957, "learning_rate": 5.10802547127353e-06, "loss": 0.2273, "num_input_tokens_seen": 21375856, "step": 35055 }, { "epoch": 10.878063915606578, "grad_norm": 0.7657381892204285, "learning_rate": 5.1066719753492455e-06, "loss": 0.2261, "num_input_tokens_seen": 21379184, "step": 35060 }, { "epoch": 10.879615265280794, "grad_norm": 0.6584805846214294, "learning_rate": 5.1053184716046424e-06, "loss": 0.2356, "num_input_tokens_seen": 21381488, "step": 35065 }, { "epoch": 10.88116661495501, "grad_norm": 0.8613855242729187, "learning_rate": 5.103964960138947e-06, "loss": 0.2306, "num_input_tokens_seen": 21384688, "step": 35070 }, { "epoch": 10.882717964629228, "grad_norm": 0.6614765524864197, "learning_rate": 5.102611441051389e-06, "loss": 0.212, "num_input_tokens_seen": 21387344, "step": 35075 }, { "epoch": 10.884269314303443, "grad_norm": 1.2069756984710693, "learning_rate": 5.101257914441198e-06, "loss": 0.2202, "num_input_tokens_seen": 21389712, "step": 35080 }, { "epoch": 10.885820663977661, "grad_norm": 0.8173563480377197, "learning_rate": 5.099904380407602e-06, "loss": 0.2203, "num_input_tokens_seen": 21393136, "step": 35085 }, { "epoch": 10.887372013651877, "grad_norm": 1.2052638530731201, "learning_rate": 5.098550839049832e-06, "loss": 0.2349, "num_input_tokens_seen": 21395664, "step": 35090 }, { "epoch": 10.888923363326093, "grad_norm": 1.4340248107910156, "learning_rate": 5.0971972904671186e-06, "loss": 0.2403, "num_input_tokens_seen": 21399280, "step": 35095 }, { "epoch": 10.89047471300031, "grad_norm": 1.0264285802841187, "learning_rate": 5.095843734758693e-06, "loss": 0.2042, "num_input_tokens_seen": 21402448, "step": 35100 }, { "epoch": 10.892026062674526, "grad_norm": 1.178818941116333, "learning_rate": 5.094490172023787e-06, "loss": 0.2384, "num_input_tokens_seen": 21405712, "step": 35105 }, { "epoch": 10.893577412348744, "grad_norm": 0.44559818506240845, "learning_rate": 5.093136602361632e-06, "loss": 0.2205, "num_input_tokens_seen": 21408144, "step": 35110 }, { "epoch": 10.89512876202296, "grad_norm": 0.7300053834915161, "learning_rate": 5.091783025871462e-06, "loss": 0.2181, "num_input_tokens_seen": 21410608, "step": 35115 }, { "epoch": 10.896680111697176, "grad_norm": 0.663643479347229, "learning_rate": 5.090429442652508e-06, "loss": 0.2118, "num_input_tokens_seen": 21413552, "step": 35120 }, { "epoch": 10.898231461371394, "grad_norm": 1.1410316228866577, "learning_rate": 5.089075852804006e-06, "loss": 0.2487, "num_input_tokens_seen": 21416368, "step": 35125 }, { "epoch": 10.89978281104561, "grad_norm": 0.9907917976379395, "learning_rate": 5.087722256425189e-06, "loss": 0.232, "num_input_tokens_seen": 21419632, "step": 35130 }, { "epoch": 10.901334160719827, "grad_norm": 0.7946397662162781, "learning_rate": 5.086368653615292e-06, "loss": 0.2248, "num_input_tokens_seen": 21422256, "step": 35135 }, { "epoch": 10.902885510394043, "grad_norm": 0.7750883102416992, "learning_rate": 5.08501504447355e-06, "loss": 0.2477, "num_input_tokens_seen": 21424880, "step": 35140 }, { "epoch": 10.904436860068259, "grad_norm": 0.4760209619998932, "learning_rate": 5.0836614290991984e-06, "loss": 0.2336, "num_input_tokens_seen": 21427888, "step": 35145 }, { "epoch": 10.905988209742477, "grad_norm": 0.7901722192764282, "learning_rate": 5.082307807591473e-06, "loss": 0.2165, "num_input_tokens_seen": 21430704, "step": 35150 }, { "epoch": 10.907539559416692, "grad_norm": 0.7403099536895752, "learning_rate": 5.08095418004961e-06, "loss": 0.2211, "num_input_tokens_seen": 21433520, "step": 35155 }, { "epoch": 10.909090909090908, "grad_norm": 0.939414381980896, "learning_rate": 5.079600546572847e-06, "loss": 0.2224, "num_input_tokens_seen": 21436496, "step": 35160 }, { "epoch": 10.910642258765126, "grad_norm": 0.5273895859718323, "learning_rate": 5.078246907260423e-06, "loss": 0.2191, "num_input_tokens_seen": 21439216, "step": 35165 }, { "epoch": 10.912193608439342, "grad_norm": 1.0206108093261719, "learning_rate": 5.076893262211573e-06, "loss": 0.2401, "num_input_tokens_seen": 21442928, "step": 35170 }, { "epoch": 10.91374495811356, "grad_norm": 0.9847701191902161, "learning_rate": 5.075539611525536e-06, "loss": 0.2148, "num_input_tokens_seen": 21445680, "step": 35175 }, { "epoch": 10.915296307787775, "grad_norm": 0.9415803551673889, "learning_rate": 5.074185955301552e-06, "loss": 0.2237, "num_input_tokens_seen": 21449168, "step": 35180 }, { "epoch": 10.916847657461991, "grad_norm": 0.8391850590705872, "learning_rate": 5.072832293638859e-06, "loss": 0.2141, "num_input_tokens_seen": 21453296, "step": 35185 }, { "epoch": 10.918399007136209, "grad_norm": 0.5911765694618225, "learning_rate": 5.071478626636696e-06, "loss": 0.2216, "num_input_tokens_seen": 21458288, "step": 35190 }, { "epoch": 10.919950356810425, "grad_norm": 1.0243806838989258, "learning_rate": 5.070124954394302e-06, "loss": 0.22, "num_input_tokens_seen": 21461936, "step": 35195 }, { "epoch": 10.921501706484642, "grad_norm": 1.0771228075027466, "learning_rate": 5.0687712770109195e-06, "loss": 0.2259, "num_input_tokens_seen": 21464624, "step": 35200 }, { "epoch": 10.923053056158858, "grad_norm": 0.9202032089233398, "learning_rate": 5.0674175945857886e-06, "loss": 0.249, "num_input_tokens_seen": 21467472, "step": 35205 }, { "epoch": 10.924604405833074, "grad_norm": 0.6820980310440063, "learning_rate": 5.066063907218148e-06, "loss": 0.2333, "num_input_tokens_seen": 21470288, "step": 35210 }, { "epoch": 10.926155755507292, "grad_norm": 0.9229246973991394, "learning_rate": 5.0647102150072416e-06, "loss": 0.2305, "num_input_tokens_seen": 21472976, "step": 35215 }, { "epoch": 10.927707105181508, "grad_norm": 1.1623560190200806, "learning_rate": 5.06335651805231e-06, "loss": 0.23, "num_input_tokens_seen": 21475696, "step": 35220 }, { "epoch": 10.929258454855724, "grad_norm": 0.9551717042922974, "learning_rate": 5.062002816452596e-06, "loss": 0.2082, "num_input_tokens_seen": 21478960, "step": 35225 }, { "epoch": 10.930809804529941, "grad_norm": 0.7878990173339844, "learning_rate": 5.060649110307342e-06, "loss": 0.2107, "num_input_tokens_seen": 21481360, "step": 35230 }, { "epoch": 10.932361154204157, "grad_norm": 1.0808924436569214, "learning_rate": 5.059295399715789e-06, "loss": 0.2254, "num_input_tokens_seen": 21483600, "step": 35235 }, { "epoch": 10.933912503878375, "grad_norm": 0.983778715133667, "learning_rate": 5.057941684777181e-06, "loss": 0.256, "num_input_tokens_seen": 21486416, "step": 35240 }, { "epoch": 10.93546385355259, "grad_norm": 1.2938653230667114, "learning_rate": 5.056587965590761e-06, "loss": 0.2032, "num_input_tokens_seen": 21489360, "step": 35245 }, { "epoch": 10.937015203226807, "grad_norm": 0.9835000038146973, "learning_rate": 5.055234242255774e-06, "loss": 0.2396, "num_input_tokens_seen": 21491952, "step": 35250 }, { "epoch": 10.938566552901024, "grad_norm": 0.7887395620346069, "learning_rate": 5.053880514871462e-06, "loss": 0.2386, "num_input_tokens_seen": 21495536, "step": 35255 }, { "epoch": 10.94011790257524, "grad_norm": 1.358534336090088, "learning_rate": 5.052526783537072e-06, "loss": 0.2381, "num_input_tokens_seen": 21500752, "step": 35260 }, { "epoch": 10.941669252249458, "grad_norm": 1.2104674577713013, "learning_rate": 5.051173048351845e-06, "loss": 0.2267, "num_input_tokens_seen": 21503408, "step": 35265 }, { "epoch": 10.943220601923674, "grad_norm": 1.2759946584701538, "learning_rate": 5.0498193094150275e-06, "loss": 0.2414, "num_input_tokens_seen": 21506480, "step": 35270 }, { "epoch": 10.94477195159789, "grad_norm": 0.8380562663078308, "learning_rate": 5.048465566825865e-06, "loss": 0.2581, "num_input_tokens_seen": 21509328, "step": 35275 }, { "epoch": 10.946323301272107, "grad_norm": 1.2432153224945068, "learning_rate": 5.047111820683603e-06, "loss": 0.2355, "num_input_tokens_seen": 21511984, "step": 35280 }, { "epoch": 10.947874650946323, "grad_norm": 0.9236565828323364, "learning_rate": 5.045758071087485e-06, "loss": 0.2226, "num_input_tokens_seen": 21515248, "step": 35285 }, { "epoch": 10.949426000620539, "grad_norm": 0.7683069705963135, "learning_rate": 5.04440431813676e-06, "loss": 0.2244, "num_input_tokens_seen": 21518064, "step": 35290 }, { "epoch": 10.950977350294757, "grad_norm": 0.9359526038169861, "learning_rate": 5.043050561930674e-06, "loss": 0.2317, "num_input_tokens_seen": 21520336, "step": 35295 }, { "epoch": 10.952528699968973, "grad_norm": 0.6715806126594543, "learning_rate": 5.04169680256847e-06, "loss": 0.1951, "num_input_tokens_seen": 21523216, "step": 35300 }, { "epoch": 10.95408004964319, "grad_norm": 1.0161104202270508, "learning_rate": 5.040343040149397e-06, "loss": 0.2448, "num_input_tokens_seen": 21525648, "step": 35305 }, { "epoch": 10.955631399317406, "grad_norm": 0.6841640472412109, "learning_rate": 5.038989274772702e-06, "loss": 0.239, "num_input_tokens_seen": 21528240, "step": 35310 }, { "epoch": 10.957182748991622, "grad_norm": 0.7686044573783875, "learning_rate": 5.037635506537632e-06, "loss": 0.2315, "num_input_tokens_seen": 21531536, "step": 35315 }, { "epoch": 10.95873409866584, "grad_norm": 0.9978060126304626, "learning_rate": 5.0362817355434325e-06, "loss": 0.237, "num_input_tokens_seen": 21535280, "step": 35320 }, { "epoch": 10.960285448340056, "grad_norm": 0.7958008050918579, "learning_rate": 5.034927961889352e-06, "loss": 0.2169, "num_input_tokens_seen": 21537712, "step": 35325 }, { "epoch": 10.961836798014273, "grad_norm": 1.1121596097946167, "learning_rate": 5.033574185674638e-06, "loss": 0.2247, "num_input_tokens_seen": 21540880, "step": 35330 }, { "epoch": 10.96338814768849, "grad_norm": 0.9536173939704895, "learning_rate": 5.03222040699854e-06, "loss": 0.2354, "num_input_tokens_seen": 21543152, "step": 35335 }, { "epoch": 10.964939497362705, "grad_norm": 0.6662417054176331, "learning_rate": 5.030866625960303e-06, "loss": 0.2284, "num_input_tokens_seen": 21546512, "step": 35340 }, { "epoch": 10.966490847036923, "grad_norm": 0.6615304350852966, "learning_rate": 5.029512842659178e-06, "loss": 0.2379, "num_input_tokens_seen": 21550128, "step": 35345 }, { "epoch": 10.968042196711139, "grad_norm": 0.8659899830818176, "learning_rate": 5.0281590571944114e-06, "loss": 0.2306, "num_input_tokens_seen": 21552848, "step": 35350 }, { "epoch": 10.969593546385354, "grad_norm": 0.6680498719215393, "learning_rate": 5.026805269665254e-06, "loss": 0.212, "num_input_tokens_seen": 21555472, "step": 35355 }, { "epoch": 10.971144896059572, "grad_norm": 0.7156359553337097, "learning_rate": 5.0254514801709505e-06, "loss": 0.2467, "num_input_tokens_seen": 21557776, "step": 35360 }, { "epoch": 10.972696245733788, "grad_norm": 0.9644806385040283, "learning_rate": 5.024097688810754e-06, "loss": 0.2111, "num_input_tokens_seen": 21560784, "step": 35365 }, { "epoch": 10.974247595408006, "grad_norm": 1.3431802988052368, "learning_rate": 5.02274389568391e-06, "loss": 0.2242, "num_input_tokens_seen": 21563696, "step": 35370 }, { "epoch": 10.975798945082222, "grad_norm": 0.8167270421981812, "learning_rate": 5.02139010088967e-06, "loss": 0.2328, "num_input_tokens_seen": 21565904, "step": 35375 }, { "epoch": 10.977350294756437, "grad_norm": 0.856732964515686, "learning_rate": 5.020036304527283e-06, "loss": 0.2189, "num_input_tokens_seen": 21569552, "step": 35380 }, { "epoch": 10.978901644430655, "grad_norm": 0.910778820514679, "learning_rate": 5.018682506695997e-06, "loss": 0.2234, "num_input_tokens_seen": 21572400, "step": 35385 }, { "epoch": 10.980452994104871, "grad_norm": 0.44759148359298706, "learning_rate": 5.017328707495063e-06, "loss": 0.2344, "num_input_tokens_seen": 21575312, "step": 35390 }, { "epoch": 10.982004343779089, "grad_norm": 0.7294283509254456, "learning_rate": 5.015974907023728e-06, "loss": 0.2267, "num_input_tokens_seen": 21578512, "step": 35395 }, { "epoch": 10.983555693453305, "grad_norm": 0.7101041674613953, "learning_rate": 5.0146211053812445e-06, "loss": 0.2066, "num_input_tokens_seen": 21581712, "step": 35400 }, { "epoch": 10.98510704312752, "grad_norm": 1.1474876403808594, "learning_rate": 5.013267302666859e-06, "loss": 0.2394, "num_input_tokens_seen": 21584784, "step": 35405 }, { "epoch": 10.986658392801738, "grad_norm": 1.0067652463912964, "learning_rate": 5.011913498979824e-06, "loss": 0.2327, "num_input_tokens_seen": 21589168, "step": 35410 }, { "epoch": 10.988209742475954, "grad_norm": 1.0382896661758423, "learning_rate": 5.010559694419388e-06, "loss": 0.2276, "num_input_tokens_seen": 21591856, "step": 35415 }, { "epoch": 10.98976109215017, "grad_norm": 0.6851138472557068, "learning_rate": 5.009205889084803e-06, "loss": 0.2218, "num_input_tokens_seen": 21595280, "step": 35420 }, { "epoch": 10.991312441824387, "grad_norm": 0.8246068358421326, "learning_rate": 5.007852083075316e-06, "loss": 0.2356, "num_input_tokens_seen": 21598096, "step": 35425 }, { "epoch": 10.992863791498603, "grad_norm": 0.9347643852233887, "learning_rate": 5.006498276490178e-06, "loss": 0.2487, "num_input_tokens_seen": 21602960, "step": 35430 }, { "epoch": 10.994415141172821, "grad_norm": 0.6114970445632935, "learning_rate": 5.005144469428641e-06, "loss": 0.2173, "num_input_tokens_seen": 21605840, "step": 35435 }, { "epoch": 10.995966490847037, "grad_norm": 0.6021684408187866, "learning_rate": 5.003790661989952e-06, "loss": 0.2351, "num_input_tokens_seen": 21608880, "step": 35440 }, { "epoch": 10.997517840521253, "grad_norm": 1.1312836408615112, "learning_rate": 5.002436854273364e-06, "loss": 0.2053, "num_input_tokens_seen": 21614064, "step": 35445 }, { "epoch": 10.99906919019547, "grad_norm": 0.7141917943954468, "learning_rate": 5.001083046378125e-06, "loss": 0.234, "num_input_tokens_seen": 21617040, "step": 35450 }, { "epoch": 11.000620539869686, "grad_norm": 1.497766375541687, "learning_rate": 4.999729238403485e-06, "loss": 0.2223, "num_input_tokens_seen": 21619664, "step": 35455 }, { "epoch": 11.002171889543904, "grad_norm": 1.710387945175171, "learning_rate": 4.998375430448696e-06, "loss": 0.2255, "num_input_tokens_seen": 21622640, "step": 35460 }, { "epoch": 11.00372323921812, "grad_norm": 1.1222153902053833, "learning_rate": 4.997021622613007e-06, "loss": 0.2101, "num_input_tokens_seen": 21625808, "step": 35465 }, { "epoch": 11.005274588892336, "grad_norm": 1.372483253479004, "learning_rate": 4.995667814995668e-06, "loss": 0.2221, "num_input_tokens_seen": 21629104, "step": 35470 }, { "epoch": 11.006825938566553, "grad_norm": 1.0659191608428955, "learning_rate": 4.99431400769593e-06, "loss": 0.225, "num_input_tokens_seen": 21631440, "step": 35475 }, { "epoch": 11.00837728824077, "grad_norm": 1.3980423212051392, "learning_rate": 4.992960200813042e-06, "loss": 0.2374, "num_input_tokens_seen": 21634608, "step": 35480 }, { "epoch": 11.009928637914985, "grad_norm": 1.6417157649993896, "learning_rate": 4.991606394446255e-06, "loss": 0.2197, "num_input_tokens_seen": 21637968, "step": 35485 }, { "epoch": 11.011479987589203, "grad_norm": 0.9892444014549255, "learning_rate": 4.990252588694819e-06, "loss": 0.2267, "num_input_tokens_seen": 21641424, "step": 35490 }, { "epoch": 11.013031337263419, "grad_norm": 0.9027343392372131, "learning_rate": 4.988898783657983e-06, "loss": 0.2262, "num_input_tokens_seen": 21644624, "step": 35495 }, { "epoch": 11.014582686937636, "grad_norm": 0.8376688361167908, "learning_rate": 4.987544979434999e-06, "loss": 0.2255, "num_input_tokens_seen": 21647280, "step": 35500 }, { "epoch": 11.016134036611852, "grad_norm": 1.257570505142212, "learning_rate": 4.9861911761251135e-06, "loss": 0.2238, "num_input_tokens_seen": 21650224, "step": 35505 }, { "epoch": 11.017685386286068, "grad_norm": 0.9292053580284119, "learning_rate": 4.984837373827578e-06, "loss": 0.2211, "num_input_tokens_seen": 21652848, "step": 35510 }, { "epoch": 11.019236735960286, "grad_norm": 0.749905526638031, "learning_rate": 4.9834835726416435e-06, "loss": 0.2312, "num_input_tokens_seen": 21655824, "step": 35515 }, { "epoch": 11.020788085634502, "grad_norm": 0.8669265508651733, "learning_rate": 4.982129772666558e-06, "loss": 0.2298, "num_input_tokens_seen": 21658064, "step": 35520 }, { "epoch": 11.02233943530872, "grad_norm": 0.7318376302719116, "learning_rate": 4.980775974001573e-06, "loss": 0.2315, "num_input_tokens_seen": 21660176, "step": 35525 }, { "epoch": 11.023890784982935, "grad_norm": 1.1807732582092285, "learning_rate": 4.979422176745937e-06, "loss": 0.2234, "num_input_tokens_seen": 21663344, "step": 35530 }, { "epoch": 11.025442134657151, "grad_norm": 1.0006593465805054, "learning_rate": 4.978068380998899e-06, "loss": 0.2282, "num_input_tokens_seen": 21666736, "step": 35535 }, { "epoch": 11.026993484331369, "grad_norm": 1.2242348194122314, "learning_rate": 4.976714586859707e-06, "loss": 0.2316, "num_input_tokens_seen": 21670928, "step": 35540 }, { "epoch": 11.028544834005585, "grad_norm": 0.91468346118927, "learning_rate": 4.9753607944276124e-06, "loss": 0.2113, "num_input_tokens_seen": 21674192, "step": 35545 }, { "epoch": 11.0300961836798, "grad_norm": 1.5184428691864014, "learning_rate": 4.9740070038018635e-06, "loss": 0.2276, "num_input_tokens_seen": 21677648, "step": 35550 }, { "epoch": 11.031647533354018, "grad_norm": 0.9784799218177795, "learning_rate": 4.972653215081708e-06, "loss": 0.2261, "num_input_tokens_seen": 21680368, "step": 35555 }, { "epoch": 11.033198883028234, "grad_norm": 1.4228529930114746, "learning_rate": 4.971299428366397e-06, "loss": 0.2205, "num_input_tokens_seen": 21683216, "step": 35560 }, { "epoch": 11.034750232702452, "grad_norm": 1.738731861114502, "learning_rate": 4.969945643755178e-06, "loss": 0.2222, "num_input_tokens_seen": 21685936, "step": 35565 }, { "epoch": 11.036301582376668, "grad_norm": 1.7128294706344604, "learning_rate": 4.968591861347299e-06, "loss": 0.2208, "num_input_tokens_seen": 21689008, "step": 35570 }, { "epoch": 11.037852932050884, "grad_norm": 1.2804654836654663, "learning_rate": 4.9672380812420075e-06, "loss": 0.2308, "num_input_tokens_seen": 21691952, "step": 35575 }, { "epoch": 11.039404281725101, "grad_norm": 0.8135132789611816, "learning_rate": 4.965884303538553e-06, "loss": 0.2197, "num_input_tokens_seen": 21695824, "step": 35580 }, { "epoch": 11.040955631399317, "grad_norm": 0.9150648713111877, "learning_rate": 4.964530528336184e-06, "loss": 0.195, "num_input_tokens_seen": 21698800, "step": 35585 }, { "epoch": 11.042506981073535, "grad_norm": 1.5533210039138794, "learning_rate": 4.963176755734146e-06, "loss": 0.207, "num_input_tokens_seen": 21702064, "step": 35590 }, { "epoch": 11.04405833074775, "grad_norm": 0.7831680178642273, "learning_rate": 4.961822985831689e-06, "loss": 0.2428, "num_input_tokens_seen": 21704144, "step": 35595 }, { "epoch": 11.045609680421967, "grad_norm": 0.9147799611091614, "learning_rate": 4.960469218728058e-06, "loss": 0.2316, "num_input_tokens_seen": 21707056, "step": 35600 }, { "epoch": 11.047161030096184, "grad_norm": 1.445067048072815, "learning_rate": 4.959115454522502e-06, "loss": 0.2314, "num_input_tokens_seen": 21709328, "step": 35605 }, { "epoch": 11.0487123797704, "grad_norm": 0.903978705406189, "learning_rate": 4.957761693314268e-06, "loss": 0.2171, "num_input_tokens_seen": 21712624, "step": 35610 }, { "epoch": 11.050263729444616, "grad_norm": 1.02224600315094, "learning_rate": 4.9564079352026e-06, "loss": 0.2307, "num_input_tokens_seen": 21715632, "step": 35615 }, { "epoch": 11.051815079118834, "grad_norm": 1.0222814083099365, "learning_rate": 4.955054180286748e-06, "loss": 0.1969, "num_input_tokens_seen": 21718960, "step": 35620 }, { "epoch": 11.05336642879305, "grad_norm": 1.2596769332885742, "learning_rate": 4.953700428665958e-06, "loss": 0.2122, "num_input_tokens_seen": 21721712, "step": 35625 }, { "epoch": 11.054917778467267, "grad_norm": 0.8011853694915771, "learning_rate": 4.952346680439473e-06, "loss": 0.2209, "num_input_tokens_seen": 21724944, "step": 35630 }, { "epoch": 11.056469128141483, "grad_norm": 1.2218557596206665, "learning_rate": 4.9509929357065415e-06, "loss": 0.2297, "num_input_tokens_seen": 21727536, "step": 35635 }, { "epoch": 11.058020477815699, "grad_norm": 1.100190281867981, "learning_rate": 4.949639194566408e-06, "loss": 0.2201, "num_input_tokens_seen": 21730896, "step": 35640 }, { "epoch": 11.059571827489917, "grad_norm": 0.7758424878120422, "learning_rate": 4.9482854571183184e-06, "loss": 0.2354, "num_input_tokens_seen": 21733616, "step": 35645 }, { "epoch": 11.061123177164133, "grad_norm": 1.509753942489624, "learning_rate": 4.946931723461516e-06, "loss": 0.2198, "num_input_tokens_seen": 21736240, "step": 35650 }, { "epoch": 11.06267452683835, "grad_norm": 0.8125829696655273, "learning_rate": 4.945577993695248e-06, "loss": 0.202, "num_input_tokens_seen": 21738640, "step": 35655 }, { "epoch": 11.064225876512566, "grad_norm": 1.4405790567398071, "learning_rate": 4.9442242679187555e-06, "loss": 0.2212, "num_input_tokens_seen": 21741680, "step": 35660 }, { "epoch": 11.065777226186782, "grad_norm": 1.2767536640167236, "learning_rate": 4.942870546231286e-06, "loss": 0.2307, "num_input_tokens_seen": 21744848, "step": 35665 }, { "epoch": 11.067328575861, "grad_norm": 1.0345841646194458, "learning_rate": 4.941516828732082e-06, "loss": 0.2117, "num_input_tokens_seen": 21747312, "step": 35670 }, { "epoch": 11.068879925535215, "grad_norm": 0.9935332536697388, "learning_rate": 4.940163115520386e-06, "loss": 0.2256, "num_input_tokens_seen": 21749616, "step": 35675 }, { "epoch": 11.070431275209431, "grad_norm": 2.4612090587615967, "learning_rate": 4.938809406695444e-06, "loss": 0.2495, "num_input_tokens_seen": 21752176, "step": 35680 }, { "epoch": 11.071982624883649, "grad_norm": 1.2615084648132324, "learning_rate": 4.937455702356496e-06, "loss": 0.2299, "num_input_tokens_seen": 21756048, "step": 35685 }, { "epoch": 11.073533974557865, "grad_norm": 0.7953455448150635, "learning_rate": 4.936102002602786e-06, "loss": 0.2176, "num_input_tokens_seen": 21759088, "step": 35690 }, { "epoch": 11.075085324232083, "grad_norm": 2.055528163909912, "learning_rate": 4.934748307533556e-06, "loss": 0.2277, "num_input_tokens_seen": 21762864, "step": 35695 }, { "epoch": 11.076636673906298, "grad_norm": 2.684281349182129, "learning_rate": 4.933394617248047e-06, "loss": 0.2306, "num_input_tokens_seen": 21767376, "step": 35700 }, { "epoch": 11.078188023580514, "grad_norm": 1.7561765909194946, "learning_rate": 4.932040931845502e-06, "loss": 0.2425, "num_input_tokens_seen": 21769744, "step": 35705 }, { "epoch": 11.079739373254732, "grad_norm": 1.3749297857284546, "learning_rate": 4.930687251425162e-06, "loss": 0.208, "num_input_tokens_seen": 21773168, "step": 35710 }, { "epoch": 11.081290722928948, "grad_norm": 1.5325547456741333, "learning_rate": 4.929333576086266e-06, "loss": 0.21, "num_input_tokens_seen": 21775984, "step": 35715 }, { "epoch": 11.082842072603166, "grad_norm": 0.9854452013969421, "learning_rate": 4.927979905928055e-06, "loss": 0.2158, "num_input_tokens_seen": 21778864, "step": 35720 }, { "epoch": 11.084393422277381, "grad_norm": 1.4183717966079712, "learning_rate": 4.926626241049771e-06, "loss": 0.2108, "num_input_tokens_seen": 21782064, "step": 35725 }, { "epoch": 11.085944771951597, "grad_norm": 0.8711589574813843, "learning_rate": 4.925272581550652e-06, "loss": 0.2071, "num_input_tokens_seen": 21785104, "step": 35730 }, { "epoch": 11.087496121625815, "grad_norm": 1.0726372003555298, "learning_rate": 4.923918927529939e-06, "loss": 0.2071, "num_input_tokens_seen": 21788272, "step": 35735 }, { "epoch": 11.08904747130003, "grad_norm": 1.753678321838379, "learning_rate": 4.922565279086868e-06, "loss": 0.2205, "num_input_tokens_seen": 21791248, "step": 35740 }, { "epoch": 11.090598820974247, "grad_norm": 1.1337037086486816, "learning_rate": 4.921211636320681e-06, "loss": 0.2211, "num_input_tokens_seen": 21795664, "step": 35745 }, { "epoch": 11.092150170648464, "grad_norm": 0.9369949102401733, "learning_rate": 4.919857999330612e-06, "loss": 0.2298, "num_input_tokens_seen": 21798384, "step": 35750 }, { "epoch": 11.09370152032268, "grad_norm": 1.2519794702529907, "learning_rate": 4.918504368215901e-06, "loss": 0.2237, "num_input_tokens_seen": 21800784, "step": 35755 }, { "epoch": 11.095252869996898, "grad_norm": 2.071589231491089, "learning_rate": 4.9171507430757856e-06, "loss": 0.2523, "num_input_tokens_seen": 21803600, "step": 35760 }, { "epoch": 11.096804219671114, "grad_norm": 0.7853974103927612, "learning_rate": 4.915797124009501e-06, "loss": 0.2265, "num_input_tokens_seen": 21806288, "step": 35765 }, { "epoch": 11.09835556934533, "grad_norm": 1.20115327835083, "learning_rate": 4.914443511116285e-06, "loss": 0.2044, "num_input_tokens_seen": 21808816, "step": 35770 }, { "epoch": 11.099906919019547, "grad_norm": 1.4659396409988403, "learning_rate": 4.913089904495372e-06, "loss": 0.2455, "num_input_tokens_seen": 21811792, "step": 35775 }, { "epoch": 11.101458268693763, "grad_norm": 0.9865427017211914, "learning_rate": 4.911736304245997e-06, "loss": 0.2154, "num_input_tokens_seen": 21814768, "step": 35780 }, { "epoch": 11.103009618367981, "grad_norm": 1.8677250146865845, "learning_rate": 4.9103827104673985e-06, "loss": 0.2356, "num_input_tokens_seen": 21819568, "step": 35785 }, { "epoch": 11.104560968042197, "grad_norm": 1.2048187255859375, "learning_rate": 4.909029123258808e-06, "loss": 0.2164, "num_input_tokens_seen": 21822192, "step": 35790 }, { "epoch": 11.106112317716413, "grad_norm": 2.260153293609619, "learning_rate": 4.90767554271946e-06, "loss": 0.2268, "num_input_tokens_seen": 21825008, "step": 35795 }, { "epoch": 11.10766366739063, "grad_norm": 0.910679817199707, "learning_rate": 4.906321968948587e-06, "loss": 0.2242, "num_input_tokens_seen": 21827088, "step": 35800 }, { "epoch": 11.109215017064846, "grad_norm": 1.3520312309265137, "learning_rate": 4.904968402045425e-06, "loss": 0.2323, "num_input_tokens_seen": 21830640, "step": 35805 }, { "epoch": 11.110766366739062, "grad_norm": 1.299412488937378, "learning_rate": 4.903614842109203e-06, "loss": 0.2198, "num_input_tokens_seen": 21833040, "step": 35810 }, { "epoch": 11.11231771641328, "grad_norm": 1.1183587312698364, "learning_rate": 4.902261289239155e-06, "loss": 0.2359, "num_input_tokens_seen": 21835888, "step": 35815 }, { "epoch": 11.113869066087496, "grad_norm": 1.0978145599365234, "learning_rate": 4.9009077435345114e-06, "loss": 0.2269, "num_input_tokens_seen": 21839056, "step": 35820 }, { "epoch": 11.115420415761713, "grad_norm": 1.4036098718643188, "learning_rate": 4.899554205094504e-06, "loss": 0.2053, "num_input_tokens_seen": 21842832, "step": 35825 }, { "epoch": 11.11697176543593, "grad_norm": 3.931699514389038, "learning_rate": 4.8982006740183625e-06, "loss": 0.2526, "num_input_tokens_seen": 21845776, "step": 35830 }, { "epoch": 11.118523115110145, "grad_norm": 1.6654313802719116, "learning_rate": 4.8968471504053176e-06, "loss": 0.2349, "num_input_tokens_seen": 21849008, "step": 35835 }, { "epoch": 11.120074464784363, "grad_norm": 1.3021621704101562, "learning_rate": 4.895493634354596e-06, "loss": 0.2163, "num_input_tokens_seen": 21852240, "step": 35840 }, { "epoch": 11.121625814458579, "grad_norm": 0.927590548992157, "learning_rate": 4.894140125965431e-06, "loss": 0.2317, "num_input_tokens_seen": 21854480, "step": 35845 }, { "epoch": 11.123177164132796, "grad_norm": 2.743927240371704, "learning_rate": 4.892786625337047e-06, "loss": 0.2164, "num_input_tokens_seen": 21857456, "step": 35850 }, { "epoch": 11.124728513807012, "grad_norm": 1.463777780532837, "learning_rate": 4.891433132568674e-06, "loss": 0.2009, "num_input_tokens_seen": 21860112, "step": 35855 }, { "epoch": 11.126279863481228, "grad_norm": 1.3144071102142334, "learning_rate": 4.890079647759537e-06, "loss": 0.2008, "num_input_tokens_seen": 21862992, "step": 35860 }, { "epoch": 11.127831213155446, "grad_norm": 2.5061447620391846, "learning_rate": 4.888726171008865e-06, "loss": 0.2214, "num_input_tokens_seen": 21866672, "step": 35865 }, { "epoch": 11.129382562829662, "grad_norm": 1.9498814344406128, "learning_rate": 4.887372702415882e-06, "loss": 0.2507, "num_input_tokens_seen": 21869360, "step": 35870 }, { "epoch": 11.130933912503878, "grad_norm": 3.450279712677002, "learning_rate": 4.886019242079812e-06, "loss": 0.2568, "num_input_tokens_seen": 21873360, "step": 35875 }, { "epoch": 11.132485262178095, "grad_norm": 1.4676709175109863, "learning_rate": 4.884665790099882e-06, "loss": 0.2118, "num_input_tokens_seen": 21876432, "step": 35880 }, { "epoch": 11.134036611852311, "grad_norm": 0.961374819278717, "learning_rate": 4.883312346575317e-06, "loss": 0.2175, "num_input_tokens_seen": 21879504, "step": 35885 }, { "epoch": 11.135587961526529, "grad_norm": 1.4021681547164917, "learning_rate": 4.881958911605337e-06, "loss": 0.2456, "num_input_tokens_seen": 21881488, "step": 35890 }, { "epoch": 11.137139311200745, "grad_norm": 1.6083521842956543, "learning_rate": 4.880605485289167e-06, "loss": 0.2452, "num_input_tokens_seen": 21885360, "step": 35895 }, { "epoch": 11.13869066087496, "grad_norm": 1.439817190170288, "learning_rate": 4.879252067726027e-06, "loss": 0.1917, "num_input_tokens_seen": 21888336, "step": 35900 }, { "epoch": 11.140242010549178, "grad_norm": 1.317132592201233, "learning_rate": 4.877898659015143e-06, "loss": 0.2499, "num_input_tokens_seen": 21891248, "step": 35905 }, { "epoch": 11.141793360223394, "grad_norm": 2.1547305583953857, "learning_rate": 4.876545259255732e-06, "loss": 0.219, "num_input_tokens_seen": 21894064, "step": 35910 }, { "epoch": 11.143344709897612, "grad_norm": 4.264551162719727, "learning_rate": 4.875191868547016e-06, "loss": 0.2451, "num_input_tokens_seen": 21896976, "step": 35915 }, { "epoch": 11.144896059571828, "grad_norm": 3.842881441116333, "learning_rate": 4.873838486988213e-06, "loss": 0.2437, "num_input_tokens_seen": 21899792, "step": 35920 }, { "epoch": 11.146447409246043, "grad_norm": 0.9930594563484192, "learning_rate": 4.872485114678544e-06, "loss": 0.2278, "num_input_tokens_seen": 21902256, "step": 35925 }, { "epoch": 11.147998758920261, "grad_norm": 1.711102843284607, "learning_rate": 4.8711317517172255e-06, "loss": 0.2254, "num_input_tokens_seen": 21905488, "step": 35930 }, { "epoch": 11.149550108594477, "grad_norm": 2.2154548168182373, "learning_rate": 4.869778398203476e-06, "loss": 0.2042, "num_input_tokens_seen": 21908336, "step": 35935 }, { "epoch": 11.151101458268693, "grad_norm": 1.902449607849121, "learning_rate": 4.868425054236512e-06, "loss": 0.2135, "num_input_tokens_seen": 21911760, "step": 35940 }, { "epoch": 11.15265280794291, "grad_norm": 1.2211990356445312, "learning_rate": 4.867071719915549e-06, "loss": 0.2093, "num_input_tokens_seen": 21914352, "step": 35945 }, { "epoch": 11.154204157617126, "grad_norm": 2.7171497344970703, "learning_rate": 4.865718395339803e-06, "loss": 0.2313, "num_input_tokens_seen": 21916880, "step": 35950 }, { "epoch": 11.155755507291344, "grad_norm": 1.6281613111495972, "learning_rate": 4.864365080608491e-06, "loss": 0.2036, "num_input_tokens_seen": 21919344, "step": 35955 }, { "epoch": 11.15730685696556, "grad_norm": 1.0373653173446655, "learning_rate": 4.8630117758208206e-06, "loss": 0.1818, "num_input_tokens_seen": 21922608, "step": 35960 }, { "epoch": 11.158858206639776, "grad_norm": 2.105515480041504, "learning_rate": 4.861658481076012e-06, "loss": 0.198, "num_input_tokens_seen": 21925968, "step": 35965 }, { "epoch": 11.160409556313994, "grad_norm": 3.1990461349487305, "learning_rate": 4.860305196473274e-06, "loss": 0.2477, "num_input_tokens_seen": 21928656, "step": 35970 }, { "epoch": 11.16196090598821, "grad_norm": 1.2512534856796265, "learning_rate": 4.8589519221118205e-06, "loss": 0.2294, "num_input_tokens_seen": 21933040, "step": 35975 }, { "epoch": 11.163512255662427, "grad_norm": 1.5453757047653198, "learning_rate": 4.85759865809086e-06, "loss": 0.2007, "num_input_tokens_seen": 21936080, "step": 35980 }, { "epoch": 11.165063605336643, "grad_norm": 2.343843698501587, "learning_rate": 4.856245404509606e-06, "loss": 0.235, "num_input_tokens_seen": 21938768, "step": 35985 }, { "epoch": 11.166614955010859, "grad_norm": 2.0346133708953857, "learning_rate": 4.854892161467265e-06, "loss": 0.2317, "num_input_tokens_seen": 21941136, "step": 35990 }, { "epoch": 11.168166304685077, "grad_norm": 1.5357248783111572, "learning_rate": 4.853538929063047e-06, "loss": 0.205, "num_input_tokens_seen": 21944272, "step": 35995 }, { "epoch": 11.169717654359292, "grad_norm": 1.8437851667404175, "learning_rate": 4.852185707396159e-06, "loss": 0.2771, "num_input_tokens_seen": 21948624, "step": 36000 }, { "epoch": 11.171269004033508, "grad_norm": 1.698386549949646, "learning_rate": 4.85083249656581e-06, "loss": 0.2494, "num_input_tokens_seen": 21951792, "step": 36005 }, { "epoch": 11.172820353707726, "grad_norm": 1.8883869647979736, "learning_rate": 4.849479296671205e-06, "loss": 0.2148, "num_input_tokens_seen": 21955440, "step": 36010 }, { "epoch": 11.174371703381942, "grad_norm": 1.6071573495864868, "learning_rate": 4.84812610781155e-06, "loss": 0.2317, "num_input_tokens_seen": 21958096, "step": 36015 }, { "epoch": 11.17592305305616, "grad_norm": 3.0138754844665527, "learning_rate": 4.846772930086049e-06, "loss": 0.2042, "num_input_tokens_seen": 21961552, "step": 36020 }, { "epoch": 11.177474402730375, "grad_norm": 1.6175334453582764, "learning_rate": 4.845419763593907e-06, "loss": 0.2182, "num_input_tokens_seen": 21964016, "step": 36025 }, { "epoch": 11.179025752404591, "grad_norm": 1.3217116594314575, "learning_rate": 4.844066608434328e-06, "loss": 0.201, "num_input_tokens_seen": 21966832, "step": 36030 }, { "epoch": 11.180577102078809, "grad_norm": 4.723447799682617, "learning_rate": 4.842713464706513e-06, "loss": 0.2235, "num_input_tokens_seen": 21969904, "step": 36035 }, { "epoch": 11.182128451753025, "grad_norm": 3.054623603820801, "learning_rate": 4.841360332509663e-06, "loss": 0.2151, "num_input_tokens_seen": 21972464, "step": 36040 }, { "epoch": 11.183679801427242, "grad_norm": 2.0694870948791504, "learning_rate": 4.84000721194298e-06, "loss": 0.2254, "num_input_tokens_seen": 21974832, "step": 36045 }, { "epoch": 11.185231151101458, "grad_norm": 2.8667736053466797, "learning_rate": 4.8386541031056614e-06, "loss": 0.2015, "num_input_tokens_seen": 21977456, "step": 36050 }, { "epoch": 11.186782500775674, "grad_norm": 2.362186908721924, "learning_rate": 4.837301006096909e-06, "loss": 0.2288, "num_input_tokens_seen": 21979888, "step": 36055 }, { "epoch": 11.188333850449892, "grad_norm": 2.550200939178467, "learning_rate": 4.835947921015919e-06, "loss": 0.225, "num_input_tokens_seen": 21982672, "step": 36060 }, { "epoch": 11.189885200124108, "grad_norm": 1.592275857925415, "learning_rate": 4.83459484796189e-06, "loss": 0.2106, "num_input_tokens_seen": 21986960, "step": 36065 }, { "epoch": 11.191436549798324, "grad_norm": 2.1184744834899902, "learning_rate": 4.833241787034016e-06, "loss": 0.2183, "num_input_tokens_seen": 21989200, "step": 36070 }, { "epoch": 11.192987899472541, "grad_norm": 3.116204261779785, "learning_rate": 4.831888738331495e-06, "loss": 0.2312, "num_input_tokens_seen": 21992496, "step": 36075 }, { "epoch": 11.194539249146757, "grad_norm": 4.774474143981934, "learning_rate": 4.830535701953517e-06, "loss": 0.2378, "num_input_tokens_seen": 21995440, "step": 36080 }, { "epoch": 11.196090598820975, "grad_norm": 2.4526000022888184, "learning_rate": 4.829182677999282e-06, "loss": 0.2245, "num_input_tokens_seen": 21998640, "step": 36085 }, { "epoch": 11.19764194849519, "grad_norm": 1.6430082321166992, "learning_rate": 4.8278296665679775e-06, "loss": 0.2024, "num_input_tokens_seen": 22001808, "step": 36090 }, { "epoch": 11.199193298169407, "grad_norm": 1.680679202079773, "learning_rate": 4.826476667758798e-06, "loss": 0.2136, "num_input_tokens_seen": 22004464, "step": 36095 }, { "epoch": 11.200744647843624, "grad_norm": 2.2550904750823975, "learning_rate": 4.825123681670934e-06, "loss": 0.2093, "num_input_tokens_seen": 22006704, "step": 36100 }, { "epoch": 11.20229599751784, "grad_norm": 2.5733797550201416, "learning_rate": 4.823770708403574e-06, "loss": 0.2401, "num_input_tokens_seen": 22009712, "step": 36105 }, { "epoch": 11.203847347192058, "grad_norm": 2.396376371383667, "learning_rate": 4.8224177480559095e-06, "loss": 0.2745, "num_input_tokens_seen": 22012272, "step": 36110 }, { "epoch": 11.205398696866274, "grad_norm": 1.854323387145996, "learning_rate": 4.821064800727127e-06, "loss": 0.2342, "num_input_tokens_seen": 22015664, "step": 36115 }, { "epoch": 11.20695004654049, "grad_norm": 2.213453531265259, "learning_rate": 4.8197118665164125e-06, "loss": 0.2497, "num_input_tokens_seen": 22018736, "step": 36120 }, { "epoch": 11.208501396214707, "grad_norm": 1.6927757263183594, "learning_rate": 4.818358945522954e-06, "loss": 0.2426, "num_input_tokens_seen": 22021968, "step": 36125 }, { "epoch": 11.210052745888923, "grad_norm": 1.0645420551300049, "learning_rate": 4.817006037845935e-06, "loss": 0.2151, "num_input_tokens_seen": 22025360, "step": 36130 }, { "epoch": 11.211604095563139, "grad_norm": 1.695491909980774, "learning_rate": 4.81565314358454e-06, "loss": 0.2431, "num_input_tokens_seen": 22029200, "step": 36135 }, { "epoch": 11.213155445237357, "grad_norm": 1.1575860977172852, "learning_rate": 4.814300262837954e-06, "loss": 0.2253, "num_input_tokens_seen": 22032048, "step": 36140 }, { "epoch": 11.214706794911573, "grad_norm": 0.7632845044136047, "learning_rate": 4.812947395705357e-06, "loss": 0.2113, "num_input_tokens_seen": 22035152, "step": 36145 }, { "epoch": 11.21625814458579, "grad_norm": 1.118897557258606, "learning_rate": 4.811594542285931e-06, "loss": 0.2058, "num_input_tokens_seen": 22038288, "step": 36150 }, { "epoch": 11.217809494260006, "grad_norm": 1.7756270170211792, "learning_rate": 4.810241702678856e-06, "loss": 0.208, "num_input_tokens_seen": 22041072, "step": 36155 }, { "epoch": 11.219360843934222, "grad_norm": 0.8818144202232361, "learning_rate": 4.808888876983312e-06, "loss": 0.2313, "num_input_tokens_seen": 22043632, "step": 36160 }, { "epoch": 11.22091219360844, "grad_norm": 1.2027772665023804, "learning_rate": 4.807536065298474e-06, "loss": 0.2249, "num_input_tokens_seen": 22045904, "step": 36165 }, { "epoch": 11.222463543282656, "grad_norm": 1.6423704624176025, "learning_rate": 4.806183267723523e-06, "loss": 0.2039, "num_input_tokens_seen": 22049488, "step": 36170 }, { "epoch": 11.224014892956873, "grad_norm": 2.0879640579223633, "learning_rate": 4.8048304843576344e-06, "loss": 0.2121, "num_input_tokens_seen": 22053008, "step": 36175 }, { "epoch": 11.22556624263109, "grad_norm": 1.5464609861373901, "learning_rate": 4.803477715299981e-06, "loss": 0.2436, "num_input_tokens_seen": 22055856, "step": 36180 }, { "epoch": 11.227117592305305, "grad_norm": 2.157649040222168, "learning_rate": 4.80212496064974e-06, "loss": 0.218, "num_input_tokens_seen": 22058416, "step": 36185 }, { "epoch": 11.228668941979523, "grad_norm": 1.6433517932891846, "learning_rate": 4.800772220506081e-06, "loss": 0.2242, "num_input_tokens_seen": 22062096, "step": 36190 }, { "epoch": 11.230220291653739, "grad_norm": 1.178573727607727, "learning_rate": 4.799419494968179e-06, "loss": 0.2393, "num_input_tokens_seen": 22064432, "step": 36195 }, { "epoch": 11.231771641327954, "grad_norm": 1.3239492177963257, "learning_rate": 4.798066784135202e-06, "loss": 0.2419, "num_input_tokens_seen": 22067824, "step": 36200 }, { "epoch": 11.233322991002172, "grad_norm": 1.4940738677978516, "learning_rate": 4.796714088106322e-06, "loss": 0.1986, "num_input_tokens_seen": 22070864, "step": 36205 }, { "epoch": 11.234874340676388, "grad_norm": 1.797001600265503, "learning_rate": 4.795361406980706e-06, "loss": 0.2104, "num_input_tokens_seen": 22073936, "step": 36210 }, { "epoch": 11.236425690350606, "grad_norm": 2.190523624420166, "learning_rate": 4.794008740857522e-06, "loss": 0.2106, "num_input_tokens_seen": 22078128, "step": 36215 }, { "epoch": 11.237977040024822, "grad_norm": 1.574707269668579, "learning_rate": 4.792656089835936e-06, "loss": 0.2251, "num_input_tokens_seen": 22081456, "step": 36220 }, { "epoch": 11.239528389699037, "grad_norm": 1.3719311952590942, "learning_rate": 4.791303454015115e-06, "loss": 0.2349, "num_input_tokens_seen": 22085616, "step": 36225 }, { "epoch": 11.241079739373255, "grad_norm": 3.9757535457611084, "learning_rate": 4.789950833494221e-06, "loss": 0.2262, "num_input_tokens_seen": 22088432, "step": 36230 }, { "epoch": 11.242631089047471, "grad_norm": 0.6630039811134338, "learning_rate": 4.788598228372419e-06, "loss": 0.1863, "num_input_tokens_seen": 22093104, "step": 36235 }, { "epoch": 11.244182438721689, "grad_norm": 0.8761210441589355, "learning_rate": 4.7872456387488706e-06, "loss": 0.2022, "num_input_tokens_seen": 22097232, "step": 36240 }, { "epoch": 11.245733788395905, "grad_norm": 1.4649285078048706, "learning_rate": 4.7858930647227356e-06, "loss": 0.218, "num_input_tokens_seen": 22100560, "step": 36245 }, { "epoch": 11.24728513807012, "grad_norm": 1.5554629564285278, "learning_rate": 4.784540506393175e-06, "loss": 0.1978, "num_input_tokens_seen": 22103312, "step": 36250 }, { "epoch": 11.248836487744338, "grad_norm": 1.6054359674453735, "learning_rate": 4.783187963859346e-06, "loss": 0.2385, "num_input_tokens_seen": 22105488, "step": 36255 }, { "epoch": 11.250387837418554, "grad_norm": 3.5227723121643066, "learning_rate": 4.7818354372204075e-06, "loss": 0.2168, "num_input_tokens_seen": 22108272, "step": 36260 }, { "epoch": 11.25193918709277, "grad_norm": 1.273362159729004, "learning_rate": 4.780482926575515e-06, "loss": 0.2374, "num_input_tokens_seen": 22110768, "step": 36265 }, { "epoch": 11.253490536766988, "grad_norm": 1.411949634552002, "learning_rate": 4.779130432023823e-06, "loss": 0.2137, "num_input_tokens_seen": 22114160, "step": 36270 }, { "epoch": 11.255041886441203, "grad_norm": 2.868762254714966, "learning_rate": 4.777777953664486e-06, "loss": 0.2559, "num_input_tokens_seen": 22117328, "step": 36275 }, { "epoch": 11.256593236115421, "grad_norm": 1.531093955039978, "learning_rate": 4.776425491596656e-06, "loss": 0.2242, "num_input_tokens_seen": 22120720, "step": 36280 }, { "epoch": 11.258144585789637, "grad_norm": 1.391156554222107, "learning_rate": 4.7750730459194845e-06, "loss": 0.2348, "num_input_tokens_seen": 22123152, "step": 36285 }, { "epoch": 11.259695935463853, "grad_norm": 1.1492280960083008, "learning_rate": 4.7737206167321236e-06, "loss": 0.2125, "num_input_tokens_seen": 22125840, "step": 36290 }, { "epoch": 11.26124728513807, "grad_norm": 2.5068094730377197, "learning_rate": 4.77236820413372e-06, "loss": 0.2314, "num_input_tokens_seen": 22128944, "step": 36295 }, { "epoch": 11.262798634812286, "grad_norm": 1.0946427583694458, "learning_rate": 4.771015808223423e-06, "loss": 0.2164, "num_input_tokens_seen": 22131600, "step": 36300 }, { "epoch": 11.264349984486504, "grad_norm": 1.2741267681121826, "learning_rate": 4.769663429100379e-06, "loss": 0.2102, "num_input_tokens_seen": 22134160, "step": 36305 }, { "epoch": 11.26590133416072, "grad_norm": 2.1452553272247314, "learning_rate": 4.768311066863734e-06, "loss": 0.2259, "num_input_tokens_seen": 22136752, "step": 36310 }, { "epoch": 11.267452683834936, "grad_norm": 2.528801441192627, "learning_rate": 4.76695872161263e-06, "loss": 0.1961, "num_input_tokens_seen": 22139472, "step": 36315 }, { "epoch": 11.269004033509153, "grad_norm": 1.689884901046753, "learning_rate": 4.765606393446212e-06, "loss": 0.1965, "num_input_tokens_seen": 22141776, "step": 36320 }, { "epoch": 11.27055538318337, "grad_norm": 1.667924404144287, "learning_rate": 4.7642540824636205e-06, "loss": 0.2455, "num_input_tokens_seen": 22144784, "step": 36325 }, { "epoch": 11.272106732857585, "grad_norm": 1.4416307210922241, "learning_rate": 4.762901788763997e-06, "loss": 0.2447, "num_input_tokens_seen": 22151312, "step": 36330 }, { "epoch": 11.273658082531803, "grad_norm": 1.1415705680847168, "learning_rate": 4.76154951244648e-06, "loss": 0.2104, "num_input_tokens_seen": 22153904, "step": 36335 }, { "epoch": 11.275209432206019, "grad_norm": 1.6208171844482422, "learning_rate": 4.7601972536102075e-06, "loss": 0.1965, "num_input_tokens_seen": 22156816, "step": 36340 }, { "epoch": 11.276760781880236, "grad_norm": 1.5571300983428955, "learning_rate": 4.758845012354314e-06, "loss": 0.2138, "num_input_tokens_seen": 22161968, "step": 36345 }, { "epoch": 11.278312131554452, "grad_norm": 2.4849367141723633, "learning_rate": 4.757492788777939e-06, "loss": 0.2139, "num_input_tokens_seen": 22164688, "step": 36350 }, { "epoch": 11.279863481228668, "grad_norm": 1.6779288053512573, "learning_rate": 4.756140582980215e-06, "loss": 0.2681, "num_input_tokens_seen": 22167440, "step": 36355 }, { "epoch": 11.281414830902886, "grad_norm": 1.418426275253296, "learning_rate": 4.7547883950602725e-06, "loss": 0.1856, "num_input_tokens_seen": 22171824, "step": 36360 }, { "epoch": 11.282966180577102, "grad_norm": 1.2965120077133179, "learning_rate": 4.753436225117244e-06, "loss": 0.2226, "num_input_tokens_seen": 22173744, "step": 36365 }, { "epoch": 11.28451753025132, "grad_norm": 2.530505418777466, "learning_rate": 4.752084073250262e-06, "loss": 0.1891, "num_input_tokens_seen": 22177680, "step": 36370 }, { "epoch": 11.286068879925535, "grad_norm": 1.888022541999817, "learning_rate": 4.750731939558451e-06, "loss": 0.2234, "num_input_tokens_seen": 22180816, "step": 36375 }, { "epoch": 11.287620229599751, "grad_norm": 2.060976505279541, "learning_rate": 4.7493798241409415e-06, "loss": 0.2547, "num_input_tokens_seen": 22183792, "step": 36380 }, { "epoch": 11.289171579273969, "grad_norm": 1.8204177618026733, "learning_rate": 4.748027727096859e-06, "loss": 0.2028, "num_input_tokens_seen": 22186576, "step": 36385 }, { "epoch": 11.290722928948185, "grad_norm": 1.4564568996429443, "learning_rate": 4.746675648525328e-06, "loss": 0.212, "num_input_tokens_seen": 22188912, "step": 36390 }, { "epoch": 11.2922742786224, "grad_norm": 2.2328896522521973, "learning_rate": 4.745323588525471e-06, "loss": 0.2838, "num_input_tokens_seen": 22191312, "step": 36395 }, { "epoch": 11.293825628296618, "grad_norm": 1.5676807165145874, "learning_rate": 4.743971547196411e-06, "loss": 0.2141, "num_input_tokens_seen": 22194064, "step": 36400 }, { "epoch": 11.295376977970834, "grad_norm": 1.6857414245605469, "learning_rate": 4.742619524637267e-06, "loss": 0.2283, "num_input_tokens_seen": 22198480, "step": 36405 }, { "epoch": 11.296928327645052, "grad_norm": 1.310167670249939, "learning_rate": 4.741267520947162e-06, "loss": 0.2133, "num_input_tokens_seen": 22201232, "step": 36410 }, { "epoch": 11.298479677319268, "grad_norm": 2.8458330631256104, "learning_rate": 4.7399155362252106e-06, "loss": 0.2111, "num_input_tokens_seen": 22204432, "step": 36415 }, { "epoch": 11.300031026993484, "grad_norm": 1.3504868745803833, "learning_rate": 4.7385635705705305e-06, "loss": 0.2183, "num_input_tokens_seen": 22208176, "step": 36420 }, { "epoch": 11.301582376667701, "grad_norm": 2.3648602962493896, "learning_rate": 4.737211624082237e-06, "loss": 0.2529, "num_input_tokens_seen": 22211504, "step": 36425 }, { "epoch": 11.303133726341917, "grad_norm": 1.539581060409546, "learning_rate": 4.735859696859443e-06, "loss": 0.1918, "num_input_tokens_seen": 22214160, "step": 36430 }, { "epoch": 11.304685076016135, "grad_norm": 2.3307361602783203, "learning_rate": 4.734507789001261e-06, "loss": 0.2092, "num_input_tokens_seen": 22216976, "step": 36435 }, { "epoch": 11.30623642569035, "grad_norm": 2.2026185989379883, "learning_rate": 4.733155900606802e-06, "loss": 0.2481, "num_input_tokens_seen": 22219376, "step": 36440 }, { "epoch": 11.307787775364567, "grad_norm": 1.3471791744232178, "learning_rate": 4.731804031775175e-06, "loss": 0.216, "num_input_tokens_seen": 22224176, "step": 36445 }, { "epoch": 11.309339125038784, "grad_norm": 1.2611949443817139, "learning_rate": 4.730452182605489e-06, "loss": 0.2369, "num_input_tokens_seen": 22226896, "step": 36450 }, { "epoch": 11.310890474713, "grad_norm": 1.7515580654144287, "learning_rate": 4.72910035319685e-06, "loss": 0.2031, "num_input_tokens_seen": 22229808, "step": 36455 }, { "epoch": 11.312441824387218, "grad_norm": 2.1166272163391113, "learning_rate": 4.727748543648362e-06, "loss": 0.2456, "num_input_tokens_seen": 22235024, "step": 36460 }, { "epoch": 11.313993174061434, "grad_norm": 1.8699432611465454, "learning_rate": 4.72639675405913e-06, "loss": 0.2205, "num_input_tokens_seen": 22237872, "step": 36465 }, { "epoch": 11.31554452373565, "grad_norm": 4.335262298583984, "learning_rate": 4.725044984528256e-06, "loss": 0.2351, "num_input_tokens_seen": 22240784, "step": 36470 }, { "epoch": 11.317095873409867, "grad_norm": 1.717966079711914, "learning_rate": 4.7236932351548405e-06, "loss": 0.2294, "num_input_tokens_seen": 22243504, "step": 36475 }, { "epoch": 11.318647223084083, "grad_norm": 1.4348812103271484, "learning_rate": 4.722341506037984e-06, "loss": 0.2099, "num_input_tokens_seen": 22246320, "step": 36480 }, { "epoch": 11.320198572758299, "grad_norm": 1.7542001008987427, "learning_rate": 4.720989797276782e-06, "loss": 0.2073, "num_input_tokens_seen": 22249680, "step": 36485 }, { "epoch": 11.321749922432517, "grad_norm": 2.222536563873291, "learning_rate": 4.7196381089703316e-06, "loss": 0.2133, "num_input_tokens_seen": 22252848, "step": 36490 }, { "epoch": 11.323301272106733, "grad_norm": 2.237933874130249, "learning_rate": 4.718286441217728e-06, "loss": 0.2046, "num_input_tokens_seen": 22256112, "step": 36495 }, { "epoch": 11.32485262178095, "grad_norm": 2.650869131088257, "learning_rate": 4.716934794118065e-06, "loss": 0.2126, "num_input_tokens_seen": 22258896, "step": 36500 }, { "epoch": 11.326403971455166, "grad_norm": 2.1355631351470947, "learning_rate": 4.715583167770433e-06, "loss": 0.2407, "num_input_tokens_seen": 22261712, "step": 36505 }, { "epoch": 11.327955321129382, "grad_norm": 1.8081748485565186, "learning_rate": 4.714231562273923e-06, "loss": 0.2289, "num_input_tokens_seen": 22264592, "step": 36510 }, { "epoch": 11.3295066708036, "grad_norm": 2.4136126041412354, "learning_rate": 4.712879977727623e-06, "loss": 0.218, "num_input_tokens_seen": 22268464, "step": 36515 }, { "epoch": 11.331058020477816, "grad_norm": 2.6707777976989746, "learning_rate": 4.711528414230621e-06, "loss": 0.2289, "num_input_tokens_seen": 22271792, "step": 36520 }, { "epoch": 11.332609370152031, "grad_norm": 1.954111933708191, "learning_rate": 4.710176871882002e-06, "loss": 0.2483, "num_input_tokens_seen": 22274320, "step": 36525 }, { "epoch": 11.334160719826249, "grad_norm": 1.7828222513198853, "learning_rate": 4.7088253507808504e-06, "loss": 0.2593, "num_input_tokens_seen": 22277648, "step": 36530 }, { "epoch": 11.335712069500465, "grad_norm": 2.3786582946777344, "learning_rate": 4.707473851026248e-06, "loss": 0.2146, "num_input_tokens_seen": 22280336, "step": 36535 }, { "epoch": 11.337263419174683, "grad_norm": 2.8063323497772217, "learning_rate": 4.706122372717278e-06, "loss": 0.2523, "num_input_tokens_seen": 22283344, "step": 36540 }, { "epoch": 11.338814768848898, "grad_norm": 2.3383595943450928, "learning_rate": 4.704770915953018e-06, "loss": 0.2146, "num_input_tokens_seen": 22286512, "step": 36545 }, { "epoch": 11.340366118523114, "grad_norm": 2.1980016231536865, "learning_rate": 4.7034194808325445e-06, "loss": 0.2077, "num_input_tokens_seen": 22289456, "step": 36550 }, { "epoch": 11.341917468197332, "grad_norm": 1.361922025680542, "learning_rate": 4.702068067454936e-06, "loss": 0.2118, "num_input_tokens_seen": 22292432, "step": 36555 }, { "epoch": 11.343468817871548, "grad_norm": 1.2954046726226807, "learning_rate": 4.7007166759192656e-06, "loss": 0.1837, "num_input_tokens_seen": 22296176, "step": 36560 }, { "epoch": 11.345020167545766, "grad_norm": 1.311081051826477, "learning_rate": 4.699365306324607e-06, "loss": 0.2278, "num_input_tokens_seen": 22299024, "step": 36565 }, { "epoch": 11.346571517219981, "grad_norm": 2.231537103652954, "learning_rate": 4.698013958770032e-06, "loss": 0.2368, "num_input_tokens_seen": 22302352, "step": 36570 }, { "epoch": 11.348122866894197, "grad_norm": 1.8818436861038208, "learning_rate": 4.69666263335461e-06, "loss": 0.2497, "num_input_tokens_seen": 22305264, "step": 36575 }, { "epoch": 11.349674216568415, "grad_norm": 1.5873854160308838, "learning_rate": 4.6953113301774085e-06, "loss": 0.1939, "num_input_tokens_seen": 22307504, "step": 36580 }, { "epoch": 11.351225566242631, "grad_norm": 2.216604232788086, "learning_rate": 4.693960049337495e-06, "loss": 0.2205, "num_input_tokens_seen": 22310256, "step": 36585 }, { "epoch": 11.352776915916849, "grad_norm": 1.9364715814590454, "learning_rate": 4.692608790933932e-06, "loss": 0.227, "num_input_tokens_seen": 22313776, "step": 36590 }, { "epoch": 11.354328265591064, "grad_norm": 1.2791361808776855, "learning_rate": 4.691257555065787e-06, "loss": 0.1942, "num_input_tokens_seen": 22316432, "step": 36595 }, { "epoch": 11.35587961526528, "grad_norm": 1.8118692636489868, "learning_rate": 4.689906341832118e-06, "loss": 0.2324, "num_input_tokens_seen": 22319888, "step": 36600 }, { "epoch": 11.357430964939498, "grad_norm": 3.0307352542877197, "learning_rate": 4.688555151331987e-06, "loss": 0.2109, "num_input_tokens_seen": 22322832, "step": 36605 }, { "epoch": 11.358982314613714, "grad_norm": 1.398008942604065, "learning_rate": 4.687203983664451e-06, "loss": 0.1851, "num_input_tokens_seen": 22326416, "step": 36610 }, { "epoch": 11.36053366428793, "grad_norm": 2.229574203491211, "learning_rate": 4.685852838928568e-06, "loss": 0.2437, "num_input_tokens_seen": 22329232, "step": 36615 }, { "epoch": 11.362085013962147, "grad_norm": 2.234067440032959, "learning_rate": 4.6845017172233905e-06, "loss": 0.2223, "num_input_tokens_seen": 22331856, "step": 36620 }, { "epoch": 11.363636363636363, "grad_norm": 2.338209867477417, "learning_rate": 4.683150618647975e-06, "loss": 0.2531, "num_input_tokens_seen": 22334224, "step": 36625 }, { "epoch": 11.365187713310581, "grad_norm": 1.8563967943191528, "learning_rate": 4.68179954330137e-06, "loss": 0.2488, "num_input_tokens_seen": 22336816, "step": 36630 }, { "epoch": 11.366739062984797, "grad_norm": 2.5474231243133545, "learning_rate": 4.680448491282627e-06, "loss": 0.2185, "num_input_tokens_seen": 22341264, "step": 36635 }, { "epoch": 11.368290412659013, "grad_norm": 1.6352860927581787, "learning_rate": 4.679097462690793e-06, "loss": 0.2161, "num_input_tokens_seen": 22343536, "step": 36640 }, { "epoch": 11.36984176233323, "grad_norm": 1.769062876701355, "learning_rate": 4.677746457624915e-06, "loss": 0.2222, "num_input_tokens_seen": 22346416, "step": 36645 }, { "epoch": 11.371393112007446, "grad_norm": 2.7398784160614014, "learning_rate": 4.676395476184037e-06, "loss": 0.2036, "num_input_tokens_seen": 22349360, "step": 36650 }, { "epoch": 11.372944461681662, "grad_norm": 1.9454975128173828, "learning_rate": 4.675044518467205e-06, "loss": 0.2381, "num_input_tokens_seen": 22352368, "step": 36655 }, { "epoch": 11.37449581135588, "grad_norm": 1.8193483352661133, "learning_rate": 4.673693584573456e-06, "loss": 0.2112, "num_input_tokens_seen": 22355856, "step": 36660 }, { "epoch": 11.376047161030096, "grad_norm": 1.7547876834869385, "learning_rate": 4.672342674601832e-06, "loss": 0.2179, "num_input_tokens_seen": 22360432, "step": 36665 }, { "epoch": 11.377598510704313, "grad_norm": 1.1674245595932007, "learning_rate": 4.6709917886513684e-06, "loss": 0.194, "num_input_tokens_seen": 22363600, "step": 36670 }, { "epoch": 11.37914986037853, "grad_norm": 2.412649393081665, "learning_rate": 4.669640926821103e-06, "loss": 0.2343, "num_input_tokens_seen": 22366000, "step": 36675 }, { "epoch": 11.380701210052745, "grad_norm": 1.5718860626220703, "learning_rate": 4.6682900892100716e-06, "loss": 0.2015, "num_input_tokens_seen": 22368368, "step": 36680 }, { "epoch": 11.382252559726963, "grad_norm": 2.447248935699463, "learning_rate": 4.666939275917304e-06, "loss": 0.2467, "num_input_tokens_seen": 22371376, "step": 36685 }, { "epoch": 11.383803909401179, "grad_norm": 1.4974148273468018, "learning_rate": 4.665588487041831e-06, "loss": 0.2039, "num_input_tokens_seen": 22373872, "step": 36690 }, { "epoch": 11.385355259075396, "grad_norm": 3.0581655502319336, "learning_rate": 4.664237722682683e-06, "loss": 0.1976, "num_input_tokens_seen": 22377040, "step": 36695 }, { "epoch": 11.386906608749612, "grad_norm": 4.406325340270996, "learning_rate": 4.6628869829388855e-06, "loss": 0.2485, "num_input_tokens_seen": 22380368, "step": 36700 }, { "epoch": 11.388457958423828, "grad_norm": 2.938480854034424, "learning_rate": 4.661536267909464e-06, "loss": 0.2731, "num_input_tokens_seen": 22382896, "step": 36705 }, { "epoch": 11.390009308098046, "grad_norm": 1.9176418781280518, "learning_rate": 4.660185577693442e-06, "loss": 0.2631, "num_input_tokens_seen": 22385552, "step": 36710 }, { "epoch": 11.391560657772262, "grad_norm": 2.035534381866455, "learning_rate": 4.658834912389842e-06, "loss": 0.2224, "num_input_tokens_seen": 22388112, "step": 36715 }, { "epoch": 11.39311200744648, "grad_norm": 2.355288028717041, "learning_rate": 4.657484272097681e-06, "loss": 0.2548, "num_input_tokens_seen": 22390544, "step": 36720 }, { "epoch": 11.394663357120695, "grad_norm": 2.400792360305786, "learning_rate": 4.65613365691598e-06, "loss": 0.249, "num_input_tokens_seen": 22393040, "step": 36725 }, { "epoch": 11.396214706794911, "grad_norm": 1.9700363874435425, "learning_rate": 4.654783066943754e-06, "loss": 0.2124, "num_input_tokens_seen": 22395792, "step": 36730 }, { "epoch": 11.397766056469129, "grad_norm": 2.4228932857513428, "learning_rate": 4.653432502280016e-06, "loss": 0.2209, "num_input_tokens_seen": 22399184, "step": 36735 }, { "epoch": 11.399317406143345, "grad_norm": 1.8421272039413452, "learning_rate": 4.652081963023781e-06, "loss": 0.2203, "num_input_tokens_seen": 22401904, "step": 36740 }, { "epoch": 11.40086875581756, "grad_norm": 2.260763168334961, "learning_rate": 4.650731449274059e-06, "loss": 0.2245, "num_input_tokens_seen": 22407120, "step": 36745 }, { "epoch": 11.402420105491778, "grad_norm": 2.8055648803710938, "learning_rate": 4.649380961129856e-06, "loss": 0.2413, "num_input_tokens_seen": 22410672, "step": 36750 }, { "epoch": 11.403971455165994, "grad_norm": 2.0728037357330322, "learning_rate": 4.648030498690181e-06, "loss": 0.206, "num_input_tokens_seen": 22413104, "step": 36755 }, { "epoch": 11.405522804840212, "grad_norm": 1.0447582006454468, "learning_rate": 4.646680062054038e-06, "loss": 0.2538, "num_input_tokens_seen": 22415600, "step": 36760 }, { "epoch": 11.407074154514428, "grad_norm": 3.7489123344421387, "learning_rate": 4.645329651320431e-06, "loss": 0.2361, "num_input_tokens_seen": 22418320, "step": 36765 }, { "epoch": 11.408625504188644, "grad_norm": 1.7843579053878784, "learning_rate": 4.64397926658836e-06, "loss": 0.2359, "num_input_tokens_seen": 22421104, "step": 36770 }, { "epoch": 11.410176853862861, "grad_norm": 2.6448256969451904, "learning_rate": 4.6426289079568255e-06, "loss": 0.2237, "num_input_tokens_seen": 22423696, "step": 36775 }, { "epoch": 11.411728203537077, "grad_norm": 2.1822879314422607, "learning_rate": 4.641278575524823e-06, "loss": 0.2269, "num_input_tokens_seen": 22427600, "step": 36780 }, { "epoch": 11.413279553211293, "grad_norm": 1.459847092628479, "learning_rate": 4.63992826939135e-06, "loss": 0.2366, "num_input_tokens_seen": 22429840, "step": 36785 }, { "epoch": 11.41483090288551, "grad_norm": 2.1123414039611816, "learning_rate": 4.638577989655398e-06, "loss": 0.2374, "num_input_tokens_seen": 22433104, "step": 36790 }, { "epoch": 11.416382252559726, "grad_norm": 1.9669212102890015, "learning_rate": 4.637227736415958e-06, "loss": 0.2196, "num_input_tokens_seen": 22435440, "step": 36795 }, { "epoch": 11.417933602233944, "grad_norm": 1.1279215812683105, "learning_rate": 4.635877509772022e-06, "loss": 0.2173, "num_input_tokens_seen": 22438448, "step": 36800 }, { "epoch": 11.41948495190816, "grad_norm": 1.7207012176513672, "learning_rate": 4.634527309822576e-06, "loss": 0.2193, "num_input_tokens_seen": 22441040, "step": 36805 }, { "epoch": 11.421036301582376, "grad_norm": 2.3061459064483643, "learning_rate": 4.6331771366666065e-06, "loss": 0.2258, "num_input_tokens_seen": 22445488, "step": 36810 }, { "epoch": 11.422587651256594, "grad_norm": 3.189690113067627, "learning_rate": 4.631826990403096e-06, "loss": 0.2468, "num_input_tokens_seen": 22448176, "step": 36815 }, { "epoch": 11.42413900093081, "grad_norm": 1.5744147300720215, "learning_rate": 4.630476871131028e-06, "loss": 0.2236, "num_input_tokens_seen": 22450800, "step": 36820 }, { "epoch": 11.425690350605027, "grad_norm": 2.264425039291382, "learning_rate": 4.629126778949381e-06, "loss": 0.216, "num_input_tokens_seen": 22454000, "step": 36825 }, { "epoch": 11.427241700279243, "grad_norm": 1.7249219417572021, "learning_rate": 4.6277767139571325e-06, "loss": 0.2183, "num_input_tokens_seen": 22457648, "step": 36830 }, { "epoch": 11.428793049953459, "grad_norm": 2.2674684524536133, "learning_rate": 4.626426676253259e-06, "loss": 0.2222, "num_input_tokens_seen": 22460368, "step": 36835 }, { "epoch": 11.430344399627677, "grad_norm": 2.1888465881347656, "learning_rate": 4.625076665936733e-06, "loss": 0.2154, "num_input_tokens_seen": 22463344, "step": 36840 }, { "epoch": 11.431895749301892, "grad_norm": 2.1749935150146484, "learning_rate": 4.623726683106529e-06, "loss": 0.217, "num_input_tokens_seen": 22466064, "step": 36845 }, { "epoch": 11.43344709897611, "grad_norm": 1.7910500764846802, "learning_rate": 4.622376727861613e-06, "loss": 0.2281, "num_input_tokens_seen": 22469456, "step": 36850 }, { "epoch": 11.434998448650326, "grad_norm": 1.1481057405471802, "learning_rate": 4.621026800300954e-06, "loss": 0.2121, "num_input_tokens_seen": 22472272, "step": 36855 }, { "epoch": 11.436549798324542, "grad_norm": 2.1612815856933594, "learning_rate": 4.61967690052352e-06, "loss": 0.2241, "num_input_tokens_seen": 22474576, "step": 36860 }, { "epoch": 11.43810114799876, "grad_norm": 3.2132034301757812, "learning_rate": 4.618327028628272e-06, "loss": 0.2386, "num_input_tokens_seen": 22477904, "step": 36865 }, { "epoch": 11.439652497672975, "grad_norm": 2.6204237937927246, "learning_rate": 4.616977184714173e-06, "loss": 0.221, "num_input_tokens_seen": 22480336, "step": 36870 }, { "epoch": 11.441203847347191, "grad_norm": 1.4858994483947754, "learning_rate": 4.615627368880182e-06, "loss": 0.1973, "num_input_tokens_seen": 22483888, "step": 36875 }, { "epoch": 11.442755197021409, "grad_norm": 1.8225842714309692, "learning_rate": 4.6142775812252565e-06, "loss": 0.2288, "num_input_tokens_seen": 22486704, "step": 36880 }, { "epoch": 11.444306546695625, "grad_norm": 3.181269884109497, "learning_rate": 4.612927821848352e-06, "loss": 0.2348, "num_input_tokens_seen": 22489712, "step": 36885 }, { "epoch": 11.445857896369843, "grad_norm": 2.0329976081848145, "learning_rate": 4.611578090848422e-06, "loss": 0.2404, "num_input_tokens_seen": 22492144, "step": 36890 }, { "epoch": 11.447409246044058, "grad_norm": 2.640293598175049, "learning_rate": 4.6102283883244166e-06, "loss": 0.2264, "num_input_tokens_seen": 22494416, "step": 36895 }, { "epoch": 11.448960595718274, "grad_norm": 1.7971464395523071, "learning_rate": 4.608878714375287e-06, "loss": 0.1903, "num_input_tokens_seen": 22497008, "step": 36900 }, { "epoch": 11.450511945392492, "grad_norm": 1.5262051820755005, "learning_rate": 4.6075290690999785e-06, "loss": 0.2304, "num_input_tokens_seen": 22499600, "step": 36905 }, { "epoch": 11.452063295066708, "grad_norm": 3.3215792179107666, "learning_rate": 4.6061794525974365e-06, "loss": 0.2191, "num_input_tokens_seen": 22502416, "step": 36910 }, { "epoch": 11.453614644740924, "grad_norm": 2.7120423316955566, "learning_rate": 4.604829864966604e-06, "loss": 0.219, "num_input_tokens_seen": 22505520, "step": 36915 }, { "epoch": 11.455165994415141, "grad_norm": 3.327101469039917, "learning_rate": 4.603480306306423e-06, "loss": 0.2359, "num_input_tokens_seen": 22508400, "step": 36920 }, { "epoch": 11.456717344089357, "grad_norm": 2.9258484840393066, "learning_rate": 4.60213077671583e-06, "loss": 0.2725, "num_input_tokens_seen": 22511792, "step": 36925 }, { "epoch": 11.458268693763575, "grad_norm": 1.5976327657699585, "learning_rate": 4.600781276293764e-06, "loss": 0.2174, "num_input_tokens_seen": 22515824, "step": 36930 }, { "epoch": 11.45982004343779, "grad_norm": 2.6894466876983643, "learning_rate": 4.599431805139157e-06, "loss": 0.2347, "num_input_tokens_seen": 22518448, "step": 36935 }, { "epoch": 11.461371393112007, "grad_norm": 1.6177878379821777, "learning_rate": 4.5980823633509416e-06, "loss": 0.222, "num_input_tokens_seen": 22520752, "step": 36940 }, { "epoch": 11.462922742786224, "grad_norm": 1.71908438205719, "learning_rate": 4.596732951028049e-06, "loss": 0.2202, "num_input_tokens_seen": 22523504, "step": 36945 }, { "epoch": 11.46447409246044, "grad_norm": 2.075066328048706, "learning_rate": 4.595383568269406e-06, "loss": 0.2284, "num_input_tokens_seen": 22526416, "step": 36950 }, { "epoch": 11.466025442134658, "grad_norm": 2.218520402908325, "learning_rate": 4.59403421517394e-06, "loss": 0.232, "num_input_tokens_seen": 22528976, "step": 36955 }, { "epoch": 11.467576791808874, "grad_norm": 1.8872783184051514, "learning_rate": 4.592684891840572e-06, "loss": 0.2868, "num_input_tokens_seen": 22532176, "step": 36960 }, { "epoch": 11.46912814148309, "grad_norm": 2.3193562030792236, "learning_rate": 4.5913355983682266e-06, "loss": 0.2194, "num_input_tokens_seen": 22535280, "step": 36965 }, { "epoch": 11.470679491157307, "grad_norm": 3.5351321697235107, "learning_rate": 4.589986334855819e-06, "loss": 0.2491, "num_input_tokens_seen": 22538288, "step": 36970 }, { "epoch": 11.472230840831523, "grad_norm": 1.9120694398880005, "learning_rate": 4.588637101402269e-06, "loss": 0.2328, "num_input_tokens_seen": 22541328, "step": 36975 }, { "epoch": 11.47378219050574, "grad_norm": 1.7318732738494873, "learning_rate": 4.587287898106491e-06, "loss": 0.2265, "num_input_tokens_seen": 22544016, "step": 36980 }, { "epoch": 11.475333540179957, "grad_norm": 2.4610774517059326, "learning_rate": 4.585938725067397e-06, "loss": 0.2162, "num_input_tokens_seen": 22547216, "step": 36985 }, { "epoch": 11.476884889854173, "grad_norm": 1.8541258573532104, "learning_rate": 4.584589582383898e-06, "loss": 0.2114, "num_input_tokens_seen": 22549552, "step": 36990 }, { "epoch": 11.47843623952839, "grad_norm": 1.3147449493408203, "learning_rate": 4.583240470154902e-06, "loss": 0.2093, "num_input_tokens_seen": 22551920, "step": 36995 }, { "epoch": 11.479987589202606, "grad_norm": 1.3397926092147827, "learning_rate": 4.5818913884793146e-06, "loss": 0.2046, "num_input_tokens_seen": 22554736, "step": 37000 }, { "epoch": 11.481538938876822, "grad_norm": 0.887849748134613, "learning_rate": 4.580542337456039e-06, "loss": 0.2362, "num_input_tokens_seen": 22557520, "step": 37005 }, { "epoch": 11.48309028855104, "grad_norm": 2.7263243198394775, "learning_rate": 4.5791933171839784e-06, "loss": 0.2455, "num_input_tokens_seen": 22560528, "step": 37010 }, { "epoch": 11.484641638225256, "grad_norm": 1.2098913192749023, "learning_rate": 4.57784432776203e-06, "loss": 0.231, "num_input_tokens_seen": 22562864, "step": 37015 }, { "epoch": 11.486192987899473, "grad_norm": 1.6132378578186035, "learning_rate": 4.576495369289091e-06, "loss": 0.2291, "num_input_tokens_seen": 22565680, "step": 37020 }, { "epoch": 11.48774433757369, "grad_norm": 1.3905683755874634, "learning_rate": 4.575146441864057e-06, "loss": 0.2549, "num_input_tokens_seen": 22568176, "step": 37025 }, { "epoch": 11.489295687247905, "grad_norm": 0.9968926310539246, "learning_rate": 4.57379754558582e-06, "loss": 0.2081, "num_input_tokens_seen": 22571440, "step": 37030 }, { "epoch": 11.490847036922123, "grad_norm": 0.910910427570343, "learning_rate": 4.572448680553268e-06, "loss": 0.1916, "num_input_tokens_seen": 22574832, "step": 37035 }, { "epoch": 11.492398386596339, "grad_norm": 0.9860670566558838, "learning_rate": 4.571099846865293e-06, "loss": 0.2172, "num_input_tokens_seen": 22577520, "step": 37040 }, { "epoch": 11.493949736270554, "grad_norm": 1.3640624284744263, "learning_rate": 4.5697510446207775e-06, "loss": 0.2177, "num_input_tokens_seen": 22580752, "step": 37045 }, { "epoch": 11.495501085944772, "grad_norm": 1.2242525815963745, "learning_rate": 4.568402273918606e-06, "loss": 0.2212, "num_input_tokens_seen": 22583344, "step": 37050 }, { "epoch": 11.497052435618988, "grad_norm": 1.438359022140503, "learning_rate": 4.567053534857658e-06, "loss": 0.2287, "num_input_tokens_seen": 22585872, "step": 37055 }, { "epoch": 11.498603785293206, "grad_norm": 1.4666731357574463, "learning_rate": 4.565704827536813e-06, "loss": 0.254, "num_input_tokens_seen": 22589776, "step": 37060 }, { "epoch": 11.500155134967422, "grad_norm": 1.4708791971206665, "learning_rate": 4.564356152054946e-06, "loss": 0.2019, "num_input_tokens_seen": 22593488, "step": 37065 }, { "epoch": 11.501706484641637, "grad_norm": 3.167440891265869, "learning_rate": 4.5630075085109336e-06, "loss": 0.2658, "num_input_tokens_seen": 22596848, "step": 37070 }, { "epoch": 11.503257834315855, "grad_norm": 1.3964157104492188, "learning_rate": 4.561658897003644e-06, "loss": 0.2259, "num_input_tokens_seen": 22599792, "step": 37075 }, { "epoch": 11.504809183990071, "grad_norm": 0.8204829692840576, "learning_rate": 4.560310317631949e-06, "loss": 0.2275, "num_input_tokens_seen": 22602416, "step": 37080 }, { "epoch": 11.506360533664289, "grad_norm": 1.0390335321426392, "learning_rate": 4.558961770494714e-06, "loss": 0.2168, "num_input_tokens_seen": 22605392, "step": 37085 }, { "epoch": 11.507911883338505, "grad_norm": 1.0219210386276245, "learning_rate": 4.557613255690803e-06, "loss": 0.2194, "num_input_tokens_seen": 22608208, "step": 37090 }, { "epoch": 11.50946323301272, "grad_norm": 0.9174310564994812, "learning_rate": 4.55626477331908e-06, "loss": 0.2185, "num_input_tokens_seen": 22611056, "step": 37095 }, { "epoch": 11.511014582686938, "grad_norm": 1.5136058330535889, "learning_rate": 4.554916323478404e-06, "loss": 0.2171, "num_input_tokens_seen": 22613552, "step": 37100 }, { "epoch": 11.512565932361154, "grad_norm": 1.4352805614471436, "learning_rate": 4.55356790626763e-06, "loss": 0.227, "num_input_tokens_seen": 22617520, "step": 37105 }, { "epoch": 11.514117282035372, "grad_norm": 0.9766514301300049, "learning_rate": 4.552219521785618e-06, "loss": 0.215, "num_input_tokens_seen": 22621584, "step": 37110 }, { "epoch": 11.515668631709588, "grad_norm": 1.0626896619796753, "learning_rate": 4.550871170131216e-06, "loss": 0.2295, "num_input_tokens_seen": 22624848, "step": 37115 }, { "epoch": 11.517219981383803, "grad_norm": 2.701280355453491, "learning_rate": 4.549522851403276e-06, "loss": 0.2261, "num_input_tokens_seen": 22627024, "step": 37120 }, { "epoch": 11.518771331058021, "grad_norm": 1.9169641733169556, "learning_rate": 4.548174565700645e-06, "loss": 0.2127, "num_input_tokens_seen": 22632816, "step": 37125 }, { "epoch": 11.520322680732237, "grad_norm": 1.4691752195358276, "learning_rate": 4.54682631312217e-06, "loss": 0.2246, "num_input_tokens_seen": 22635632, "step": 37130 }, { "epoch": 11.521874030406453, "grad_norm": 2.016200065612793, "learning_rate": 4.545478093766692e-06, "loss": 0.2334, "num_input_tokens_seen": 22639568, "step": 37135 }, { "epoch": 11.52342538008067, "grad_norm": 3.7986600399017334, "learning_rate": 4.5441299077330515e-06, "loss": 0.2304, "num_input_tokens_seen": 22643408, "step": 37140 }, { "epoch": 11.524976729754886, "grad_norm": 2.026876926422119, "learning_rate": 4.542781755120087e-06, "loss": 0.239, "num_input_tokens_seen": 22645840, "step": 37145 }, { "epoch": 11.526528079429104, "grad_norm": 0.9796133637428284, "learning_rate": 4.541433636026636e-06, "loss": 0.2186, "num_input_tokens_seen": 22648688, "step": 37150 }, { "epoch": 11.52807942910332, "grad_norm": 0.931355893611908, "learning_rate": 4.540085550551527e-06, "loss": 0.2144, "num_input_tokens_seen": 22651632, "step": 37155 }, { "epoch": 11.529630778777536, "grad_norm": 1.7316898107528687, "learning_rate": 4.538737498793594e-06, "loss": 0.2176, "num_input_tokens_seen": 22654800, "step": 37160 }, { "epoch": 11.531182128451753, "grad_norm": 2.601966381072998, "learning_rate": 4.537389480851665e-06, "loss": 0.2063, "num_input_tokens_seen": 22657840, "step": 37165 }, { "epoch": 11.53273347812597, "grad_norm": 2.248762607574463, "learning_rate": 4.5360414968245654e-06, "loss": 0.227, "num_input_tokens_seen": 22660656, "step": 37170 }, { "epoch": 11.534284827800185, "grad_norm": 2.3327362537384033, "learning_rate": 4.534693546811119e-06, "loss": 0.2328, "num_input_tokens_seen": 22663920, "step": 37175 }, { "epoch": 11.535836177474403, "grad_norm": 1.348888874053955, "learning_rate": 4.5333456309101445e-06, "loss": 0.2245, "num_input_tokens_seen": 22666960, "step": 37180 }, { "epoch": 11.537387527148619, "grad_norm": 1.7424206733703613, "learning_rate": 4.531997749220462e-06, "loss": 0.219, "num_input_tokens_seen": 22669552, "step": 37185 }, { "epoch": 11.538938876822836, "grad_norm": 0.8107620477676392, "learning_rate": 4.530649901840886e-06, "loss": 0.217, "num_input_tokens_seen": 22672112, "step": 37190 }, { "epoch": 11.540490226497052, "grad_norm": 1.0222468376159668, "learning_rate": 4.52930208887023e-06, "loss": 0.2216, "num_input_tokens_seen": 22675184, "step": 37195 }, { "epoch": 11.542041576171268, "grad_norm": 3.1802122592926025, "learning_rate": 4.5279543104073044e-06, "loss": 0.2331, "num_input_tokens_seen": 22680560, "step": 37200 }, { "epoch": 11.543592925845486, "grad_norm": 1.5430891513824463, "learning_rate": 4.526606566550918e-06, "loss": 0.2522, "num_input_tokens_seen": 22682992, "step": 37205 }, { "epoch": 11.545144275519702, "grad_norm": 1.408046841621399, "learning_rate": 4.5252588573998765e-06, "loss": 0.2369, "num_input_tokens_seen": 22685808, "step": 37210 }, { "epoch": 11.54669562519392, "grad_norm": 2.3690738677978516, "learning_rate": 4.5239111830529815e-06, "loss": 0.2354, "num_input_tokens_seen": 22689136, "step": 37215 }, { "epoch": 11.548246974868135, "grad_norm": 3.9737191200256348, "learning_rate": 4.5225635436090354e-06, "loss": 0.2279, "num_input_tokens_seen": 22691472, "step": 37220 }, { "epoch": 11.549798324542351, "grad_norm": 0.9919840693473816, "learning_rate": 4.521215939166835e-06, "loss": 0.2389, "num_input_tokens_seen": 22694064, "step": 37225 }, { "epoch": 11.551349674216569, "grad_norm": 1.767237901687622, "learning_rate": 4.519868369825177e-06, "loss": 0.2201, "num_input_tokens_seen": 22696816, "step": 37230 }, { "epoch": 11.552901023890785, "grad_norm": 1.2275205850601196, "learning_rate": 4.518520835682852e-06, "loss": 0.2555, "num_input_tokens_seen": 22699664, "step": 37235 }, { "epoch": 11.554452373565002, "grad_norm": 2.2032294273376465, "learning_rate": 4.517173336838652e-06, "loss": 0.2332, "num_input_tokens_seen": 22703600, "step": 37240 }, { "epoch": 11.556003723239218, "grad_norm": 1.117558479309082, "learning_rate": 4.515825873391363e-06, "loss": 0.2063, "num_input_tokens_seen": 22706128, "step": 37245 }, { "epoch": 11.557555072913434, "grad_norm": 1.093548059463501, "learning_rate": 4.5144784454397695e-06, "loss": 0.2216, "num_input_tokens_seen": 22709232, "step": 37250 }, { "epoch": 11.559106422587652, "grad_norm": 1.4372403621673584, "learning_rate": 4.513131053082657e-06, "loss": 0.2307, "num_input_tokens_seen": 22712720, "step": 37255 }, { "epoch": 11.560657772261868, "grad_norm": 1.0170727968215942, "learning_rate": 4.511783696418804e-06, "loss": 0.1993, "num_input_tokens_seen": 22717264, "step": 37260 }, { "epoch": 11.562209121936084, "grad_norm": 1.0163882970809937, "learning_rate": 4.510436375546988e-06, "loss": 0.1989, "num_input_tokens_seen": 22720368, "step": 37265 }, { "epoch": 11.563760471610301, "grad_norm": 2.1119582653045654, "learning_rate": 4.509089090565982e-06, "loss": 0.2215, "num_input_tokens_seen": 22725232, "step": 37270 }, { "epoch": 11.565311821284517, "grad_norm": 1.7553834915161133, "learning_rate": 4.507741841574559e-06, "loss": 0.2308, "num_input_tokens_seen": 22728368, "step": 37275 }, { "epoch": 11.566863170958735, "grad_norm": 1.4500136375427246, "learning_rate": 4.506394628671489e-06, "loss": 0.2503, "num_input_tokens_seen": 22730704, "step": 37280 }, { "epoch": 11.56841452063295, "grad_norm": 1.4106179475784302, "learning_rate": 4.505047451955537e-06, "loss": 0.219, "num_input_tokens_seen": 22734416, "step": 37285 }, { "epoch": 11.569965870307167, "grad_norm": 1.2334407567977905, "learning_rate": 4.5037003115254685e-06, "loss": 0.2164, "num_input_tokens_seen": 22737360, "step": 37290 }, { "epoch": 11.571517219981384, "grad_norm": 1.448184609413147, "learning_rate": 4.502353207480044e-06, "loss": 0.2364, "num_input_tokens_seen": 22740816, "step": 37295 }, { "epoch": 11.5730685696556, "grad_norm": 1.3056772947311401, "learning_rate": 4.501006139918021e-06, "loss": 0.238, "num_input_tokens_seen": 22744592, "step": 37300 }, { "epoch": 11.574619919329816, "grad_norm": 1.0776687860488892, "learning_rate": 4.499659108938159e-06, "loss": 0.1981, "num_input_tokens_seen": 22747824, "step": 37305 }, { "epoch": 11.576171269004034, "grad_norm": 2.021156072616577, "learning_rate": 4.498312114639209e-06, "loss": 0.2092, "num_input_tokens_seen": 22753296, "step": 37310 }, { "epoch": 11.57772261867825, "grad_norm": 2.3037214279174805, "learning_rate": 4.496965157119921e-06, "loss": 0.2166, "num_input_tokens_seen": 22755760, "step": 37315 }, { "epoch": 11.579273968352467, "grad_norm": 3.4790725708007812, "learning_rate": 4.495618236479045e-06, "loss": 0.24, "num_input_tokens_seen": 22758320, "step": 37320 }, { "epoch": 11.580825318026683, "grad_norm": 2.8133513927459717, "learning_rate": 4.494271352815324e-06, "loss": 0.2206, "num_input_tokens_seen": 22761360, "step": 37325 }, { "epoch": 11.582376667700899, "grad_norm": 1.3128060102462769, "learning_rate": 4.4929245062275016e-06, "loss": 0.2989, "num_input_tokens_seen": 22764816, "step": 37330 }, { "epoch": 11.583928017375117, "grad_norm": 2.5385360717773438, "learning_rate": 4.491577696814318e-06, "loss": 0.2119, "num_input_tokens_seen": 22768080, "step": 37335 }, { "epoch": 11.585479367049333, "grad_norm": 2.1771228313446045, "learning_rate": 4.49023092467451e-06, "loss": 0.2434, "num_input_tokens_seen": 22773360, "step": 37340 }, { "epoch": 11.58703071672355, "grad_norm": 1.5582118034362793, "learning_rate": 4.488884189906811e-06, "loss": 0.2164, "num_input_tokens_seen": 22776208, "step": 37345 }, { "epoch": 11.588582066397766, "grad_norm": 1.8349692821502686, "learning_rate": 4.487537492609954e-06, "loss": 0.2405, "num_input_tokens_seen": 22778960, "step": 37350 }, { "epoch": 11.590133416071982, "grad_norm": 2.1132447719573975, "learning_rate": 4.486190832882667e-06, "loss": 0.2267, "num_input_tokens_seen": 22781328, "step": 37355 }, { "epoch": 11.5916847657462, "grad_norm": 1.3469724655151367, "learning_rate": 4.484844210823676e-06, "loss": 0.2404, "num_input_tokens_seen": 22783824, "step": 37360 }, { "epoch": 11.593236115420416, "grad_norm": 1.4323855638504028, "learning_rate": 4.483497626531705e-06, "loss": 0.2505, "num_input_tokens_seen": 22786384, "step": 37365 }, { "epoch": 11.594787465094633, "grad_norm": 1.5720868110656738, "learning_rate": 4.482151080105475e-06, "loss": 0.2407, "num_input_tokens_seen": 22789232, "step": 37370 }, { "epoch": 11.596338814768849, "grad_norm": 2.265519142150879, "learning_rate": 4.480804571643702e-06, "loss": 0.2478, "num_input_tokens_seen": 22792304, "step": 37375 }, { "epoch": 11.597890164443065, "grad_norm": 2.019698143005371, "learning_rate": 4.479458101245103e-06, "loss": 0.1947, "num_input_tokens_seen": 22796464, "step": 37380 }, { "epoch": 11.599441514117283, "grad_norm": 2.6417555809020996, "learning_rate": 4.478111669008388e-06, "loss": 0.2123, "num_input_tokens_seen": 22799248, "step": 37385 }, { "epoch": 11.600992863791499, "grad_norm": 1.2647830247879028, "learning_rate": 4.476765275032269e-06, "loss": 0.2537, "num_input_tokens_seen": 22801904, "step": 37390 }, { "epoch": 11.602544213465714, "grad_norm": 1.131502389907837, "learning_rate": 4.475418919415451e-06, "loss": 0.2391, "num_input_tokens_seen": 22804656, "step": 37395 }, { "epoch": 11.604095563139932, "grad_norm": 1.7882544994354248, "learning_rate": 4.474072602256639e-06, "loss": 0.2116, "num_input_tokens_seen": 22808432, "step": 37400 }, { "epoch": 11.605646912814148, "grad_norm": 0.8851901292800903, "learning_rate": 4.472726323654532e-06, "loss": 0.2242, "num_input_tokens_seen": 22811024, "step": 37405 }, { "epoch": 11.607198262488366, "grad_norm": 3.0861713886260986, "learning_rate": 4.47138008370783e-06, "loss": 0.2378, "num_input_tokens_seen": 22814032, "step": 37410 }, { "epoch": 11.608749612162581, "grad_norm": 1.506056785583496, "learning_rate": 4.4700338825152275e-06, "loss": 0.2317, "num_input_tokens_seen": 22817072, "step": 37415 }, { "epoch": 11.610300961836797, "grad_norm": 1.0068100690841675, "learning_rate": 4.468687720175416e-06, "loss": 0.2133, "num_input_tokens_seen": 22819472, "step": 37420 }, { "epoch": 11.611852311511015, "grad_norm": 2.1727726459503174, "learning_rate": 4.467341596787087e-06, "loss": 0.2392, "num_input_tokens_seen": 22822416, "step": 37425 }, { "epoch": 11.613403661185231, "grad_norm": 2.055208444595337, "learning_rate": 4.4659955124489265e-06, "loss": 0.2184, "num_input_tokens_seen": 22825072, "step": 37430 }, { "epoch": 11.614955010859447, "grad_norm": 2.287276268005371, "learning_rate": 4.464649467259619e-06, "loss": 0.2361, "num_input_tokens_seen": 22828144, "step": 37435 }, { "epoch": 11.616506360533664, "grad_norm": 1.6405234336853027, "learning_rate": 4.463303461317846e-06, "loss": 0.2221, "num_input_tokens_seen": 22831024, "step": 37440 }, { "epoch": 11.61805771020788, "grad_norm": 1.7077549695968628, "learning_rate": 4.461957494722284e-06, "loss": 0.2356, "num_input_tokens_seen": 22834768, "step": 37445 }, { "epoch": 11.619609059882098, "grad_norm": 1.8903392553329468, "learning_rate": 4.460611567571608e-06, "loss": 0.234, "num_input_tokens_seen": 22838096, "step": 37450 }, { "epoch": 11.621160409556314, "grad_norm": 1.7907649278640747, "learning_rate": 4.459265679964491e-06, "loss": 0.2284, "num_input_tokens_seen": 22842384, "step": 37455 }, { "epoch": 11.62271175923053, "grad_norm": 2.5374927520751953, "learning_rate": 4.457919831999605e-06, "loss": 0.2345, "num_input_tokens_seen": 22845392, "step": 37460 }, { "epoch": 11.624263108904747, "grad_norm": 1.1672357320785522, "learning_rate": 4.456574023775614e-06, "loss": 0.2094, "num_input_tokens_seen": 22848464, "step": 37465 }, { "epoch": 11.625814458578963, "grad_norm": 2.745572566986084, "learning_rate": 4.455228255391182e-06, "loss": 0.2222, "num_input_tokens_seen": 22851376, "step": 37470 }, { "epoch": 11.627365808253181, "grad_norm": 1.3259621858596802, "learning_rate": 4.45388252694497e-06, "loss": 0.2179, "num_input_tokens_seen": 22853520, "step": 37475 }, { "epoch": 11.628917157927397, "grad_norm": 0.9886924028396606, "learning_rate": 4.452536838535633e-06, "loss": 0.2165, "num_input_tokens_seen": 22856560, "step": 37480 }, { "epoch": 11.630468507601613, "grad_norm": 1.583170771598816, "learning_rate": 4.451191190261832e-06, "loss": 0.229, "num_input_tokens_seen": 22860272, "step": 37485 }, { "epoch": 11.63201985727583, "grad_norm": 1.76775062084198, "learning_rate": 4.4498455822222146e-06, "loss": 0.2137, "num_input_tokens_seen": 22863248, "step": 37490 }, { "epoch": 11.633571206950046, "grad_norm": 2.2064616680145264, "learning_rate": 4.448500014515431e-06, "loss": 0.2586, "num_input_tokens_seen": 22866384, "step": 37495 }, { "epoch": 11.635122556624264, "grad_norm": 0.9743055701255798, "learning_rate": 4.447154487240126e-06, "loss": 0.2027, "num_input_tokens_seen": 22870384, "step": 37500 }, { "epoch": 11.63667390629848, "grad_norm": 0.9394690990447998, "learning_rate": 4.445809000494945e-06, "loss": 0.2308, "num_input_tokens_seen": 22872848, "step": 37505 }, { "epoch": 11.638225255972696, "grad_norm": 1.3304427862167358, "learning_rate": 4.444463554378527e-06, "loss": 0.2512, "num_input_tokens_seen": 22875568, "step": 37510 }, { "epoch": 11.639776605646913, "grad_norm": 2.2355473041534424, "learning_rate": 4.443118148989509e-06, "loss": 0.2173, "num_input_tokens_seen": 22879536, "step": 37515 }, { "epoch": 11.64132795532113, "grad_norm": 2.5450408458709717, "learning_rate": 4.441772784426524e-06, "loss": 0.2421, "num_input_tokens_seen": 22882480, "step": 37520 }, { "epoch": 11.642879304995345, "grad_norm": 1.5624322891235352, "learning_rate": 4.440427460788206e-06, "loss": 0.21, "num_input_tokens_seen": 22885200, "step": 37525 }, { "epoch": 11.644430654669563, "grad_norm": 1.1461735963821411, "learning_rate": 4.43908217817318e-06, "loss": 0.2104, "num_input_tokens_seen": 22887696, "step": 37530 }, { "epoch": 11.645982004343779, "grad_norm": 2.099801540374756, "learning_rate": 4.4377369366800735e-06, "loss": 0.2055, "num_input_tokens_seen": 22890768, "step": 37535 }, { "epoch": 11.647533354017996, "grad_norm": 1.9794518947601318, "learning_rate": 4.436391736407507e-06, "loss": 0.2199, "num_input_tokens_seen": 22893744, "step": 37540 }, { "epoch": 11.649084703692212, "grad_norm": 3.277553081512451, "learning_rate": 4.4350465774541e-06, "loss": 0.2156, "num_input_tokens_seen": 22896784, "step": 37545 }, { "epoch": 11.650636053366428, "grad_norm": 2.0959222316741943, "learning_rate": 4.43370145991847e-06, "loss": 0.1928, "num_input_tokens_seen": 22899376, "step": 37550 }, { "epoch": 11.652187403040646, "grad_norm": 1.2957680225372314, "learning_rate": 4.432356383899229e-06, "loss": 0.226, "num_input_tokens_seen": 22901680, "step": 37555 }, { "epoch": 11.653738752714862, "grad_norm": 1.5642428398132324, "learning_rate": 4.431011349494986e-06, "loss": 0.2262, "num_input_tokens_seen": 22904816, "step": 37560 }, { "epoch": 11.655290102389078, "grad_norm": 1.5025911331176758, "learning_rate": 4.42966635680435e-06, "loss": 0.2252, "num_input_tokens_seen": 22907504, "step": 37565 }, { "epoch": 11.656841452063295, "grad_norm": 1.8187168836593628, "learning_rate": 4.428321405925923e-06, "loss": 0.2422, "num_input_tokens_seen": 22910736, "step": 37570 }, { "epoch": 11.658392801737511, "grad_norm": 2.294965982437134, "learning_rate": 4.426976496958306e-06, "loss": 0.2273, "num_input_tokens_seen": 22913424, "step": 37575 }, { "epoch": 11.659944151411729, "grad_norm": 2.389626979827881, "learning_rate": 4.425631630000099e-06, "loss": 0.2057, "num_input_tokens_seen": 22916624, "step": 37580 }, { "epoch": 11.661495501085945, "grad_norm": 2.106283664703369, "learning_rate": 4.424286805149895e-06, "loss": 0.2285, "num_input_tokens_seen": 22919472, "step": 37585 }, { "epoch": 11.66304685076016, "grad_norm": 1.339020848274231, "learning_rate": 4.422942022506285e-06, "loss": 0.2098, "num_input_tokens_seen": 22922192, "step": 37590 }, { "epoch": 11.664598200434378, "grad_norm": 1.4112440347671509, "learning_rate": 4.421597282167858e-06, "loss": 0.2616, "num_input_tokens_seen": 22925392, "step": 37595 }, { "epoch": 11.666149550108594, "grad_norm": 0.7276375889778137, "learning_rate": 4.4202525842332e-06, "loss": 0.2298, "num_input_tokens_seen": 22927600, "step": 37600 }, { "epoch": 11.667700899782812, "grad_norm": 1.5332300662994385, "learning_rate": 4.418907928800891e-06, "loss": 0.2164, "num_input_tokens_seen": 22930608, "step": 37605 }, { "epoch": 11.669252249457028, "grad_norm": 2.5021698474884033, "learning_rate": 4.417563315969513e-06, "loss": 0.2269, "num_input_tokens_seen": 22933776, "step": 37610 }, { "epoch": 11.670803599131244, "grad_norm": 1.4877550601959229, "learning_rate": 4.416218745837642e-06, "loss": 0.2057, "num_input_tokens_seen": 22937136, "step": 37615 }, { "epoch": 11.672354948805461, "grad_norm": 1.4358279705047607, "learning_rate": 4.414874218503849e-06, "loss": 0.2437, "num_input_tokens_seen": 22940304, "step": 37620 }, { "epoch": 11.673906298479677, "grad_norm": 1.5861196517944336, "learning_rate": 4.413529734066705e-06, "loss": 0.1588, "num_input_tokens_seen": 22943152, "step": 37625 }, { "epoch": 11.675457648153895, "grad_norm": 1.7038654088974, "learning_rate": 4.412185292624777e-06, "loss": 0.225, "num_input_tokens_seen": 22947344, "step": 37630 }, { "epoch": 11.67700899782811, "grad_norm": 1.956652045249939, "learning_rate": 4.410840894276627e-06, "loss": 0.2622, "num_input_tokens_seen": 22950832, "step": 37635 }, { "epoch": 11.678560347502327, "grad_norm": 2.1046855449676514, "learning_rate": 4.409496539120817e-06, "loss": 0.2123, "num_input_tokens_seen": 22953904, "step": 37640 }, { "epoch": 11.680111697176544, "grad_norm": 1.3521207571029663, "learning_rate": 4.408152227255902e-06, "loss": 0.2539, "num_input_tokens_seen": 22957232, "step": 37645 }, { "epoch": 11.68166304685076, "grad_norm": 1.3073186874389648, "learning_rate": 4.4068079587804385e-06, "loss": 0.2517, "num_input_tokens_seen": 22960336, "step": 37650 }, { "epoch": 11.683214396524976, "grad_norm": 2.3395955562591553, "learning_rate": 4.405463733792975e-06, "loss": 0.2716, "num_input_tokens_seen": 22964016, "step": 37655 }, { "epoch": 11.684765746199194, "grad_norm": 1.735650658607483, "learning_rate": 4.404119552392061e-06, "loss": 0.2196, "num_input_tokens_seen": 22967792, "step": 37660 }, { "epoch": 11.68631709587341, "grad_norm": 2.157459020614624, "learning_rate": 4.402775414676239e-06, "loss": 0.2389, "num_input_tokens_seen": 22969904, "step": 37665 }, { "epoch": 11.687868445547627, "grad_norm": 1.525075912475586, "learning_rate": 4.401431320744052e-06, "loss": 0.2075, "num_input_tokens_seen": 22972208, "step": 37670 }, { "epoch": 11.689419795221843, "grad_norm": 1.4496365785598755, "learning_rate": 4.400087270694038e-06, "loss": 0.2349, "num_input_tokens_seen": 22975920, "step": 37675 }, { "epoch": 11.690971144896059, "grad_norm": 1.8678650856018066, "learning_rate": 4.39874326462473e-06, "loss": 0.2235, "num_input_tokens_seen": 22978864, "step": 37680 }, { "epoch": 11.692522494570277, "grad_norm": 0.9343425631523132, "learning_rate": 4.397399302634661e-06, "loss": 0.2089, "num_input_tokens_seen": 22981808, "step": 37685 }, { "epoch": 11.694073844244492, "grad_norm": 1.3071057796478271, "learning_rate": 4.39605538482236e-06, "loss": 0.1924, "num_input_tokens_seen": 22986416, "step": 37690 }, { "epoch": 11.695625193918708, "grad_norm": 3.027722120285034, "learning_rate": 4.39471151128635e-06, "loss": 0.2303, "num_input_tokens_seen": 22991952, "step": 37695 }, { "epoch": 11.697176543592926, "grad_norm": 1.2855634689331055, "learning_rate": 4.393367682125154e-06, "loss": 0.2291, "num_input_tokens_seen": 22995472, "step": 37700 }, { "epoch": 11.698727893267142, "grad_norm": 1.4652190208435059, "learning_rate": 4.392023897437291e-06, "loss": 0.235, "num_input_tokens_seen": 22998736, "step": 37705 }, { "epoch": 11.70027924294136, "grad_norm": 3.625295639038086, "learning_rate": 4.390680157321275e-06, "loss": 0.2438, "num_input_tokens_seen": 23001360, "step": 37710 }, { "epoch": 11.701830592615575, "grad_norm": 1.6174900531768799, "learning_rate": 4.38933646187562e-06, "loss": 0.2269, "num_input_tokens_seen": 23003792, "step": 37715 }, { "epoch": 11.703381942289791, "grad_norm": 1.5138401985168457, "learning_rate": 4.387992811198832e-06, "loss": 0.2276, "num_input_tokens_seen": 23007696, "step": 37720 }, { "epoch": 11.704933291964009, "grad_norm": 1.3306790590286255, "learning_rate": 4.3866492053894185e-06, "loss": 0.2318, "num_input_tokens_seen": 23010448, "step": 37725 }, { "epoch": 11.706484641638225, "grad_norm": 1.1506816148757935, "learning_rate": 4.385305644545882e-06, "loss": 0.1904, "num_input_tokens_seen": 23012688, "step": 37730 }, { "epoch": 11.708035991312443, "grad_norm": 1.1470741033554077, "learning_rate": 4.383962128766721e-06, "loss": 0.2293, "num_input_tokens_seen": 23015376, "step": 37735 }, { "epoch": 11.709587340986658, "grad_norm": 1.4160668849945068, "learning_rate": 4.382618658150431e-06, "loss": 0.2171, "num_input_tokens_seen": 23017712, "step": 37740 }, { "epoch": 11.711138690660874, "grad_norm": 1.0976755619049072, "learning_rate": 4.381275232795504e-06, "loss": 0.2297, "num_input_tokens_seen": 23020848, "step": 37745 }, { "epoch": 11.712690040335092, "grad_norm": 1.819267749786377, "learning_rate": 4.37993185280043e-06, "loss": 0.2002, "num_input_tokens_seen": 23023632, "step": 37750 }, { "epoch": 11.714241390009308, "grad_norm": 3.1219847202301025, "learning_rate": 4.378588518263693e-06, "loss": 0.2308, "num_input_tokens_seen": 23027664, "step": 37755 }, { "epoch": 11.715792739683526, "grad_norm": 1.5148682594299316, "learning_rate": 4.377245229283776e-06, "loss": 0.227, "num_input_tokens_seen": 23031312, "step": 37760 }, { "epoch": 11.717344089357741, "grad_norm": 1.8115037679672241, "learning_rate": 4.375901985959159e-06, "loss": 0.2161, "num_input_tokens_seen": 23034800, "step": 37765 }, { "epoch": 11.718895439031957, "grad_norm": 1.3938748836517334, "learning_rate": 4.374558788388317e-06, "loss": 0.2147, "num_input_tokens_seen": 23037968, "step": 37770 }, { "epoch": 11.720446788706175, "grad_norm": 1.1909046173095703, "learning_rate": 4.373215636669721e-06, "loss": 0.1971, "num_input_tokens_seen": 23041264, "step": 37775 }, { "epoch": 11.72199813838039, "grad_norm": 2.162353277206421, "learning_rate": 4.3718725309018424e-06, "loss": 0.2423, "num_input_tokens_seen": 23044240, "step": 37780 }, { "epoch": 11.723549488054607, "grad_norm": 1.1552284955978394, "learning_rate": 4.3705294711831435e-06, "loss": 0.2302, "num_input_tokens_seen": 23046832, "step": 37785 }, { "epoch": 11.725100837728824, "grad_norm": 1.6661626100540161, "learning_rate": 4.3691864576120905e-06, "loss": 0.2174, "num_input_tokens_seen": 23049040, "step": 37790 }, { "epoch": 11.72665218740304, "grad_norm": 1.4892768859863281, "learning_rate": 4.367843490287139e-06, "loss": 0.212, "num_input_tokens_seen": 23051600, "step": 37795 }, { "epoch": 11.728203537077258, "grad_norm": 1.2401522397994995, "learning_rate": 4.366500569306746e-06, "loss": 0.2096, "num_input_tokens_seen": 23054640, "step": 37800 }, { "epoch": 11.729754886751474, "grad_norm": 3.1708507537841797, "learning_rate": 4.365157694769363e-06, "loss": 0.2347, "num_input_tokens_seen": 23057488, "step": 37805 }, { "epoch": 11.73130623642569, "grad_norm": 3.263737678527832, "learning_rate": 4.3638148667734385e-06, "loss": 0.2288, "num_input_tokens_seen": 23060816, "step": 37810 }, { "epoch": 11.732857586099907, "grad_norm": 1.5987228155136108, "learning_rate": 4.3624720854174174e-06, "loss": 0.2416, "num_input_tokens_seen": 23064720, "step": 37815 }, { "epoch": 11.734408935774123, "grad_norm": 1.4123061895370483, "learning_rate": 4.361129350799742e-06, "loss": 0.2293, "num_input_tokens_seen": 23067952, "step": 37820 }, { "epoch": 11.73596028544834, "grad_norm": 2.523573160171509, "learning_rate": 4.359786663018849e-06, "loss": 0.2171, "num_input_tokens_seen": 23070800, "step": 37825 }, { "epoch": 11.737511635122557, "grad_norm": 2.3137595653533936, "learning_rate": 4.358444022173177e-06, "loss": 0.2438, "num_input_tokens_seen": 23073520, "step": 37830 }, { "epoch": 11.739062984796773, "grad_norm": 1.403848648071289, "learning_rate": 4.357101428361154e-06, "loss": 0.2129, "num_input_tokens_seen": 23076656, "step": 37835 }, { "epoch": 11.74061433447099, "grad_norm": 3.2131145000457764, "learning_rate": 4.355758881681211e-06, "loss": 0.217, "num_input_tokens_seen": 23079824, "step": 37840 }, { "epoch": 11.742165684145206, "grad_norm": 1.5517375469207764, "learning_rate": 4.354416382231771e-06, "loss": 0.1949, "num_input_tokens_seen": 23082768, "step": 37845 }, { "epoch": 11.743717033819422, "grad_norm": 1.334079384803772, "learning_rate": 4.353073930111254e-06, "loss": 0.215, "num_input_tokens_seen": 23085552, "step": 37850 }, { "epoch": 11.74526838349364, "grad_norm": 2.4531240463256836, "learning_rate": 4.3517315254180795e-06, "loss": 0.2067, "num_input_tokens_seen": 23087792, "step": 37855 }, { "epoch": 11.746819733167856, "grad_norm": 1.8849161863327026, "learning_rate": 4.35038916825066e-06, "loss": 0.2316, "num_input_tokens_seen": 23090992, "step": 37860 }, { "epoch": 11.748371082842073, "grad_norm": 1.5266170501708984, "learning_rate": 4.349046858707408e-06, "loss": 0.1966, "num_input_tokens_seen": 23094448, "step": 37865 }, { "epoch": 11.74992243251629, "grad_norm": 2.127930164337158, "learning_rate": 4.3477045968867275e-06, "loss": 0.2106, "num_input_tokens_seen": 23097296, "step": 37870 }, { "epoch": 11.751473782190505, "grad_norm": 1.233254075050354, "learning_rate": 4.346362382887026e-06, "loss": 0.2044, "num_input_tokens_seen": 23101584, "step": 37875 }, { "epoch": 11.753025131864723, "grad_norm": 2.558112621307373, "learning_rate": 4.345020216806702e-06, "loss": 0.2381, "num_input_tokens_seen": 23105360, "step": 37880 }, { "epoch": 11.754576481538939, "grad_norm": 2.1054372787475586, "learning_rate": 4.343678098744153e-06, "loss": 0.2298, "num_input_tokens_seen": 23108432, "step": 37885 }, { "epoch": 11.756127831213156, "grad_norm": 2.7409191131591797, "learning_rate": 4.34233602879777e-06, "loss": 0.2236, "num_input_tokens_seen": 23110512, "step": 37890 }, { "epoch": 11.757679180887372, "grad_norm": 1.9330943822860718, "learning_rate": 4.3409940070659445e-06, "loss": 0.2147, "num_input_tokens_seen": 23112784, "step": 37895 }, { "epoch": 11.759230530561588, "grad_norm": 2.065654754638672, "learning_rate": 4.339652033647062e-06, "loss": 0.1875, "num_input_tokens_seen": 23116208, "step": 37900 }, { "epoch": 11.760781880235806, "grad_norm": 2.1433660984039307, "learning_rate": 4.338310108639506e-06, "loss": 0.1956, "num_input_tokens_seen": 23118768, "step": 37905 }, { "epoch": 11.762333229910022, "grad_norm": 3.8196773529052734, "learning_rate": 4.336968232141654e-06, "loss": 0.2083, "num_input_tokens_seen": 23122064, "step": 37910 }, { "epoch": 11.763884579584237, "grad_norm": 2.4323055744171143, "learning_rate": 4.3356264042518815e-06, "loss": 0.2627, "num_input_tokens_seen": 23125072, "step": 37915 }, { "epoch": 11.765435929258455, "grad_norm": 2.2374215126037598, "learning_rate": 4.334284625068562e-06, "loss": 0.2223, "num_input_tokens_seen": 23127408, "step": 37920 }, { "epoch": 11.766987278932671, "grad_norm": 1.3707430362701416, "learning_rate": 4.332942894690062e-06, "loss": 0.2018, "num_input_tokens_seen": 23130384, "step": 37925 }, { "epoch": 11.768538628606889, "grad_norm": 1.4534118175506592, "learning_rate": 4.331601213214746e-06, "loss": 0.1962, "num_input_tokens_seen": 23133552, "step": 37930 }, { "epoch": 11.770089978281105, "grad_norm": 1.9995310306549072, "learning_rate": 4.330259580740977e-06, "loss": 0.2377, "num_input_tokens_seen": 23136272, "step": 37935 }, { "epoch": 11.77164132795532, "grad_norm": 2.0770177841186523, "learning_rate": 4.328917997367112e-06, "loss": 0.1882, "num_input_tokens_seen": 23141008, "step": 37940 }, { "epoch": 11.773192677629538, "grad_norm": 4.291477203369141, "learning_rate": 4.3275764631915046e-06, "loss": 0.2097, "num_input_tokens_seen": 23143856, "step": 37945 }, { "epoch": 11.774744027303754, "grad_norm": 1.7847107648849487, "learning_rate": 4.326234978312505e-06, "loss": 0.2452, "num_input_tokens_seen": 23146736, "step": 37950 }, { "epoch": 11.77629537697797, "grad_norm": 1.621519684791565, "learning_rate": 4.32489354282846e-06, "loss": 0.2015, "num_input_tokens_seen": 23149200, "step": 37955 }, { "epoch": 11.777846726652188, "grad_norm": 2.1828837394714355, "learning_rate": 4.323552156837712e-06, "loss": 0.205, "num_input_tokens_seen": 23151824, "step": 37960 }, { "epoch": 11.779398076326403, "grad_norm": 1.7027220726013184, "learning_rate": 4.322210820438603e-06, "loss": 0.2278, "num_input_tokens_seen": 23154704, "step": 37965 }, { "epoch": 11.780949426000621, "grad_norm": 2.480226755142212, "learning_rate": 4.3208695337294655e-06, "loss": 0.2155, "num_input_tokens_seen": 23157936, "step": 37970 }, { "epoch": 11.782500775674837, "grad_norm": 1.1213619709014893, "learning_rate": 4.3195282968086344e-06, "loss": 0.2305, "num_input_tokens_seen": 23160784, "step": 37975 }, { "epoch": 11.784052125349053, "grad_norm": 2.657660484313965, "learning_rate": 4.3181871097744364e-06, "loss": 0.19, "num_input_tokens_seen": 23163120, "step": 37980 }, { "epoch": 11.78560347502327, "grad_norm": 1.3890833854675293, "learning_rate": 4.316845972725198e-06, "loss": 0.2339, "num_input_tokens_seen": 23165968, "step": 37985 }, { "epoch": 11.787154824697486, "grad_norm": 2.706979751586914, "learning_rate": 4.315504885759237e-06, "loss": 0.2225, "num_input_tokens_seen": 23168880, "step": 37990 }, { "epoch": 11.788706174371704, "grad_norm": 1.8166427612304688, "learning_rate": 4.314163848974876e-06, "loss": 0.2213, "num_input_tokens_seen": 23171568, "step": 37995 }, { "epoch": 11.79025752404592, "grad_norm": 1.3993686437606812, "learning_rate": 4.312822862470426e-06, "loss": 0.2353, "num_input_tokens_seen": 23173584, "step": 38000 }, { "epoch": 11.791808873720136, "grad_norm": 2.915376901626587, "learning_rate": 4.311481926344198e-06, "loss": 0.246, "num_input_tokens_seen": 23176304, "step": 38005 }, { "epoch": 11.793360223394354, "grad_norm": 3.3959031105041504, "learning_rate": 4.310141040694497e-06, "loss": 0.2179, "num_input_tokens_seen": 23179280, "step": 38010 }, { "epoch": 11.79491157306857, "grad_norm": 1.5510083436965942, "learning_rate": 4.308800205619627e-06, "loss": 0.2623, "num_input_tokens_seen": 23182544, "step": 38015 }, { "epoch": 11.796462922742787, "grad_norm": 2.570601224899292, "learning_rate": 4.307459421217888e-06, "loss": 0.2208, "num_input_tokens_seen": 23185360, "step": 38020 }, { "epoch": 11.798014272417003, "grad_norm": 1.0508373975753784, "learning_rate": 4.306118687587574e-06, "loss": 0.1952, "num_input_tokens_seen": 23188688, "step": 38025 }, { "epoch": 11.799565622091219, "grad_norm": 1.9514198303222656, "learning_rate": 4.304778004826976e-06, "loss": 0.2189, "num_input_tokens_seen": 23191536, "step": 38030 }, { "epoch": 11.801116971765437, "grad_norm": 3.4417943954467773, "learning_rate": 4.303437373034384e-06, "loss": 0.2706, "num_input_tokens_seen": 23194928, "step": 38035 }, { "epoch": 11.802668321439652, "grad_norm": 2.304649591445923, "learning_rate": 4.30209679230808e-06, "loss": 0.2241, "num_input_tokens_seen": 23199056, "step": 38040 }, { "epoch": 11.804219671113868, "grad_norm": 1.8793491125106812, "learning_rate": 4.3007562627463465e-06, "loss": 0.2254, "num_input_tokens_seen": 23201808, "step": 38045 }, { "epoch": 11.805771020788086, "grad_norm": 3.489361524581909, "learning_rate": 4.299415784447457e-06, "loss": 0.218, "num_input_tokens_seen": 23204112, "step": 38050 }, { "epoch": 11.807322370462302, "grad_norm": 1.4947516918182373, "learning_rate": 4.298075357509689e-06, "loss": 0.214, "num_input_tokens_seen": 23206512, "step": 38055 }, { "epoch": 11.80887372013652, "grad_norm": 4.062426567077637, "learning_rate": 4.29673498203131e-06, "loss": 0.2053, "num_input_tokens_seen": 23209360, "step": 38060 }, { "epoch": 11.810425069810735, "grad_norm": 2.043553113937378, "learning_rate": 4.295394658110583e-06, "loss": 0.2219, "num_input_tokens_seen": 23211696, "step": 38065 }, { "epoch": 11.811976419484951, "grad_norm": 1.7954363822937012, "learning_rate": 4.294054385845772e-06, "loss": 0.1796, "num_input_tokens_seen": 23214544, "step": 38070 }, { "epoch": 11.813527769159169, "grad_norm": 2.5866215229034424, "learning_rate": 4.292714165335134e-06, "loss": 0.214, "num_input_tokens_seen": 23218096, "step": 38075 }, { "epoch": 11.815079118833385, "grad_norm": 1.8857080936431885, "learning_rate": 4.2913739966769235e-06, "loss": 0.2043, "num_input_tokens_seen": 23221040, "step": 38080 }, { "epoch": 11.8166304685076, "grad_norm": 3.310025215148926, "learning_rate": 4.2900338799693904e-06, "loss": 0.248, "num_input_tokens_seen": 23224144, "step": 38085 }, { "epoch": 11.818181818181818, "grad_norm": 2.0710973739624023, "learning_rate": 4.288693815310781e-06, "loss": 0.2324, "num_input_tokens_seen": 23226512, "step": 38090 }, { "epoch": 11.819733167856034, "grad_norm": 2.6597743034362793, "learning_rate": 4.287353802799338e-06, "loss": 0.1766, "num_input_tokens_seen": 23229744, "step": 38095 }, { "epoch": 11.821284517530252, "grad_norm": 2.113435745239258, "learning_rate": 4.2860138425333006e-06, "loss": 0.2586, "num_input_tokens_seen": 23232336, "step": 38100 }, { "epoch": 11.822835867204468, "grad_norm": 3.2113053798675537, "learning_rate": 4.2846739346109025e-06, "loss": 0.2477, "num_input_tokens_seen": 23235280, "step": 38105 }, { "epoch": 11.824387216878684, "grad_norm": 2.7255959510803223, "learning_rate": 4.283334079130376e-06, "loss": 0.2496, "num_input_tokens_seen": 23237680, "step": 38110 }, { "epoch": 11.825938566552901, "grad_norm": 2.699875831604004, "learning_rate": 4.281994276189948e-06, "loss": 0.2319, "num_input_tokens_seen": 23240016, "step": 38115 }, { "epoch": 11.827489916227117, "grad_norm": 3.8029727935791016, "learning_rate": 4.280654525887843e-06, "loss": 0.253, "num_input_tokens_seen": 23242608, "step": 38120 }, { "epoch": 11.829041265901335, "grad_norm": 3.7931699752807617, "learning_rate": 4.279314828322279e-06, "loss": 0.2755, "num_input_tokens_seen": 23245680, "step": 38125 }, { "epoch": 11.83059261557555, "grad_norm": 2.1394457817077637, "learning_rate": 4.277975183591472e-06, "loss": 0.2247, "num_input_tokens_seen": 23248208, "step": 38130 }, { "epoch": 11.832143965249767, "grad_norm": 2.2023558616638184, "learning_rate": 4.276635591793634e-06, "loss": 0.1939, "num_input_tokens_seen": 23251632, "step": 38135 }, { "epoch": 11.833695314923984, "grad_norm": 4.599534511566162, "learning_rate": 4.275296053026974e-06, "loss": 0.2223, "num_input_tokens_seen": 23254192, "step": 38140 }, { "epoch": 11.8352466645982, "grad_norm": 2.3594255447387695, "learning_rate": 4.273956567389694e-06, "loss": 0.2177, "num_input_tokens_seen": 23257104, "step": 38145 }, { "epoch": 11.836798014272418, "grad_norm": 2.1229982376098633, "learning_rate": 4.272617134979996e-06, "loss": 0.2668, "num_input_tokens_seen": 23259632, "step": 38150 }, { "epoch": 11.838349363946634, "grad_norm": 1.717698335647583, "learning_rate": 4.271277755896075e-06, "loss": 0.2219, "num_input_tokens_seen": 23262672, "step": 38155 }, { "epoch": 11.83990071362085, "grad_norm": 2.7100000381469727, "learning_rate": 4.269938430236124e-06, "loss": 0.2582, "num_input_tokens_seen": 23266832, "step": 38160 }, { "epoch": 11.841452063295067, "grad_norm": 7.060083389282227, "learning_rate": 4.268599158098332e-06, "loss": 0.2398, "num_input_tokens_seen": 23270384, "step": 38165 }, { "epoch": 11.843003412969283, "grad_norm": 1.4952244758605957, "learning_rate": 4.26725993958088e-06, "loss": 0.2179, "num_input_tokens_seen": 23272848, "step": 38170 }, { "epoch": 11.844554762643499, "grad_norm": 2.1702754497528076, "learning_rate": 4.265920774781954e-06, "loss": 0.2132, "num_input_tokens_seen": 23275344, "step": 38175 }, { "epoch": 11.846106112317717, "grad_norm": 2.1090173721313477, "learning_rate": 4.264581663799728e-06, "loss": 0.2277, "num_input_tokens_seen": 23278288, "step": 38180 }, { "epoch": 11.847657461991933, "grad_norm": 2.7872722148895264, "learning_rate": 4.263242606732374e-06, "loss": 0.1991, "num_input_tokens_seen": 23281392, "step": 38185 }, { "epoch": 11.84920881166615, "grad_norm": 1.453048586845398, "learning_rate": 4.261903603678062e-06, "loss": 0.2195, "num_input_tokens_seen": 23285424, "step": 38190 }, { "epoch": 11.850760161340366, "grad_norm": 1.7884581089019775, "learning_rate": 4.260564654734956e-06, "loss": 0.2167, "num_input_tokens_seen": 23289744, "step": 38195 }, { "epoch": 11.852311511014582, "grad_norm": 1.203413724899292, "learning_rate": 4.259225760001217e-06, "loss": 0.209, "num_input_tokens_seen": 23293424, "step": 38200 }, { "epoch": 11.8538628606888, "grad_norm": 1.5816479921340942, "learning_rate": 4.2578869195750016e-06, "loss": 0.2128, "num_input_tokens_seen": 23295792, "step": 38205 }, { "epoch": 11.855414210363016, "grad_norm": 3.3256287574768066, "learning_rate": 4.256548133554463e-06, "loss": 0.2473, "num_input_tokens_seen": 23298928, "step": 38210 }, { "epoch": 11.856965560037231, "grad_norm": 1.1592180728912354, "learning_rate": 4.25520940203775e-06, "loss": 0.1946, "num_input_tokens_seen": 23301584, "step": 38215 }, { "epoch": 11.85851690971145, "grad_norm": 1.330947756767273, "learning_rate": 4.253870725123008e-06, "loss": 0.2443, "num_input_tokens_seen": 23304912, "step": 38220 }, { "epoch": 11.860068259385665, "grad_norm": 2.4454829692840576, "learning_rate": 4.252532102908377e-06, "loss": 0.2794, "num_input_tokens_seen": 23307408, "step": 38225 }, { "epoch": 11.861619609059883, "grad_norm": 1.675432801246643, "learning_rate": 4.251193535491993e-06, "loss": 0.224, "num_input_tokens_seen": 23309872, "step": 38230 }, { "epoch": 11.863170958734099, "grad_norm": 1.822349190711975, "learning_rate": 4.249855022971992e-06, "loss": 0.2489, "num_input_tokens_seen": 23313264, "step": 38235 }, { "epoch": 11.864722308408314, "grad_norm": 1.445507526397705, "learning_rate": 4.2485165654465e-06, "loss": 0.2147, "num_input_tokens_seen": 23316656, "step": 38240 }, { "epoch": 11.866273658082532, "grad_norm": 2.5913736820220947, "learning_rate": 4.247178163013643e-06, "loss": 0.2011, "num_input_tokens_seen": 23319664, "step": 38245 }, { "epoch": 11.867825007756748, "grad_norm": 2.0531280040740967, "learning_rate": 4.2458398157715405e-06, "loss": 0.213, "num_input_tokens_seen": 23323184, "step": 38250 }, { "epoch": 11.869376357430966, "grad_norm": 1.6729072332382202, "learning_rate": 4.24450152381831e-06, "loss": 0.218, "num_input_tokens_seen": 23325520, "step": 38255 }, { "epoch": 11.870927707105182, "grad_norm": 2.711273431777954, "learning_rate": 4.243163287252066e-06, "loss": 0.2155, "num_input_tokens_seen": 23328496, "step": 38260 }, { "epoch": 11.872479056779397, "grad_norm": 2.088491678237915, "learning_rate": 4.241825106170914e-06, "loss": 0.2087, "num_input_tokens_seen": 23331280, "step": 38265 }, { "epoch": 11.874030406453615, "grad_norm": 1.3640216588974, "learning_rate": 4.24048698067296e-06, "loss": 0.2229, "num_input_tokens_seen": 23333840, "step": 38270 }, { "epoch": 11.875581756127831, "grad_norm": 1.8452837467193604, "learning_rate": 4.239148910856305e-06, "loss": 0.2093, "num_input_tokens_seen": 23336208, "step": 38275 }, { "epoch": 11.877133105802049, "grad_norm": 1.8096853494644165, "learning_rate": 4.237810896819044e-06, "loss": 0.2424, "num_input_tokens_seen": 23338480, "step": 38280 }, { "epoch": 11.878684455476265, "grad_norm": 2.2348101139068604, "learning_rate": 4.23647293865927e-06, "loss": 0.2056, "num_input_tokens_seen": 23341904, "step": 38285 }, { "epoch": 11.88023580515048, "grad_norm": 2.0161778926849365, "learning_rate": 4.23513503647507e-06, "loss": 0.2375, "num_input_tokens_seen": 23344176, "step": 38290 }, { "epoch": 11.881787154824698, "grad_norm": 2.2322170734405518, "learning_rate": 4.23379719036453e-06, "loss": 0.2246, "num_input_tokens_seen": 23346768, "step": 38295 }, { "epoch": 11.883338504498914, "grad_norm": 1.3267565965652466, "learning_rate": 4.2324594004257304e-06, "loss": 0.2471, "num_input_tokens_seen": 23349968, "step": 38300 }, { "epoch": 11.88488985417313, "grad_norm": 1.5618152618408203, "learning_rate": 4.231121666756745e-06, "loss": 0.2308, "num_input_tokens_seen": 23352400, "step": 38305 }, { "epoch": 11.886441203847347, "grad_norm": 3.194056749343872, "learning_rate": 4.229783989455646e-06, "loss": 0.2042, "num_input_tokens_seen": 23356176, "step": 38310 }, { "epoch": 11.887992553521563, "grad_norm": 2.0749728679656982, "learning_rate": 4.228446368620501e-06, "loss": 0.2063, "num_input_tokens_seen": 23358800, "step": 38315 }, { "epoch": 11.889543903195781, "grad_norm": 1.3302757740020752, "learning_rate": 4.227108804349375e-06, "loss": 0.2041, "num_input_tokens_seen": 23361712, "step": 38320 }, { "epoch": 11.891095252869997, "grad_norm": 4.4567694664001465, "learning_rate": 4.225771296740325e-06, "loss": 0.2552, "num_input_tokens_seen": 23364048, "step": 38325 }, { "epoch": 11.892646602544213, "grad_norm": 2.219975709915161, "learning_rate": 4.224433845891407e-06, "loss": 0.2225, "num_input_tokens_seen": 23367120, "step": 38330 }, { "epoch": 11.89419795221843, "grad_norm": 3.0316333770751953, "learning_rate": 4.223096451900673e-06, "loss": 0.2429, "num_input_tokens_seen": 23370288, "step": 38335 }, { "epoch": 11.895749301892646, "grad_norm": 4.232367992401123, "learning_rate": 4.221759114866169e-06, "loss": 0.2076, "num_input_tokens_seen": 23374096, "step": 38340 }, { "epoch": 11.897300651566862, "grad_norm": 1.7980563640594482, "learning_rate": 4.220421834885937e-06, "loss": 0.2634, "num_input_tokens_seen": 23379984, "step": 38345 }, { "epoch": 11.89885200124108, "grad_norm": 1.5917129516601562, "learning_rate": 4.219084612058015e-06, "loss": 0.2007, "num_input_tokens_seen": 23382640, "step": 38350 }, { "epoch": 11.900403350915296, "grad_norm": 1.8518294095993042, "learning_rate": 4.2177474464804394e-06, "loss": 0.2257, "num_input_tokens_seen": 23385904, "step": 38355 }, { "epoch": 11.901954700589513, "grad_norm": 2.6024532318115234, "learning_rate": 4.216410338251239e-06, "loss": 0.2143, "num_input_tokens_seen": 23388816, "step": 38360 }, { "epoch": 11.90350605026373, "grad_norm": 1.8734188079833984, "learning_rate": 4.2150732874684404e-06, "loss": 0.2303, "num_input_tokens_seen": 23391216, "step": 38365 }, { "epoch": 11.905057399937945, "grad_norm": 2.293268918991089, "learning_rate": 4.213736294230065e-06, "loss": 0.2099, "num_input_tokens_seen": 23394480, "step": 38370 }, { "epoch": 11.906608749612163, "grad_norm": 2.07705020904541, "learning_rate": 4.2123993586341305e-06, "loss": 0.2296, "num_input_tokens_seen": 23396880, "step": 38375 }, { "epoch": 11.908160099286379, "grad_norm": 1.6632198095321655, "learning_rate": 4.211062480778649e-06, "loss": 0.199, "num_input_tokens_seen": 23399760, "step": 38380 }, { "epoch": 11.909711448960596, "grad_norm": 3.501072883605957, "learning_rate": 4.209725660761629e-06, "loss": 0.211, "num_input_tokens_seen": 23402384, "step": 38385 }, { "epoch": 11.911262798634812, "grad_norm": 1.9950876235961914, "learning_rate": 4.2083888986810776e-06, "loss": 0.2202, "num_input_tokens_seen": 23405744, "step": 38390 }, { "epoch": 11.912814148309028, "grad_norm": 2.792772054672241, "learning_rate": 4.207052194634994e-06, "loss": 0.2098, "num_input_tokens_seen": 23408368, "step": 38395 }, { "epoch": 11.914365497983246, "grad_norm": 1.5976017713546753, "learning_rate": 4.205715548721374e-06, "loss": 0.2314, "num_input_tokens_seen": 23411664, "step": 38400 }, { "epoch": 11.915916847657462, "grad_norm": 4.445842266082764, "learning_rate": 4.204378961038208e-06, "loss": 0.2016, "num_input_tokens_seen": 23414576, "step": 38405 }, { "epoch": 11.91746819733168, "grad_norm": 1.3502435684204102, "learning_rate": 4.2030424316834875e-06, "loss": 0.2202, "num_input_tokens_seen": 23417520, "step": 38410 }, { "epoch": 11.919019547005895, "grad_norm": 2.578361988067627, "learning_rate": 4.201705960755195e-06, "loss": 0.2394, "num_input_tokens_seen": 23419952, "step": 38415 }, { "epoch": 11.920570896680111, "grad_norm": 3.0599939823150635, "learning_rate": 4.200369548351308e-06, "loss": 0.2087, "num_input_tokens_seen": 23423376, "step": 38420 }, { "epoch": 11.922122246354329, "grad_norm": 2.065647602081299, "learning_rate": 4.199033194569802e-06, "loss": 0.246, "num_input_tokens_seen": 23426000, "step": 38425 }, { "epoch": 11.923673596028545, "grad_norm": 3.768017292022705, "learning_rate": 4.197696899508649e-06, "loss": 0.2555, "num_input_tokens_seen": 23429360, "step": 38430 }, { "epoch": 11.92522494570276, "grad_norm": 1.1059439182281494, "learning_rate": 4.196360663265812e-06, "loss": 0.2125, "num_input_tokens_seen": 23432016, "step": 38435 }, { "epoch": 11.926776295376978, "grad_norm": 2.2467689514160156, "learning_rate": 4.195024485939256e-06, "loss": 0.2126, "num_input_tokens_seen": 23434704, "step": 38440 }, { "epoch": 11.928327645051194, "grad_norm": 2.270707607269287, "learning_rate": 4.1936883676269365e-06, "loss": 0.2099, "num_input_tokens_seen": 23439856, "step": 38445 }, { "epoch": 11.929878994725412, "grad_norm": 1.686253309249878, "learning_rate": 4.192352308426809e-06, "loss": 0.2003, "num_input_tokens_seen": 23442896, "step": 38450 }, { "epoch": 11.931430344399628, "grad_norm": 2.0223007202148438, "learning_rate": 4.191016308436821e-06, "loss": 0.2188, "num_input_tokens_seen": 23446192, "step": 38455 }, { "epoch": 11.932981694073844, "grad_norm": 1.9520593881607056, "learning_rate": 4.1896803677549185e-06, "loss": 0.2111, "num_input_tokens_seen": 23448720, "step": 38460 }, { "epoch": 11.934533043748061, "grad_norm": 2.7569785118103027, "learning_rate": 4.188344486479039e-06, "loss": 0.2263, "num_input_tokens_seen": 23452208, "step": 38465 }, { "epoch": 11.936084393422277, "grad_norm": 2.9122116565704346, "learning_rate": 4.187008664707121e-06, "loss": 0.2516, "num_input_tokens_seen": 23455312, "step": 38470 }, { "epoch": 11.937635743096495, "grad_norm": 3.0929503440856934, "learning_rate": 4.185672902537095e-06, "loss": 0.2005, "num_input_tokens_seen": 23458224, "step": 38475 }, { "epoch": 11.93918709277071, "grad_norm": 1.3850735425949097, "learning_rate": 4.184337200066888e-06, "loss": 0.2256, "num_input_tokens_seen": 23460240, "step": 38480 }, { "epoch": 11.940738442444927, "grad_norm": 1.1039254665374756, "learning_rate": 4.183001557394423e-06, "loss": 0.2033, "num_input_tokens_seen": 23462576, "step": 38485 }, { "epoch": 11.942289792119144, "grad_norm": 1.723051905632019, "learning_rate": 4.181665974617619e-06, "loss": 0.2419, "num_input_tokens_seen": 23464784, "step": 38490 }, { "epoch": 11.94384114179336, "grad_norm": 4.476855754852295, "learning_rate": 4.180330451834388e-06, "loss": 0.2512, "num_input_tokens_seen": 23467312, "step": 38495 }, { "epoch": 11.945392491467576, "grad_norm": 1.651090383529663, "learning_rate": 4.1789949891426425e-06, "loss": 0.228, "num_input_tokens_seen": 23469936, "step": 38500 }, { "epoch": 11.946943841141794, "grad_norm": 1.3047022819519043, "learning_rate": 4.177659586640287e-06, "loss": 0.228, "num_input_tokens_seen": 23473744, "step": 38505 }, { "epoch": 11.94849519081601, "grad_norm": 1.6132774353027344, "learning_rate": 4.176324244425222e-06, "loss": 0.2023, "num_input_tokens_seen": 23476304, "step": 38510 }, { "epoch": 11.950046540490227, "grad_norm": 2.128777027130127, "learning_rate": 4.1749889625953424e-06, "loss": 0.2437, "num_input_tokens_seen": 23478640, "step": 38515 }, { "epoch": 11.951597890164443, "grad_norm": 2.4380393028259277, "learning_rate": 4.1736537412485425e-06, "loss": 0.2189, "num_input_tokens_seen": 23481200, "step": 38520 }, { "epoch": 11.953149239838659, "grad_norm": 1.8709434270858765, "learning_rate": 4.172318580482708e-06, "loss": 0.2002, "num_input_tokens_seen": 23484048, "step": 38525 }, { "epoch": 11.954700589512877, "grad_norm": 2.6386377811431885, "learning_rate": 4.1709834803957225e-06, "loss": 0.2374, "num_input_tokens_seen": 23486832, "step": 38530 }, { "epoch": 11.956251939187093, "grad_norm": 2.201117992401123, "learning_rate": 4.169648441085466e-06, "loss": 0.2229, "num_input_tokens_seen": 23489200, "step": 38535 }, { "epoch": 11.95780328886131, "grad_norm": 1.6742730140686035, "learning_rate": 4.168313462649811e-06, "loss": 0.1981, "num_input_tokens_seen": 23491920, "step": 38540 }, { "epoch": 11.959354638535526, "grad_norm": 1.3723124265670776, "learning_rate": 4.166978545186628e-06, "loss": 0.2246, "num_input_tokens_seen": 23494640, "step": 38545 }, { "epoch": 11.960905988209742, "grad_norm": 2.8784501552581787, "learning_rate": 4.165643688793782e-06, "loss": 0.2831, "num_input_tokens_seen": 23498576, "step": 38550 }, { "epoch": 11.96245733788396, "grad_norm": 2.26300311088562, "learning_rate": 4.164308893569134e-06, "loss": 0.2249, "num_input_tokens_seen": 23501552, "step": 38555 }, { "epoch": 11.964008687558175, "grad_norm": 1.8315180540084839, "learning_rate": 4.1629741596105386e-06, "loss": 0.2336, "num_input_tokens_seen": 23504176, "step": 38560 }, { "epoch": 11.965560037232391, "grad_norm": 1.8178880214691162, "learning_rate": 4.161639487015851e-06, "loss": 0.2197, "num_input_tokens_seen": 23506512, "step": 38565 }, { "epoch": 11.967111386906609, "grad_norm": 2.508695125579834, "learning_rate": 4.1603048758829164e-06, "loss": 0.2551, "num_input_tokens_seen": 23509392, "step": 38570 }, { "epoch": 11.968662736580825, "grad_norm": 4.847362041473389, "learning_rate": 4.1589703263095775e-06, "loss": 0.232, "num_input_tokens_seen": 23514000, "step": 38575 }, { "epoch": 11.970214086255043, "grad_norm": 2.4047205448150635, "learning_rate": 4.157635838393673e-06, "loss": 0.2241, "num_input_tokens_seen": 23517168, "step": 38580 }, { "epoch": 11.971765435929258, "grad_norm": 2.8969526290893555, "learning_rate": 4.156301412233037e-06, "loss": 0.2173, "num_input_tokens_seen": 23519984, "step": 38585 }, { "epoch": 11.973316785603474, "grad_norm": 1.8918882608413696, "learning_rate": 4.154967047925499e-06, "loss": 0.2039, "num_input_tokens_seen": 23524112, "step": 38590 }, { "epoch": 11.974868135277692, "grad_norm": 0.9407044053077698, "learning_rate": 4.153632745568882e-06, "loss": 0.2306, "num_input_tokens_seen": 23527152, "step": 38595 }, { "epoch": 11.976419484951908, "grad_norm": 1.6427490711212158, "learning_rate": 4.152298505261007e-06, "loss": 0.2134, "num_input_tokens_seen": 23530640, "step": 38600 }, { "epoch": 11.977970834626126, "grad_norm": 3.670170307159424, "learning_rate": 4.150964327099691e-06, "loss": 0.2557, "num_input_tokens_seen": 23534160, "step": 38605 }, { "epoch": 11.979522184300341, "grad_norm": 4.166205883026123, "learning_rate": 4.149630211182743e-06, "loss": 0.202, "num_input_tokens_seen": 23537648, "step": 38610 }, { "epoch": 11.981073533974557, "grad_norm": 2.2801215648651123, "learning_rate": 4.148296157607971e-06, "loss": 0.2241, "num_input_tokens_seen": 23540976, "step": 38615 }, { "epoch": 11.982624883648775, "grad_norm": 1.645377278327942, "learning_rate": 4.146962166473175e-06, "loss": 0.1964, "num_input_tokens_seen": 23543632, "step": 38620 }, { "epoch": 11.98417623332299, "grad_norm": 1.379456877708435, "learning_rate": 4.145628237876154e-06, "loss": 0.2177, "num_input_tokens_seen": 23545680, "step": 38625 }, { "epoch": 11.985727582997207, "grad_norm": 1.6007308959960938, "learning_rate": 4.144294371914702e-06, "loss": 0.214, "num_input_tokens_seen": 23550288, "step": 38630 }, { "epoch": 11.987278932671424, "grad_norm": 3.0046939849853516, "learning_rate": 4.142960568686605e-06, "loss": 0.227, "num_input_tokens_seen": 23553040, "step": 38635 }, { "epoch": 11.98883028234564, "grad_norm": 1.2617360353469849, "learning_rate": 4.141626828289647e-06, "loss": 0.225, "num_input_tokens_seen": 23555376, "step": 38640 }, { "epoch": 11.990381632019858, "grad_norm": 2.8078954219818115, "learning_rate": 4.140293150821607e-06, "loss": 0.2036, "num_input_tokens_seen": 23559568, "step": 38645 }, { "epoch": 11.991932981694074, "grad_norm": 2.112004518508911, "learning_rate": 4.13895953638026e-06, "loss": 0.1837, "num_input_tokens_seen": 23562288, "step": 38650 }, { "epoch": 11.99348433136829, "grad_norm": 1.9251009225845337, "learning_rate": 4.137625985063374e-06, "loss": 0.2193, "num_input_tokens_seen": 23565040, "step": 38655 }, { "epoch": 11.995035681042507, "grad_norm": 3.7161865234375, "learning_rate": 4.136292496968716e-06, "loss": 0.2453, "num_input_tokens_seen": 23567504, "step": 38660 }, { "epoch": 11.996587030716723, "grad_norm": 1.8575786352157593, "learning_rate": 4.134959072194047e-06, "loss": 0.2091, "num_input_tokens_seen": 23569968, "step": 38665 }, { "epoch": 11.998138380390941, "grad_norm": 2.8427815437316895, "learning_rate": 4.13362571083712e-06, "loss": 0.2238, "num_input_tokens_seen": 23571984, "step": 38670 }, { "epoch": 11.999689730065157, "grad_norm": 2.613544225692749, "learning_rate": 4.132292412995688e-06, "loss": 0.1983, "num_input_tokens_seen": 23574704, "step": 38675 }, { "epoch": 12.0, "eval_loss": 0.24983832240104675, "eval_runtime": 34.329, "eval_samples_per_second": 93.886, "eval_steps_per_second": 23.479, "num_input_tokens_seen": 23574992, "step": 38676 }, { "epoch": 12.001241079739373, "grad_norm": 4.092585563659668, "learning_rate": 4.130959178767497e-06, "loss": 0.2425, "num_input_tokens_seen": 23579472, "step": 38680 }, { "epoch": 12.00279242941359, "grad_norm": 1.4669891595840454, "learning_rate": 4.1296260082502895e-06, "loss": 0.2026, "num_input_tokens_seen": 23582512, "step": 38685 }, { "epoch": 12.004343779087806, "grad_norm": 2.145840644836426, "learning_rate": 4.128292901541802e-06, "loss": 0.2309, "num_input_tokens_seen": 23585680, "step": 38690 }, { "epoch": 12.005895128762022, "grad_norm": 2.51236629486084, "learning_rate": 4.126959858739769e-06, "loss": 0.1927, "num_input_tokens_seen": 23589296, "step": 38695 }, { "epoch": 12.00744647843624, "grad_norm": 1.9931679964065552, "learning_rate": 4.125626879941916e-06, "loss": 0.2008, "num_input_tokens_seen": 23591856, "step": 38700 }, { "epoch": 12.008997828110456, "grad_norm": 1.8392053842544556, "learning_rate": 4.124293965245967e-06, "loss": 0.2444, "num_input_tokens_seen": 23594928, "step": 38705 }, { "epoch": 12.010549177784673, "grad_norm": 3.6768312454223633, "learning_rate": 4.12296111474964e-06, "loss": 0.2639, "num_input_tokens_seen": 23597776, "step": 38710 }, { "epoch": 12.01210052745889, "grad_norm": 2.376335620880127, "learning_rate": 4.121628328550648e-06, "loss": 0.2069, "num_input_tokens_seen": 23600112, "step": 38715 }, { "epoch": 12.013651877133105, "grad_norm": 1.7220401763916016, "learning_rate": 4.1202956067467024e-06, "loss": 0.2025, "num_input_tokens_seen": 23602768, "step": 38720 }, { "epoch": 12.015203226807323, "grad_norm": 2.3571009635925293, "learning_rate": 4.118962949435505e-06, "loss": 0.2092, "num_input_tokens_seen": 23605968, "step": 38725 }, { "epoch": 12.016754576481539, "grad_norm": 5.83427619934082, "learning_rate": 4.1176303567147575e-06, "loss": 0.2196, "num_input_tokens_seen": 23608688, "step": 38730 }, { "epoch": 12.018305926155756, "grad_norm": 1.6392019987106323, "learning_rate": 4.116297828682153e-06, "loss": 0.1943, "num_input_tokens_seen": 23611856, "step": 38735 }, { "epoch": 12.019857275829972, "grad_norm": 1.696603536605835, "learning_rate": 4.114965365435382e-06, "loss": 0.1758, "num_input_tokens_seen": 23614864, "step": 38740 }, { "epoch": 12.021408625504188, "grad_norm": 3.214946746826172, "learning_rate": 4.11363296707213e-06, "loss": 0.2293, "num_input_tokens_seen": 23617168, "step": 38745 }, { "epoch": 12.022959975178406, "grad_norm": 1.4325004816055298, "learning_rate": 4.112300633690078e-06, "loss": 0.2212, "num_input_tokens_seen": 23620176, "step": 38750 }, { "epoch": 12.024511324852622, "grad_norm": 1.8485952615737915, "learning_rate": 4.110968365386902e-06, "loss": 0.178, "num_input_tokens_seen": 23623920, "step": 38755 }, { "epoch": 12.026062674526838, "grad_norm": 3.1912283897399902, "learning_rate": 4.109636162260272e-06, "loss": 0.2114, "num_input_tokens_seen": 23627984, "step": 38760 }, { "epoch": 12.027614024201055, "grad_norm": 1.9150241613388062, "learning_rate": 4.108304024407855e-06, "loss": 0.2007, "num_input_tokens_seen": 23631312, "step": 38765 }, { "epoch": 12.029165373875271, "grad_norm": 2.144953966140747, "learning_rate": 4.1069719519273124e-06, "loss": 0.2191, "num_input_tokens_seen": 23633904, "step": 38770 }, { "epoch": 12.030716723549489, "grad_norm": 2.3500213623046875, "learning_rate": 4.105639944916301e-06, "loss": 0.23, "num_input_tokens_seen": 23636752, "step": 38775 }, { "epoch": 12.032268073223705, "grad_norm": 7.765133857727051, "learning_rate": 4.104308003472472e-06, "loss": 0.2617, "num_input_tokens_seen": 23639984, "step": 38780 }, { "epoch": 12.03381942289792, "grad_norm": 3.0571091175079346, "learning_rate": 4.1029761276934725e-06, "loss": 0.1906, "num_input_tokens_seen": 23642512, "step": 38785 }, { "epoch": 12.035370772572138, "grad_norm": 2.0779869556427, "learning_rate": 4.101644317676946e-06, "loss": 0.2048, "num_input_tokens_seen": 23645776, "step": 38790 }, { "epoch": 12.036922122246354, "grad_norm": 5.302789211273193, "learning_rate": 4.100312573520529e-06, "loss": 0.2402, "num_input_tokens_seen": 23648944, "step": 38795 }, { "epoch": 12.038473471920572, "grad_norm": 2.9087438583374023, "learning_rate": 4.098980895321853e-06, "loss": 0.2277, "num_input_tokens_seen": 23651760, "step": 38800 }, { "epoch": 12.040024821594788, "grad_norm": 3.680853843688965, "learning_rate": 4.097649283178548e-06, "loss": 0.2121, "num_input_tokens_seen": 23656080, "step": 38805 }, { "epoch": 12.041576171269003, "grad_norm": 2.0839438438415527, "learning_rate": 4.0963177371882365e-06, "loss": 0.2243, "num_input_tokens_seen": 23659536, "step": 38810 }, { "epoch": 12.043127520943221, "grad_norm": 3.5096707344055176, "learning_rate": 4.0949862574485355e-06, "loss": 0.2272, "num_input_tokens_seen": 23662160, "step": 38815 }, { "epoch": 12.044678870617437, "grad_norm": 1.880513072013855, "learning_rate": 4.093654844057059e-06, "loss": 0.2219, "num_input_tokens_seen": 23665008, "step": 38820 }, { "epoch": 12.046230220291653, "grad_norm": 2.8844075202941895, "learning_rate": 4.092323497111414e-06, "loss": 0.1608, "num_input_tokens_seen": 23668816, "step": 38825 }, { "epoch": 12.04778156996587, "grad_norm": 4.023535251617432, "learning_rate": 4.090992216709207e-06, "loss": 0.257, "num_input_tokens_seen": 23671312, "step": 38830 }, { "epoch": 12.049332919640086, "grad_norm": 3.8818116188049316, "learning_rate": 4.089661002948035e-06, "loss": 0.2388, "num_input_tokens_seen": 23675216, "step": 38835 }, { "epoch": 12.050884269314304, "grad_norm": 1.7755396366119385, "learning_rate": 4.08832985592549e-06, "loss": 0.2219, "num_input_tokens_seen": 23677776, "step": 38840 }, { "epoch": 12.05243561898852, "grad_norm": 2.906419515609741, "learning_rate": 4.086998775739163e-06, "loss": 0.2514, "num_input_tokens_seen": 23680304, "step": 38845 }, { "epoch": 12.053986968662736, "grad_norm": 3.530092477798462, "learning_rate": 4.085667762486637e-06, "loss": 0.1961, "num_input_tokens_seen": 23683664, "step": 38850 }, { "epoch": 12.055538318336954, "grad_norm": 2.8998496532440186, "learning_rate": 4.084336816265492e-06, "loss": 0.2318, "num_input_tokens_seen": 23686800, "step": 38855 }, { "epoch": 12.05708966801117, "grad_norm": 5.558475971221924, "learning_rate": 4.083005937173299e-06, "loss": 0.2091, "num_input_tokens_seen": 23691664, "step": 38860 }, { "epoch": 12.058641017685387, "grad_norm": 2.1265881061553955, "learning_rate": 4.081675125307632e-06, "loss": 0.2467, "num_input_tokens_seen": 23694608, "step": 38865 }, { "epoch": 12.060192367359603, "grad_norm": 3.1716761589050293, "learning_rate": 4.0803443807660525e-06, "loss": 0.2296, "num_input_tokens_seen": 23697424, "step": 38870 }, { "epoch": 12.061743717033819, "grad_norm": 1.4176669120788574, "learning_rate": 4.079013703646121e-06, "loss": 0.1963, "num_input_tokens_seen": 23699856, "step": 38875 }, { "epoch": 12.063295066708037, "grad_norm": 2.418307065963745, "learning_rate": 4.077683094045389e-06, "loss": 0.2274, "num_input_tokens_seen": 23703472, "step": 38880 }, { "epoch": 12.064846416382252, "grad_norm": 2.6646952629089355, "learning_rate": 4.07635255206141e-06, "loss": 0.1803, "num_input_tokens_seen": 23707024, "step": 38885 }, { "epoch": 12.066397766056468, "grad_norm": 1.8988572359085083, "learning_rate": 4.075022077791726e-06, "loss": 0.1906, "num_input_tokens_seen": 23709904, "step": 38890 }, { "epoch": 12.067949115730686, "grad_norm": 2.7440402507781982, "learning_rate": 4.073691671333877e-06, "loss": 0.2097, "num_input_tokens_seen": 23712752, "step": 38895 }, { "epoch": 12.069500465404902, "grad_norm": 2.2298223972320557, "learning_rate": 4.072361332785398e-06, "loss": 0.1888, "num_input_tokens_seen": 23716208, "step": 38900 }, { "epoch": 12.07105181507912, "grad_norm": 2.6397788524627686, "learning_rate": 4.0710310622438175e-06, "loss": 0.2203, "num_input_tokens_seen": 23718288, "step": 38905 }, { "epoch": 12.072603164753335, "grad_norm": 3.4613564014434814, "learning_rate": 4.069700859806661e-06, "loss": 0.2135, "num_input_tokens_seen": 23721136, "step": 38910 }, { "epoch": 12.074154514427551, "grad_norm": 2.512404680252075, "learning_rate": 4.0683707255714485e-06, "loss": 0.2355, "num_input_tokens_seen": 23723664, "step": 38915 }, { "epoch": 12.075705864101769, "grad_norm": 2.502959728240967, "learning_rate": 4.067040659635693e-06, "loss": 0.1901, "num_input_tokens_seen": 23726352, "step": 38920 }, { "epoch": 12.077257213775985, "grad_norm": 2.8935582637786865, "learning_rate": 4.065710662096905e-06, "loss": 0.2489, "num_input_tokens_seen": 23729264, "step": 38925 }, { "epoch": 12.078808563450202, "grad_norm": 1.5768235921859741, "learning_rate": 4.06438073305259e-06, "loss": 0.2688, "num_input_tokens_seen": 23731984, "step": 38930 }, { "epoch": 12.080359913124418, "grad_norm": 2.4182701110839844, "learning_rate": 4.063050872600246e-06, "loss": 0.1965, "num_input_tokens_seen": 23734544, "step": 38935 }, { "epoch": 12.081911262798634, "grad_norm": 3.547659397125244, "learning_rate": 4.061721080837369e-06, "loss": 0.2083, "num_input_tokens_seen": 23737648, "step": 38940 }, { "epoch": 12.083462612472852, "grad_norm": 2.0407769680023193, "learning_rate": 4.060391357861447e-06, "loss": 0.229, "num_input_tokens_seen": 23740400, "step": 38945 }, { "epoch": 12.085013962147068, "grad_norm": 2.5765879154205322, "learning_rate": 4.059061703769965e-06, "loss": 0.2006, "num_input_tokens_seen": 23744208, "step": 38950 }, { "epoch": 12.086565311821284, "grad_norm": 4.673179626464844, "learning_rate": 4.057732118660403e-06, "loss": 0.241, "num_input_tokens_seen": 23746928, "step": 38955 }, { "epoch": 12.088116661495501, "grad_norm": 3.7410202026367188, "learning_rate": 4.056402602630235e-06, "loss": 0.2141, "num_input_tokens_seen": 23749584, "step": 38960 }, { "epoch": 12.089668011169717, "grad_norm": 2.9252662658691406, "learning_rate": 4.055073155776929e-06, "loss": 0.2405, "num_input_tokens_seen": 23752240, "step": 38965 }, { "epoch": 12.091219360843935, "grad_norm": 2.695188283920288, "learning_rate": 4.053743778197951e-06, "loss": 0.179, "num_input_tokens_seen": 23755312, "step": 38970 }, { "epoch": 12.09277071051815, "grad_norm": 2.6006298065185547, "learning_rate": 4.0524144699907594e-06, "loss": 0.2056, "num_input_tokens_seen": 23757648, "step": 38975 }, { "epoch": 12.094322060192367, "grad_norm": 1.6799051761627197, "learning_rate": 4.051085231252806e-06, "loss": 0.1978, "num_input_tokens_seen": 23760656, "step": 38980 }, { "epoch": 12.095873409866584, "grad_norm": 1.9911224842071533, "learning_rate": 4.049756062081544e-06, "loss": 0.2016, "num_input_tokens_seen": 23762928, "step": 38985 }, { "epoch": 12.0974247595408, "grad_norm": 2.9728894233703613, "learning_rate": 4.048426962574416e-06, "loss": 0.2519, "num_input_tokens_seen": 23765520, "step": 38990 }, { "epoch": 12.098976109215018, "grad_norm": 2.0728580951690674, "learning_rate": 4.047097932828859e-06, "loss": 0.2087, "num_input_tokens_seen": 23768432, "step": 38995 }, { "epoch": 12.100527458889234, "grad_norm": 3.885723829269409, "learning_rate": 4.045768972942308e-06, "loss": 0.2193, "num_input_tokens_seen": 23771408, "step": 39000 }, { "epoch": 12.10207880856345, "grad_norm": 1.6554772853851318, "learning_rate": 4.04444008301219e-06, "loss": 0.19, "num_input_tokens_seen": 23775792, "step": 39005 }, { "epoch": 12.103630158237667, "grad_norm": 5.228519916534424, "learning_rate": 4.04311126313593e-06, "loss": 0.227, "num_input_tokens_seen": 23778832, "step": 39010 }, { "epoch": 12.105181507911883, "grad_norm": 2.8645739555358887, "learning_rate": 4.041782513410946e-06, "loss": 0.2109, "num_input_tokens_seen": 23782192, "step": 39015 }, { "epoch": 12.106732857586099, "grad_norm": 7.092597007751465, "learning_rate": 4.040453833934651e-06, "loss": 0.2601, "num_input_tokens_seen": 23785904, "step": 39020 }, { "epoch": 12.108284207260317, "grad_norm": 3.675554037094116, "learning_rate": 4.039125224804453e-06, "loss": 0.2099, "num_input_tokens_seen": 23788752, "step": 39025 }, { "epoch": 12.109835556934533, "grad_norm": 0.9722921848297119, "learning_rate": 4.037796686117753e-06, "loss": 0.2003, "num_input_tokens_seen": 23791440, "step": 39030 }, { "epoch": 12.11138690660875, "grad_norm": 5.95539665222168, "learning_rate": 4.036468217971951e-06, "loss": 0.2181, "num_input_tokens_seen": 23794192, "step": 39035 }, { "epoch": 12.112938256282966, "grad_norm": 2.080366373062134, "learning_rate": 4.03513982046444e-06, "loss": 0.2225, "num_input_tokens_seen": 23796432, "step": 39040 }, { "epoch": 12.114489605957182, "grad_norm": 3.558182954788208, "learning_rate": 4.033811493692604e-06, "loss": 0.2193, "num_input_tokens_seen": 23800304, "step": 39045 }, { "epoch": 12.1160409556314, "grad_norm": 6.697009086608887, "learning_rate": 4.032483237753827e-06, "loss": 0.2087, "num_input_tokens_seen": 23803280, "step": 39050 }, { "epoch": 12.117592305305616, "grad_norm": 3.446885824203491, "learning_rate": 4.031155052745487e-06, "loss": 0.2281, "num_input_tokens_seen": 23805360, "step": 39055 }, { "epoch": 12.119143654979833, "grad_norm": 5.433247089385986, "learning_rate": 4.029826938764954e-06, "loss": 0.2244, "num_input_tokens_seen": 23808272, "step": 39060 }, { "epoch": 12.12069500465405, "grad_norm": 3.9446914196014404, "learning_rate": 4.028498895909593e-06, "loss": 0.2795, "num_input_tokens_seen": 23811024, "step": 39065 }, { "epoch": 12.122246354328265, "grad_norm": 3.7656304836273193, "learning_rate": 4.027170924276769e-06, "loss": 0.2214, "num_input_tokens_seen": 23813744, "step": 39070 }, { "epoch": 12.123797704002483, "grad_norm": 3.2088406085968018, "learning_rate": 4.025843023963836e-06, "loss": 0.1973, "num_input_tokens_seen": 23816368, "step": 39075 }, { "epoch": 12.125349053676699, "grad_norm": 4.377580165863037, "learning_rate": 4.024515195068145e-06, "loss": 0.2348, "num_input_tokens_seen": 23818416, "step": 39080 }, { "epoch": 12.126900403350914, "grad_norm": 2.462834596633911, "learning_rate": 4.023187437687042e-06, "loss": 0.2035, "num_input_tokens_seen": 23821040, "step": 39085 }, { "epoch": 12.128451753025132, "grad_norm": 1.5285552740097046, "learning_rate": 4.021859751917867e-06, "loss": 0.2161, "num_input_tokens_seen": 23823248, "step": 39090 }, { "epoch": 12.130003102699348, "grad_norm": 2.3304378986358643, "learning_rate": 4.0205321378579545e-06, "loss": 0.2669, "num_input_tokens_seen": 23826416, "step": 39095 }, { "epoch": 12.131554452373566, "grad_norm": 5.074496269226074, "learning_rate": 4.019204595604635e-06, "loss": 0.2115, "num_input_tokens_seen": 23831472, "step": 39100 }, { "epoch": 12.133105802047782, "grad_norm": 2.9736878871917725, "learning_rate": 4.017877125255234e-06, "loss": 0.1943, "num_input_tokens_seen": 23834320, "step": 39105 }, { "epoch": 12.134657151721997, "grad_norm": 2.365694284439087, "learning_rate": 4.0165497269070695e-06, "loss": 0.2114, "num_input_tokens_seen": 23837488, "step": 39110 }, { "epoch": 12.136208501396215, "grad_norm": 3.8729636669158936, "learning_rate": 4.015222400657455e-06, "loss": 0.236, "num_input_tokens_seen": 23840368, "step": 39115 }, { "epoch": 12.137759851070431, "grad_norm": 1.2778000831604004, "learning_rate": 4.0138951466037005e-06, "loss": 0.2553, "num_input_tokens_seen": 23843408, "step": 39120 }, { "epoch": 12.139311200744649, "grad_norm": 3.6675426959991455, "learning_rate": 4.012567964843108e-06, "loss": 0.1912, "num_input_tokens_seen": 23846544, "step": 39125 }, { "epoch": 12.140862550418865, "grad_norm": 4.1240715980529785, "learning_rate": 4.011240855472978e-06, "loss": 0.2062, "num_input_tokens_seen": 23849488, "step": 39130 }, { "epoch": 12.14241390009308, "grad_norm": 2.9792962074279785, "learning_rate": 4.009913818590602e-06, "loss": 0.2155, "num_input_tokens_seen": 23852240, "step": 39135 }, { "epoch": 12.143965249767298, "grad_norm": 3.9372541904449463, "learning_rate": 4.0085868542932665e-06, "loss": 0.2092, "num_input_tokens_seen": 23856144, "step": 39140 }, { "epoch": 12.145516599441514, "grad_norm": 1.787709355354309, "learning_rate": 4.007259962678256e-06, "loss": 0.2061, "num_input_tokens_seen": 23859632, "step": 39145 }, { "epoch": 12.14706794911573, "grad_norm": 5.313149452209473, "learning_rate": 4.005933143842845e-06, "loss": 0.1978, "num_input_tokens_seen": 23862448, "step": 39150 }, { "epoch": 12.148619298789948, "grad_norm": 2.6865768432617188, "learning_rate": 4.004606397884306e-06, "loss": 0.2262, "num_input_tokens_seen": 23865200, "step": 39155 }, { "epoch": 12.150170648464163, "grad_norm": 2.8547067642211914, "learning_rate": 4.003279724899906e-06, "loss": 0.2543, "num_input_tokens_seen": 23868240, "step": 39160 }, { "epoch": 12.151721998138381, "grad_norm": 3.9213268756866455, "learning_rate": 4.001953124986904e-06, "loss": 0.2131, "num_input_tokens_seen": 23870864, "step": 39165 }, { "epoch": 12.153273347812597, "grad_norm": 3.650942802429199, "learning_rate": 4.000626598242558e-06, "loss": 0.193, "num_input_tokens_seen": 23875376, "step": 39170 }, { "epoch": 12.154824697486813, "grad_norm": 1.313740849494934, "learning_rate": 3.999300144764116e-06, "loss": 0.2528, "num_input_tokens_seen": 23878192, "step": 39175 }, { "epoch": 12.15637604716103, "grad_norm": 1.556894063949585, "learning_rate": 3.997973764648823e-06, "loss": 0.224, "num_input_tokens_seen": 23880656, "step": 39180 }, { "epoch": 12.157927396835246, "grad_norm": 2.0994319915771484, "learning_rate": 3.996647457993918e-06, "loss": 0.1943, "num_input_tokens_seen": 23883280, "step": 39185 }, { "epoch": 12.159478746509464, "grad_norm": 3.9678192138671875, "learning_rate": 3.995321224896637e-06, "loss": 0.1783, "num_input_tokens_seen": 23886768, "step": 39190 }, { "epoch": 12.16103009618368, "grad_norm": 3.0160539150238037, "learning_rate": 3.9939950654542074e-06, "loss": 0.1907, "num_input_tokens_seen": 23890192, "step": 39195 }, { "epoch": 12.162581445857896, "grad_norm": 3.598696708679199, "learning_rate": 3.992668979763853e-06, "loss": 0.1993, "num_input_tokens_seen": 23894096, "step": 39200 }, { "epoch": 12.164132795532113, "grad_norm": 2.1773569583892822, "learning_rate": 3.99134296792279e-06, "loss": 0.1886, "num_input_tokens_seen": 23897424, "step": 39205 }, { "epoch": 12.16568414520633, "grad_norm": 4.732848167419434, "learning_rate": 3.990017030028232e-06, "loss": 0.2239, "num_input_tokens_seen": 23901200, "step": 39210 }, { "epoch": 12.167235494880545, "grad_norm": 6.096643924713135, "learning_rate": 3.9886911661773864e-06, "loss": 0.2367, "num_input_tokens_seen": 23903760, "step": 39215 }, { "epoch": 12.168786844554763, "grad_norm": 3.405076742172241, "learning_rate": 3.987365376467453e-06, "loss": 0.236, "num_input_tokens_seen": 23906864, "step": 39220 }, { "epoch": 12.170338194228979, "grad_norm": 2.3003854751586914, "learning_rate": 3.9860396609956295e-06, "loss": 0.2122, "num_input_tokens_seen": 23909264, "step": 39225 }, { "epoch": 12.171889543903196, "grad_norm": 4.031977653503418, "learning_rate": 3.984714019859105e-06, "loss": 0.1807, "num_input_tokens_seen": 23912432, "step": 39230 }, { "epoch": 12.173440893577412, "grad_norm": 2.196558952331543, "learning_rate": 3.983388453155067e-06, "loss": 0.2595, "num_input_tokens_seen": 23915408, "step": 39235 }, { "epoch": 12.174992243251628, "grad_norm": 3.1572296619415283, "learning_rate": 3.982062960980693e-06, "loss": 0.232, "num_input_tokens_seen": 23917968, "step": 39240 }, { "epoch": 12.176543592925846, "grad_norm": 3.081437587738037, "learning_rate": 3.980737543433158e-06, "loss": 0.1984, "num_input_tokens_seen": 23920272, "step": 39245 }, { "epoch": 12.178094942600062, "grad_norm": 2.568190336227417, "learning_rate": 3.9794122006096305e-06, "loss": 0.2023, "num_input_tokens_seen": 23922800, "step": 39250 }, { "epoch": 12.17964629227428, "grad_norm": 4.039050102233887, "learning_rate": 3.978086932607276e-06, "loss": 0.1836, "num_input_tokens_seen": 23925680, "step": 39255 }, { "epoch": 12.181197641948495, "grad_norm": 2.9702956676483154, "learning_rate": 3.9767617395232495e-06, "loss": 0.2003, "num_input_tokens_seen": 23929616, "step": 39260 }, { "epoch": 12.182748991622711, "grad_norm": 12.515615463256836, "learning_rate": 3.975436621454705e-06, "loss": 0.2202, "num_input_tokens_seen": 23932688, "step": 39265 }, { "epoch": 12.184300341296929, "grad_norm": 4.5122904777526855, "learning_rate": 3.974111578498788e-06, "loss": 0.1911, "num_input_tokens_seen": 23935056, "step": 39270 }, { "epoch": 12.185851690971145, "grad_norm": 3.797401189804077, "learning_rate": 3.972786610752641e-06, "loss": 0.2071, "num_input_tokens_seen": 23938832, "step": 39275 }, { "epoch": 12.18740304064536, "grad_norm": 2.0114166736602783, "learning_rate": 3.9714617183134e-06, "loss": 0.1365, "num_input_tokens_seen": 23941808, "step": 39280 }, { "epoch": 12.188954390319578, "grad_norm": 4.708165168762207, "learning_rate": 3.970136901278194e-06, "loss": 0.2114, "num_input_tokens_seen": 23945200, "step": 39285 }, { "epoch": 12.190505739993794, "grad_norm": 3.4205636978149414, "learning_rate": 3.968812159744149e-06, "loss": 0.1934, "num_input_tokens_seen": 23947600, "step": 39290 }, { "epoch": 12.192057089668012, "grad_norm": 3.714823007583618, "learning_rate": 3.967487493808383e-06, "loss": 0.2251, "num_input_tokens_seen": 23950928, "step": 39295 }, { "epoch": 12.193608439342228, "grad_norm": 3.5454840660095215, "learning_rate": 3.966162903568012e-06, "loss": 0.1996, "num_input_tokens_seen": 23953296, "step": 39300 }, { "epoch": 12.195159789016444, "grad_norm": 3.1942481994628906, "learning_rate": 3.96483838912014e-06, "loss": 0.236, "num_input_tokens_seen": 23955824, "step": 39305 }, { "epoch": 12.196711138690661, "grad_norm": 5.984424591064453, "learning_rate": 3.963513950561874e-06, "loss": 0.1933, "num_input_tokens_seen": 23959024, "step": 39310 }, { "epoch": 12.198262488364877, "grad_norm": 3.494440793991089, "learning_rate": 3.96218958799031e-06, "loss": 0.1926, "num_input_tokens_seen": 23962672, "step": 39315 }, { "epoch": 12.199813838039095, "grad_norm": 5.614019393920898, "learning_rate": 3.960865301502537e-06, "loss": 0.3275, "num_input_tokens_seen": 23964880, "step": 39320 }, { "epoch": 12.20136518771331, "grad_norm": 5.842646598815918, "learning_rate": 3.9595410911956435e-06, "loss": 0.2111, "num_input_tokens_seen": 23967632, "step": 39325 }, { "epoch": 12.202916537387527, "grad_norm": 5.2096476554870605, "learning_rate": 3.958216957166709e-06, "loss": 0.2014, "num_input_tokens_seen": 23972944, "step": 39330 }, { "epoch": 12.204467887061744, "grad_norm": 7.290415287017822, "learning_rate": 3.956892899512808e-06, "loss": 0.2266, "num_input_tokens_seen": 23976944, "step": 39335 }, { "epoch": 12.20601923673596, "grad_norm": 10.191421508789062, "learning_rate": 3.9555689183310095e-06, "loss": 0.2905, "num_input_tokens_seen": 23979952, "step": 39340 }, { "epoch": 12.207570586410176, "grad_norm": 3.1615540981292725, "learning_rate": 3.954245013718378e-06, "loss": 0.2128, "num_input_tokens_seen": 23983088, "step": 39345 }, { "epoch": 12.209121936084394, "grad_norm": 2.841038227081299, "learning_rate": 3.95292118577197e-06, "loss": 0.1961, "num_input_tokens_seen": 23986128, "step": 39350 }, { "epoch": 12.21067328575861, "grad_norm": 1.5021294355392456, "learning_rate": 3.951597434588837e-06, "loss": 0.1425, "num_input_tokens_seen": 23990768, "step": 39355 }, { "epoch": 12.212224635432827, "grad_norm": 3.1399307250976562, "learning_rate": 3.950273760266029e-06, "loss": 0.1841, "num_input_tokens_seen": 23993424, "step": 39360 }, { "epoch": 12.213775985107043, "grad_norm": 4.903319358825684, "learning_rate": 3.948950162900583e-06, "loss": 0.2291, "num_input_tokens_seen": 23996048, "step": 39365 }, { "epoch": 12.215327334781259, "grad_norm": 9.151830673217773, "learning_rate": 3.947626642589538e-06, "loss": 0.2991, "num_input_tokens_seen": 23998928, "step": 39370 }, { "epoch": 12.216878684455477, "grad_norm": 2.2338333129882812, "learning_rate": 3.946303199429922e-06, "loss": 0.1824, "num_input_tokens_seen": 24001456, "step": 39375 }, { "epoch": 12.218430034129693, "grad_norm": 4.276037216186523, "learning_rate": 3.944979833518758e-06, "loss": 0.2347, "num_input_tokens_seen": 24004304, "step": 39380 }, { "epoch": 12.21998138380391, "grad_norm": 3.6960184574127197, "learning_rate": 3.943656544953067e-06, "loss": 0.2071, "num_input_tokens_seen": 24007088, "step": 39385 }, { "epoch": 12.221532733478126, "grad_norm": 2.6744632720947266, "learning_rate": 3.942333333829859e-06, "loss": 0.1848, "num_input_tokens_seen": 24009616, "step": 39390 }, { "epoch": 12.223084083152342, "grad_norm": 3.6117658615112305, "learning_rate": 3.941010200246143e-06, "loss": 0.2637, "num_input_tokens_seen": 24014032, "step": 39395 }, { "epoch": 12.22463543282656, "grad_norm": 7.967000961303711, "learning_rate": 3.93968714429892e-06, "loss": 0.2424, "num_input_tokens_seen": 24017872, "step": 39400 }, { "epoch": 12.226186782500776, "grad_norm": 2.8021671772003174, "learning_rate": 3.9383641660851845e-06, "loss": 0.1782, "num_input_tokens_seen": 24020336, "step": 39405 }, { "epoch": 12.227738132174991, "grad_norm": 3.4707140922546387, "learning_rate": 3.937041265701928e-06, "loss": 0.1832, "num_input_tokens_seen": 24023600, "step": 39410 }, { "epoch": 12.229289481849209, "grad_norm": 3.1173579692840576, "learning_rate": 3.935718443246134e-06, "loss": 0.2495, "num_input_tokens_seen": 24026064, "step": 39415 }, { "epoch": 12.230840831523425, "grad_norm": 3.7950785160064697, "learning_rate": 3.93439569881478e-06, "loss": 0.1771, "num_input_tokens_seen": 24030416, "step": 39420 }, { "epoch": 12.232392181197643, "grad_norm": 8.513856887817383, "learning_rate": 3.93307303250484e-06, "loss": 0.227, "num_input_tokens_seen": 24034192, "step": 39425 }, { "epoch": 12.233943530871858, "grad_norm": 6.165989875793457, "learning_rate": 3.931750444413281e-06, "loss": 0.1958, "num_input_tokens_seen": 24036912, "step": 39430 }, { "epoch": 12.235494880546074, "grad_norm": 3.1020309925079346, "learning_rate": 3.930427934637066e-06, "loss": 0.2768, "num_input_tokens_seen": 24039952, "step": 39435 }, { "epoch": 12.237046230220292, "grad_norm": 4.360699653625488, "learning_rate": 3.929105503273147e-06, "loss": 0.2115, "num_input_tokens_seen": 24042064, "step": 39440 }, { "epoch": 12.238597579894508, "grad_norm": 3.04087233543396, "learning_rate": 3.9277831504184774e-06, "loss": 0.2265, "num_input_tokens_seen": 24046352, "step": 39445 }, { "epoch": 12.240148929568726, "grad_norm": 2.1697864532470703, "learning_rate": 3.92646087617e-06, "loss": 0.2182, "num_input_tokens_seen": 24048688, "step": 39450 }, { "epoch": 12.241700279242941, "grad_norm": 5.715051651000977, "learning_rate": 3.925138680624652e-06, "loss": 0.2367, "num_input_tokens_seen": 24051408, "step": 39455 }, { "epoch": 12.243251628917157, "grad_norm": 2.9820122718811035, "learning_rate": 3.923816563879367e-06, "loss": 0.2004, "num_input_tokens_seen": 24054512, "step": 39460 }, { "epoch": 12.244802978591375, "grad_norm": 3.0404787063598633, "learning_rate": 3.9224945260310725e-06, "loss": 0.1647, "num_input_tokens_seen": 24057968, "step": 39465 }, { "epoch": 12.246354328265591, "grad_norm": 2.708597421646118, "learning_rate": 3.9211725671766885e-06, "loss": 0.1993, "num_input_tokens_seen": 24062832, "step": 39470 }, { "epoch": 12.247905677939807, "grad_norm": 4.081568241119385, "learning_rate": 3.91985068741313e-06, "loss": 0.2106, "num_input_tokens_seen": 24065840, "step": 39475 }, { "epoch": 12.249457027614024, "grad_norm": 4.616288661956787, "learning_rate": 3.918528886837308e-06, "loss": 0.2275, "num_input_tokens_seen": 24068688, "step": 39480 }, { "epoch": 12.25100837728824, "grad_norm": 4.8138933181762695, "learning_rate": 3.917207165546124e-06, "loss": 0.2401, "num_input_tokens_seen": 24071376, "step": 39485 }, { "epoch": 12.252559726962458, "grad_norm": 1.8971563577651978, "learning_rate": 3.915885523636477e-06, "loss": 0.2726, "num_input_tokens_seen": 24074384, "step": 39490 }, { "epoch": 12.254111076636674, "grad_norm": 4.3710103034973145, "learning_rate": 3.91456396120526e-06, "loss": 0.1787, "num_input_tokens_seen": 24078352, "step": 39495 }, { "epoch": 12.25566242631089, "grad_norm": 4.9363250732421875, "learning_rate": 3.913242478349357e-06, "loss": 0.2509, "num_input_tokens_seen": 24080784, "step": 39500 }, { "epoch": 12.257213775985107, "grad_norm": 3.5528948307037354, "learning_rate": 3.91192107516565e-06, "loss": 0.2369, "num_input_tokens_seen": 24083536, "step": 39505 }, { "epoch": 12.258765125659323, "grad_norm": 1.9544492959976196, "learning_rate": 3.910599751751013e-06, "loss": 0.232, "num_input_tokens_seen": 24086512, "step": 39510 }, { "epoch": 12.260316475333541, "grad_norm": 2.975578546524048, "learning_rate": 3.909278508202315e-06, "loss": 0.2129, "num_input_tokens_seen": 24089200, "step": 39515 }, { "epoch": 12.261867825007757, "grad_norm": 2.4576709270477295, "learning_rate": 3.907957344616418e-06, "loss": 0.1939, "num_input_tokens_seen": 24092688, "step": 39520 }, { "epoch": 12.263419174681973, "grad_norm": 3.3467445373535156, "learning_rate": 3.9066362610901795e-06, "loss": 0.1867, "num_input_tokens_seen": 24095120, "step": 39525 }, { "epoch": 12.26497052435619, "grad_norm": 2.4978456497192383, "learning_rate": 3.905315257720451e-06, "loss": 0.1858, "num_input_tokens_seen": 24097648, "step": 39530 }, { "epoch": 12.266521874030406, "grad_norm": 3.9647414684295654, "learning_rate": 3.903994334604076e-06, "loss": 0.2076, "num_input_tokens_seen": 24099952, "step": 39535 }, { "epoch": 12.268073223704622, "grad_norm": 4.48529577255249, "learning_rate": 3.9026734918378964e-06, "loss": 0.2092, "num_input_tokens_seen": 24102672, "step": 39540 }, { "epoch": 12.26962457337884, "grad_norm": 4.003339767456055, "learning_rate": 3.901352729518741e-06, "loss": 0.214, "num_input_tokens_seen": 24105200, "step": 39545 }, { "epoch": 12.271175923053056, "grad_norm": 2.8629236221313477, "learning_rate": 3.900032047743443e-06, "loss": 0.1921, "num_input_tokens_seen": 24108240, "step": 39550 }, { "epoch": 12.272727272727273, "grad_norm": 6.179111480712891, "learning_rate": 3.898711446608822e-06, "loss": 0.1723, "num_input_tokens_seen": 24112240, "step": 39555 }, { "epoch": 12.27427862240149, "grad_norm": 2.3990843296051025, "learning_rate": 3.89739092621169e-06, "loss": 0.2224, "num_input_tokens_seen": 24115248, "step": 39560 }, { "epoch": 12.275829972075705, "grad_norm": 3.9611775875091553, "learning_rate": 3.896070486648863e-06, "loss": 0.1808, "num_input_tokens_seen": 24117968, "step": 39565 }, { "epoch": 12.277381321749923, "grad_norm": 5.425154685974121, "learning_rate": 3.8947501280171405e-06, "loss": 0.2186, "num_input_tokens_seen": 24121264, "step": 39570 }, { "epoch": 12.278932671424139, "grad_norm": 4.307236194610596, "learning_rate": 3.893429850413322e-06, "loss": 0.2264, "num_input_tokens_seen": 24124848, "step": 39575 }, { "epoch": 12.280484021098356, "grad_norm": 3.7279744148254395, "learning_rate": 3.892109653934199e-06, "loss": 0.1734, "num_input_tokens_seen": 24128016, "step": 39580 }, { "epoch": 12.282035370772572, "grad_norm": 3.374122381210327, "learning_rate": 3.890789538676558e-06, "loss": 0.2917, "num_input_tokens_seen": 24130768, "step": 39585 }, { "epoch": 12.283586720446788, "grad_norm": 6.546318054199219, "learning_rate": 3.88946950473718e-06, "loss": 0.2214, "num_input_tokens_seen": 24133616, "step": 39590 }, { "epoch": 12.285138070121006, "grad_norm": 2.2673332691192627, "learning_rate": 3.888149552212837e-06, "loss": 0.1892, "num_input_tokens_seen": 24136784, "step": 39595 }, { "epoch": 12.286689419795222, "grad_norm": 2.4905927181243896, "learning_rate": 3.886829681200297e-06, "loss": 0.2567, "num_input_tokens_seen": 24139696, "step": 39600 }, { "epoch": 12.288240769469438, "grad_norm": 2.224618911743164, "learning_rate": 3.885509891796325e-06, "loss": 0.2185, "num_input_tokens_seen": 24143088, "step": 39605 }, { "epoch": 12.289792119143655, "grad_norm": 7.221564769744873, "learning_rate": 3.884190184097675e-06, "loss": 0.2215, "num_input_tokens_seen": 24146576, "step": 39610 }, { "epoch": 12.291343468817871, "grad_norm": 1.8229938745498657, "learning_rate": 3.882870558201098e-06, "loss": 0.183, "num_input_tokens_seen": 24148848, "step": 39615 }, { "epoch": 12.292894818492089, "grad_norm": 2.0875182151794434, "learning_rate": 3.881551014203338e-06, "loss": 0.1924, "num_input_tokens_seen": 24150960, "step": 39620 }, { "epoch": 12.294446168166305, "grad_norm": 2.9286248683929443, "learning_rate": 3.880231552201131e-06, "loss": 0.2644, "num_input_tokens_seen": 24153584, "step": 39625 }, { "epoch": 12.29599751784052, "grad_norm": 1.7362192869186401, "learning_rate": 3.878912172291214e-06, "loss": 0.2127, "num_input_tokens_seen": 24156336, "step": 39630 }, { "epoch": 12.297548867514738, "grad_norm": 3.8004605770111084, "learning_rate": 3.877592874570307e-06, "loss": 0.2213, "num_input_tokens_seen": 24159952, "step": 39635 }, { "epoch": 12.299100217188954, "grad_norm": 7.403558254241943, "learning_rate": 3.876273659135136e-06, "loss": 0.1903, "num_input_tokens_seen": 24162352, "step": 39640 }, { "epoch": 12.300651566863172, "grad_norm": 5.380489826202393, "learning_rate": 3.874954526082413e-06, "loss": 0.2351, "num_input_tokens_seen": 24165648, "step": 39645 }, { "epoch": 12.302202916537388, "grad_norm": 3.3604719638824463, "learning_rate": 3.873635475508845e-06, "loss": 0.2067, "num_input_tokens_seen": 24168528, "step": 39650 }, { "epoch": 12.303754266211604, "grad_norm": 1.7828224897384644, "learning_rate": 3.872316507511135e-06, "loss": 0.239, "num_input_tokens_seen": 24173264, "step": 39655 }, { "epoch": 12.305305615885821, "grad_norm": 4.05665922164917, "learning_rate": 3.870997622185979e-06, "loss": 0.2056, "num_input_tokens_seen": 24175728, "step": 39660 }, { "epoch": 12.306856965560037, "grad_norm": 4.464418888092041, "learning_rate": 3.869678819630067e-06, "loss": 0.2073, "num_input_tokens_seen": 24179024, "step": 39665 }, { "epoch": 12.308408315234253, "grad_norm": 3.618457794189453, "learning_rate": 3.8683600999400815e-06, "loss": 0.2177, "num_input_tokens_seen": 24182352, "step": 39670 }, { "epoch": 12.30995966490847, "grad_norm": 2.134462833404541, "learning_rate": 3.8670414632127016e-06, "loss": 0.2118, "num_input_tokens_seen": 24185264, "step": 39675 }, { "epoch": 12.311511014582686, "grad_norm": 5.135840892791748, "learning_rate": 3.865722909544599e-06, "loss": 0.2173, "num_input_tokens_seen": 24188912, "step": 39680 }, { "epoch": 12.313062364256904, "grad_norm": 2.642188787460327, "learning_rate": 3.864404439032439e-06, "loss": 0.2405, "num_input_tokens_seen": 24192688, "step": 39685 }, { "epoch": 12.31461371393112, "grad_norm": 2.693084478378296, "learning_rate": 3.863086051772881e-06, "loss": 0.2325, "num_input_tokens_seen": 24195248, "step": 39690 }, { "epoch": 12.316165063605336, "grad_norm": 3.3102810382843018, "learning_rate": 3.861767747862578e-06, "loss": 0.2032, "num_input_tokens_seen": 24198864, "step": 39695 }, { "epoch": 12.317716413279554, "grad_norm": 1.6481637954711914, "learning_rate": 3.860449527398178e-06, "loss": 0.1879, "num_input_tokens_seen": 24201264, "step": 39700 }, { "epoch": 12.31926776295377, "grad_norm": 1.9274579286575317, "learning_rate": 3.859131390476322e-06, "loss": 0.2577, "num_input_tokens_seen": 24204368, "step": 39705 }, { "epoch": 12.320819112627987, "grad_norm": 2.5906710624694824, "learning_rate": 3.8578133371936454e-06, "loss": 0.2006, "num_input_tokens_seen": 24207536, "step": 39710 }, { "epoch": 12.322370462302203, "grad_norm": 3.0851387977600098, "learning_rate": 3.856495367646777e-06, "loss": 0.1958, "num_input_tokens_seen": 24209936, "step": 39715 }, { "epoch": 12.323921811976419, "grad_norm": 7.654709339141846, "learning_rate": 3.8551774819323375e-06, "loss": 0.1909, "num_input_tokens_seen": 24213040, "step": 39720 }, { "epoch": 12.325473161650637, "grad_norm": 3.7414679527282715, "learning_rate": 3.8538596801469474e-06, "loss": 0.2506, "num_input_tokens_seen": 24217072, "step": 39725 }, { "epoch": 12.327024511324852, "grad_norm": 3.575268030166626, "learning_rate": 3.852541962387214e-06, "loss": 0.214, "num_input_tokens_seen": 24219824, "step": 39730 }, { "epoch": 12.32857586099907, "grad_norm": 2.318294048309326, "learning_rate": 3.851224328749743e-06, "loss": 0.2072, "num_input_tokens_seen": 24222480, "step": 39735 }, { "epoch": 12.330127210673286, "grad_norm": 3.3497931957244873, "learning_rate": 3.849906779331132e-06, "loss": 0.2359, "num_input_tokens_seen": 24226672, "step": 39740 }, { "epoch": 12.331678560347502, "grad_norm": 6.056746006011963, "learning_rate": 3.848589314227974e-06, "loss": 0.2515, "num_input_tokens_seen": 24230384, "step": 39745 }, { "epoch": 12.33322991002172, "grad_norm": 3.6469082832336426, "learning_rate": 3.847271933536852e-06, "loss": 0.2174, "num_input_tokens_seen": 24233264, "step": 39750 }, { "epoch": 12.334781259695935, "grad_norm": 4.396012783050537, "learning_rate": 3.845954637354349e-06, "loss": 0.2308, "num_input_tokens_seen": 24237552, "step": 39755 }, { "epoch": 12.336332609370151, "grad_norm": 5.911734580993652, "learning_rate": 3.844637425777037e-06, "loss": 0.1916, "num_input_tokens_seen": 24240816, "step": 39760 }, { "epoch": 12.337883959044369, "grad_norm": 3.3757548332214355, "learning_rate": 3.843320298901483e-06, "loss": 0.2374, "num_input_tokens_seen": 24242704, "step": 39765 }, { "epoch": 12.339435308718585, "grad_norm": 3.976447343826294, "learning_rate": 3.842003256824248e-06, "loss": 0.1897, "num_input_tokens_seen": 24246096, "step": 39770 }, { "epoch": 12.340986658392803, "grad_norm": 4.368310451507568, "learning_rate": 3.840686299641888e-06, "loss": 0.2101, "num_input_tokens_seen": 24249040, "step": 39775 }, { "epoch": 12.342538008067018, "grad_norm": 8.343178749084473, "learning_rate": 3.8393694274509495e-06, "loss": 0.2356, "num_input_tokens_seen": 24251760, "step": 39780 }, { "epoch": 12.344089357741234, "grad_norm": 4.3510308265686035, "learning_rate": 3.838052640347977e-06, "loss": 0.2113, "num_input_tokens_seen": 24254032, "step": 39785 }, { "epoch": 12.345640707415452, "grad_norm": 3.2999751567840576, "learning_rate": 3.836735938429505e-06, "loss": 0.2262, "num_input_tokens_seen": 24256368, "step": 39790 }, { "epoch": 12.347192057089668, "grad_norm": 4.434691905975342, "learning_rate": 3.835419321792063e-06, "loss": 0.216, "num_input_tokens_seen": 24258992, "step": 39795 }, { "epoch": 12.348743406763884, "grad_norm": 2.960577964782715, "learning_rate": 3.834102790532177e-06, "loss": 0.1882, "num_input_tokens_seen": 24261584, "step": 39800 }, { "epoch": 12.350294756438101, "grad_norm": 3.77707839012146, "learning_rate": 3.832786344746362e-06, "loss": 0.2193, "num_input_tokens_seen": 24264208, "step": 39805 }, { "epoch": 12.351846106112317, "grad_norm": 5.0455241203308105, "learning_rate": 3.8314699845311295e-06, "loss": 0.1809, "num_input_tokens_seen": 24267216, "step": 39810 }, { "epoch": 12.353397455786535, "grad_norm": 3.110104560852051, "learning_rate": 3.830153709982983e-06, "loss": 0.215, "num_input_tokens_seen": 24269744, "step": 39815 }, { "epoch": 12.35494880546075, "grad_norm": 6.645728588104248, "learning_rate": 3.828837521198425e-06, "loss": 0.2112, "num_input_tokens_seen": 24273040, "step": 39820 }, { "epoch": 12.356500155134967, "grad_norm": 2.1399049758911133, "learning_rate": 3.827521418273945e-06, "loss": 0.2484, "num_input_tokens_seen": 24276048, "step": 39825 }, { "epoch": 12.358051504809184, "grad_norm": 2.3146135807037354, "learning_rate": 3.826205401306028e-06, "loss": 0.2062, "num_input_tokens_seen": 24279600, "step": 39830 }, { "epoch": 12.3596028544834, "grad_norm": 2.292814016342163, "learning_rate": 3.824889470391156e-06, "loss": 0.2323, "num_input_tokens_seen": 24282512, "step": 39835 }, { "epoch": 12.361154204157618, "grad_norm": 3.7841105461120605, "learning_rate": 3.823573625625802e-06, "loss": 0.2615, "num_input_tokens_seen": 24284848, "step": 39840 }, { "epoch": 12.362705553831834, "grad_norm": 2.6401913166046143, "learning_rate": 3.822257867106431e-06, "loss": 0.1953, "num_input_tokens_seen": 24287344, "step": 39845 }, { "epoch": 12.36425690350605, "grad_norm": 6.700867652893066, "learning_rate": 3.820942194929505e-06, "loss": 0.2569, "num_input_tokens_seen": 24290256, "step": 39850 }, { "epoch": 12.365808253180267, "grad_norm": 2.693347692489624, "learning_rate": 3.819626609191479e-06, "loss": 0.1779, "num_input_tokens_seen": 24293136, "step": 39855 }, { "epoch": 12.367359602854483, "grad_norm": 6.273287296295166, "learning_rate": 3.818311109988799e-06, "loss": 0.2298, "num_input_tokens_seen": 24296112, "step": 39860 }, { "epoch": 12.3689109525287, "grad_norm": 3.745203733444214, "learning_rate": 3.816995697417909e-06, "loss": 0.2354, "num_input_tokens_seen": 24299184, "step": 39865 }, { "epoch": 12.370462302202917, "grad_norm": 2.2106502056121826, "learning_rate": 3.815680371575243e-06, "loss": 0.2477, "num_input_tokens_seen": 24301744, "step": 39870 }, { "epoch": 12.372013651877133, "grad_norm": 6.474748611450195, "learning_rate": 3.8143651325572282e-06, "loss": 0.2365, "num_input_tokens_seen": 24303920, "step": 39875 }, { "epoch": 12.37356500155135, "grad_norm": 3.426644802093506, "learning_rate": 3.8130499804602915e-06, "loss": 0.1968, "num_input_tokens_seen": 24307376, "step": 39880 }, { "epoch": 12.375116351225566, "grad_norm": 5.262535572052002, "learning_rate": 3.8117349153808463e-06, "loss": 0.233, "num_input_tokens_seen": 24310000, "step": 39885 }, { "epoch": 12.376667700899782, "grad_norm": 4.519498348236084, "learning_rate": 3.8104199374153034e-06, "loss": 0.2339, "num_input_tokens_seen": 24314480, "step": 39890 }, { "epoch": 12.378219050574, "grad_norm": 4.76444149017334, "learning_rate": 3.8091050466600652e-06, "loss": 0.2211, "num_input_tokens_seen": 24317936, "step": 39895 }, { "epoch": 12.379770400248216, "grad_norm": 2.994051218032837, "learning_rate": 3.80779024321153e-06, "loss": 0.2118, "num_input_tokens_seen": 24320656, "step": 39900 }, { "epoch": 12.381321749922433, "grad_norm": 2.800926923751831, "learning_rate": 3.8064755271660877e-06, "loss": 0.1824, "num_input_tokens_seen": 24323600, "step": 39905 }, { "epoch": 12.38287309959665, "grad_norm": 2.6735877990722656, "learning_rate": 3.805160898620122e-06, "loss": 0.2564, "num_input_tokens_seen": 24326160, "step": 39910 }, { "epoch": 12.384424449270865, "grad_norm": 3.5227394104003906, "learning_rate": 3.8038463576700126e-06, "loss": 0.2211, "num_input_tokens_seen": 24328752, "step": 39915 }, { "epoch": 12.385975798945083, "grad_norm": 3.3762102127075195, "learning_rate": 3.802531904412129e-06, "loss": 0.2338, "num_input_tokens_seen": 24331248, "step": 39920 }, { "epoch": 12.387527148619299, "grad_norm": 4.482447147369385, "learning_rate": 3.801217538942837e-06, "loss": 0.2383, "num_input_tokens_seen": 24334224, "step": 39925 }, { "epoch": 12.389078498293514, "grad_norm": 3.2192294597625732, "learning_rate": 3.7999032613584954e-06, "loss": 0.2263, "num_input_tokens_seen": 24336848, "step": 39930 }, { "epoch": 12.390629847967732, "grad_norm": 2.3070764541625977, "learning_rate": 3.798589071755454e-06, "loss": 0.1941, "num_input_tokens_seen": 24339376, "step": 39935 }, { "epoch": 12.392181197641948, "grad_norm": 2.4609978199005127, "learning_rate": 3.7972749702300627e-06, "loss": 0.1992, "num_input_tokens_seen": 24341904, "step": 39940 }, { "epoch": 12.393732547316166, "grad_norm": 1.8937078714370728, "learning_rate": 3.795960956878658e-06, "loss": 0.176, "num_input_tokens_seen": 24345456, "step": 39945 }, { "epoch": 12.395283896990382, "grad_norm": 4.831125736236572, "learning_rate": 3.794647031797573e-06, "loss": 0.2662, "num_input_tokens_seen": 24348720, "step": 39950 }, { "epoch": 12.396835246664597, "grad_norm": 1.6790251731872559, "learning_rate": 3.793333195083134e-06, "loss": 0.1958, "num_input_tokens_seen": 24351376, "step": 39955 }, { "epoch": 12.398386596338815, "grad_norm": 2.856222629547119, "learning_rate": 3.7920194468316617e-06, "loss": 0.236, "num_input_tokens_seen": 24354000, "step": 39960 }, { "epoch": 12.399937946013031, "grad_norm": 2.560519218444824, "learning_rate": 3.790705787139468e-06, "loss": 0.1977, "num_input_tokens_seen": 24357456, "step": 39965 }, { "epoch": 12.401489295687249, "grad_norm": 4.38857364654541, "learning_rate": 3.78939221610286e-06, "loss": 0.2309, "num_input_tokens_seen": 24360368, "step": 39970 }, { "epoch": 12.403040645361465, "grad_norm": 2.521064519882202, "learning_rate": 3.788078733818139e-06, "loss": 0.2143, "num_input_tokens_seen": 24362672, "step": 39975 }, { "epoch": 12.40459199503568, "grad_norm": 3.9541244506835938, "learning_rate": 3.786765340381597e-06, "loss": 0.2186, "num_input_tokens_seen": 24365200, "step": 39980 }, { "epoch": 12.406143344709898, "grad_norm": 3.1371593475341797, "learning_rate": 3.7854520358895237e-06, "loss": 0.2057, "num_input_tokens_seen": 24369392, "step": 39985 }, { "epoch": 12.407694694384114, "grad_norm": 2.3575029373168945, "learning_rate": 3.7841388204381978e-06, "loss": 0.2101, "num_input_tokens_seen": 24372336, "step": 39990 }, { "epoch": 12.409246044058332, "grad_norm": 5.204128265380859, "learning_rate": 3.782825694123894e-06, "loss": 0.2093, "num_input_tokens_seen": 24374480, "step": 39995 }, { "epoch": 12.410797393732548, "grad_norm": 3.433492660522461, "learning_rate": 3.781512657042881e-06, "loss": 0.2272, "num_input_tokens_seen": 24376784, "step": 40000 }, { "epoch": 12.412348743406763, "grad_norm": 4.510307788848877, "learning_rate": 3.780199709291419e-06, "loss": 0.2173, "num_input_tokens_seen": 24380688, "step": 40005 }, { "epoch": 12.413900093080981, "grad_norm": 4.577780246734619, "learning_rate": 3.7788868509657627e-06, "loss": 0.2001, "num_input_tokens_seen": 24385488, "step": 40010 }, { "epoch": 12.415451442755197, "grad_norm": 5.60933256149292, "learning_rate": 3.7775740821621605e-06, "loss": 0.2489, "num_input_tokens_seen": 24388944, "step": 40015 }, { "epoch": 12.417002792429413, "grad_norm": 2.312777280807495, "learning_rate": 3.776261402976854e-06, "loss": 0.227, "num_input_tokens_seen": 24392080, "step": 40020 }, { "epoch": 12.41855414210363, "grad_norm": 3.0750842094421387, "learning_rate": 3.7749488135060776e-06, "loss": 0.2065, "num_input_tokens_seen": 24394608, "step": 40025 }, { "epoch": 12.420105491777846, "grad_norm": 4.97555685043335, "learning_rate": 3.77363631384606e-06, "loss": 0.2251, "num_input_tokens_seen": 24397744, "step": 40030 }, { "epoch": 12.421656841452064, "grad_norm": 2.0756630897521973, "learning_rate": 3.772323904093024e-06, "loss": 0.1791, "num_input_tokens_seen": 24400080, "step": 40035 }, { "epoch": 12.42320819112628, "grad_norm": 4.302972316741943, "learning_rate": 3.7710115843431822e-06, "loss": 0.2549, "num_input_tokens_seen": 24402736, "step": 40040 }, { "epoch": 12.424759540800496, "grad_norm": 2.7809371948242188, "learning_rate": 3.769699354692745e-06, "loss": 0.2009, "num_input_tokens_seen": 24405232, "step": 40045 }, { "epoch": 12.426310890474713, "grad_norm": 3.128709554672241, "learning_rate": 3.7683872152379147e-06, "loss": 0.1983, "num_input_tokens_seen": 24407856, "step": 40050 }, { "epoch": 12.42786224014893, "grad_norm": 3.635622262954712, "learning_rate": 3.7670751660748843e-06, "loss": 0.1911, "num_input_tokens_seen": 24410320, "step": 40055 }, { "epoch": 12.429413589823145, "grad_norm": 2.000354528427124, "learning_rate": 3.7657632072998455e-06, "loss": 0.2027, "num_input_tokens_seen": 24413552, "step": 40060 }, { "epoch": 12.430964939497363, "grad_norm": 5.146608352661133, "learning_rate": 3.7644513390089793e-06, "loss": 0.2303, "num_input_tokens_seen": 24415760, "step": 40065 }, { "epoch": 12.432516289171579, "grad_norm": 2.1498663425445557, "learning_rate": 3.7631395612984612e-06, "loss": 0.2476, "num_input_tokens_seen": 24418800, "step": 40070 }, { "epoch": 12.434067638845796, "grad_norm": 4.429993152618408, "learning_rate": 3.761827874264459e-06, "loss": 0.2186, "num_input_tokens_seen": 24421552, "step": 40075 }, { "epoch": 12.435618988520012, "grad_norm": 10.881349563598633, "learning_rate": 3.7605162780031368e-06, "loss": 0.2503, "num_input_tokens_seen": 24425744, "step": 40080 }, { "epoch": 12.437170338194228, "grad_norm": 4.657510280609131, "learning_rate": 3.7592047726106484e-06, "loss": 0.2273, "num_input_tokens_seen": 24428592, "step": 40085 }, { "epoch": 12.438721687868446, "grad_norm": 2.532351016998291, "learning_rate": 3.757893358183144e-06, "loss": 0.2731, "num_input_tokens_seen": 24433200, "step": 40090 }, { "epoch": 12.440273037542662, "grad_norm": 3.8753724098205566, "learning_rate": 3.7565820348167647e-06, "loss": 0.2455, "num_input_tokens_seen": 24436272, "step": 40095 }, { "epoch": 12.44182438721688, "grad_norm": 2.130608320236206, "learning_rate": 3.7552708026076466e-06, "loss": 0.2473, "num_input_tokens_seen": 24440784, "step": 40100 }, { "epoch": 12.443375736891095, "grad_norm": 1.7047609090805054, "learning_rate": 3.7539596616519186e-06, "loss": 0.2104, "num_input_tokens_seen": 24443792, "step": 40105 }, { "epoch": 12.444927086565311, "grad_norm": 4.768316268920898, "learning_rate": 3.752648612045703e-06, "loss": 0.2209, "num_input_tokens_seen": 24446832, "step": 40110 }, { "epoch": 12.446478436239529, "grad_norm": 2.1778359413146973, "learning_rate": 3.751337653885113e-06, "loss": 0.1936, "num_input_tokens_seen": 24449456, "step": 40115 }, { "epoch": 12.448029785913745, "grad_norm": 1.9264674186706543, "learning_rate": 3.750026787266261e-06, "loss": 0.2388, "num_input_tokens_seen": 24452080, "step": 40120 }, { "epoch": 12.449581135587962, "grad_norm": 8.130335807800293, "learning_rate": 3.748716012285247e-06, "loss": 0.2036, "num_input_tokens_seen": 24455760, "step": 40125 }, { "epoch": 12.451132485262178, "grad_norm": 3.4810030460357666, "learning_rate": 3.7474053290381674e-06, "loss": 0.2186, "num_input_tokens_seen": 24458480, "step": 40130 }, { "epoch": 12.452683834936394, "grad_norm": 3.856754779815674, "learning_rate": 3.746094737621109e-06, "loss": 0.2122, "num_input_tokens_seen": 24461968, "step": 40135 }, { "epoch": 12.454235184610612, "grad_norm": 2.3243486881256104, "learning_rate": 3.7447842381301546e-06, "loss": 0.2384, "num_input_tokens_seen": 24465584, "step": 40140 }, { "epoch": 12.455786534284828, "grad_norm": 5.399731159210205, "learning_rate": 3.7434738306613807e-06, "loss": 0.203, "num_input_tokens_seen": 24468816, "step": 40145 }, { "epoch": 12.457337883959044, "grad_norm": 4.026944637298584, "learning_rate": 3.7421635153108537e-06, "loss": 0.2085, "num_input_tokens_seen": 24471280, "step": 40150 }, { "epoch": 12.458889233633261, "grad_norm": 3.6317241191864014, "learning_rate": 3.740853292174637e-06, "loss": 0.2119, "num_input_tokens_seen": 24474096, "step": 40155 }, { "epoch": 12.460440583307477, "grad_norm": 3.1509850025177, "learning_rate": 3.739543161348784e-06, "loss": 0.2354, "num_input_tokens_seen": 24476336, "step": 40160 }, { "epoch": 12.461991932981695, "grad_norm": 2.2366671562194824, "learning_rate": 3.7382331229293435e-06, "loss": 0.203, "num_input_tokens_seen": 24478704, "step": 40165 }, { "epoch": 12.46354328265591, "grad_norm": 3.3941996097564697, "learning_rate": 3.7369231770123572e-06, "loss": 0.203, "num_input_tokens_seen": 24483344, "step": 40170 }, { "epoch": 12.465094632330127, "grad_norm": 1.966341495513916, "learning_rate": 3.7356133236938585e-06, "loss": 0.207, "num_input_tokens_seen": 24485936, "step": 40175 }, { "epoch": 12.466645982004344, "grad_norm": 2.5453758239746094, "learning_rate": 3.7343035630698754e-06, "loss": 0.2244, "num_input_tokens_seen": 24488080, "step": 40180 }, { "epoch": 12.46819733167856, "grad_norm": 3.1640501022338867, "learning_rate": 3.73299389523643e-06, "loss": 0.202, "num_input_tokens_seen": 24491792, "step": 40185 }, { "epoch": 12.469748681352776, "grad_norm": 2.863687515258789, "learning_rate": 3.7316843202895346e-06, "loss": 0.1915, "num_input_tokens_seen": 24494672, "step": 40190 }, { "epoch": 12.471300031026994, "grad_norm": 4.4513630867004395, "learning_rate": 3.7303748383251987e-06, "loss": 0.2487, "num_input_tokens_seen": 24497872, "step": 40195 }, { "epoch": 12.47285138070121, "grad_norm": 4.7565412521362305, "learning_rate": 3.7290654494394207e-06, "loss": 0.2436, "num_input_tokens_seen": 24501968, "step": 40200 }, { "epoch": 12.474402730375427, "grad_norm": 2.5589160919189453, "learning_rate": 3.7277561537281957e-06, "loss": 0.2, "num_input_tokens_seen": 24504944, "step": 40205 }, { "epoch": 12.475954080049643, "grad_norm": 5.285217761993408, "learning_rate": 3.7264469512875107e-06, "loss": 0.238, "num_input_tokens_seen": 24507824, "step": 40210 }, { "epoch": 12.477505429723859, "grad_norm": 4.379031658172607, "learning_rate": 3.7251378422133455e-06, "loss": 0.2261, "num_input_tokens_seen": 24510192, "step": 40215 }, { "epoch": 12.479056779398077, "grad_norm": 2.01051664352417, "learning_rate": 3.723828826601672e-06, "loss": 0.2002, "num_input_tokens_seen": 24512848, "step": 40220 }, { "epoch": 12.480608129072293, "grad_norm": 7.746349811553955, "learning_rate": 3.722519904548459e-06, "loss": 0.2267, "num_input_tokens_seen": 24515600, "step": 40225 }, { "epoch": 12.48215947874651, "grad_norm": 4.481527328491211, "learning_rate": 3.721211076149664e-06, "loss": 0.2158, "num_input_tokens_seen": 24518416, "step": 40230 }, { "epoch": 12.483710828420726, "grad_norm": 4.348636150360107, "learning_rate": 3.7199023415012403e-06, "loss": 0.2448, "num_input_tokens_seen": 24522000, "step": 40235 }, { "epoch": 12.485262178094942, "grad_norm": 2.42948842048645, "learning_rate": 3.7185937006991337e-06, "loss": 0.2134, "num_input_tokens_seen": 24528112, "step": 40240 }, { "epoch": 12.48681352776916, "grad_norm": 1.6540368795394897, "learning_rate": 3.717285153839283e-06, "loss": 0.2203, "num_input_tokens_seen": 24530352, "step": 40245 }, { "epoch": 12.488364877443376, "grad_norm": 4.9587297439575195, "learning_rate": 3.7159767010176197e-06, "loss": 0.2046, "num_input_tokens_seen": 24533200, "step": 40250 }, { "epoch": 12.489916227117593, "grad_norm": 1.728401780128479, "learning_rate": 3.7146683423300696e-06, "loss": 0.2104, "num_input_tokens_seen": 24536688, "step": 40255 }, { "epoch": 12.491467576791809, "grad_norm": 2.810697317123413, "learning_rate": 3.7133600778725496e-06, "loss": 0.2852, "num_input_tokens_seen": 24540880, "step": 40260 }, { "epoch": 12.493018926466025, "grad_norm": 2.9655330181121826, "learning_rate": 3.7120519077409727e-06, "loss": 0.1955, "num_input_tokens_seen": 24543440, "step": 40265 }, { "epoch": 12.494570276140243, "grad_norm": 3.184870719909668, "learning_rate": 3.710743832031243e-06, "loss": 0.2469, "num_input_tokens_seen": 24545776, "step": 40270 }, { "epoch": 12.496121625814459, "grad_norm": 3.63885498046875, "learning_rate": 3.709435850839257e-06, "loss": 0.2118, "num_input_tokens_seen": 24548304, "step": 40275 }, { "epoch": 12.497672975488674, "grad_norm": 3.371116876602173, "learning_rate": 3.7081279642609064e-06, "loss": 0.2415, "num_input_tokens_seen": 24551248, "step": 40280 }, { "epoch": 12.499224325162892, "grad_norm": 4.061430931091309, "learning_rate": 3.706820172392074e-06, "loss": 0.2136, "num_input_tokens_seen": 24554544, "step": 40285 }, { "epoch": 12.500775674837108, "grad_norm": 4.8022332191467285, "learning_rate": 3.705512475328636e-06, "loss": 0.2355, "num_input_tokens_seen": 24557456, "step": 40290 }, { "epoch": 12.502327024511326, "grad_norm": 3.035557270050049, "learning_rate": 3.7042048731664626e-06, "loss": 0.172, "num_input_tokens_seen": 24560464, "step": 40295 }, { "epoch": 12.503878374185541, "grad_norm": 4.770817756652832, "learning_rate": 3.702897366001417e-06, "loss": 0.2086, "num_input_tokens_seen": 24564240, "step": 40300 }, { "epoch": 12.505429723859757, "grad_norm": 7.039117813110352, "learning_rate": 3.701589953929354e-06, "loss": 0.2747, "num_input_tokens_seen": 24567344, "step": 40305 }, { "epoch": 12.506981073533975, "grad_norm": 1.6552866697311401, "learning_rate": 3.700282637046123e-06, "loss": 0.1974, "num_input_tokens_seen": 24569648, "step": 40310 }, { "epoch": 12.508532423208191, "grad_norm": 2.1812307834625244, "learning_rate": 3.6989754154475654e-06, "loss": 0.1885, "num_input_tokens_seen": 24572624, "step": 40315 }, { "epoch": 12.510083772882407, "grad_norm": 2.614044427871704, "learning_rate": 3.6976682892295157e-06, "loss": 0.2495, "num_input_tokens_seen": 24575824, "step": 40320 }, { "epoch": 12.511635122556624, "grad_norm": 3.4433040618896484, "learning_rate": 3.6963612584878035e-06, "loss": 0.1906, "num_input_tokens_seen": 24579728, "step": 40325 }, { "epoch": 12.51318647223084, "grad_norm": 3.9088187217712402, "learning_rate": 3.695054323318248e-06, "loss": 0.2347, "num_input_tokens_seen": 24583056, "step": 40330 }, { "epoch": 12.514737821905058, "grad_norm": 1.503030776977539, "learning_rate": 3.6937474838166637e-06, "loss": 0.1944, "num_input_tokens_seen": 24585904, "step": 40335 }, { "epoch": 12.516289171579274, "grad_norm": 5.060538291931152, "learning_rate": 3.692440740078857e-06, "loss": 0.1826, "num_input_tokens_seen": 24590128, "step": 40340 }, { "epoch": 12.51784052125349, "grad_norm": 3.7206339836120605, "learning_rate": 3.691134092200628e-06, "loss": 0.1987, "num_input_tokens_seen": 24594128, "step": 40345 }, { "epoch": 12.519391870927707, "grad_norm": 2.3934366703033447, "learning_rate": 3.6898275402777694e-06, "loss": 0.2306, "num_input_tokens_seen": 24596560, "step": 40350 }, { "epoch": 12.520943220601923, "grad_norm": 2.5633647441864014, "learning_rate": 3.688521084406067e-06, "loss": 0.1862, "num_input_tokens_seen": 24600272, "step": 40355 }, { "epoch": 12.522494570276141, "grad_norm": 3.054701805114746, "learning_rate": 3.6872147246812983e-06, "loss": 0.1833, "num_input_tokens_seen": 24603504, "step": 40360 }, { "epoch": 12.524045919950357, "grad_norm": 5.03877067565918, "learning_rate": 3.685908461199237e-06, "loss": 0.2039, "num_input_tokens_seen": 24607088, "step": 40365 }, { "epoch": 12.525597269624573, "grad_norm": 3.038703203201294, "learning_rate": 3.684602294055647e-06, "loss": 0.2075, "num_input_tokens_seen": 24609584, "step": 40370 }, { "epoch": 12.52714861929879, "grad_norm": 3.6477272510528564, "learning_rate": 3.6832962233462843e-06, "loss": 0.189, "num_input_tokens_seen": 24613008, "step": 40375 }, { "epoch": 12.528699968973006, "grad_norm": 1.8809863328933716, "learning_rate": 3.6819902491669004e-06, "loss": 0.1768, "num_input_tokens_seen": 24615152, "step": 40380 }, { "epoch": 12.530251318647224, "grad_norm": 5.932974815368652, "learning_rate": 3.6806843716132395e-06, "loss": 0.2144, "num_input_tokens_seen": 24618896, "step": 40385 }, { "epoch": 12.53180266832144, "grad_norm": 4.438814640045166, "learning_rate": 3.6793785907810376e-06, "loss": 0.1943, "num_input_tokens_seen": 24623152, "step": 40390 }, { "epoch": 12.533354017995656, "grad_norm": 5.869485855102539, "learning_rate": 3.6780729067660225e-06, "loss": 0.2277, "num_input_tokens_seen": 24626192, "step": 40395 }, { "epoch": 12.534905367669873, "grad_norm": 2.930996894836426, "learning_rate": 3.6767673196639186e-06, "loss": 0.2141, "num_input_tokens_seen": 24628336, "step": 40400 }, { "epoch": 12.53645671734409, "grad_norm": 6.326830863952637, "learning_rate": 3.6754618295704387e-06, "loss": 0.2471, "num_input_tokens_seen": 24630640, "step": 40405 }, { "epoch": 12.538008067018305, "grad_norm": 4.14627742767334, "learning_rate": 3.674156436581292e-06, "loss": 0.221, "num_input_tokens_seen": 24633104, "step": 40410 }, { "epoch": 12.539559416692523, "grad_norm": 7.405834197998047, "learning_rate": 3.6728511407921785e-06, "loss": 0.2397, "num_input_tokens_seen": 24635632, "step": 40415 }, { "epoch": 12.541110766366739, "grad_norm": 3.9832541942596436, "learning_rate": 3.671545942298792e-06, "loss": 0.2235, "num_input_tokens_seen": 24638800, "step": 40420 }, { "epoch": 12.542662116040956, "grad_norm": 5.175686836242676, "learning_rate": 3.67024084119682e-06, "loss": 0.2386, "num_input_tokens_seen": 24641200, "step": 40425 }, { "epoch": 12.544213465715172, "grad_norm": 3.328655242919922, "learning_rate": 3.6689358375819404e-06, "loss": 0.2626, "num_input_tokens_seen": 24644272, "step": 40430 }, { "epoch": 12.545764815389388, "grad_norm": 4.812005519866943, "learning_rate": 3.667630931549826e-06, "loss": 0.2692, "num_input_tokens_seen": 24648208, "step": 40435 }, { "epoch": 12.547316165063606, "grad_norm": 2.3193111419677734, "learning_rate": 3.666326123196141e-06, "loss": 0.2123, "num_input_tokens_seen": 24651024, "step": 40440 }, { "epoch": 12.548867514737822, "grad_norm": 4.5057902336120605, "learning_rate": 3.6650214126165458e-06, "loss": 0.177, "num_input_tokens_seen": 24654704, "step": 40445 }, { "epoch": 12.550418864412038, "grad_norm": 4.956503868103027, "learning_rate": 3.6637167999066893e-06, "loss": 0.2638, "num_input_tokens_seen": 24657136, "step": 40450 }, { "epoch": 12.551970214086255, "grad_norm": 5.313846588134766, "learning_rate": 3.6624122851622147e-06, "loss": 0.2334, "num_input_tokens_seen": 24660528, "step": 40455 }, { "epoch": 12.553521563760471, "grad_norm": 5.025276184082031, "learning_rate": 3.6611078684787593e-06, "loss": 0.229, "num_input_tokens_seen": 24662864, "step": 40460 }, { "epoch": 12.555072913434689, "grad_norm": 5.034143447875977, "learning_rate": 3.6598035499519525e-06, "loss": 0.255, "num_input_tokens_seen": 24665968, "step": 40465 }, { "epoch": 12.556624263108905, "grad_norm": 2.5454602241516113, "learning_rate": 3.6584993296774152e-06, "loss": 0.2261, "num_input_tokens_seen": 24668720, "step": 40470 }, { "epoch": 12.55817561278312, "grad_norm": 4.426941871643066, "learning_rate": 3.6571952077507634e-06, "loss": 0.2262, "num_input_tokens_seen": 24671760, "step": 40475 }, { "epoch": 12.559726962457338, "grad_norm": 4.297659873962402, "learning_rate": 3.6558911842676038e-06, "loss": 0.1816, "num_input_tokens_seen": 24675600, "step": 40480 }, { "epoch": 12.561278312131554, "grad_norm": 2.988826036453247, "learning_rate": 3.6545872593235367e-06, "loss": 0.209, "num_input_tokens_seen": 24677968, "step": 40485 }, { "epoch": 12.562829661805772, "grad_norm": 3.357696056365967, "learning_rate": 3.6532834330141554e-06, "loss": 0.1992, "num_input_tokens_seen": 24680720, "step": 40490 }, { "epoch": 12.564381011479988, "grad_norm": 2.417243003845215, "learning_rate": 3.651979705435047e-06, "loss": 0.1919, "num_input_tokens_seen": 24684528, "step": 40495 }, { "epoch": 12.565932361154204, "grad_norm": 4.877452850341797, "learning_rate": 3.650676076681787e-06, "loss": 0.231, "num_input_tokens_seen": 24686800, "step": 40500 }, { "epoch": 12.567483710828421, "grad_norm": 3.0406253337860107, "learning_rate": 3.649372546849951e-06, "loss": 0.2361, "num_input_tokens_seen": 24689776, "step": 40505 }, { "epoch": 12.569035060502637, "grad_norm": 5.653176307678223, "learning_rate": 3.6480691160351002e-06, "loss": 0.2299, "num_input_tokens_seen": 24692432, "step": 40510 }, { "epoch": 12.570586410176855, "grad_norm": 3.5503108501434326, "learning_rate": 3.6467657843327933e-06, "loss": 0.2269, "num_input_tokens_seen": 24695088, "step": 40515 }, { "epoch": 12.57213775985107, "grad_norm": 4.378805637359619, "learning_rate": 3.6454625518385777e-06, "loss": 0.2145, "num_input_tokens_seen": 24699216, "step": 40520 }, { "epoch": 12.573689109525287, "grad_norm": 2.2242045402526855, "learning_rate": 3.6441594186479986e-06, "loss": 0.2122, "num_input_tokens_seen": 24703888, "step": 40525 }, { "epoch": 12.575240459199504, "grad_norm": 4.199162483215332, "learning_rate": 3.642856384856589e-06, "loss": 0.2274, "num_input_tokens_seen": 24706640, "step": 40530 }, { "epoch": 12.57679180887372, "grad_norm": 5.8803815841674805, "learning_rate": 3.6415534505598777e-06, "loss": 0.2049, "num_input_tokens_seen": 24709744, "step": 40535 }, { "epoch": 12.578343158547936, "grad_norm": 2.8329572677612305, "learning_rate": 3.6402506158533846e-06, "loss": 0.1865, "num_input_tokens_seen": 24712080, "step": 40540 }, { "epoch": 12.579894508222154, "grad_norm": 3.218217372894287, "learning_rate": 3.6389478808326233e-06, "loss": 0.2641, "num_input_tokens_seen": 24715216, "step": 40545 }, { "epoch": 12.58144585789637, "grad_norm": 2.061728000640869, "learning_rate": 3.6376452455931e-06, "loss": 0.2218, "num_input_tokens_seen": 24717520, "step": 40550 }, { "epoch": 12.582997207570587, "grad_norm": 7.084788799285889, "learning_rate": 3.6363427102303124e-06, "loss": 0.2353, "num_input_tokens_seen": 24720304, "step": 40555 }, { "epoch": 12.584548557244803, "grad_norm": 3.9316463470458984, "learning_rate": 3.635040274839751e-06, "loss": 0.238, "num_input_tokens_seen": 24724240, "step": 40560 }, { "epoch": 12.586099906919019, "grad_norm": 3.596176862716675, "learning_rate": 3.6337379395169024e-06, "loss": 0.1706, "num_input_tokens_seen": 24727152, "step": 40565 }, { "epoch": 12.587651256593237, "grad_norm": 3.541428565979004, "learning_rate": 3.632435704357242e-06, "loss": 0.2172, "num_input_tokens_seen": 24729776, "step": 40570 }, { "epoch": 12.589202606267452, "grad_norm": 3.8208045959472656, "learning_rate": 3.631133569456239e-06, "loss": 0.1807, "num_input_tokens_seen": 24731920, "step": 40575 }, { "epoch": 12.590753955941668, "grad_norm": 2.7582030296325684, "learning_rate": 3.6298315349093545e-06, "loss": 0.1937, "num_input_tokens_seen": 24735056, "step": 40580 }, { "epoch": 12.592305305615886, "grad_norm": 2.6901679039001465, "learning_rate": 3.628529600812044e-06, "loss": 0.2092, "num_input_tokens_seen": 24737488, "step": 40585 }, { "epoch": 12.593856655290102, "grad_norm": 4.48542594909668, "learning_rate": 3.6272277672597543e-06, "loss": 0.2162, "num_input_tokens_seen": 24740400, "step": 40590 }, { "epoch": 12.59540800496432, "grad_norm": 5.156680583953857, "learning_rate": 3.6259260343479252e-06, "loss": 0.229, "num_input_tokens_seen": 24744176, "step": 40595 }, { "epoch": 12.596959354638535, "grad_norm": 3.4585886001586914, "learning_rate": 3.6246244021719902e-06, "loss": 0.2115, "num_input_tokens_seen": 24746608, "step": 40600 }, { "epoch": 12.598510704312751, "grad_norm": 2.3698840141296387, "learning_rate": 3.6233228708273723e-06, "loss": 0.1707, "num_input_tokens_seen": 24749904, "step": 40605 }, { "epoch": 12.600062053986969, "grad_norm": 6.450839042663574, "learning_rate": 3.6220214404094905e-06, "loss": 0.2563, "num_input_tokens_seen": 24753456, "step": 40610 }, { "epoch": 12.601613403661185, "grad_norm": 2.543131113052368, "learning_rate": 3.6207201110137547e-06, "loss": 0.2204, "num_input_tokens_seen": 24755920, "step": 40615 }, { "epoch": 12.603164753335403, "grad_norm": 3.1856532096862793, "learning_rate": 3.6194188827355673e-06, "loss": 0.2321, "num_input_tokens_seen": 24758448, "step": 40620 }, { "epoch": 12.604716103009618, "grad_norm": 2.543337345123291, "learning_rate": 3.618117755670325e-06, "loss": 0.2052, "num_input_tokens_seen": 24761584, "step": 40625 }, { "epoch": 12.606267452683834, "grad_norm": 3.860203742980957, "learning_rate": 3.616816729913416e-06, "loss": 0.179, "num_input_tokens_seen": 24766288, "step": 40630 }, { "epoch": 12.607818802358052, "grad_norm": 2.6131880283355713, "learning_rate": 3.615515805560219e-06, "loss": 0.256, "num_input_tokens_seen": 24769904, "step": 40635 }, { "epoch": 12.609370152032268, "grad_norm": 3.1976356506347656, "learning_rate": 3.6142149827061087e-06, "loss": 0.1893, "num_input_tokens_seen": 24773136, "step": 40640 }, { "epoch": 12.610921501706486, "grad_norm": 3.3995983600616455, "learning_rate": 3.6129142614464496e-06, "loss": 0.2017, "num_input_tokens_seen": 24775760, "step": 40645 }, { "epoch": 12.612472851380701, "grad_norm": 4.863757133483887, "learning_rate": 3.6116136418766017e-06, "loss": 0.2319, "num_input_tokens_seen": 24778192, "step": 40650 }, { "epoch": 12.614024201054917, "grad_norm": 3.644608974456787, "learning_rate": 3.6103131240919142e-06, "loss": 0.2132, "num_input_tokens_seen": 24781296, "step": 40655 }, { "epoch": 12.615575550729135, "grad_norm": 3.9098012447357178, "learning_rate": 3.6090127081877313e-06, "loss": 0.23, "num_input_tokens_seen": 24783792, "step": 40660 }, { "epoch": 12.61712690040335, "grad_norm": 2.522033452987671, "learning_rate": 3.6077123942593882e-06, "loss": 0.202, "num_input_tokens_seen": 24786256, "step": 40665 }, { "epoch": 12.618678250077567, "grad_norm": 2.3669240474700928, "learning_rate": 3.6064121824022147e-06, "loss": 0.2077, "num_input_tokens_seen": 24791216, "step": 40670 }, { "epoch": 12.620229599751784, "grad_norm": 3.537492275238037, "learning_rate": 3.6051120727115303e-06, "loss": 0.2071, "num_input_tokens_seen": 24793808, "step": 40675 }, { "epoch": 12.621780949426, "grad_norm": 4.655874252319336, "learning_rate": 3.6038120652826474e-06, "loss": 0.2456, "num_input_tokens_seen": 24796496, "step": 40680 }, { "epoch": 12.623332299100218, "grad_norm": 4.639011859893799, "learning_rate": 3.6025121602108747e-06, "loss": 0.2224, "num_input_tokens_seen": 24799344, "step": 40685 }, { "epoch": 12.624883648774434, "grad_norm": 2.8573849201202393, "learning_rate": 3.60121235759151e-06, "loss": 0.2065, "num_input_tokens_seen": 24803024, "step": 40690 }, { "epoch": 12.62643499844865, "grad_norm": 3.0851364135742188, "learning_rate": 3.599912657519843e-06, "loss": 0.2211, "num_input_tokens_seen": 24806672, "step": 40695 }, { "epoch": 12.627986348122867, "grad_norm": 3.4735500812530518, "learning_rate": 3.5986130600911578e-06, "loss": 0.2178, "num_input_tokens_seen": 24809168, "step": 40700 }, { "epoch": 12.629537697797083, "grad_norm": 2.9107143878936768, "learning_rate": 3.5973135654007307e-06, "loss": 0.1965, "num_input_tokens_seen": 24811760, "step": 40705 }, { "epoch": 12.6310890474713, "grad_norm": 2.759181261062622, "learning_rate": 3.596014173543829e-06, "loss": 0.2107, "num_input_tokens_seen": 24814192, "step": 40710 }, { "epoch": 12.632640397145517, "grad_norm": 3.4888081550598145, "learning_rate": 3.594714884615712e-06, "loss": 0.2466, "num_input_tokens_seen": 24817040, "step": 40715 }, { "epoch": 12.634191746819733, "grad_norm": 3.086841344833374, "learning_rate": 3.5934156987116374e-06, "loss": 0.2328, "num_input_tokens_seen": 24820080, "step": 40720 }, { "epoch": 12.63574309649395, "grad_norm": 5.066248893737793, "learning_rate": 3.5921166159268486e-06, "loss": 0.2425, "num_input_tokens_seen": 24822160, "step": 40725 }, { "epoch": 12.637294446168166, "grad_norm": 5.490870952606201, "learning_rate": 3.5908176363565827e-06, "loss": 0.216, "num_input_tokens_seen": 24825456, "step": 40730 }, { "epoch": 12.638845795842382, "grad_norm": 3.5664150714874268, "learning_rate": 3.5895187600960726e-06, "loss": 0.1976, "num_input_tokens_seen": 24828080, "step": 40735 }, { "epoch": 12.6403971455166, "grad_norm": 8.00304889678955, "learning_rate": 3.5882199872405387e-06, "loss": 0.2421, "num_input_tokens_seen": 24831600, "step": 40740 }, { "epoch": 12.641948495190816, "grad_norm": 3.1289963722229004, "learning_rate": 3.586921317885199e-06, "loss": 0.248, "num_input_tokens_seen": 24834640, "step": 40745 }, { "epoch": 12.643499844865033, "grad_norm": 3.965846538543701, "learning_rate": 3.5856227521252596e-06, "loss": 0.2132, "num_input_tokens_seen": 24837680, "step": 40750 }, { "epoch": 12.64505119453925, "grad_norm": 2.954643964767456, "learning_rate": 3.5843242900559206e-06, "loss": 0.2316, "num_input_tokens_seen": 24840048, "step": 40755 }, { "epoch": 12.646602544213465, "grad_norm": 4.463082313537598, "learning_rate": 3.583025931772376e-06, "loss": 0.2227, "num_input_tokens_seen": 24842288, "step": 40760 }, { "epoch": 12.648153893887683, "grad_norm": 3.4382786750793457, "learning_rate": 3.5817276773698094e-06, "loss": 0.2407, "num_input_tokens_seen": 24845520, "step": 40765 }, { "epoch": 12.649705243561899, "grad_norm": 4.371575832366943, "learning_rate": 3.5804295269433984e-06, "loss": 0.198, "num_input_tokens_seen": 24848944, "step": 40770 }, { "epoch": 12.651256593236116, "grad_norm": 4.429008960723877, "learning_rate": 3.5791314805883144e-06, "loss": 0.2337, "num_input_tokens_seen": 24852784, "step": 40775 }, { "epoch": 12.652807942910332, "grad_norm": 5.327347755432129, "learning_rate": 3.577833538399718e-06, "loss": 0.1931, "num_input_tokens_seen": 24858032, "step": 40780 }, { "epoch": 12.654359292584548, "grad_norm": 2.1109883785247803, "learning_rate": 3.5765357004727645e-06, "loss": 0.2111, "num_input_tokens_seen": 24860816, "step": 40785 }, { "epoch": 12.655910642258766, "grad_norm": 4.1452226638793945, "learning_rate": 3.5752379669026004e-06, "loss": 0.2119, "num_input_tokens_seen": 24863792, "step": 40790 }, { "epoch": 12.657461991932982, "grad_norm": 4.413462162017822, "learning_rate": 3.573940337784365e-06, "loss": 0.2357, "num_input_tokens_seen": 24866320, "step": 40795 }, { "epoch": 12.659013341607197, "grad_norm": 3.05694580078125, "learning_rate": 3.5726428132131902e-06, "loss": 0.2103, "num_input_tokens_seen": 24869616, "step": 40800 }, { "epoch": 12.660564691281415, "grad_norm": 4.041589736938477, "learning_rate": 3.5713453932841997e-06, "loss": 0.2249, "num_input_tokens_seen": 24872272, "step": 40805 }, { "epoch": 12.662116040955631, "grad_norm": 5.308594703674316, "learning_rate": 3.5700480780925094e-06, "loss": 0.2335, "num_input_tokens_seen": 24875536, "step": 40810 }, { "epoch": 12.663667390629849, "grad_norm": 1.6858450174331665, "learning_rate": 3.5687508677332284e-06, "loss": 0.1842, "num_input_tokens_seen": 24878288, "step": 40815 }, { "epoch": 12.665218740304065, "grad_norm": 1.0497338771820068, "learning_rate": 3.5674537623014564e-06, "loss": 0.19, "num_input_tokens_seen": 24882704, "step": 40820 }, { "epoch": 12.66677008997828, "grad_norm": 3.585184097290039, "learning_rate": 3.5661567618922887e-06, "loss": 0.1852, "num_input_tokens_seen": 24885296, "step": 40825 }, { "epoch": 12.668321439652498, "grad_norm": 1.5953291654586792, "learning_rate": 3.564859866600808e-06, "loss": 0.2175, "num_input_tokens_seen": 24888208, "step": 40830 }, { "epoch": 12.669872789326714, "grad_norm": 2.548473596572876, "learning_rate": 3.5635630765220945e-06, "loss": 0.1893, "num_input_tokens_seen": 24891248, "step": 40835 }, { "epoch": 12.67142413900093, "grad_norm": 4.323270797729492, "learning_rate": 3.5622663917512178e-06, "loss": 0.1991, "num_input_tokens_seen": 24894288, "step": 40840 }, { "epoch": 12.672975488675148, "grad_norm": 4.591459274291992, "learning_rate": 3.5609698123832397e-06, "loss": 0.2267, "num_input_tokens_seen": 24897200, "step": 40845 }, { "epoch": 12.674526838349363, "grad_norm": 3.8101465702056885, "learning_rate": 3.559673338513215e-06, "loss": 0.2723, "num_input_tokens_seen": 24900368, "step": 40850 }, { "epoch": 12.676078188023581, "grad_norm": 4.270156383514404, "learning_rate": 3.5583769702361907e-06, "loss": 0.2637, "num_input_tokens_seen": 24903664, "step": 40855 }, { "epoch": 12.677629537697797, "grad_norm": 4.396214008331299, "learning_rate": 3.557080707647206e-06, "loss": 0.2421, "num_input_tokens_seen": 24906896, "step": 40860 }, { "epoch": 12.679180887372013, "grad_norm": 2.92887282371521, "learning_rate": 3.5557845508412926e-06, "loss": 0.2146, "num_input_tokens_seen": 24909680, "step": 40865 }, { "epoch": 12.68073223704623, "grad_norm": 3.5829567909240723, "learning_rate": 3.554488499913473e-06, "loss": 0.1744, "num_input_tokens_seen": 24913264, "step": 40870 }, { "epoch": 12.682283586720446, "grad_norm": 1.7055261135101318, "learning_rate": 3.553192554958764e-06, "loss": 0.2265, "num_input_tokens_seen": 24916368, "step": 40875 }, { "epoch": 12.683834936394664, "grad_norm": 3.876735210418701, "learning_rate": 3.551896716072173e-06, "loss": 0.2298, "num_input_tokens_seen": 24919280, "step": 40880 }, { "epoch": 12.68538628606888, "grad_norm": 4.104383945465088, "learning_rate": 3.550600983348701e-06, "loss": 0.2499, "num_input_tokens_seen": 24922352, "step": 40885 }, { "epoch": 12.686937635743096, "grad_norm": 5.389847755432129, "learning_rate": 3.54930535688334e-06, "loss": 0.2577, "num_input_tokens_seen": 24925296, "step": 40890 }, { "epoch": 12.688488985417314, "grad_norm": 2.101299285888672, "learning_rate": 3.548009836771076e-06, "loss": 0.2005, "num_input_tokens_seen": 24927728, "step": 40895 }, { "epoch": 12.69004033509153, "grad_norm": 4.003356456756592, "learning_rate": 3.546714423106884e-06, "loss": 0.2546, "num_input_tokens_seen": 24930736, "step": 40900 }, { "epoch": 12.691591684765747, "grad_norm": 4.600131034851074, "learning_rate": 3.5454191159857354e-06, "loss": 0.2483, "num_input_tokens_seen": 24932912, "step": 40905 }, { "epoch": 12.693143034439963, "grad_norm": 6.300399303436279, "learning_rate": 3.54412391550259e-06, "loss": 0.2302, "num_input_tokens_seen": 24935920, "step": 40910 }, { "epoch": 12.694694384114179, "grad_norm": 3.366580009460449, "learning_rate": 3.542828821752402e-06, "loss": 0.2577, "num_input_tokens_seen": 24939152, "step": 40915 }, { "epoch": 12.696245733788396, "grad_norm": 1.4714512825012207, "learning_rate": 3.5415338348301164e-06, "loss": 0.2116, "num_input_tokens_seen": 24941040, "step": 40920 }, { "epoch": 12.697797083462612, "grad_norm": 4.4577956199646, "learning_rate": 3.540238954830672e-06, "loss": 0.2433, "num_input_tokens_seen": 24943632, "step": 40925 }, { "epoch": 12.699348433136828, "grad_norm": 5.20736026763916, "learning_rate": 3.5389441818489983e-06, "loss": 0.2287, "num_input_tokens_seen": 24946480, "step": 40930 }, { "epoch": 12.700899782811046, "grad_norm": 1.657042384147644, "learning_rate": 3.537649515980017e-06, "loss": 0.2228, "num_input_tokens_seen": 24949360, "step": 40935 }, { "epoch": 12.702451132485262, "grad_norm": 2.7627639770507812, "learning_rate": 3.536354957318644e-06, "loss": 0.1994, "num_input_tokens_seen": 24951536, "step": 40940 }, { "epoch": 12.70400248215948, "grad_norm": 3.459969997406006, "learning_rate": 3.535060505959784e-06, "loss": 0.2253, "num_input_tokens_seen": 24955024, "step": 40945 }, { "epoch": 12.705553831833695, "grad_norm": 3.839684247970581, "learning_rate": 3.5337661619983354e-06, "loss": 0.2078, "num_input_tokens_seen": 24957936, "step": 40950 }, { "epoch": 12.707105181507911, "grad_norm": 6.223212242126465, "learning_rate": 3.5324719255291916e-06, "loss": 0.2578, "num_input_tokens_seen": 24961264, "step": 40955 }, { "epoch": 12.708656531182129, "grad_norm": 2.341013193130493, "learning_rate": 3.5311777966472332e-06, "loss": 0.2303, "num_input_tokens_seen": 24963184, "step": 40960 }, { "epoch": 12.710207880856345, "grad_norm": 5.2121758460998535, "learning_rate": 3.529883775447336e-06, "loss": 0.2105, "num_input_tokens_seen": 24965648, "step": 40965 }, { "epoch": 12.71175923053056, "grad_norm": 5.764769554138184, "learning_rate": 3.5285898620243664e-06, "loss": 0.2647, "num_input_tokens_seen": 24967824, "step": 40970 }, { "epoch": 12.713310580204778, "grad_norm": 4.323554992675781, "learning_rate": 3.527296056473185e-06, "loss": 0.2087, "num_input_tokens_seen": 24970320, "step": 40975 }, { "epoch": 12.714861929878994, "grad_norm": 1.9147700071334839, "learning_rate": 3.526002358888641e-06, "loss": 0.2567, "num_input_tokens_seen": 24973072, "step": 40980 }, { "epoch": 12.716413279553212, "grad_norm": 5.397760391235352, "learning_rate": 3.524708769365579e-06, "loss": 0.2265, "num_input_tokens_seen": 24976048, "step": 40985 }, { "epoch": 12.717964629227428, "grad_norm": 4.351668357849121, "learning_rate": 3.523415287998835e-06, "loss": 0.2302, "num_input_tokens_seen": 24978864, "step": 40990 }, { "epoch": 12.719515978901644, "grad_norm": 2.48223614692688, "learning_rate": 3.5221219148832353e-06, "loss": 0.203, "num_input_tokens_seen": 24982096, "step": 40995 }, { "epoch": 12.721067328575861, "grad_norm": 3.2675974369049072, "learning_rate": 3.5208286501136e-06, "loss": 0.252, "num_input_tokens_seen": 24986512, "step": 41000 }, { "epoch": 12.722618678250077, "grad_norm": 4.740145206451416, "learning_rate": 3.5195354937847403e-06, "loss": 0.2868, "num_input_tokens_seen": 24989232, "step": 41005 }, { "epoch": 12.724170027924295, "grad_norm": 1.8945740461349487, "learning_rate": 3.51824244599146e-06, "loss": 0.2016, "num_input_tokens_seen": 24992112, "step": 41010 }, { "epoch": 12.72572137759851, "grad_norm": 2.4681200981140137, "learning_rate": 3.5169495068285553e-06, "loss": 0.1854, "num_input_tokens_seen": 24994896, "step": 41015 }, { "epoch": 12.727272727272727, "grad_norm": 3.6605446338653564, "learning_rate": 3.515656676390814e-06, "loss": 0.2501, "num_input_tokens_seen": 24997744, "step": 41020 }, { "epoch": 12.728824076946944, "grad_norm": 2.068227529525757, "learning_rate": 3.514363954773016e-06, "loss": 0.191, "num_input_tokens_seen": 25000176, "step": 41025 }, { "epoch": 12.73037542662116, "grad_norm": 4.4988694190979, "learning_rate": 3.5130713420699314e-06, "loss": 0.2158, "num_input_tokens_seen": 25002704, "step": 41030 }, { "epoch": 12.731926776295378, "grad_norm": 2.869694709777832, "learning_rate": 3.5117788383763262e-06, "loss": 0.209, "num_input_tokens_seen": 25006896, "step": 41035 }, { "epoch": 12.733478125969594, "grad_norm": 2.5534839630126953, "learning_rate": 3.5104864437869556e-06, "loss": 0.1961, "num_input_tokens_seen": 25009392, "step": 41040 }, { "epoch": 12.73502947564381, "grad_norm": 4.077064037322998, "learning_rate": 3.5091941583965673e-06, "loss": 0.2185, "num_input_tokens_seen": 25012848, "step": 41045 }, { "epoch": 12.736580825318027, "grad_norm": 3.068056106567383, "learning_rate": 3.5079019822999e-06, "loss": 0.1971, "num_input_tokens_seen": 25016144, "step": 41050 }, { "epoch": 12.738132174992243, "grad_norm": 2.523679733276367, "learning_rate": 3.5066099155916865e-06, "loss": 0.2006, "num_input_tokens_seen": 25019312, "step": 41055 }, { "epoch": 12.739683524666459, "grad_norm": 3.5743300914764404, "learning_rate": 3.505317958366651e-06, "loss": 0.1832, "num_input_tokens_seen": 25022544, "step": 41060 }, { "epoch": 12.741234874340677, "grad_norm": 1.7553352117538452, "learning_rate": 3.5040261107195095e-06, "loss": 0.2647, "num_input_tokens_seen": 25025648, "step": 41065 }, { "epoch": 12.742786224014893, "grad_norm": 6.215160846710205, "learning_rate": 3.502734372744967e-06, "loss": 0.2309, "num_input_tokens_seen": 25029616, "step": 41070 }, { "epoch": 12.74433757368911, "grad_norm": 1.9137818813323975, "learning_rate": 3.501442744537727e-06, "loss": 0.1819, "num_input_tokens_seen": 25032464, "step": 41075 }, { "epoch": 12.745888923363326, "grad_norm": 2.2972846031188965, "learning_rate": 3.5001512261924788e-06, "loss": 0.2039, "num_input_tokens_seen": 25035312, "step": 41080 }, { "epoch": 12.747440273037542, "grad_norm": 1.9462037086486816, "learning_rate": 3.498859817803907e-06, "loss": 0.2483, "num_input_tokens_seen": 25037456, "step": 41085 }, { "epoch": 12.74899162271176, "grad_norm": 1.9824259281158447, "learning_rate": 3.4975685194666864e-06, "loss": 0.2352, "num_input_tokens_seen": 25040240, "step": 41090 }, { "epoch": 12.750542972385976, "grad_norm": 4.033705711364746, "learning_rate": 3.496277331275485e-06, "loss": 0.2022, "num_input_tokens_seen": 25042704, "step": 41095 }, { "epoch": 12.752094322060191, "grad_norm": 1.945011019706726, "learning_rate": 3.4949862533249625e-06, "loss": 0.2412, "num_input_tokens_seen": 25045328, "step": 41100 }, { "epoch": 12.753645671734409, "grad_norm": 5.752238750457764, "learning_rate": 3.4936952857097686e-06, "loss": 0.2383, "num_input_tokens_seen": 25047824, "step": 41105 }, { "epoch": 12.755197021408625, "grad_norm": 3.3312456607818604, "learning_rate": 3.4924044285245482e-06, "loss": 0.2432, "num_input_tokens_seen": 25051344, "step": 41110 }, { "epoch": 12.756748371082843, "grad_norm": 1.389783263206482, "learning_rate": 3.491113681863936e-06, "loss": 0.1948, "num_input_tokens_seen": 25054160, "step": 41115 }, { "epoch": 12.758299720757059, "grad_norm": 4.045492172241211, "learning_rate": 3.489823045822558e-06, "loss": 0.2475, "num_input_tokens_seen": 25057104, "step": 41120 }, { "epoch": 12.759851070431274, "grad_norm": 2.6306087970733643, "learning_rate": 3.4885325204950344e-06, "loss": 0.2067, "num_input_tokens_seen": 25060720, "step": 41125 }, { "epoch": 12.761402420105492, "grad_norm": 5.647575378417969, "learning_rate": 3.4872421059759742e-06, "loss": 0.2395, "num_input_tokens_seen": 25063248, "step": 41130 }, { "epoch": 12.762953769779708, "grad_norm": 2.8496954441070557, "learning_rate": 3.4859518023599827e-06, "loss": 0.2607, "num_input_tokens_seen": 25066576, "step": 41135 }, { "epoch": 12.764505119453926, "grad_norm": 1.690847635269165, "learning_rate": 3.484661609741653e-06, "loss": 0.2055, "num_input_tokens_seen": 25069552, "step": 41140 }, { "epoch": 12.766056469128142, "grad_norm": 2.5044755935668945, "learning_rate": 3.4833715282155716e-06, "loss": 0.214, "num_input_tokens_seen": 25072336, "step": 41145 }, { "epoch": 12.767607818802357, "grad_norm": 4.943988800048828, "learning_rate": 3.4820815578763166e-06, "loss": 0.2174, "num_input_tokens_seen": 25074832, "step": 41150 }, { "epoch": 12.769159168476575, "grad_norm": 3.2819724082946777, "learning_rate": 3.4807916988184587e-06, "loss": 0.2031, "num_input_tokens_seen": 25077392, "step": 41155 }, { "epoch": 12.770710518150791, "grad_norm": 3.0501933097839355, "learning_rate": 3.479501951136559e-06, "loss": 0.2094, "num_input_tokens_seen": 25079824, "step": 41160 }, { "epoch": 12.772261867825009, "grad_norm": 6.988368034362793, "learning_rate": 3.4782123149251717e-06, "loss": 0.2409, "num_input_tokens_seen": 25082352, "step": 41165 }, { "epoch": 12.773813217499224, "grad_norm": 5.362554550170898, "learning_rate": 3.476922790278843e-06, "loss": 0.2145, "num_input_tokens_seen": 25085424, "step": 41170 }, { "epoch": 12.77536456717344, "grad_norm": 3.1842904090881348, "learning_rate": 3.475633377292109e-06, "loss": 0.1939, "num_input_tokens_seen": 25087728, "step": 41175 }, { "epoch": 12.776915916847658, "grad_norm": 4.975244998931885, "learning_rate": 3.474344076059499e-06, "loss": 0.2105, "num_input_tokens_seen": 25090960, "step": 41180 }, { "epoch": 12.778467266521874, "grad_norm": 2.1486668586730957, "learning_rate": 3.4730548866755366e-06, "loss": 0.2038, "num_input_tokens_seen": 25094832, "step": 41185 }, { "epoch": 12.78001861619609, "grad_norm": 2.9874093532562256, "learning_rate": 3.47176580923473e-06, "loss": 0.2894, "num_input_tokens_seen": 25097616, "step": 41190 }, { "epoch": 12.781569965870307, "grad_norm": 4.182459354400635, "learning_rate": 3.470476843831588e-06, "loss": 0.2305, "num_input_tokens_seen": 25099920, "step": 41195 }, { "epoch": 12.783121315544523, "grad_norm": 8.40274429321289, "learning_rate": 3.4691879905606062e-06, "loss": 0.2136, "num_input_tokens_seen": 25103728, "step": 41200 }, { "epoch": 12.784672665218741, "grad_norm": 4.414653778076172, "learning_rate": 3.467899249516272e-06, "loss": 0.2275, "num_input_tokens_seen": 25107280, "step": 41205 }, { "epoch": 12.786224014892957, "grad_norm": 4.524346351623535, "learning_rate": 3.466610620793065e-06, "loss": 0.1755, "num_input_tokens_seen": 25110544, "step": 41210 }, { "epoch": 12.787775364567173, "grad_norm": 2.800797700881958, "learning_rate": 3.465322104485458e-06, "loss": 0.2082, "num_input_tokens_seen": 25113264, "step": 41215 }, { "epoch": 12.78932671424139, "grad_norm": 4.460442543029785, "learning_rate": 3.4640337006879145e-06, "loss": 0.2412, "num_input_tokens_seen": 25116784, "step": 41220 }, { "epoch": 12.790878063915606, "grad_norm": 2.6690711975097656, "learning_rate": 3.4627454094948885e-06, "loss": 0.2199, "num_input_tokens_seen": 25119408, "step": 41225 }, { "epoch": 12.792429413589822, "grad_norm": 1.9462553262710571, "learning_rate": 3.4614572310008286e-06, "loss": 0.184, "num_input_tokens_seen": 25121968, "step": 41230 }, { "epoch": 12.79398076326404, "grad_norm": 7.438624858856201, "learning_rate": 3.460169165300172e-06, "loss": 0.2451, "num_input_tokens_seen": 25124944, "step": 41235 }, { "epoch": 12.795532112938256, "grad_norm": 6.367615222930908, "learning_rate": 3.4588812124873506e-06, "loss": 0.258, "num_input_tokens_seen": 25127728, "step": 41240 }, { "epoch": 12.797083462612473, "grad_norm": 5.673043251037598, "learning_rate": 3.457593372656785e-06, "loss": 0.2058, "num_input_tokens_seen": 25130960, "step": 41245 }, { "epoch": 12.79863481228669, "grad_norm": 4.016584396362305, "learning_rate": 3.45630564590289e-06, "loss": 0.2048, "num_input_tokens_seen": 25134672, "step": 41250 }, { "epoch": 12.800186161960905, "grad_norm": 4.613254070281982, "learning_rate": 3.455018032320071e-06, "loss": 0.2046, "num_input_tokens_seen": 25137360, "step": 41255 }, { "epoch": 12.801737511635123, "grad_norm": 4.8870625495910645, "learning_rate": 3.453730532002727e-06, "loss": 0.2497, "num_input_tokens_seen": 25140624, "step": 41260 }, { "epoch": 12.803288861309339, "grad_norm": 4.656395435333252, "learning_rate": 3.4524431450452446e-06, "loss": 0.2064, "num_input_tokens_seen": 25143152, "step": 41265 }, { "epoch": 12.804840210983556, "grad_norm": 5.239128589630127, "learning_rate": 3.4511558715420056e-06, "loss": 0.2106, "num_input_tokens_seen": 25145968, "step": 41270 }, { "epoch": 12.806391560657772, "grad_norm": 4.615206241607666, "learning_rate": 3.4498687115873825e-06, "loss": 0.2317, "num_input_tokens_seen": 25150128, "step": 41275 }, { "epoch": 12.807942910331988, "grad_norm": 1.845747709274292, "learning_rate": 3.448581665275739e-06, "loss": 0.1892, "num_input_tokens_seen": 25152912, "step": 41280 }, { "epoch": 12.809494260006206, "grad_norm": 4.0686774253845215, "learning_rate": 3.447294732701431e-06, "loss": 0.2482, "num_input_tokens_seen": 25155760, "step": 41285 }, { "epoch": 12.811045609680422, "grad_norm": 4.295971870422363, "learning_rate": 3.446007913958806e-06, "loss": 0.1859, "num_input_tokens_seen": 25158576, "step": 41290 }, { "epoch": 12.81259695935464, "grad_norm": 4.486368656158447, "learning_rate": 3.444721209142201e-06, "loss": 0.2066, "num_input_tokens_seen": 25162352, "step": 41295 }, { "epoch": 12.814148309028855, "grad_norm": 4.960416316986084, "learning_rate": 3.44343461834595e-06, "loss": 0.2427, "num_input_tokens_seen": 25168304, "step": 41300 }, { "epoch": 12.815699658703071, "grad_norm": 2.0609872341156006, "learning_rate": 3.442148141664375e-06, "loss": 0.242, "num_input_tokens_seen": 25171984, "step": 41305 }, { "epoch": 12.817251008377289, "grad_norm": 2.84997820854187, "learning_rate": 3.440861779191788e-06, "loss": 0.2305, "num_input_tokens_seen": 25175440, "step": 41310 }, { "epoch": 12.818802358051505, "grad_norm": 2.2203431129455566, "learning_rate": 3.439575531022496e-06, "loss": 0.182, "num_input_tokens_seen": 25178096, "step": 41315 }, { "epoch": 12.82035370772572, "grad_norm": 3.167144536972046, "learning_rate": 3.4382893972507956e-06, "loss": 0.1959, "num_input_tokens_seen": 25181200, "step": 41320 }, { "epoch": 12.821905057399938, "grad_norm": 3.144730806350708, "learning_rate": 3.437003377970976e-06, "loss": 0.2104, "num_input_tokens_seen": 25183504, "step": 41325 }, { "epoch": 12.823456407074154, "grad_norm": 5.147494792938232, "learning_rate": 3.4357174732773175e-06, "loss": 0.1927, "num_input_tokens_seen": 25186544, "step": 41330 }, { "epoch": 12.825007756748372, "grad_norm": 5.728090286254883, "learning_rate": 3.4344316832640913e-06, "loss": 0.2055, "num_input_tokens_seen": 25189904, "step": 41335 }, { "epoch": 12.826559106422588, "grad_norm": 16.709211349487305, "learning_rate": 3.433146008025562e-06, "loss": 0.228, "num_input_tokens_seen": 25194192, "step": 41340 }, { "epoch": 12.828110456096804, "grad_norm": 2.7421634197235107, "learning_rate": 3.4318604476559846e-06, "loss": 0.2085, "num_input_tokens_seen": 25197840, "step": 41345 }, { "epoch": 12.829661805771021, "grad_norm": 2.97640061378479, "learning_rate": 3.4305750022496066e-06, "loss": 0.1792, "num_input_tokens_seen": 25200304, "step": 41350 }, { "epoch": 12.831213155445237, "grad_norm": 3.161127805709839, "learning_rate": 3.4292896719006656e-06, "loss": 0.2214, "num_input_tokens_seen": 25203312, "step": 41355 }, { "epoch": 12.832764505119453, "grad_norm": 2.9527828693389893, "learning_rate": 3.428004456703392e-06, "loss": 0.2143, "num_input_tokens_seen": 25206000, "step": 41360 }, { "epoch": 12.83431585479367, "grad_norm": 2.565098762512207, "learning_rate": 3.426719356752006e-06, "loss": 0.2109, "num_input_tokens_seen": 25208816, "step": 41365 }, { "epoch": 12.835867204467887, "grad_norm": 5.198522090911865, "learning_rate": 3.4254343721407223e-06, "loss": 0.2373, "num_input_tokens_seen": 25212368, "step": 41370 }, { "epoch": 12.837418554142104, "grad_norm": 2.591963052749634, "learning_rate": 3.4241495029637445e-06, "loss": 0.2199, "num_input_tokens_seen": 25215696, "step": 41375 }, { "epoch": 12.83896990381632, "grad_norm": 2.8249030113220215, "learning_rate": 3.422864749315269e-06, "loss": 0.2553, "num_input_tokens_seen": 25219152, "step": 41380 }, { "epoch": 12.840521253490536, "grad_norm": 2.5858495235443115, "learning_rate": 3.4215801112894836e-06, "loss": 0.2267, "num_input_tokens_seen": 25221200, "step": 41385 }, { "epoch": 12.842072603164754, "grad_norm": 3.7874457836151123, "learning_rate": 3.420295588980567e-06, "loss": 0.2151, "num_input_tokens_seen": 25223920, "step": 41390 }, { "epoch": 12.84362395283897, "grad_norm": 4.811378002166748, "learning_rate": 3.41901118248269e-06, "loss": 0.2455, "num_input_tokens_seen": 25226512, "step": 41395 }, { "epoch": 12.845175302513187, "grad_norm": 2.284694194793701, "learning_rate": 3.4177268918900154e-06, "loss": 0.1937, "num_input_tokens_seen": 25229872, "step": 41400 }, { "epoch": 12.846726652187403, "grad_norm": 4.62371301651001, "learning_rate": 3.4164427172966964e-06, "loss": 0.1943, "num_input_tokens_seen": 25232944, "step": 41405 }, { "epoch": 12.848278001861619, "grad_norm": 5.9161152839660645, "learning_rate": 3.415158658796879e-06, "loss": 0.2512, "num_input_tokens_seen": 25235248, "step": 41410 }, { "epoch": 12.849829351535837, "grad_norm": 6.305355548858643, "learning_rate": 3.4138747164846987e-06, "loss": 0.1959, "num_input_tokens_seen": 25238928, "step": 41415 }, { "epoch": 12.851380701210052, "grad_norm": 5.370597839355469, "learning_rate": 3.4125908904542836e-06, "loss": 0.2179, "num_input_tokens_seen": 25241776, "step": 41420 }, { "epoch": 12.85293205088427, "grad_norm": 4.171870231628418, "learning_rate": 3.411307180799755e-06, "loss": 0.2348, "num_input_tokens_seen": 25243920, "step": 41425 }, { "epoch": 12.854483400558486, "grad_norm": 3.6939663887023926, "learning_rate": 3.4100235876152226e-06, "loss": 0.2142, "num_input_tokens_seen": 25245968, "step": 41430 }, { "epoch": 12.856034750232702, "grad_norm": 3.2417001724243164, "learning_rate": 3.4087401109947895e-06, "loss": 0.1603, "num_input_tokens_seen": 25249744, "step": 41435 }, { "epoch": 12.85758609990692, "grad_norm": 2.6832375526428223, "learning_rate": 3.4074567510325497e-06, "loss": 0.1962, "num_input_tokens_seen": 25251984, "step": 41440 }, { "epoch": 12.859137449581135, "grad_norm": 2.2591307163238525, "learning_rate": 3.4061735078225887e-06, "loss": 0.2402, "num_input_tokens_seen": 25255408, "step": 41445 }, { "epoch": 12.860688799255351, "grad_norm": 5.329441547393799, "learning_rate": 3.404890381458983e-06, "loss": 0.1696, "num_input_tokens_seen": 25258800, "step": 41450 }, { "epoch": 12.862240148929569, "grad_norm": 5.410491466522217, "learning_rate": 3.4036073720358e-06, "loss": 0.2323, "num_input_tokens_seen": 25261168, "step": 41455 }, { "epoch": 12.863791498603785, "grad_norm": 4.145399570465088, "learning_rate": 3.4023244796471022e-06, "loss": 0.1742, "num_input_tokens_seen": 25264112, "step": 41460 }, { "epoch": 12.865342848278003, "grad_norm": 4.529770374298096, "learning_rate": 3.4010417043869393e-06, "loss": 0.204, "num_input_tokens_seen": 25266352, "step": 41465 }, { "epoch": 12.866894197952218, "grad_norm": 4.589536190032959, "learning_rate": 3.3997590463493536e-06, "loss": 0.2233, "num_input_tokens_seen": 25268560, "step": 41470 }, { "epoch": 12.868445547626434, "grad_norm": 6.355828762054443, "learning_rate": 3.39847650562838e-06, "loss": 0.2717, "num_input_tokens_seen": 25271376, "step": 41475 }, { "epoch": 12.869996897300652, "grad_norm": 5.521440505981445, "learning_rate": 3.397194082318043e-06, "loss": 0.2378, "num_input_tokens_seen": 25273968, "step": 41480 }, { "epoch": 12.871548246974868, "grad_norm": 2.2775440216064453, "learning_rate": 3.39591177651236e-06, "loss": 0.1799, "num_input_tokens_seen": 25277616, "step": 41485 }, { "epoch": 12.873099596649084, "grad_norm": 10.156018257141113, "learning_rate": 3.394629588305339e-06, "loss": 0.1919, "num_input_tokens_seen": 25280816, "step": 41490 }, { "epoch": 12.874650946323301, "grad_norm": 4.806637287139893, "learning_rate": 3.3933475177909794e-06, "loss": 0.2293, "num_input_tokens_seen": 25285424, "step": 41495 }, { "epoch": 12.876202295997517, "grad_norm": 5.393820762634277, "learning_rate": 3.392065565063273e-06, "loss": 0.234, "num_input_tokens_seen": 25288112, "step": 41500 }, { "epoch": 12.877753645671735, "grad_norm": 2.09122633934021, "learning_rate": 3.3907837302162004e-06, "loss": 0.2331, "num_input_tokens_seen": 25291504, "step": 41505 }, { "epoch": 12.87930499534595, "grad_norm": 3.8716204166412354, "learning_rate": 3.3895020133437373e-06, "loss": 0.265, "num_input_tokens_seen": 25294704, "step": 41510 }, { "epoch": 12.880856345020167, "grad_norm": 2.0933074951171875, "learning_rate": 3.388220414539847e-06, "loss": 0.1867, "num_input_tokens_seen": 25298448, "step": 41515 }, { "epoch": 12.882407694694384, "grad_norm": 3.4596006870269775, "learning_rate": 3.386938933898487e-06, "loss": 0.2044, "num_input_tokens_seen": 25301136, "step": 41520 }, { "epoch": 12.8839590443686, "grad_norm": 1.882298231124878, "learning_rate": 3.3856575715136043e-06, "loss": 0.2274, "num_input_tokens_seen": 25304208, "step": 41525 }, { "epoch": 12.885510394042818, "grad_norm": 2.6734516620635986, "learning_rate": 3.3843763274791385e-06, "loss": 0.3087, "num_input_tokens_seen": 25306832, "step": 41530 }, { "epoch": 12.887061743717034, "grad_norm": 4.414937496185303, "learning_rate": 3.3830952018890205e-06, "loss": 0.2138, "num_input_tokens_seen": 25309840, "step": 41535 }, { "epoch": 12.88861309339125, "grad_norm": 5.077075004577637, "learning_rate": 3.381814194837171e-06, "loss": 0.2149, "num_input_tokens_seen": 25312336, "step": 41540 }, { "epoch": 12.890164443065467, "grad_norm": 3.746177911758423, "learning_rate": 3.380533306417503e-06, "loss": 0.1918, "num_input_tokens_seen": 25315408, "step": 41545 }, { "epoch": 12.891715792739683, "grad_norm": 2.1040797233581543, "learning_rate": 3.3792525367239203e-06, "loss": 0.2035, "num_input_tokens_seen": 25318192, "step": 41550 }, { "epoch": 12.893267142413901, "grad_norm": 5.319515228271484, "learning_rate": 3.3779718858503195e-06, "loss": 0.2446, "num_input_tokens_seen": 25322448, "step": 41555 }, { "epoch": 12.894818492088117, "grad_norm": 6.2232818603515625, "learning_rate": 3.3766913538905875e-06, "loss": 0.202, "num_input_tokens_seen": 25325808, "step": 41560 }, { "epoch": 12.896369841762333, "grad_norm": 3.5500171184539795, "learning_rate": 3.3754109409386025e-06, "loss": 0.1932, "num_input_tokens_seen": 25328400, "step": 41565 }, { "epoch": 12.89792119143655, "grad_norm": 1.7396129369735718, "learning_rate": 3.374130647088233e-06, "loss": 0.2506, "num_input_tokens_seen": 25331408, "step": 41570 }, { "epoch": 12.899472541110766, "grad_norm": 5.338269233703613, "learning_rate": 3.372850472433339e-06, "loss": 0.214, "num_input_tokens_seen": 25334512, "step": 41575 }, { "epoch": 12.901023890784982, "grad_norm": 3.205887794494629, "learning_rate": 3.3715704170677755e-06, "loss": 0.2192, "num_input_tokens_seen": 25337360, "step": 41580 }, { "epoch": 12.9025752404592, "grad_norm": 2.4966611862182617, "learning_rate": 3.3702904810853833e-06, "loss": 0.2401, "num_input_tokens_seen": 25340048, "step": 41585 }, { "epoch": 12.904126590133416, "grad_norm": 5.382419109344482, "learning_rate": 3.3690106645799973e-06, "loss": 0.2392, "num_input_tokens_seen": 25342992, "step": 41590 }, { "epoch": 12.905677939807633, "grad_norm": 3.7337841987609863, "learning_rate": 3.367730967645444e-06, "loss": 0.2268, "num_input_tokens_seen": 25345808, "step": 41595 }, { "epoch": 12.90722928948185, "grad_norm": 3.272453546524048, "learning_rate": 3.366451390375539e-06, "loss": 0.1581, "num_input_tokens_seen": 25348752, "step": 41600 }, { "epoch": 12.908780639156065, "grad_norm": 5.980327606201172, "learning_rate": 3.365171932864092e-06, "loss": 0.2255, "num_input_tokens_seen": 25352688, "step": 41605 }, { "epoch": 12.910331988830283, "grad_norm": 4.413150310516357, "learning_rate": 3.363892595204902e-06, "loss": 0.166, "num_input_tokens_seen": 25356272, "step": 41610 }, { "epoch": 12.911883338504499, "grad_norm": 3.825137138366699, "learning_rate": 3.362613377491758e-06, "loss": 0.2097, "num_input_tokens_seen": 25361840, "step": 41615 }, { "epoch": 12.913434688178716, "grad_norm": 3.087204694747925, "learning_rate": 3.3613342798184433e-06, "loss": 0.225, "num_input_tokens_seen": 25364880, "step": 41620 }, { "epoch": 12.914986037852932, "grad_norm": 4.497349262237549, "learning_rate": 3.3600553022787307e-06, "loss": 0.204, "num_input_tokens_seen": 25367600, "step": 41625 }, { "epoch": 12.916537387527148, "grad_norm": 3.7879366874694824, "learning_rate": 3.358776444966384e-06, "loss": 0.2057, "num_input_tokens_seen": 25371344, "step": 41630 }, { "epoch": 12.918088737201366, "grad_norm": 3.4690234661102295, "learning_rate": 3.357497707975158e-06, "loss": 0.1936, "num_input_tokens_seen": 25376432, "step": 41635 }, { "epoch": 12.919640086875582, "grad_norm": 10.648796081542969, "learning_rate": 3.3562190913988014e-06, "loss": 0.2479, "num_input_tokens_seen": 25379152, "step": 41640 }, { "epoch": 12.921191436549798, "grad_norm": 3.9797489643096924, "learning_rate": 3.354940595331051e-06, "loss": 0.2267, "num_input_tokens_seen": 25382384, "step": 41645 }, { "epoch": 12.922742786224015, "grad_norm": 2.069789409637451, "learning_rate": 3.353662219865635e-06, "loss": 0.2064, "num_input_tokens_seen": 25385552, "step": 41650 }, { "epoch": 12.924294135898231, "grad_norm": 5.341648101806641, "learning_rate": 3.352383965096274e-06, "loss": 0.2102, "num_input_tokens_seen": 25388336, "step": 41655 }, { "epoch": 12.925845485572449, "grad_norm": 3.417494297027588, "learning_rate": 3.351105831116679e-06, "loss": 0.2141, "num_input_tokens_seen": 25391216, "step": 41660 }, { "epoch": 12.927396835246665, "grad_norm": 2.480332612991333, "learning_rate": 3.349827818020552e-06, "loss": 0.2462, "num_input_tokens_seen": 25393296, "step": 41665 }, { "epoch": 12.92894818492088, "grad_norm": 5.73521089553833, "learning_rate": 3.348549925901587e-06, "loss": 0.2192, "num_input_tokens_seen": 25397008, "step": 41670 }, { "epoch": 12.930499534595098, "grad_norm": 2.733114242553711, "learning_rate": 3.347272154853469e-06, "loss": 0.2259, "num_input_tokens_seen": 25399376, "step": 41675 }, { "epoch": 12.932050884269314, "grad_norm": 1.8034133911132812, "learning_rate": 3.345994504969873e-06, "loss": 0.2164, "num_input_tokens_seen": 25402064, "step": 41680 }, { "epoch": 12.933602233943532, "grad_norm": 5.105835437774658, "learning_rate": 3.3447169763444655e-06, "loss": 0.236, "num_input_tokens_seen": 25405136, "step": 41685 }, { "epoch": 12.935153583617748, "grad_norm": 11.820592880249023, "learning_rate": 3.343439569070906e-06, "loss": 0.1818, "num_input_tokens_seen": 25408208, "step": 41690 }, { "epoch": 12.936704933291963, "grad_norm": 5.533421993255615, "learning_rate": 3.3421622832428414e-06, "loss": 0.2289, "num_input_tokens_seen": 25411024, "step": 41695 }, { "epoch": 12.938256282966181, "grad_norm": 3.9157865047454834, "learning_rate": 3.3408851189539138e-06, "loss": 0.2039, "num_input_tokens_seen": 25414192, "step": 41700 }, { "epoch": 12.939807632640397, "grad_norm": 4.387993335723877, "learning_rate": 3.3396080762977544e-06, "loss": 0.2632, "num_input_tokens_seen": 25416720, "step": 41705 }, { "epoch": 12.941358982314613, "grad_norm": 3.7853074073791504, "learning_rate": 3.3383311553679853e-06, "loss": 0.231, "num_input_tokens_seen": 25419632, "step": 41710 }, { "epoch": 12.94291033198883, "grad_norm": 3.1992685794830322, "learning_rate": 3.337054356258219e-06, "loss": 0.1885, "num_input_tokens_seen": 25422256, "step": 41715 }, { "epoch": 12.944461681663046, "grad_norm": 2.9032599925994873, "learning_rate": 3.3357776790620603e-06, "loss": 0.2058, "num_input_tokens_seen": 25425392, "step": 41720 }, { "epoch": 12.946013031337264, "grad_norm": 2.966914653778076, "learning_rate": 3.3345011238731063e-06, "loss": 0.1946, "num_input_tokens_seen": 25428080, "step": 41725 }, { "epoch": 12.94756438101148, "grad_norm": 5.078427314758301, "learning_rate": 3.333224690784941e-06, "loss": 0.2315, "num_input_tokens_seen": 25431184, "step": 41730 }, { "epoch": 12.949115730685696, "grad_norm": 1.7148971557617188, "learning_rate": 3.3319483798911444e-06, "loss": 0.2315, "num_input_tokens_seen": 25433488, "step": 41735 }, { "epoch": 12.950667080359914, "grad_norm": 1.439753532409668, "learning_rate": 3.3306721912852834e-06, "loss": 0.1952, "num_input_tokens_seen": 25436112, "step": 41740 }, { "epoch": 12.95221843003413, "grad_norm": 4.453672409057617, "learning_rate": 3.3293961250609195e-06, "loss": 0.2231, "num_input_tokens_seen": 25438864, "step": 41745 }, { "epoch": 12.953769779708347, "grad_norm": 4.182398796081543, "learning_rate": 3.3281201813116016e-06, "loss": 0.2429, "num_input_tokens_seen": 25442704, "step": 41750 }, { "epoch": 12.955321129382563, "grad_norm": 5.784118175506592, "learning_rate": 3.326844360130873e-06, "loss": 0.2514, "num_input_tokens_seen": 25445520, "step": 41755 }, { "epoch": 12.956872479056779, "grad_norm": 2.1393990516662598, "learning_rate": 3.325568661612265e-06, "loss": 0.2182, "num_input_tokens_seen": 25449904, "step": 41760 }, { "epoch": 12.958423828730997, "grad_norm": 3.866140127182007, "learning_rate": 3.3242930858493033e-06, "loss": 0.2121, "num_input_tokens_seen": 25454192, "step": 41765 }, { "epoch": 12.959975178405212, "grad_norm": 4.168385982513428, "learning_rate": 3.323017632935501e-06, "loss": 0.2081, "num_input_tokens_seen": 25457040, "step": 41770 }, { "epoch": 12.961526528079428, "grad_norm": 4.656042098999023, "learning_rate": 3.321742302964366e-06, "loss": 0.1749, "num_input_tokens_seen": 25460112, "step": 41775 }, { "epoch": 12.963077877753646, "grad_norm": 3.581427574157715, "learning_rate": 3.3204670960293928e-06, "loss": 0.2052, "num_input_tokens_seen": 25463152, "step": 41780 }, { "epoch": 12.964629227427862, "grad_norm": 2.419252634048462, "learning_rate": 3.3191920122240704e-06, "loss": 0.2025, "num_input_tokens_seen": 25466160, "step": 41785 }, { "epoch": 12.96618057710208, "grad_norm": 1.7759509086608887, "learning_rate": 3.3179170516418766e-06, "loss": 0.2185, "num_input_tokens_seen": 25470128, "step": 41790 }, { "epoch": 12.967731926776295, "grad_norm": 3.507322311401367, "learning_rate": 3.316642214376283e-06, "loss": 0.2098, "num_input_tokens_seen": 25472688, "step": 41795 }, { "epoch": 12.969283276450511, "grad_norm": 1.9077640771865845, "learning_rate": 3.3153675005207486e-06, "loss": 0.1797, "num_input_tokens_seen": 25475216, "step": 41800 }, { "epoch": 12.970834626124729, "grad_norm": 4.1537017822265625, "learning_rate": 3.314092910168725e-06, "loss": 0.1559, "num_input_tokens_seen": 25478832, "step": 41805 }, { "epoch": 12.972385975798945, "grad_norm": 3.346576452255249, "learning_rate": 3.3128184434136557e-06, "loss": 0.1831, "num_input_tokens_seen": 25483024, "step": 41810 }, { "epoch": 12.973937325473162, "grad_norm": 4.312479019165039, "learning_rate": 3.3115441003489736e-06, "loss": 0.2297, "num_input_tokens_seen": 25485168, "step": 41815 }, { "epoch": 12.975488675147378, "grad_norm": 3.611668825149536, "learning_rate": 3.3102698810681012e-06, "loss": 0.2268, "num_input_tokens_seen": 25488368, "step": 41820 }, { "epoch": 12.977040024821594, "grad_norm": 2.666982650756836, "learning_rate": 3.3089957856644582e-06, "loss": 0.2231, "num_input_tokens_seen": 25491344, "step": 41825 }, { "epoch": 12.978591374495812, "grad_norm": 3.511244297027588, "learning_rate": 3.307721814231448e-06, "loss": 0.2234, "num_input_tokens_seen": 25493648, "step": 41830 }, { "epoch": 12.980142724170028, "grad_norm": 4.927073001861572, "learning_rate": 3.3064479668624684e-06, "loss": 0.2308, "num_input_tokens_seen": 25496688, "step": 41835 }, { "epoch": 12.981694073844244, "grad_norm": 2.7091243267059326, "learning_rate": 3.3051742436509084e-06, "loss": 0.193, "num_input_tokens_seen": 25499760, "step": 41840 }, { "epoch": 12.983245423518461, "grad_norm": 4.384381294250488, "learning_rate": 3.3039006446901443e-06, "loss": 0.2147, "num_input_tokens_seen": 25502224, "step": 41845 }, { "epoch": 12.984796773192677, "grad_norm": 4.373982906341553, "learning_rate": 3.302627170073549e-06, "loss": 0.1915, "num_input_tokens_seen": 25505072, "step": 41850 }, { "epoch": 12.986348122866895, "grad_norm": 3.8925366401672363, "learning_rate": 3.301353819894482e-06, "loss": 0.1912, "num_input_tokens_seen": 25508016, "step": 41855 }, { "epoch": 12.98789947254111, "grad_norm": 5.451152801513672, "learning_rate": 3.3000805942462957e-06, "loss": 0.2405, "num_input_tokens_seen": 25510480, "step": 41860 }, { "epoch": 12.989450822215327, "grad_norm": 4.083855628967285, "learning_rate": 3.29880749322233e-06, "loss": 0.2189, "num_input_tokens_seen": 25513104, "step": 41865 }, { "epoch": 12.991002171889544, "grad_norm": 3.1318604946136475, "learning_rate": 3.2975345169159196e-06, "loss": 0.2538, "num_input_tokens_seen": 25516176, "step": 41870 }, { "epoch": 12.99255352156376, "grad_norm": 4.592020511627197, "learning_rate": 3.2962616654203903e-06, "loss": 0.2052, "num_input_tokens_seen": 25518480, "step": 41875 }, { "epoch": 12.994104871237978, "grad_norm": 2.0557861328125, "learning_rate": 3.2949889388290573e-06, "loss": 0.2381, "num_input_tokens_seen": 25521904, "step": 41880 }, { "epoch": 12.995656220912194, "grad_norm": 4.075932025909424, "learning_rate": 3.2937163372352255e-06, "loss": 0.1978, "num_input_tokens_seen": 25524144, "step": 41885 }, { "epoch": 12.99720757058641, "grad_norm": 8.704463958740234, "learning_rate": 3.292443860732191e-06, "loss": 0.2434, "num_input_tokens_seen": 25527664, "step": 41890 }, { "epoch": 12.998758920260627, "grad_norm": 7.260952949523926, "learning_rate": 3.2911715094132418e-06, "loss": 0.2633, "num_input_tokens_seen": 25531216, "step": 41895 }, { "epoch": 13.000310269934843, "grad_norm": 3.3503220081329346, "learning_rate": 3.289899283371657e-06, "loss": 0.213, "num_input_tokens_seen": 25533920, "step": 41900 }, { "epoch": 13.001861619609059, "grad_norm": 4.84415340423584, "learning_rate": 3.2886271827007036e-06, "loss": 0.1943, "num_input_tokens_seen": 25536544, "step": 41905 }, { "epoch": 13.003412969283277, "grad_norm": 5.652287483215332, "learning_rate": 3.2873552074936456e-06, "loss": 0.2194, "num_input_tokens_seen": 25539808, "step": 41910 }, { "epoch": 13.004964318957493, "grad_norm": 3.6921751499176025, "learning_rate": 3.28608335784373e-06, "loss": 0.2104, "num_input_tokens_seen": 25543424, "step": 41915 }, { "epoch": 13.00651566863171, "grad_norm": 2.150927782058716, "learning_rate": 3.2848116338442014e-06, "loss": 0.1676, "num_input_tokens_seen": 25545920, "step": 41920 }, { "epoch": 13.008067018305926, "grad_norm": 7.034223556518555, "learning_rate": 3.2835400355882907e-06, "loss": 0.2006, "num_input_tokens_seen": 25548384, "step": 41925 }, { "epoch": 13.009618367980142, "grad_norm": 1.9750653505325317, "learning_rate": 3.282268563169221e-06, "loss": 0.1944, "num_input_tokens_seen": 25551648, "step": 41930 }, { "epoch": 13.01116971765436, "grad_norm": 3.5046284198760986, "learning_rate": 3.2809972166802073e-06, "loss": 0.1806, "num_input_tokens_seen": 25555200, "step": 41935 }, { "epoch": 13.012721067328576, "grad_norm": 2.484121084213257, "learning_rate": 3.279725996214453e-06, "loss": 0.1892, "num_input_tokens_seen": 25557152, "step": 41940 }, { "epoch": 13.014272417002793, "grad_norm": 5.916642189025879, "learning_rate": 3.2784549018651546e-06, "loss": 0.1794, "num_input_tokens_seen": 25560032, "step": 41945 }, { "epoch": 13.01582376667701, "grad_norm": 5.631254196166992, "learning_rate": 3.2771839337254975e-06, "loss": 0.2401, "num_input_tokens_seen": 25562432, "step": 41950 }, { "epoch": 13.017375116351225, "grad_norm": 2.853172779083252, "learning_rate": 3.2759130918886595e-06, "loss": 0.229, "num_input_tokens_seen": 25565376, "step": 41955 }, { "epoch": 13.018926466025443, "grad_norm": 4.475468158721924, "learning_rate": 3.2746423764478085e-06, "loss": 0.184, "num_input_tokens_seen": 25568032, "step": 41960 }, { "epoch": 13.020477815699659, "grad_norm": 10.388801574707031, "learning_rate": 3.2733717874961014e-06, "loss": 0.1824, "num_input_tokens_seen": 25570624, "step": 41965 }, { "epoch": 13.022029165373874, "grad_norm": 3.0672106742858887, "learning_rate": 3.272101325126689e-06, "loss": 0.2311, "num_input_tokens_seen": 25572992, "step": 41970 }, { "epoch": 13.023580515048092, "grad_norm": 5.1759748458862305, "learning_rate": 3.2708309894327116e-06, "loss": 0.1998, "num_input_tokens_seen": 25575680, "step": 41975 }, { "epoch": 13.025131864722308, "grad_norm": 5.071780204772949, "learning_rate": 3.2695607805072987e-06, "loss": 0.2447, "num_input_tokens_seen": 25578656, "step": 41980 }, { "epoch": 13.026683214396526, "grad_norm": 4.837684631347656, "learning_rate": 3.2682906984435726e-06, "loss": 0.2424, "num_input_tokens_seen": 25582368, "step": 41985 }, { "epoch": 13.028234564070742, "grad_norm": 6.320407867431641, "learning_rate": 3.267020743334644e-06, "loss": 0.2026, "num_input_tokens_seen": 25584960, "step": 41990 }, { "epoch": 13.029785913744957, "grad_norm": 5.137502193450928, "learning_rate": 3.2657509152736167e-06, "loss": 0.1844, "num_input_tokens_seen": 25587872, "step": 41995 }, { "epoch": 13.031337263419175, "grad_norm": 5.765028953552246, "learning_rate": 3.2644812143535842e-06, "loss": 0.1916, "num_input_tokens_seen": 25590208, "step": 42000 }, { "epoch": 13.032888613093391, "grad_norm": 12.908441543579102, "learning_rate": 3.2632116406676308e-06, "loss": 0.189, "num_input_tokens_seen": 25593696, "step": 42005 }, { "epoch": 13.034439962767609, "grad_norm": 9.12013053894043, "learning_rate": 3.26194219430883e-06, "loss": 0.1908, "num_input_tokens_seen": 25596288, "step": 42010 }, { "epoch": 13.035991312441825, "grad_norm": 6.145091533660889, "learning_rate": 3.2606728753702478e-06, "loss": 0.1828, "num_input_tokens_seen": 25599488, "step": 42015 }, { "epoch": 13.03754266211604, "grad_norm": 8.919574737548828, "learning_rate": 3.259403683944941e-06, "loss": 0.2537, "num_input_tokens_seen": 25603296, "step": 42020 }, { "epoch": 13.039094011790258, "grad_norm": 4.35010290145874, "learning_rate": 3.258134620125955e-06, "loss": 0.2052, "num_input_tokens_seen": 25606592, "step": 42025 }, { "epoch": 13.040645361464474, "grad_norm": 8.926353454589844, "learning_rate": 3.256865684006329e-06, "loss": 0.2384, "num_input_tokens_seen": 25609984, "step": 42030 }, { "epoch": 13.04219671113869, "grad_norm": 4.584693908691406, "learning_rate": 3.25559687567909e-06, "loss": 0.2334, "num_input_tokens_seen": 25612768, "step": 42035 }, { "epoch": 13.043748060812907, "grad_norm": 4.682307243347168, "learning_rate": 3.2543281952372575e-06, "loss": 0.1944, "num_input_tokens_seen": 25616192, "step": 42040 }, { "epoch": 13.045299410487123, "grad_norm": 5.363640785217285, "learning_rate": 3.2530596427738393e-06, "loss": 0.2282, "num_input_tokens_seen": 25618848, "step": 42045 }, { "epoch": 13.046850760161341, "grad_norm": 3.489898681640625, "learning_rate": 3.2517912183818366e-06, "loss": 0.2064, "num_input_tokens_seen": 25622784, "step": 42050 }, { "epoch": 13.048402109835557, "grad_norm": 2.726144313812256, "learning_rate": 3.2505229221542387e-06, "loss": 0.186, "num_input_tokens_seen": 25627360, "step": 42055 }, { "epoch": 13.049953459509773, "grad_norm": 4.375701427459717, "learning_rate": 3.2492547541840284e-06, "loss": 0.2027, "num_input_tokens_seen": 25629952, "step": 42060 }, { "epoch": 13.05150480918399, "grad_norm": 2.9545681476593018, "learning_rate": 3.2479867145641764e-06, "loss": 0.2025, "num_input_tokens_seen": 25632224, "step": 42065 }, { "epoch": 13.053056158858206, "grad_norm": 3.8174045085906982, "learning_rate": 3.246718803387645e-06, "loss": 0.1761, "num_input_tokens_seen": 25635776, "step": 42070 }, { "epoch": 13.054607508532424, "grad_norm": 11.631696701049805, "learning_rate": 3.245451020747388e-06, "loss": 0.2645, "num_input_tokens_seen": 25638368, "step": 42075 }, { "epoch": 13.05615885820664, "grad_norm": 2.7448034286499023, "learning_rate": 3.244183366736347e-06, "loss": 0.1967, "num_input_tokens_seen": 25641984, "step": 42080 }, { "epoch": 13.057710207880856, "grad_norm": 3.1330361366271973, "learning_rate": 3.242915841447457e-06, "loss": 0.1773, "num_input_tokens_seen": 25645600, "step": 42085 }, { "epoch": 13.059261557555073, "grad_norm": 7.827442169189453, "learning_rate": 3.2416484449736428e-06, "loss": 0.2019, "num_input_tokens_seen": 25648608, "step": 42090 }, { "epoch": 13.06081290722929, "grad_norm": 4.707604885101318, "learning_rate": 3.2403811774078207e-06, "loss": 0.2026, "num_input_tokens_seen": 25651072, "step": 42095 }, { "epoch": 13.062364256903505, "grad_norm": 7.140949249267578, "learning_rate": 3.2391140388428947e-06, "loss": 0.2113, "num_input_tokens_seen": 25654048, "step": 42100 }, { "epoch": 13.063915606577723, "grad_norm": 9.042706489562988, "learning_rate": 3.2378470293717623e-06, "loss": 0.208, "num_input_tokens_seen": 25656512, "step": 42105 }, { "epoch": 13.065466956251939, "grad_norm": 5.638269901275635, "learning_rate": 3.2365801490873087e-06, "loss": 0.1949, "num_input_tokens_seen": 25661376, "step": 42110 }, { "epoch": 13.067018305926156, "grad_norm": 14.4812650680542, "learning_rate": 3.2353133980824124e-06, "loss": 0.2187, "num_input_tokens_seen": 25664768, "step": 42115 }, { "epoch": 13.068569655600372, "grad_norm": 4.051340103149414, "learning_rate": 3.2340467764499415e-06, "loss": 0.2054, "num_input_tokens_seen": 25668096, "step": 42120 }, { "epoch": 13.070121005274588, "grad_norm": 3.607776641845703, "learning_rate": 3.232780284282754e-06, "loss": 0.1759, "num_input_tokens_seen": 25670464, "step": 42125 }, { "epoch": 13.071672354948806, "grad_norm": 10.054834365844727, "learning_rate": 3.2315139216736986e-06, "loss": 0.183, "num_input_tokens_seen": 25673440, "step": 42130 }, { "epoch": 13.073223704623022, "grad_norm": 3.447935104370117, "learning_rate": 3.2302476887156143e-06, "loss": 0.1828, "num_input_tokens_seen": 25676480, "step": 42135 }, { "epoch": 13.07477505429724, "grad_norm": 3.703657388687134, "learning_rate": 3.2289815855013316e-06, "loss": 0.1589, "num_input_tokens_seen": 25678944, "step": 42140 }, { "epoch": 13.076326403971455, "grad_norm": 8.87662410736084, "learning_rate": 3.2277156121236697e-06, "loss": 0.2024, "num_input_tokens_seen": 25681088, "step": 42145 }, { "epoch": 13.077877753645671, "grad_norm": 5.615598678588867, "learning_rate": 3.2264497686754413e-06, "loss": 0.175, "num_input_tokens_seen": 25685440, "step": 42150 }, { "epoch": 13.079429103319889, "grad_norm": 5.583339214324951, "learning_rate": 3.2251840552494473e-06, "loss": 0.2183, "num_input_tokens_seen": 25688608, "step": 42155 }, { "epoch": 13.080980452994105, "grad_norm": 5.108481407165527, "learning_rate": 3.2239184719384786e-06, "loss": 0.2022, "num_input_tokens_seen": 25690912, "step": 42160 }, { "epoch": 13.08253180266832, "grad_norm": 3.702458143234253, "learning_rate": 3.222653018835318e-06, "loss": 0.2707, "num_input_tokens_seen": 25693504, "step": 42165 }, { "epoch": 13.084083152342538, "grad_norm": 5.744553565979004, "learning_rate": 3.2213876960327383e-06, "loss": 0.2188, "num_input_tokens_seen": 25696512, "step": 42170 }, { "epoch": 13.085634502016754, "grad_norm": 5.437784194946289, "learning_rate": 3.2201225036235017e-06, "loss": 0.2012, "num_input_tokens_seen": 25699072, "step": 42175 }, { "epoch": 13.087185851690972, "grad_norm": 6.6950578689575195, "learning_rate": 3.218857441700363e-06, "loss": 0.1787, "num_input_tokens_seen": 25702400, "step": 42180 }, { "epoch": 13.088737201365188, "grad_norm": 3.395726442337036, "learning_rate": 3.2175925103560656e-06, "loss": 0.2156, "num_input_tokens_seen": 25704832, "step": 42185 }, { "epoch": 13.090288551039404, "grad_norm": 3.5061683654785156, "learning_rate": 3.2163277096833443e-06, "loss": 0.2294, "num_input_tokens_seen": 25707328, "step": 42190 }, { "epoch": 13.091839900713621, "grad_norm": 6.923413276672363, "learning_rate": 3.215063039774924e-06, "loss": 0.2774, "num_input_tokens_seen": 25709920, "step": 42195 }, { "epoch": 13.093391250387837, "grad_norm": 8.646105766296387, "learning_rate": 3.213798500723519e-06, "loss": 0.2256, "num_input_tokens_seen": 25712704, "step": 42200 }, { "epoch": 13.094942600062055, "grad_norm": 3.276369094848633, "learning_rate": 3.2125340926218346e-06, "loss": 0.2485, "num_input_tokens_seen": 25715008, "step": 42205 }, { "epoch": 13.09649394973627, "grad_norm": 11.80050277709961, "learning_rate": 3.211269815562569e-06, "loss": 0.1994, "num_input_tokens_seen": 25717856, "step": 42210 }, { "epoch": 13.098045299410487, "grad_norm": 2.9276154041290283, "learning_rate": 3.2100056696384086e-06, "loss": 0.1645, "num_input_tokens_seen": 25720352, "step": 42215 }, { "epoch": 13.099596649084704, "grad_norm": 5.477230072021484, "learning_rate": 3.208741654942028e-06, "loss": 0.2327, "num_input_tokens_seen": 25723328, "step": 42220 }, { "epoch": 13.10114799875892, "grad_norm": 2.471944570541382, "learning_rate": 3.2074777715660964e-06, "loss": 0.194, "num_input_tokens_seen": 25726336, "step": 42225 }, { "epoch": 13.102699348433136, "grad_norm": 1.3568265438079834, "learning_rate": 3.206214019603271e-06, "loss": 0.1488, "num_input_tokens_seen": 25729440, "step": 42230 }, { "epoch": 13.104250698107354, "grad_norm": 3.172558307647705, "learning_rate": 3.2049503991461996e-06, "loss": 0.1607, "num_input_tokens_seen": 25731968, "step": 42235 }, { "epoch": 13.10580204778157, "grad_norm": 6.417389869689941, "learning_rate": 3.20368691028752e-06, "loss": 0.2011, "num_input_tokens_seen": 25734784, "step": 42240 }, { "epoch": 13.107353397455787, "grad_norm": 6.253495216369629, "learning_rate": 3.2024235531198618e-06, "loss": 0.2765, "num_input_tokens_seen": 25738112, "step": 42245 }, { "epoch": 13.108904747130003, "grad_norm": 4.494475364685059, "learning_rate": 3.201160327735844e-06, "loss": 0.2645, "num_input_tokens_seen": 25740896, "step": 42250 }, { "epoch": 13.110456096804219, "grad_norm": 7.557804584503174, "learning_rate": 3.199897234228074e-06, "loss": 0.1945, "num_input_tokens_seen": 25743168, "step": 42255 }, { "epoch": 13.112007446478437, "grad_norm": 3.4338462352752686, "learning_rate": 3.198634272689155e-06, "loss": 0.1823, "num_input_tokens_seen": 25745472, "step": 42260 }, { "epoch": 13.113558796152653, "grad_norm": 5.3801069259643555, "learning_rate": 3.197371443211673e-06, "loss": 0.1962, "num_input_tokens_seen": 25748544, "step": 42265 }, { "epoch": 13.11511014582687, "grad_norm": 2.8283908367156982, "learning_rate": 3.1961087458882124e-06, "loss": 0.1749, "num_input_tokens_seen": 25751648, "step": 42270 }, { "epoch": 13.116661495501086, "grad_norm": 3.49814510345459, "learning_rate": 3.1948461808113416e-06, "loss": 0.1837, "num_input_tokens_seen": 25755008, "step": 42275 }, { "epoch": 13.118212845175302, "grad_norm": 4.697169780731201, "learning_rate": 3.193583748073622e-06, "loss": 0.1414, "num_input_tokens_seen": 25757600, "step": 42280 }, { "epoch": 13.11976419484952, "grad_norm": 6.1690754890441895, "learning_rate": 3.1923214477676044e-06, "loss": 0.2506, "num_input_tokens_seen": 25759840, "step": 42285 }, { "epoch": 13.121315544523735, "grad_norm": 4.957058906555176, "learning_rate": 3.1910592799858316e-06, "loss": 0.2095, "num_input_tokens_seen": 25763392, "step": 42290 }, { "epoch": 13.122866894197951, "grad_norm": 6.020869731903076, "learning_rate": 3.1897972448208335e-06, "loss": 0.2436, "num_input_tokens_seen": 25766336, "step": 42295 }, { "epoch": 13.124418243872169, "grad_norm": 3.782780647277832, "learning_rate": 3.1885353423651344e-06, "loss": 0.2528, "num_input_tokens_seen": 25769312, "step": 42300 }, { "epoch": 13.125969593546385, "grad_norm": 9.753225326538086, "learning_rate": 3.1872735727112457e-06, "loss": 0.2521, "num_input_tokens_seen": 25772032, "step": 42305 }, { "epoch": 13.127520943220603, "grad_norm": 12.736105918884277, "learning_rate": 3.1860119359516693e-06, "loss": 0.2035, "num_input_tokens_seen": 25774592, "step": 42310 }, { "epoch": 13.129072292894818, "grad_norm": 7.04803466796875, "learning_rate": 3.184750432178899e-06, "loss": 0.2474, "num_input_tokens_seen": 25777312, "step": 42315 }, { "epoch": 13.130623642569034, "grad_norm": 6.410436153411865, "learning_rate": 3.1834890614854186e-06, "loss": 0.2247, "num_input_tokens_seen": 25779456, "step": 42320 }, { "epoch": 13.132174992243252, "grad_norm": 3.61020827293396, "learning_rate": 3.182227823963698e-06, "loss": 0.1772, "num_input_tokens_seen": 25782144, "step": 42325 }, { "epoch": 13.133726341917468, "grad_norm": 3.7619118690490723, "learning_rate": 3.1809667197062067e-06, "loss": 0.2008, "num_input_tokens_seen": 25784768, "step": 42330 }, { "epoch": 13.135277691591686, "grad_norm": 7.288636684417725, "learning_rate": 3.179705748805394e-06, "loss": 0.2017, "num_input_tokens_seen": 25786688, "step": 42335 }, { "epoch": 13.136829041265901, "grad_norm": 1.9044431447982788, "learning_rate": 3.178444911353705e-06, "loss": 0.2136, "num_input_tokens_seen": 25789120, "step": 42340 }, { "epoch": 13.138380390940117, "grad_norm": 10.915130615234375, "learning_rate": 3.1771842074435753e-06, "loss": 0.2349, "num_input_tokens_seen": 25792032, "step": 42345 }, { "epoch": 13.139931740614335, "grad_norm": 4.312489986419678, "learning_rate": 3.1759236371674292e-06, "loss": 0.2368, "num_input_tokens_seen": 25794464, "step": 42350 }, { "epoch": 13.14148309028855, "grad_norm": 5.832931041717529, "learning_rate": 3.17466320061768e-06, "loss": 0.2029, "num_input_tokens_seen": 25798432, "step": 42355 }, { "epoch": 13.143034439962767, "grad_norm": 3.2133920192718506, "learning_rate": 3.173402897886733e-06, "loss": 0.2262, "num_input_tokens_seen": 25801504, "step": 42360 }, { "epoch": 13.144585789636984, "grad_norm": 8.461755752563477, "learning_rate": 3.172142729066984e-06, "loss": 0.2251, "num_input_tokens_seen": 25804736, "step": 42365 }, { "epoch": 13.1461371393112, "grad_norm": 2.4046859741210938, "learning_rate": 3.1708826942508184e-06, "loss": 0.1747, "num_input_tokens_seen": 25807040, "step": 42370 }, { "epoch": 13.147688488985418, "grad_norm": 4.4278669357299805, "learning_rate": 3.169622793530611e-06, "loss": 0.2123, "num_input_tokens_seen": 25809472, "step": 42375 }, { "epoch": 13.149239838659634, "grad_norm": 4.07051944732666, "learning_rate": 3.168363026998728e-06, "loss": 0.2311, "num_input_tokens_seen": 25813952, "step": 42380 }, { "epoch": 13.15079118833385, "grad_norm": 7.517670631408691, "learning_rate": 3.167103394747524e-06, "loss": 0.2629, "num_input_tokens_seen": 25816736, "step": 42385 }, { "epoch": 13.152342538008067, "grad_norm": 6.853754997253418, "learning_rate": 3.1658438968693466e-06, "loss": 0.2859, "num_input_tokens_seen": 25819648, "step": 42390 }, { "epoch": 13.153893887682283, "grad_norm": 4.8480329513549805, "learning_rate": 3.1645845334565316e-06, "loss": 0.1989, "num_input_tokens_seen": 25823008, "step": 42395 }, { "epoch": 13.155445237356501, "grad_norm": 8.862902641296387, "learning_rate": 3.1633253046014046e-06, "loss": 0.1936, "num_input_tokens_seen": 25825248, "step": 42400 }, { "epoch": 13.156996587030717, "grad_norm": 5.865452766418457, "learning_rate": 3.1620662103962816e-06, "loss": 0.2151, "num_input_tokens_seen": 25828096, "step": 42405 }, { "epoch": 13.158547936704933, "grad_norm": 4.566012859344482, "learning_rate": 3.1608072509334707e-06, "loss": 0.2253, "num_input_tokens_seen": 25831680, "step": 42410 }, { "epoch": 13.16009928637915, "grad_norm": 7.5397047996521, "learning_rate": 3.159548426305268e-06, "loss": 0.2453, "num_input_tokens_seen": 25833984, "step": 42415 }, { "epoch": 13.161650636053366, "grad_norm": 5.6396074295043945, "learning_rate": 3.158289736603959e-06, "loss": 0.2133, "num_input_tokens_seen": 25837632, "step": 42420 }, { "epoch": 13.163201985727582, "grad_norm": 4.463417053222656, "learning_rate": 3.1570311819218212e-06, "loss": 0.1962, "num_input_tokens_seen": 25841152, "step": 42425 }, { "epoch": 13.1647533354018, "grad_norm": 5.151064395904541, "learning_rate": 3.1557727623511227e-06, "loss": 0.2508, "num_input_tokens_seen": 25844384, "step": 42430 }, { "epoch": 13.166304685076016, "grad_norm": 9.912749290466309, "learning_rate": 3.1545144779841184e-06, "loss": 0.2333, "num_input_tokens_seen": 25847040, "step": 42435 }, { "epoch": 13.167856034750233, "grad_norm": 9.214509010314941, "learning_rate": 3.153256328913058e-06, "loss": 0.2, "num_input_tokens_seen": 25849568, "step": 42440 }, { "epoch": 13.16940738442445, "grad_norm": 4.621765613555908, "learning_rate": 3.1519983152301757e-06, "loss": 0.2386, "num_input_tokens_seen": 25853344, "step": 42445 }, { "epoch": 13.170958734098665, "grad_norm": 4.814860820770264, "learning_rate": 3.1507404370277007e-06, "loss": 0.1769, "num_input_tokens_seen": 25855648, "step": 42450 }, { "epoch": 13.172510083772883, "grad_norm": 4.359073638916016, "learning_rate": 3.1494826943978507e-06, "loss": 0.2155, "num_input_tokens_seen": 25860256, "step": 42455 }, { "epoch": 13.174061433447099, "grad_norm": 4.832912445068359, "learning_rate": 3.1482250874328337e-06, "loss": 0.1904, "num_input_tokens_seen": 25862752, "step": 42460 }, { "epoch": 13.175612783121316, "grad_norm": 5.883179187774658, "learning_rate": 3.146967616224845e-06, "loss": 0.1995, "num_input_tokens_seen": 25866400, "step": 42465 }, { "epoch": 13.177164132795532, "grad_norm": 2.958868980407715, "learning_rate": 3.1457102808660724e-06, "loss": 0.2051, "num_input_tokens_seen": 25869312, "step": 42470 }, { "epoch": 13.178715482469748, "grad_norm": 3.696563482284546, "learning_rate": 3.144453081448695e-06, "loss": 0.1692, "num_input_tokens_seen": 25872160, "step": 42475 }, { "epoch": 13.180266832143966, "grad_norm": 7.468106746673584, "learning_rate": 3.1431960180648806e-06, "loss": 0.22, "num_input_tokens_seen": 25875584, "step": 42480 }, { "epoch": 13.181818181818182, "grad_norm": 3.6463429927825928, "learning_rate": 3.141939090806786e-06, "loss": 0.2094, "num_input_tokens_seen": 25878944, "step": 42485 }, { "epoch": 13.183369531492398, "grad_norm": 3.3889825344085693, "learning_rate": 3.1406822997665577e-06, "loss": 0.1603, "num_input_tokens_seen": 25882912, "step": 42490 }, { "epoch": 13.184920881166615, "grad_norm": 4.856264114379883, "learning_rate": 3.1394256450363353e-06, "loss": 0.1874, "num_input_tokens_seen": 25886368, "step": 42495 }, { "epoch": 13.186472230840831, "grad_norm": 14.621794700622559, "learning_rate": 3.1381691267082452e-06, "loss": 0.2196, "num_input_tokens_seen": 25889504, "step": 42500 }, { "epoch": 13.188023580515049, "grad_norm": 6.045188903808594, "learning_rate": 3.1369127448744057e-06, "loss": 0.2351, "num_input_tokens_seen": 25892000, "step": 42505 }, { "epoch": 13.189574930189265, "grad_norm": 5.550804615020752, "learning_rate": 3.1356564996269244e-06, "loss": 0.2639, "num_input_tokens_seen": 25895712, "step": 42510 }, { "epoch": 13.19112627986348, "grad_norm": 2.5198895931243896, "learning_rate": 3.134400391057898e-06, "loss": 0.1753, "num_input_tokens_seen": 25898880, "step": 42515 }, { "epoch": 13.192677629537698, "grad_norm": 4.390833854675293, "learning_rate": 3.133144419259416e-06, "loss": 0.244, "num_input_tokens_seen": 25902560, "step": 42520 }, { "epoch": 13.194228979211914, "grad_norm": 5.727053165435791, "learning_rate": 3.131888584323555e-06, "loss": 0.2112, "num_input_tokens_seen": 25906464, "step": 42525 }, { "epoch": 13.195780328886132, "grad_norm": 9.160625457763672, "learning_rate": 3.1306328863423807e-06, "loss": 0.2115, "num_input_tokens_seen": 25909824, "step": 42530 }, { "epoch": 13.197331678560348, "grad_norm": 2.868626594543457, "learning_rate": 3.1293773254079534e-06, "loss": 0.1976, "num_input_tokens_seen": 25911872, "step": 42535 }, { "epoch": 13.198883028234563, "grad_norm": 4.291235446929932, "learning_rate": 3.12812190161232e-06, "loss": 0.2086, "num_input_tokens_seen": 25913984, "step": 42540 }, { "epoch": 13.200434377908781, "grad_norm": 4.643438816070557, "learning_rate": 3.1268666150475175e-06, "loss": 0.2395, "num_input_tokens_seen": 25916480, "step": 42545 }, { "epoch": 13.201985727582997, "grad_norm": 3.366032361984253, "learning_rate": 3.125611465805574e-06, "loss": 0.1838, "num_input_tokens_seen": 25920032, "step": 42550 }, { "epoch": 13.203537077257213, "grad_norm": 2.583374261856079, "learning_rate": 3.1243564539785047e-06, "loss": 0.2211, "num_input_tokens_seen": 25922752, "step": 42555 }, { "epoch": 13.20508842693143, "grad_norm": 4.443470478057861, "learning_rate": 3.1231015796583192e-06, "loss": 0.2099, "num_input_tokens_seen": 25925632, "step": 42560 }, { "epoch": 13.206639776605646, "grad_norm": 5.373432159423828, "learning_rate": 3.1218468429370143e-06, "loss": 0.1761, "num_input_tokens_seen": 25929376, "step": 42565 }, { "epoch": 13.208191126279864, "grad_norm": 5.393806457519531, "learning_rate": 3.120592243906575e-06, "loss": 0.1945, "num_input_tokens_seen": 25932416, "step": 42570 }, { "epoch": 13.20974247595408, "grad_norm": 3.4721124172210693, "learning_rate": 3.119337782658981e-06, "loss": 0.2039, "num_input_tokens_seen": 25935328, "step": 42575 }, { "epoch": 13.211293825628296, "grad_norm": 4.80646276473999, "learning_rate": 3.1180834592861977e-06, "loss": 0.1469, "num_input_tokens_seen": 25938304, "step": 42580 }, { "epoch": 13.212845175302514, "grad_norm": 9.549144744873047, "learning_rate": 3.116829273880181e-06, "loss": 0.1923, "num_input_tokens_seen": 25941664, "step": 42585 }, { "epoch": 13.21439652497673, "grad_norm": 8.97277545928955, "learning_rate": 3.1155752265328786e-06, "loss": 0.183, "num_input_tokens_seen": 25944352, "step": 42590 }, { "epoch": 13.215947874650947, "grad_norm": 8.19163703918457, "learning_rate": 3.1143213173362273e-06, "loss": 0.2133, "num_input_tokens_seen": 25948096, "step": 42595 }, { "epoch": 13.217499224325163, "grad_norm": 6.626888751983643, "learning_rate": 3.1130675463821543e-06, "loss": 0.1954, "num_input_tokens_seen": 25950944, "step": 42600 }, { "epoch": 13.219050573999379, "grad_norm": 8.049271583557129, "learning_rate": 3.111813913762574e-06, "loss": 0.2423, "num_input_tokens_seen": 25954688, "step": 42605 }, { "epoch": 13.220601923673597, "grad_norm": 5.355251789093018, "learning_rate": 3.1105604195693938e-06, "loss": 0.1832, "num_input_tokens_seen": 25957248, "step": 42610 }, { "epoch": 13.222153273347812, "grad_norm": 7.7704057693481445, "learning_rate": 3.109307063894509e-06, "loss": 0.1985, "num_input_tokens_seen": 25959872, "step": 42615 }, { "epoch": 13.223704623022028, "grad_norm": 3.8707706928253174, "learning_rate": 3.1080538468298047e-06, "loss": 0.1934, "num_input_tokens_seen": 25962784, "step": 42620 }, { "epoch": 13.225255972696246, "grad_norm": 4.9755425453186035, "learning_rate": 3.1068007684671584e-06, "loss": 0.2175, "num_input_tokens_seen": 25966400, "step": 42625 }, { "epoch": 13.226807322370462, "grad_norm": 6.054471015930176, "learning_rate": 3.1055478288984343e-06, "loss": 0.1804, "num_input_tokens_seen": 25969440, "step": 42630 }, { "epoch": 13.22835867204468, "grad_norm": 8.005906105041504, "learning_rate": 3.1042950282154884e-06, "loss": 0.1768, "num_input_tokens_seen": 25972576, "step": 42635 }, { "epoch": 13.229910021718895, "grad_norm": 4.460580348968506, "learning_rate": 3.103042366510165e-06, "loss": 0.2326, "num_input_tokens_seen": 25975008, "step": 42640 }, { "epoch": 13.231461371393111, "grad_norm": 6.350311756134033, "learning_rate": 3.1017898438742996e-06, "loss": 0.2399, "num_input_tokens_seen": 25977376, "step": 42645 }, { "epoch": 13.233012721067329, "grad_norm": 3.0769951343536377, "learning_rate": 3.100537460399715e-06, "loss": 0.1595, "num_input_tokens_seen": 25979936, "step": 42650 }, { "epoch": 13.234564070741545, "grad_norm": 4.275236129760742, "learning_rate": 3.0992852161782296e-06, "loss": 0.2104, "num_input_tokens_seen": 25982944, "step": 42655 }, { "epoch": 13.236115420415763, "grad_norm": 5.9058518409729, "learning_rate": 3.098033111301646e-06, "loss": 0.1618, "num_input_tokens_seen": 25985696, "step": 42660 }, { "epoch": 13.237666770089978, "grad_norm": 5.272197723388672, "learning_rate": 3.0967811458617568e-06, "loss": 0.1843, "num_input_tokens_seen": 25988736, "step": 42665 }, { "epoch": 13.239218119764194, "grad_norm": 3.0170013904571533, "learning_rate": 3.0955293199503477e-06, "loss": 0.1449, "num_input_tokens_seen": 25991360, "step": 42670 }, { "epoch": 13.240769469438412, "grad_norm": 7.384819984436035, "learning_rate": 3.0942776336591918e-06, "loss": 0.2081, "num_input_tokens_seen": 25995424, "step": 42675 }, { "epoch": 13.242320819112628, "grad_norm": 13.286334991455078, "learning_rate": 3.093026087080053e-06, "loss": 0.268, "num_input_tokens_seen": 25998624, "step": 42680 }, { "epoch": 13.243872168786844, "grad_norm": 4.222047805786133, "learning_rate": 3.0917746803046835e-06, "loss": 0.187, "num_input_tokens_seen": 26000992, "step": 42685 }, { "epoch": 13.245423518461061, "grad_norm": 4.543460845947266, "learning_rate": 3.0905234134248264e-06, "loss": 0.1768, "num_input_tokens_seen": 26003712, "step": 42690 }, { "epoch": 13.246974868135277, "grad_norm": 4.391589164733887, "learning_rate": 3.0892722865322144e-06, "loss": 0.2674, "num_input_tokens_seen": 26005920, "step": 42695 }, { "epoch": 13.248526217809495, "grad_norm": 5.148467540740967, "learning_rate": 3.088021299718571e-06, "loss": 0.2452, "num_input_tokens_seen": 26009056, "step": 42700 }, { "epoch": 13.25007756748371, "grad_norm": 5.847850799560547, "learning_rate": 3.086770453075607e-06, "loss": 0.215, "num_input_tokens_seen": 26011936, "step": 42705 }, { "epoch": 13.251628917157927, "grad_norm": 5.915432929992676, "learning_rate": 3.085519746695025e-06, "loss": 0.1889, "num_input_tokens_seen": 26014464, "step": 42710 }, { "epoch": 13.253180266832144, "grad_norm": 7.694468021392822, "learning_rate": 3.0842691806685155e-06, "loss": 0.2366, "num_input_tokens_seen": 26017312, "step": 42715 }, { "epoch": 13.25473161650636, "grad_norm": 3.7488081455230713, "learning_rate": 3.0830187550877614e-06, "loss": 0.2637, "num_input_tokens_seen": 26019936, "step": 42720 }, { "epoch": 13.256282966180578, "grad_norm": 11.342297554016113, "learning_rate": 3.0817684700444337e-06, "loss": 0.2068, "num_input_tokens_seen": 26022816, "step": 42725 }, { "epoch": 13.257834315854794, "grad_norm": 9.12516975402832, "learning_rate": 3.080518325630192e-06, "loss": 0.2206, "num_input_tokens_seen": 26025792, "step": 42730 }, { "epoch": 13.25938566552901, "grad_norm": 5.875674724578857, "learning_rate": 3.0792683219366872e-06, "loss": 0.2459, "num_input_tokens_seen": 26031040, "step": 42735 }, { "epoch": 13.260937015203227, "grad_norm": 4.157156944274902, "learning_rate": 3.0780184590555583e-06, "loss": 0.1954, "num_input_tokens_seen": 26034720, "step": 42740 }, { "epoch": 13.262488364877443, "grad_norm": 6.435369968414307, "learning_rate": 3.0767687370784373e-06, "loss": 0.1897, "num_input_tokens_seen": 26038272, "step": 42745 }, { "epoch": 13.264039714551659, "grad_norm": 9.52171516418457, "learning_rate": 3.075519156096942e-06, "loss": 0.2426, "num_input_tokens_seen": 26041952, "step": 42750 }, { "epoch": 13.265591064225877, "grad_norm": 3.894564151763916, "learning_rate": 3.074269716202681e-06, "loss": 0.2014, "num_input_tokens_seen": 26044640, "step": 42755 }, { "epoch": 13.267142413900093, "grad_norm": 6.274109363555908, "learning_rate": 3.073020417487255e-06, "loss": 0.198, "num_input_tokens_seen": 26047520, "step": 42760 }, { "epoch": 13.26869376357431, "grad_norm": 5.857663631439209, "learning_rate": 3.071771260042251e-06, "loss": 0.2671, "num_input_tokens_seen": 26050080, "step": 42765 }, { "epoch": 13.270245113248526, "grad_norm": 8.990074157714844, "learning_rate": 3.0705222439592474e-06, "loss": 0.2197, "num_input_tokens_seen": 26053568, "step": 42770 }, { "epoch": 13.271796462922742, "grad_norm": 6.1988420486450195, "learning_rate": 3.0692733693298103e-06, "loss": 0.1733, "num_input_tokens_seen": 26056736, "step": 42775 }, { "epoch": 13.27334781259696, "grad_norm": 4.782610893249512, "learning_rate": 3.068024636245499e-06, "loss": 0.2365, "num_input_tokens_seen": 26058656, "step": 42780 }, { "epoch": 13.274899162271176, "grad_norm": 6.197371482849121, "learning_rate": 3.0667760447978613e-06, "loss": 0.2011, "num_input_tokens_seen": 26061952, "step": 42785 }, { "epoch": 13.276450511945393, "grad_norm": 4.791924476623535, "learning_rate": 3.0655275950784314e-06, "loss": 0.2284, "num_input_tokens_seen": 26064256, "step": 42790 }, { "epoch": 13.27800186161961, "grad_norm": 4.840810298919678, "learning_rate": 3.064279287178736e-06, "loss": 0.2621, "num_input_tokens_seen": 26067232, "step": 42795 }, { "epoch": 13.279553211293825, "grad_norm": 5.53178071975708, "learning_rate": 3.0630311211902917e-06, "loss": 0.1708, "num_input_tokens_seen": 26069664, "step": 42800 }, { "epoch": 13.281104560968043, "grad_norm": 10.716412544250488, "learning_rate": 3.061783097204604e-06, "loss": 0.208, "num_input_tokens_seen": 26073120, "step": 42805 }, { "epoch": 13.282655910642259, "grad_norm": 3.437833309173584, "learning_rate": 3.0605352153131664e-06, "loss": 0.2033, "num_input_tokens_seen": 26075904, "step": 42810 }, { "epoch": 13.284207260316474, "grad_norm": 12.430092811584473, "learning_rate": 3.059287475607464e-06, "loss": 0.2539, "num_input_tokens_seen": 26081056, "step": 42815 }, { "epoch": 13.285758609990692, "grad_norm": 6.747759819030762, "learning_rate": 3.058039878178971e-06, "loss": 0.2404, "num_input_tokens_seen": 26084608, "step": 42820 }, { "epoch": 13.287309959664908, "grad_norm": 7.60099458694458, "learning_rate": 3.056792423119152e-06, "loss": 0.1801, "num_input_tokens_seen": 26088576, "step": 42825 }, { "epoch": 13.288861309339126, "grad_norm": 5.687203407287598, "learning_rate": 3.055545110519459e-06, "loss": 0.2167, "num_input_tokens_seen": 26092000, "step": 42830 }, { "epoch": 13.290412659013342, "grad_norm": 7.113968372344971, "learning_rate": 3.0542979404713347e-06, "loss": 0.1949, "num_input_tokens_seen": 26096096, "step": 42835 }, { "epoch": 13.291964008687557, "grad_norm": 3.4597537517547607, "learning_rate": 3.0530509130662123e-06, "loss": 0.2235, "num_input_tokens_seen": 26099104, "step": 42840 }, { "epoch": 13.293515358361775, "grad_norm": 2.7022616863250732, "learning_rate": 3.051804028395513e-06, "loss": 0.2172, "num_input_tokens_seen": 26101376, "step": 42845 }, { "epoch": 13.295066708035991, "grad_norm": 5.8094482421875, "learning_rate": 3.0505572865506493e-06, "loss": 0.1814, "num_input_tokens_seen": 26104096, "step": 42850 }, { "epoch": 13.296618057710209, "grad_norm": 7.613609790802002, "learning_rate": 3.0493106876230204e-06, "loss": 0.1879, "num_input_tokens_seen": 26106848, "step": 42855 }, { "epoch": 13.298169407384425, "grad_norm": 7.171258449554443, "learning_rate": 3.0480642317040187e-06, "loss": 0.2419, "num_input_tokens_seen": 26109536, "step": 42860 }, { "epoch": 13.29972075705864, "grad_norm": 2.771479606628418, "learning_rate": 3.0468179188850226e-06, "loss": 0.1825, "num_input_tokens_seen": 26112288, "step": 42865 }, { "epoch": 13.301272106732858, "grad_norm": 4.383538246154785, "learning_rate": 3.0455717492574027e-06, "loss": 0.2214, "num_input_tokens_seen": 26115136, "step": 42870 }, { "epoch": 13.302823456407074, "grad_norm": 5.537868976593018, "learning_rate": 3.044325722912517e-06, "loss": 0.2299, "num_input_tokens_seen": 26117792, "step": 42875 }, { "epoch": 13.304374806081292, "grad_norm": 2.8036625385284424, "learning_rate": 3.043079839941715e-06, "loss": 0.1953, "num_input_tokens_seen": 26120864, "step": 42880 }, { "epoch": 13.305926155755508, "grad_norm": 3.2949378490448, "learning_rate": 3.041834100436334e-06, "loss": 0.205, "num_input_tokens_seen": 26123456, "step": 42885 }, { "epoch": 13.307477505429723, "grad_norm": 4.044785022735596, "learning_rate": 3.0405885044877008e-06, "loss": 0.1956, "num_input_tokens_seen": 26127040, "step": 42890 }, { "epoch": 13.309028855103941, "grad_norm": 3.353872776031494, "learning_rate": 3.0393430521871326e-06, "loss": 0.1706, "num_input_tokens_seen": 26129728, "step": 42895 }, { "epoch": 13.310580204778157, "grad_norm": 3.3154139518737793, "learning_rate": 3.038097743625937e-06, "loss": 0.1831, "num_input_tokens_seen": 26132224, "step": 42900 }, { "epoch": 13.312131554452373, "grad_norm": 8.653656005859375, "learning_rate": 3.03685257889541e-06, "loss": 0.177, "num_input_tokens_seen": 26135328, "step": 42905 }, { "epoch": 13.31368290412659, "grad_norm": 6.1261982917785645, "learning_rate": 3.035607558086836e-06, "loss": 0.2085, "num_input_tokens_seen": 26137824, "step": 42910 }, { "epoch": 13.315234253800806, "grad_norm": 3.901920795440674, "learning_rate": 3.0343626812914896e-06, "loss": 0.1979, "num_input_tokens_seen": 26140512, "step": 42915 }, { "epoch": 13.316785603475024, "grad_norm": 6.4208855628967285, "learning_rate": 3.033117948600635e-06, "loss": 0.1848, "num_input_tokens_seen": 26143520, "step": 42920 }, { "epoch": 13.31833695314924, "grad_norm": 3.7787892818450928, "learning_rate": 3.031873360105526e-06, "loss": 0.2126, "num_input_tokens_seen": 26145984, "step": 42925 }, { "epoch": 13.319888302823456, "grad_norm": 2.445831060409546, "learning_rate": 3.0306289158974066e-06, "loss": 0.1677, "num_input_tokens_seen": 26149440, "step": 42930 }, { "epoch": 13.321439652497673, "grad_norm": 8.078888893127441, "learning_rate": 3.029384616067508e-06, "loss": 0.2199, "num_input_tokens_seen": 26152160, "step": 42935 }, { "epoch": 13.32299100217189, "grad_norm": 5.957873344421387, "learning_rate": 3.0281404607070532e-06, "loss": 0.2081, "num_input_tokens_seen": 26154144, "step": 42940 }, { "epoch": 13.324542351846105, "grad_norm": 5.246712684631348, "learning_rate": 3.0268964499072522e-06, "loss": 0.206, "num_input_tokens_seen": 26157728, "step": 42945 }, { "epoch": 13.326093701520323, "grad_norm": 7.475433349609375, "learning_rate": 3.0256525837593063e-06, "loss": 0.1791, "num_input_tokens_seen": 26161312, "step": 42950 }, { "epoch": 13.327645051194539, "grad_norm": 3.8563895225524902, "learning_rate": 3.0244088623544053e-06, "loss": 0.2105, "num_input_tokens_seen": 26164032, "step": 42955 }, { "epoch": 13.329196400868756, "grad_norm": 3.350900888442993, "learning_rate": 3.0231652857837302e-06, "loss": 0.1629, "num_input_tokens_seen": 26167520, "step": 42960 }, { "epoch": 13.330747750542972, "grad_norm": 4.547741889953613, "learning_rate": 3.021921854138449e-06, "loss": 0.1559, "num_input_tokens_seen": 26171008, "step": 42965 }, { "epoch": 13.332299100217188, "grad_norm": 9.843168258666992, "learning_rate": 3.0206785675097195e-06, "loss": 0.2305, "num_input_tokens_seen": 26173952, "step": 42970 }, { "epoch": 13.333850449891406, "grad_norm": 4.053600788116455, "learning_rate": 3.0194354259886906e-06, "loss": 0.225, "num_input_tokens_seen": 26176000, "step": 42975 }, { "epoch": 13.335401799565622, "grad_norm": 5.02905797958374, "learning_rate": 3.0181924296664965e-06, "loss": 0.2313, "num_input_tokens_seen": 26180256, "step": 42980 }, { "epoch": 13.33695314923984, "grad_norm": 5.121504783630371, "learning_rate": 3.016949578634267e-06, "loss": 0.1947, "num_input_tokens_seen": 26183424, "step": 42985 }, { "epoch": 13.338504498914055, "grad_norm": 3.802124261856079, "learning_rate": 3.0157068729831157e-06, "loss": 0.1992, "num_input_tokens_seen": 26186560, "step": 42990 }, { "epoch": 13.340055848588271, "grad_norm": 8.596688270568848, "learning_rate": 3.0144643128041483e-06, "loss": 0.2058, "num_input_tokens_seen": 26190208, "step": 42995 }, { "epoch": 13.341607198262489, "grad_norm": 4.745907783508301, "learning_rate": 3.0132218981884587e-06, "loss": 0.1622, "num_input_tokens_seen": 26193696, "step": 43000 }, { "epoch": 13.343158547936705, "grad_norm": 7.218783378601074, "learning_rate": 3.011979629227131e-06, "loss": 0.1737, "num_input_tokens_seen": 26198144, "step": 43005 }, { "epoch": 13.344709897610922, "grad_norm": 7.127207279205322, "learning_rate": 3.0107375060112386e-06, "loss": 0.2251, "num_input_tokens_seen": 26200992, "step": 43010 }, { "epoch": 13.346261247285138, "grad_norm": 8.573729515075684, "learning_rate": 3.0094955286318417e-06, "loss": 0.1973, "num_input_tokens_seen": 26203392, "step": 43015 }, { "epoch": 13.347812596959354, "grad_norm": 5.432164192199707, "learning_rate": 3.0082536971799955e-06, "loss": 0.2071, "num_input_tokens_seen": 26206304, "step": 43020 }, { "epoch": 13.349363946633572, "grad_norm": 2.891587972640991, "learning_rate": 3.0070120117467373e-06, "loss": 0.1714, "num_input_tokens_seen": 26208960, "step": 43025 }, { "epoch": 13.350915296307788, "grad_norm": 3.058321475982666, "learning_rate": 3.0057704724231007e-06, "loss": 0.1871, "num_input_tokens_seen": 26211552, "step": 43030 }, { "epoch": 13.352466645982004, "grad_norm": 3.092181444168091, "learning_rate": 3.0045290793001037e-06, "loss": 0.2536, "num_input_tokens_seen": 26214176, "step": 43035 }, { "epoch": 13.354017995656221, "grad_norm": 2.793999671936035, "learning_rate": 3.0032878324687536e-06, "loss": 0.1855, "num_input_tokens_seen": 26216896, "step": 43040 }, { "epoch": 13.355569345330437, "grad_norm": 5.12668514251709, "learning_rate": 3.0020467320200513e-06, "loss": 0.2159, "num_input_tokens_seen": 26220192, "step": 43045 }, { "epoch": 13.357120695004655, "grad_norm": 6.9647674560546875, "learning_rate": 3.0008057780449827e-06, "loss": 0.2252, "num_input_tokens_seen": 26222688, "step": 43050 }, { "epoch": 13.35867204467887, "grad_norm": 4.690711498260498, "learning_rate": 2.999564970634525e-06, "loss": 0.1624, "num_input_tokens_seen": 26226240, "step": 43055 }, { "epoch": 13.360223394353087, "grad_norm": 4.324706554412842, "learning_rate": 2.9983243098796434e-06, "loss": 0.2448, "num_input_tokens_seen": 26228864, "step": 43060 }, { "epoch": 13.361774744027304, "grad_norm": 4.903393268585205, "learning_rate": 2.9970837958712927e-06, "loss": 0.1986, "num_input_tokens_seen": 26231808, "step": 43065 }, { "epoch": 13.36332609370152, "grad_norm": 4.09811544418335, "learning_rate": 2.995843428700419e-06, "loss": 0.2186, "num_input_tokens_seen": 26234656, "step": 43070 }, { "epoch": 13.364877443375736, "grad_norm": 5.852662563323975, "learning_rate": 2.9946032084579533e-06, "loss": 0.2086, "num_input_tokens_seen": 26238656, "step": 43075 }, { "epoch": 13.366428793049954, "grad_norm": 5.958582878112793, "learning_rate": 2.9933631352348215e-06, "loss": 0.2192, "num_input_tokens_seen": 26241248, "step": 43080 }, { "epoch": 13.36798014272417, "grad_norm": 3.842088460922241, "learning_rate": 2.992123209121933e-06, "loss": 0.1841, "num_input_tokens_seen": 26244736, "step": 43085 }, { "epoch": 13.369531492398387, "grad_norm": 5.500293254852295, "learning_rate": 2.99088343021019e-06, "loss": 0.221, "num_input_tokens_seen": 26247168, "step": 43090 }, { "epoch": 13.371082842072603, "grad_norm": 5.952441215515137, "learning_rate": 2.9896437985904836e-06, "loss": 0.2296, "num_input_tokens_seen": 26249664, "step": 43095 }, { "epoch": 13.372634191746819, "grad_norm": 3.9807345867156982, "learning_rate": 2.988404314353691e-06, "loss": 0.1893, "num_input_tokens_seen": 26253248, "step": 43100 }, { "epoch": 13.374185541421037, "grad_norm": 3.488090753555298, "learning_rate": 2.987164977590685e-06, "loss": 0.1591, "num_input_tokens_seen": 26256608, "step": 43105 }, { "epoch": 13.375736891095253, "grad_norm": 3.6401469707489014, "learning_rate": 2.9859257883923215e-06, "loss": 0.2092, "num_input_tokens_seen": 26260832, "step": 43110 }, { "epoch": 13.37728824076947, "grad_norm": 13.873943328857422, "learning_rate": 2.984686746849448e-06, "loss": 0.235, "num_input_tokens_seen": 26264032, "step": 43115 }, { "epoch": 13.378839590443686, "grad_norm": 3.41335391998291, "learning_rate": 2.9834478530529005e-06, "loss": 0.1886, "num_input_tokens_seen": 26266688, "step": 43120 }, { "epoch": 13.380390940117902, "grad_norm": 6.228855609893799, "learning_rate": 2.982209107093505e-06, "loss": 0.2034, "num_input_tokens_seen": 26269920, "step": 43125 }, { "epoch": 13.38194228979212, "grad_norm": 5.419739723205566, "learning_rate": 2.980970509062076e-06, "loss": 0.2611, "num_input_tokens_seen": 26273280, "step": 43130 }, { "epoch": 13.383493639466336, "grad_norm": 1.5382330417633057, "learning_rate": 2.9797320590494176e-06, "loss": 0.1721, "num_input_tokens_seen": 26276192, "step": 43135 }, { "epoch": 13.385044989140553, "grad_norm": 6.810089588165283, "learning_rate": 2.9784937571463233e-06, "loss": 0.2086, "num_input_tokens_seen": 26279424, "step": 43140 }, { "epoch": 13.386596338814769, "grad_norm": 9.10612964630127, "learning_rate": 2.9772556034435745e-06, "loss": 0.1774, "num_input_tokens_seen": 26282880, "step": 43145 }, { "epoch": 13.388147688488985, "grad_norm": 3.716994524002075, "learning_rate": 2.976017598031943e-06, "loss": 0.2294, "num_input_tokens_seen": 26286112, "step": 43150 }, { "epoch": 13.389699038163203, "grad_norm": 4.379866123199463, "learning_rate": 2.9747797410021887e-06, "loss": 0.2132, "num_input_tokens_seen": 26288672, "step": 43155 }, { "epoch": 13.391250387837419, "grad_norm": 5.69953727722168, "learning_rate": 2.9735420324450608e-06, "loss": 0.2285, "num_input_tokens_seen": 26293856, "step": 43160 }, { "epoch": 13.392801737511634, "grad_norm": 4.404304504394531, "learning_rate": 2.9723044724513e-06, "loss": 0.1563, "num_input_tokens_seen": 26296000, "step": 43165 }, { "epoch": 13.394353087185852, "grad_norm": 8.44470500946045, "learning_rate": 2.9710670611116327e-06, "loss": 0.2062, "num_input_tokens_seen": 26299264, "step": 43170 }, { "epoch": 13.395904436860068, "grad_norm": 9.528876304626465, "learning_rate": 2.9698297985167755e-06, "loss": 0.1983, "num_input_tokens_seen": 26302336, "step": 43175 }, { "epoch": 13.397455786534286, "grad_norm": 4.545472621917725, "learning_rate": 2.968592684757436e-06, "loss": 0.2297, "num_input_tokens_seen": 26305216, "step": 43180 }, { "epoch": 13.399007136208501, "grad_norm": 6.4127583503723145, "learning_rate": 2.9673557199243075e-06, "loss": 0.1858, "num_input_tokens_seen": 26308512, "step": 43185 }, { "epoch": 13.400558485882717, "grad_norm": 8.311413764953613, "learning_rate": 2.9661189041080753e-06, "loss": 0.1635, "num_input_tokens_seen": 26311136, "step": 43190 }, { "epoch": 13.402109835556935, "grad_norm": 6.194355010986328, "learning_rate": 2.9648822373994112e-06, "loss": 0.1794, "num_input_tokens_seen": 26314144, "step": 43195 }, { "epoch": 13.403661185231151, "grad_norm": 6.839847564697266, "learning_rate": 2.963645719888979e-06, "loss": 0.2093, "num_input_tokens_seen": 26318432, "step": 43200 }, { "epoch": 13.405212534905367, "grad_norm": 5.879970073699951, "learning_rate": 2.9624093516674297e-06, "loss": 0.1685, "num_input_tokens_seen": 26320832, "step": 43205 }, { "epoch": 13.406763884579584, "grad_norm": 3.945727825164795, "learning_rate": 2.9611731328254036e-06, "loss": 0.2151, "num_input_tokens_seen": 26323648, "step": 43210 }, { "epoch": 13.4083152342538, "grad_norm": 3.270188570022583, "learning_rate": 2.9599370634535304e-06, "loss": 0.1723, "num_input_tokens_seen": 26326688, "step": 43215 }, { "epoch": 13.409866583928018, "grad_norm": 5.705646514892578, "learning_rate": 2.958701143642427e-06, "loss": 0.1923, "num_input_tokens_seen": 26328896, "step": 43220 }, { "epoch": 13.411417933602234, "grad_norm": 9.205716133117676, "learning_rate": 2.957465373482703e-06, "loss": 0.192, "num_input_tokens_seen": 26331328, "step": 43225 }, { "epoch": 13.41296928327645, "grad_norm": 5.3082475662231445, "learning_rate": 2.956229753064955e-06, "loss": 0.2162, "num_input_tokens_seen": 26333696, "step": 43230 }, { "epoch": 13.414520632950667, "grad_norm": 7.498483657836914, "learning_rate": 2.954994282479768e-06, "loss": 0.3198, "num_input_tokens_seen": 26336864, "step": 43235 }, { "epoch": 13.416071982624883, "grad_norm": 4.1326165199279785, "learning_rate": 2.9537589618177164e-06, "loss": 0.1978, "num_input_tokens_seen": 26341248, "step": 43240 }, { "epoch": 13.417623332299101, "grad_norm": 2.615511178970337, "learning_rate": 2.952523791169364e-06, "loss": 0.2102, "num_input_tokens_seen": 26344640, "step": 43245 }, { "epoch": 13.419174681973317, "grad_norm": 3.8278841972351074, "learning_rate": 2.9512887706252634e-06, "loss": 0.1776, "num_input_tokens_seen": 26347872, "step": 43250 }, { "epoch": 13.420726031647533, "grad_norm": 6.612244129180908, "learning_rate": 2.9500539002759565e-06, "loss": 0.2362, "num_input_tokens_seen": 26351040, "step": 43255 }, { "epoch": 13.42227738132175, "grad_norm": 6.4581828117370605, "learning_rate": 2.9488191802119735e-06, "loss": 0.1815, "num_input_tokens_seen": 26354400, "step": 43260 }, { "epoch": 13.423828730995966, "grad_norm": 4.59423303604126, "learning_rate": 2.947584610523834e-06, "loss": 0.1992, "num_input_tokens_seen": 26356864, "step": 43265 }, { "epoch": 13.425380080670184, "grad_norm": 5.29208517074585, "learning_rate": 2.946350191302047e-06, "loss": 0.2081, "num_input_tokens_seen": 26360032, "step": 43270 }, { "epoch": 13.4269314303444, "grad_norm": 4.4126763343811035, "learning_rate": 2.9451159226371097e-06, "loss": 0.2562, "num_input_tokens_seen": 26362528, "step": 43275 }, { "epoch": 13.428482780018616, "grad_norm": 5.486030578613281, "learning_rate": 2.9438818046195084e-06, "loss": 0.1754, "num_input_tokens_seen": 26365184, "step": 43280 }, { "epoch": 13.430034129692833, "grad_norm": 3.0427486896514893, "learning_rate": 2.9426478373397193e-06, "loss": 0.2427, "num_input_tokens_seen": 26368160, "step": 43285 }, { "epoch": 13.43158547936705, "grad_norm": 6.581653594970703, "learning_rate": 2.9414140208882063e-06, "loss": 0.2734, "num_input_tokens_seen": 26370880, "step": 43290 }, { "epoch": 13.433136829041265, "grad_norm": 4.247635364532471, "learning_rate": 2.9401803553554233e-06, "loss": 0.2303, "num_input_tokens_seen": 26373504, "step": 43295 }, { "epoch": 13.434688178715483, "grad_norm": 7.891760349273682, "learning_rate": 2.938946840831812e-06, "loss": 0.2053, "num_input_tokens_seen": 26377728, "step": 43300 }, { "epoch": 13.436239528389699, "grad_norm": 9.389090538024902, "learning_rate": 2.9377134774078035e-06, "loss": 0.2304, "num_input_tokens_seen": 26382464, "step": 43305 }, { "epoch": 13.437790878063916, "grad_norm": 3.598376512527466, "learning_rate": 2.936480265173819e-06, "loss": 0.1775, "num_input_tokens_seen": 26384992, "step": 43310 }, { "epoch": 13.439342227738132, "grad_norm": 4.096744060516357, "learning_rate": 2.9352472042202663e-06, "loss": 0.2206, "num_input_tokens_seen": 26389184, "step": 43315 }, { "epoch": 13.440893577412348, "grad_norm": 6.71757698059082, "learning_rate": 2.9340142946375432e-06, "loss": 0.1855, "num_input_tokens_seen": 26392384, "step": 43320 }, { "epoch": 13.442444927086566, "grad_norm": 3.3943023681640625, "learning_rate": 2.9327815365160384e-06, "loss": 0.2304, "num_input_tokens_seen": 26394880, "step": 43325 }, { "epoch": 13.443996276760782, "grad_norm": 11.87012004852295, "learning_rate": 2.9315489299461254e-06, "loss": 0.1967, "num_input_tokens_seen": 26398400, "step": 43330 }, { "epoch": 13.445547626434998, "grad_norm": 7.609391212463379, "learning_rate": 2.9303164750181704e-06, "loss": 0.1969, "num_input_tokens_seen": 26401472, "step": 43335 }, { "epoch": 13.447098976109215, "grad_norm": 6.255893707275391, "learning_rate": 2.929084171822526e-06, "loss": 0.22, "num_input_tokens_seen": 26404160, "step": 43340 }, { "epoch": 13.448650325783431, "grad_norm": 8.019762992858887, "learning_rate": 2.927852020449536e-06, "loss": 0.1696, "num_input_tokens_seen": 26407232, "step": 43345 }, { "epoch": 13.450201675457649, "grad_norm": 5.749907493591309, "learning_rate": 2.92662002098953e-06, "loss": 0.1774, "num_input_tokens_seen": 26410112, "step": 43350 }, { "epoch": 13.451753025131865, "grad_norm": 7.708456993103027, "learning_rate": 2.92538817353283e-06, "loss": 0.1846, "num_input_tokens_seen": 26413248, "step": 43355 }, { "epoch": 13.45330437480608, "grad_norm": 6.176053047180176, "learning_rate": 2.924156478169743e-06, "loss": 0.1738, "num_input_tokens_seen": 26416576, "step": 43360 }, { "epoch": 13.454855724480298, "grad_norm": 4.052248001098633, "learning_rate": 2.9229249349905686e-06, "loss": 0.1872, "num_input_tokens_seen": 26420224, "step": 43365 }, { "epoch": 13.456407074154514, "grad_norm": 11.132158279418945, "learning_rate": 2.921693544085592e-06, "loss": 0.3048, "num_input_tokens_seen": 26422784, "step": 43370 }, { "epoch": 13.457958423828732, "grad_norm": 9.057884216308594, "learning_rate": 2.9204623055450896e-06, "loss": 0.1923, "num_input_tokens_seen": 26425536, "step": 43375 }, { "epoch": 13.459509773502948, "grad_norm": 3.3963570594787598, "learning_rate": 2.9192312194593263e-06, "loss": 0.1803, "num_input_tokens_seen": 26428288, "step": 43380 }, { "epoch": 13.461061123177164, "grad_norm": 5.319312572479248, "learning_rate": 2.918000285918553e-06, "loss": 0.1827, "num_input_tokens_seen": 26431648, "step": 43385 }, { "epoch": 13.462612472851381, "grad_norm": 10.910677909851074, "learning_rate": 2.9167695050130155e-06, "loss": 0.2058, "num_input_tokens_seen": 26434016, "step": 43390 }, { "epoch": 13.464163822525597, "grad_norm": 6.384370803833008, "learning_rate": 2.9155388768329407e-06, "loss": 0.2193, "num_input_tokens_seen": 26436416, "step": 43395 }, { "epoch": 13.465715172199815, "grad_norm": 10.706158638000488, "learning_rate": 2.914308401468552e-06, "loss": 0.2038, "num_input_tokens_seen": 26440320, "step": 43400 }, { "epoch": 13.46726652187403, "grad_norm": 5.9566144943237305, "learning_rate": 2.9130780790100533e-06, "loss": 0.2041, "num_input_tokens_seen": 26444448, "step": 43405 }, { "epoch": 13.468817871548247, "grad_norm": 3.7366368770599365, "learning_rate": 2.911847909547646e-06, "loss": 0.1901, "num_input_tokens_seen": 26447936, "step": 43410 }, { "epoch": 13.470369221222464, "grad_norm": 6.96510648727417, "learning_rate": 2.9106178931715125e-06, "loss": 0.2131, "num_input_tokens_seen": 26451168, "step": 43415 }, { "epoch": 13.47192057089668, "grad_norm": 8.324010848999023, "learning_rate": 2.909388029971832e-06, "loss": 0.1974, "num_input_tokens_seen": 26453888, "step": 43420 }, { "epoch": 13.473471920570896, "grad_norm": 9.302887916564941, "learning_rate": 2.908158320038763e-06, "loss": 0.2332, "num_input_tokens_seen": 26457632, "step": 43425 }, { "epoch": 13.475023270245114, "grad_norm": 7.415524959564209, "learning_rate": 2.90692876346246e-06, "loss": 0.1501, "num_input_tokens_seen": 26460992, "step": 43430 }, { "epoch": 13.47657461991933, "grad_norm": 9.164632797241211, "learning_rate": 2.9056993603330667e-06, "loss": 0.1816, "num_input_tokens_seen": 26463584, "step": 43435 }, { "epoch": 13.478125969593547, "grad_norm": 9.670919418334961, "learning_rate": 2.904470110740709e-06, "loss": 0.2613, "num_input_tokens_seen": 26466848, "step": 43440 }, { "epoch": 13.479677319267763, "grad_norm": 6.121445655822754, "learning_rate": 2.903241014775508e-06, "loss": 0.1749, "num_input_tokens_seen": 26469632, "step": 43445 }, { "epoch": 13.481228668941979, "grad_norm": 7.704206943511963, "learning_rate": 2.902012072527568e-06, "loss": 0.2502, "num_input_tokens_seen": 26472064, "step": 43450 }, { "epoch": 13.482780018616197, "grad_norm": 15.337870597839355, "learning_rate": 2.90078328408699e-06, "loss": 0.2257, "num_input_tokens_seen": 26475200, "step": 43455 }, { "epoch": 13.484331368290412, "grad_norm": 7.458017826080322, "learning_rate": 2.899554649543853e-06, "loss": 0.2129, "num_input_tokens_seen": 26478464, "step": 43460 }, { "epoch": 13.485882717964628, "grad_norm": 4.979208469390869, "learning_rate": 2.8983261689882345e-06, "loss": 0.2134, "num_input_tokens_seen": 26480448, "step": 43465 }, { "epoch": 13.487434067638846, "grad_norm": 7.431281566619873, "learning_rate": 2.897097842510195e-06, "loss": 0.1694, "num_input_tokens_seen": 26482944, "step": 43470 }, { "epoch": 13.488985417313062, "grad_norm": 2.566826581954956, "learning_rate": 2.8958696701997867e-06, "loss": 0.1701, "num_input_tokens_seen": 26485920, "step": 43475 }, { "epoch": 13.49053676698728, "grad_norm": 5.822197437286377, "learning_rate": 2.894641652147046e-06, "loss": 0.1461, "num_input_tokens_seen": 26488800, "step": 43480 }, { "epoch": 13.492088116661495, "grad_norm": 2.625717878341675, "learning_rate": 2.893413788442006e-06, "loss": 0.1559, "num_input_tokens_seen": 26491520, "step": 43485 }, { "epoch": 13.493639466335711, "grad_norm": 8.971176147460938, "learning_rate": 2.8921860791746786e-06, "loss": 0.2355, "num_input_tokens_seen": 26494784, "step": 43490 }, { "epoch": 13.495190816009929, "grad_norm": 12.278524398803711, "learning_rate": 2.8909585244350724e-06, "loss": 0.2187, "num_input_tokens_seen": 26497856, "step": 43495 }, { "epoch": 13.496742165684145, "grad_norm": 3.971583127975464, "learning_rate": 2.889731124313182e-06, "loss": 0.1825, "num_input_tokens_seen": 26501120, "step": 43500 }, { "epoch": 13.498293515358363, "grad_norm": 6.66690731048584, "learning_rate": 2.8885038788989885e-06, "loss": 0.2269, "num_input_tokens_seen": 26503648, "step": 43505 }, { "epoch": 13.499844865032578, "grad_norm": 3.2414846420288086, "learning_rate": 2.8872767882824664e-06, "loss": 0.2672, "num_input_tokens_seen": 26505952, "step": 43510 }, { "epoch": 13.501396214706794, "grad_norm": 5.386233329772949, "learning_rate": 2.886049852553572e-06, "loss": 0.1797, "num_input_tokens_seen": 26509184, "step": 43515 }, { "epoch": 13.502947564381012, "grad_norm": 13.309969902038574, "learning_rate": 2.8848230718022586e-06, "loss": 0.2241, "num_input_tokens_seen": 26512032, "step": 43520 }, { "epoch": 13.504498914055228, "grad_norm": 5.389777660369873, "learning_rate": 2.8835964461184587e-06, "loss": 0.2312, "num_input_tokens_seen": 26514816, "step": 43525 }, { "epoch": 13.506050263729446, "grad_norm": 8.936910629272461, "learning_rate": 2.882369975592104e-06, "loss": 0.2757, "num_input_tokens_seen": 26518144, "step": 43530 }, { "epoch": 13.507601613403661, "grad_norm": 4.406894207000732, "learning_rate": 2.8811436603131043e-06, "loss": 0.1644, "num_input_tokens_seen": 26520704, "step": 43535 }, { "epoch": 13.509152963077877, "grad_norm": 6.957320690155029, "learning_rate": 2.8799175003713677e-06, "loss": 0.1935, "num_input_tokens_seen": 26523616, "step": 43540 }, { "epoch": 13.510704312752095, "grad_norm": 9.947507858276367, "learning_rate": 2.8786914958567813e-06, "loss": 0.214, "num_input_tokens_seen": 26526016, "step": 43545 }, { "epoch": 13.51225566242631, "grad_norm": 8.62522029876709, "learning_rate": 2.877465646859228e-06, "loss": 0.2828, "num_input_tokens_seen": 26528096, "step": 43550 }, { "epoch": 13.513807012100527, "grad_norm": 5.381014823913574, "learning_rate": 2.8762399534685804e-06, "loss": 0.2185, "num_input_tokens_seen": 26532128, "step": 43555 }, { "epoch": 13.515358361774744, "grad_norm": 7.2286696434021, "learning_rate": 2.8750144157746907e-06, "loss": 0.2015, "num_input_tokens_seen": 26535328, "step": 43560 }, { "epoch": 13.51690971144896, "grad_norm": 6.14564323425293, "learning_rate": 2.8737890338674094e-06, "loss": 0.1928, "num_input_tokens_seen": 26537472, "step": 43565 }, { "epoch": 13.518461061123178, "grad_norm": 4.599796772003174, "learning_rate": 2.872563807836569e-06, "loss": 0.1664, "num_input_tokens_seen": 26540800, "step": 43570 }, { "epoch": 13.520012410797394, "grad_norm": 9.874030113220215, "learning_rate": 2.8713387377719957e-06, "loss": 0.2046, "num_input_tokens_seen": 26545664, "step": 43575 }, { "epoch": 13.52156376047161, "grad_norm": 12.081555366516113, "learning_rate": 2.870113823763498e-06, "loss": 0.265, "num_input_tokens_seen": 26547712, "step": 43580 }, { "epoch": 13.523115110145827, "grad_norm": 7.601470947265625, "learning_rate": 2.8688890659008807e-06, "loss": 0.1544, "num_input_tokens_seen": 26551168, "step": 43585 }, { "epoch": 13.524666459820043, "grad_norm": 6.741856575012207, "learning_rate": 2.8676644642739287e-06, "loss": 0.178, "num_input_tokens_seen": 26554368, "step": 43590 }, { "epoch": 13.52621780949426, "grad_norm": 6.594756603240967, "learning_rate": 2.8664400189724246e-06, "loss": 0.1632, "num_input_tokens_seen": 26556992, "step": 43595 }, { "epoch": 13.527769159168477, "grad_norm": 4.163570880889893, "learning_rate": 2.86521573008613e-06, "loss": 0.1859, "num_input_tokens_seen": 26560384, "step": 43600 }, { "epoch": 13.529320508842693, "grad_norm": 4.780604839324951, "learning_rate": 2.8639915977048016e-06, "loss": 0.1793, "num_input_tokens_seen": 26562880, "step": 43605 }, { "epoch": 13.53087185851691, "grad_norm": 2.4165940284729004, "learning_rate": 2.862767621918184e-06, "loss": 0.1754, "num_input_tokens_seen": 26565728, "step": 43610 }, { "epoch": 13.532423208191126, "grad_norm": 3.955745220184326, "learning_rate": 2.86154380281601e-06, "loss": 0.1887, "num_input_tokens_seen": 26568128, "step": 43615 }, { "epoch": 13.533974557865342, "grad_norm": 6.1775360107421875, "learning_rate": 2.8603201404879966e-06, "loss": 0.1724, "num_input_tokens_seen": 26571936, "step": 43620 }, { "epoch": 13.53552590753956, "grad_norm": 3.611663818359375, "learning_rate": 2.859096635023856e-06, "loss": 0.1786, "num_input_tokens_seen": 26574528, "step": 43625 }, { "epoch": 13.537077257213776, "grad_norm": 8.741567611694336, "learning_rate": 2.8578732865132817e-06, "loss": 0.2677, "num_input_tokens_seen": 26578048, "step": 43630 }, { "epoch": 13.538628606887993, "grad_norm": 14.99626350402832, "learning_rate": 2.856650095045963e-06, "loss": 0.2933, "num_input_tokens_seen": 26580256, "step": 43635 }, { "epoch": 13.54017995656221, "grad_norm": 16.596708297729492, "learning_rate": 2.855427060711575e-06, "loss": 0.2919, "num_input_tokens_seen": 26582880, "step": 43640 }, { "epoch": 13.541731306236425, "grad_norm": 5.306029319763184, "learning_rate": 2.8542041835997774e-06, "loss": 0.1928, "num_input_tokens_seen": 26585952, "step": 43645 }, { "epoch": 13.543282655910643, "grad_norm": 7.541558742523193, "learning_rate": 2.8529814638002253e-06, "loss": 0.1938, "num_input_tokens_seen": 26590336, "step": 43650 }, { "epoch": 13.544834005584859, "grad_norm": 9.086285591125488, "learning_rate": 2.851758901402554e-06, "loss": 0.1587, "num_input_tokens_seen": 26594432, "step": 43655 }, { "epoch": 13.546385355259076, "grad_norm": 14.169441223144531, "learning_rate": 2.8505364964963955e-06, "loss": 0.244, "num_input_tokens_seen": 26598560, "step": 43660 }, { "epoch": 13.547936704933292, "grad_norm": 8.075129508972168, "learning_rate": 2.8493142491713644e-06, "loss": 0.278, "num_input_tokens_seen": 26601504, "step": 43665 }, { "epoch": 13.549488054607508, "grad_norm": 2.6892013549804688, "learning_rate": 2.8480921595170686e-06, "loss": 0.1949, "num_input_tokens_seen": 26603648, "step": 43670 }, { "epoch": 13.551039404281726, "grad_norm": 10.385000228881836, "learning_rate": 2.8468702276230977e-06, "loss": 0.2274, "num_input_tokens_seen": 26607520, "step": 43675 }, { "epoch": 13.552590753955942, "grad_norm": 7.7423858642578125, "learning_rate": 2.8456484535790375e-06, "loss": 0.2186, "num_input_tokens_seen": 26611648, "step": 43680 }, { "epoch": 13.554142103630157, "grad_norm": 4.4185686111450195, "learning_rate": 2.8444268374744554e-06, "loss": 0.2546, "num_input_tokens_seen": 26613984, "step": 43685 }, { "epoch": 13.555693453304375, "grad_norm": 4.69360876083374, "learning_rate": 2.843205379398914e-06, "loss": 0.1763, "num_input_tokens_seen": 26616352, "step": 43690 }, { "epoch": 13.557244802978591, "grad_norm": 4.7307329177856445, "learning_rate": 2.8419840794419564e-06, "loss": 0.161, "num_input_tokens_seen": 26619552, "step": 43695 }, { "epoch": 13.558796152652809, "grad_norm": 8.140508651733398, "learning_rate": 2.840762937693121e-06, "loss": 0.218, "num_input_tokens_seen": 26623552, "step": 43700 }, { "epoch": 13.560347502327025, "grad_norm": 8.465048789978027, "learning_rate": 2.839541954241933e-06, "loss": 0.2217, "num_input_tokens_seen": 26629536, "step": 43705 }, { "epoch": 13.56189885200124, "grad_norm": 3.0246617794036865, "learning_rate": 2.838321129177901e-06, "loss": 0.1847, "num_input_tokens_seen": 26633248, "step": 43710 }, { "epoch": 13.563450201675458, "grad_norm": 2.921726703643799, "learning_rate": 2.8371004625905307e-06, "loss": 0.2744, "num_input_tokens_seen": 26636384, "step": 43715 }, { "epoch": 13.565001551349674, "grad_norm": 5.584944248199463, "learning_rate": 2.835879954569307e-06, "loss": 0.2265, "num_input_tokens_seen": 26639136, "step": 43720 }, { "epoch": 13.56655290102389, "grad_norm": 9.031481742858887, "learning_rate": 2.8346596052037123e-06, "loss": 0.2646, "num_input_tokens_seen": 26642080, "step": 43725 }, { "epoch": 13.568104250698108, "grad_norm": 5.417328834533691, "learning_rate": 2.833439414583208e-06, "loss": 0.241, "num_input_tokens_seen": 26645568, "step": 43730 }, { "epoch": 13.569655600372323, "grad_norm": 2.786837100982666, "learning_rate": 2.8322193827972515e-06, "loss": 0.17, "num_input_tokens_seen": 26648896, "step": 43735 }, { "epoch": 13.571206950046541, "grad_norm": 3.415639638900757, "learning_rate": 2.830999509935283e-06, "loss": 0.1857, "num_input_tokens_seen": 26651232, "step": 43740 }, { "epoch": 13.572758299720757, "grad_norm": 10.053826332092285, "learning_rate": 2.829779796086738e-06, "loss": 0.251, "num_input_tokens_seen": 26655648, "step": 43745 }, { "epoch": 13.574309649394973, "grad_norm": 4.661192893981934, "learning_rate": 2.8285602413410303e-06, "loss": 0.186, "num_input_tokens_seen": 26658688, "step": 43750 }, { "epoch": 13.57586099906919, "grad_norm": 8.981595039367676, "learning_rate": 2.8273408457875728e-06, "loss": 0.2033, "num_input_tokens_seen": 26660960, "step": 43755 }, { "epoch": 13.577412348743406, "grad_norm": 3.777517080307007, "learning_rate": 2.8261216095157574e-06, "loss": 0.1556, "num_input_tokens_seen": 26663424, "step": 43760 }, { "epoch": 13.578963698417624, "grad_norm": 6.081387519836426, "learning_rate": 2.82490253261497e-06, "loss": 0.3081, "num_input_tokens_seen": 26666432, "step": 43765 }, { "epoch": 13.58051504809184, "grad_norm": 7.439364910125732, "learning_rate": 2.823683615174587e-06, "loss": 0.298, "num_input_tokens_seen": 26669152, "step": 43770 }, { "epoch": 13.582066397766056, "grad_norm": 4.471795558929443, "learning_rate": 2.822464857283965e-06, "loss": 0.1951, "num_input_tokens_seen": 26671872, "step": 43775 }, { "epoch": 13.583617747440274, "grad_norm": 5.163612365722656, "learning_rate": 2.8212462590324553e-06, "loss": 0.1568, "num_input_tokens_seen": 26675136, "step": 43780 }, { "epoch": 13.58516909711449, "grad_norm": 7.8735504150390625, "learning_rate": 2.820027820509394e-06, "loss": 0.212, "num_input_tokens_seen": 26678144, "step": 43785 }, { "epoch": 13.586720446788707, "grad_norm": 5.152619361877441, "learning_rate": 2.81880954180411e-06, "loss": 0.2938, "num_input_tokens_seen": 26681152, "step": 43790 }, { "epoch": 13.588271796462923, "grad_norm": 17.915157318115234, "learning_rate": 2.817591423005914e-06, "loss": 0.22, "num_input_tokens_seen": 26684416, "step": 43795 }, { "epoch": 13.589823146137139, "grad_norm": 9.825602531433105, "learning_rate": 2.8163734642041118e-06, "loss": 0.212, "num_input_tokens_seen": 26687776, "step": 43800 }, { "epoch": 13.591374495811356, "grad_norm": 7.28045129776001, "learning_rate": 2.8151556654879907e-06, "loss": 0.2309, "num_input_tokens_seen": 26690496, "step": 43805 }, { "epoch": 13.592925845485572, "grad_norm": 6.701266288757324, "learning_rate": 2.813938026946834e-06, "loss": 0.2034, "num_input_tokens_seen": 26693312, "step": 43810 }, { "epoch": 13.594477195159788, "grad_norm": 3.9607627391815186, "learning_rate": 2.812720548669905e-06, "loss": 0.2391, "num_input_tokens_seen": 26696960, "step": 43815 }, { "epoch": 13.596028544834006, "grad_norm": 5.144911289215088, "learning_rate": 2.8115032307464607e-06, "loss": 0.1437, "num_input_tokens_seen": 26700320, "step": 43820 }, { "epoch": 13.597579894508222, "grad_norm": 3.8295845985412598, "learning_rate": 2.8102860732657466e-06, "loss": 0.1964, "num_input_tokens_seen": 26703456, "step": 43825 }, { "epoch": 13.59913124418244, "grad_norm": 8.685430526733398, "learning_rate": 2.8090690763169927e-06, "loss": 0.1781, "num_input_tokens_seen": 26706240, "step": 43830 }, { "epoch": 13.600682593856655, "grad_norm": 4.985918045043945, "learning_rate": 2.8078522399894216e-06, "loss": 0.2013, "num_input_tokens_seen": 26708768, "step": 43835 }, { "epoch": 13.602233943530871, "grad_norm": 5.093855381011963, "learning_rate": 2.8066355643722377e-06, "loss": 0.1763, "num_input_tokens_seen": 26711776, "step": 43840 }, { "epoch": 13.603785293205089, "grad_norm": 6.077408313751221, "learning_rate": 2.805419049554643e-06, "loss": 0.1447, "num_input_tokens_seen": 26714976, "step": 43845 }, { "epoch": 13.605336642879305, "grad_norm": 5.831780433654785, "learning_rate": 2.804202695625817e-06, "loss": 0.1805, "num_input_tokens_seen": 26717824, "step": 43850 }, { "epoch": 13.60688799255352, "grad_norm": 8.161910057067871, "learning_rate": 2.802986502674938e-06, "loss": 0.1995, "num_input_tokens_seen": 26720576, "step": 43855 }, { "epoch": 13.608439342227738, "grad_norm": 4.124828338623047, "learning_rate": 2.8017704707911625e-06, "loss": 0.2167, "num_input_tokens_seen": 26723360, "step": 43860 }, { "epoch": 13.609990691901954, "grad_norm": 6.835862636566162, "learning_rate": 2.8005546000636448e-06, "loss": 0.1847, "num_input_tokens_seen": 26726208, "step": 43865 }, { "epoch": 13.611542041576172, "grad_norm": 8.179800987243652, "learning_rate": 2.7993388905815176e-06, "loss": 0.2138, "num_input_tokens_seen": 26729536, "step": 43870 }, { "epoch": 13.613093391250388, "grad_norm": 5.3039093017578125, "learning_rate": 2.7981233424339122e-06, "loss": 0.1785, "num_input_tokens_seen": 26732544, "step": 43875 }, { "epoch": 13.614644740924604, "grad_norm": 8.687328338623047, "learning_rate": 2.7969079557099377e-06, "loss": 0.2014, "num_input_tokens_seen": 26734784, "step": 43880 }, { "epoch": 13.616196090598821, "grad_norm": 5.58275032043457, "learning_rate": 2.7956927304986986e-06, "loss": 0.2573, "num_input_tokens_seen": 26737536, "step": 43885 }, { "epoch": 13.617747440273037, "grad_norm": 5.643910884857178, "learning_rate": 2.794477666889287e-06, "loss": 0.1507, "num_input_tokens_seen": 26740352, "step": 43890 }, { "epoch": 13.619298789947255, "grad_norm": 5.531163215637207, "learning_rate": 2.7932627649707777e-06, "loss": 0.2008, "num_input_tokens_seen": 26743424, "step": 43895 }, { "epoch": 13.62085013962147, "grad_norm": 19.648738861083984, "learning_rate": 2.792048024832242e-06, "loss": 0.2863, "num_input_tokens_seen": 26746432, "step": 43900 }, { "epoch": 13.622401489295687, "grad_norm": 13.84079360961914, "learning_rate": 2.7908334465627297e-06, "loss": 0.2598, "num_input_tokens_seen": 26749472, "step": 43905 }, { "epoch": 13.623952838969904, "grad_norm": 2.6454355716705322, "learning_rate": 2.789619030251288e-06, "loss": 0.2263, "num_input_tokens_seen": 26752160, "step": 43910 }, { "epoch": 13.62550418864412, "grad_norm": 8.660274505615234, "learning_rate": 2.788404775986945e-06, "loss": 0.2677, "num_input_tokens_seen": 26754784, "step": 43915 }, { "epoch": 13.627055538318338, "grad_norm": 2.8042876720428467, "learning_rate": 2.787190683858722e-06, "loss": 0.1899, "num_input_tokens_seen": 26757856, "step": 43920 }, { "epoch": 13.628606887992554, "grad_norm": 7.597531318664551, "learning_rate": 2.785976753955624e-06, "loss": 0.2293, "num_input_tokens_seen": 26761696, "step": 43925 }, { "epoch": 13.63015823766677, "grad_norm": 8.498617172241211, "learning_rate": 2.7847629863666503e-06, "loss": 0.2267, "num_input_tokens_seen": 26764032, "step": 43930 }, { "epoch": 13.631709587340987, "grad_norm": 8.170941352844238, "learning_rate": 2.7835493811807797e-06, "loss": 0.1981, "num_input_tokens_seen": 26766720, "step": 43935 }, { "epoch": 13.633260937015203, "grad_norm": 4.280344486236572, "learning_rate": 2.7823359384869857e-06, "loss": 0.165, "num_input_tokens_seen": 26769184, "step": 43940 }, { "epoch": 13.634812286689419, "grad_norm": 7.843682289123535, "learning_rate": 2.781122658374231e-06, "loss": 0.2304, "num_input_tokens_seen": 26773088, "step": 43945 }, { "epoch": 13.636363636363637, "grad_norm": 9.56835651397705, "learning_rate": 2.779909540931459e-06, "loss": 0.2185, "num_input_tokens_seen": 26775744, "step": 43950 }, { "epoch": 13.637914986037853, "grad_norm": 6.671560287475586, "learning_rate": 2.7786965862476088e-06, "loss": 0.1622, "num_input_tokens_seen": 26780192, "step": 43955 }, { "epoch": 13.63946633571207, "grad_norm": 5.229199409484863, "learning_rate": 2.7774837944116016e-06, "loss": 0.2178, "num_input_tokens_seen": 26782752, "step": 43960 }, { "epoch": 13.641017685386286, "grad_norm": 3.9001219272613525, "learning_rate": 2.776271165512353e-06, "loss": 0.1578, "num_input_tokens_seen": 26786464, "step": 43965 }, { "epoch": 13.642569035060502, "grad_norm": 11.892251014709473, "learning_rate": 2.7750586996387587e-06, "loss": 0.2421, "num_input_tokens_seen": 26788832, "step": 43970 }, { "epoch": 13.64412038473472, "grad_norm": 8.483719825744629, "learning_rate": 2.7738463968797104e-06, "loss": 0.2025, "num_input_tokens_seen": 26791584, "step": 43975 }, { "epoch": 13.645671734408936, "grad_norm": 7.973717212677002, "learning_rate": 2.772634257324081e-06, "loss": 0.2612, "num_input_tokens_seen": 26794208, "step": 43980 }, { "epoch": 13.647223084083151, "grad_norm": 6.940994739532471, "learning_rate": 2.7714222810607387e-06, "loss": 0.1769, "num_input_tokens_seen": 26797728, "step": 43985 }, { "epoch": 13.648774433757369, "grad_norm": 1.85306978225708, "learning_rate": 2.7702104681785313e-06, "loss": 0.2107, "num_input_tokens_seen": 26800096, "step": 43990 }, { "epoch": 13.650325783431585, "grad_norm": 4.490531921386719, "learning_rate": 2.7689988187663038e-06, "loss": 0.1944, "num_input_tokens_seen": 26802944, "step": 43995 }, { "epoch": 13.651877133105803, "grad_norm": 5.127603530883789, "learning_rate": 2.767787332912879e-06, "loss": 0.3058, "num_input_tokens_seen": 26805632, "step": 44000 }, { "epoch": 13.653428482780019, "grad_norm": 4.2045135498046875, "learning_rate": 2.766576010707077e-06, "loss": 0.2102, "num_input_tokens_seen": 26808896, "step": 44005 }, { "epoch": 13.654979832454234, "grad_norm": 1.5745824575424194, "learning_rate": 2.7653648522377027e-06, "loss": 0.1588, "num_input_tokens_seen": 26812896, "step": 44010 }, { "epoch": 13.656531182128452, "grad_norm": 4.836138725280762, "learning_rate": 2.7641538575935443e-06, "loss": 0.2102, "num_input_tokens_seen": 26817216, "step": 44015 }, { "epoch": 13.658082531802668, "grad_norm": 5.455925941467285, "learning_rate": 2.7629430268633873e-06, "loss": 0.1777, "num_input_tokens_seen": 26820000, "step": 44020 }, { "epoch": 13.659633881476886, "grad_norm": 6.990554332733154, "learning_rate": 2.7617323601359948e-06, "loss": 0.21, "num_input_tokens_seen": 26823424, "step": 44025 }, { "epoch": 13.661185231151102, "grad_norm": 9.91886043548584, "learning_rate": 2.760521857500127e-06, "loss": 0.262, "num_input_tokens_seen": 26827584, "step": 44030 }, { "epoch": 13.662736580825317, "grad_norm": 7.636787414550781, "learning_rate": 2.759311519044525e-06, "loss": 0.27, "num_input_tokens_seen": 26831232, "step": 44035 }, { "epoch": 13.664287930499535, "grad_norm": 5.60383939743042, "learning_rate": 2.7581013448579242e-06, "loss": 0.2378, "num_input_tokens_seen": 26833696, "step": 44040 }, { "epoch": 13.665839280173751, "grad_norm": 10.133232116699219, "learning_rate": 2.7568913350290404e-06, "loss": 0.2674, "num_input_tokens_seen": 26836832, "step": 44045 }, { "epoch": 13.667390629847969, "grad_norm": 4.855982780456543, "learning_rate": 2.7556814896465866e-06, "loss": 0.2267, "num_input_tokens_seen": 26839488, "step": 44050 }, { "epoch": 13.668941979522184, "grad_norm": 4.658952236175537, "learning_rate": 2.754471808799255e-06, "loss": 0.1913, "num_input_tokens_seen": 26841984, "step": 44055 }, { "epoch": 13.6704933291964, "grad_norm": 4.2439141273498535, "learning_rate": 2.7532622925757295e-06, "loss": 0.2042, "num_input_tokens_seen": 26845280, "step": 44060 }, { "epoch": 13.672044678870618, "grad_norm": 5.152486801147461, "learning_rate": 2.7520529410646864e-06, "loss": 0.1856, "num_input_tokens_seen": 26848096, "step": 44065 }, { "epoch": 13.673596028544834, "grad_norm": 6.25907564163208, "learning_rate": 2.7508437543547794e-06, "loss": 0.2238, "num_input_tokens_seen": 26850656, "step": 44070 }, { "epoch": 13.67514737821905, "grad_norm": 4.781832695007324, "learning_rate": 2.7496347325346617e-06, "loss": 0.22, "num_input_tokens_seen": 26853824, "step": 44075 }, { "epoch": 13.676698727893267, "grad_norm": 11.801458358764648, "learning_rate": 2.748425875692965e-06, "loss": 0.1983, "num_input_tokens_seen": 26856192, "step": 44080 }, { "epoch": 13.678250077567483, "grad_norm": 2.5241711139678955, "learning_rate": 2.7472171839183153e-06, "loss": 0.1796, "num_input_tokens_seen": 26858176, "step": 44085 }, { "epoch": 13.679801427241701, "grad_norm": 5.253423690795898, "learning_rate": 2.7460086572993215e-06, "loss": 0.1783, "num_input_tokens_seen": 26863712, "step": 44090 }, { "epoch": 13.681352776915917, "grad_norm": 6.740916728973389, "learning_rate": 2.7448002959245863e-06, "loss": 0.2071, "num_input_tokens_seen": 26866688, "step": 44095 }, { "epoch": 13.682904126590133, "grad_norm": 5.459253787994385, "learning_rate": 2.7435920998826927e-06, "loss": 0.2182, "num_input_tokens_seen": 26869696, "step": 44100 }, { "epoch": 13.68445547626435, "grad_norm": 5.137179374694824, "learning_rate": 2.7423840692622206e-06, "loss": 0.2217, "num_input_tokens_seen": 26871968, "step": 44105 }, { "epoch": 13.686006825938566, "grad_norm": 2.8068103790283203, "learning_rate": 2.7411762041517275e-06, "loss": 0.1912, "num_input_tokens_seen": 26874720, "step": 44110 }, { "epoch": 13.687558175612782, "grad_norm": 3.1087939739227295, "learning_rate": 2.7399685046397696e-06, "loss": 0.1697, "num_input_tokens_seen": 26878432, "step": 44115 }, { "epoch": 13.689109525287, "grad_norm": 4.667004585266113, "learning_rate": 2.738760970814881e-06, "loss": 0.2536, "num_input_tokens_seen": 26883296, "step": 44120 }, { "epoch": 13.690660874961216, "grad_norm": 5.5026750564575195, "learning_rate": 2.7375536027655906e-06, "loss": 0.2217, "num_input_tokens_seen": 26885472, "step": 44125 }, { "epoch": 13.692212224635433, "grad_norm": 4.159668922424316, "learning_rate": 2.7363464005804142e-06, "loss": 0.1976, "num_input_tokens_seen": 26888832, "step": 44130 }, { "epoch": 13.69376357430965, "grad_norm": 3.891057014465332, "learning_rate": 2.735139364347851e-06, "loss": 0.1937, "num_input_tokens_seen": 26891808, "step": 44135 }, { "epoch": 13.695314923983865, "grad_norm": 16.636709213256836, "learning_rate": 2.7339324941563937e-06, "loss": 0.2111, "num_input_tokens_seen": 26896032, "step": 44140 }, { "epoch": 13.696866273658083, "grad_norm": 8.618670463562012, "learning_rate": 2.732725790094517e-06, "loss": 0.1997, "num_input_tokens_seen": 26898720, "step": 44145 }, { "epoch": 13.698417623332299, "grad_norm": 3.577224016189575, "learning_rate": 2.73151925225069e-06, "loss": 0.2173, "num_input_tokens_seen": 26901664, "step": 44150 }, { "epoch": 13.699968973006516, "grad_norm": 4.135581016540527, "learning_rate": 2.7303128807133627e-06, "loss": 0.2279, "num_input_tokens_seen": 26904160, "step": 44155 }, { "epoch": 13.701520322680732, "grad_norm": 4.7157301902771, "learning_rate": 2.729106675570981e-06, "loss": 0.226, "num_input_tokens_seen": 26907488, "step": 44160 }, { "epoch": 13.703071672354948, "grad_norm": 4.208896160125732, "learning_rate": 2.7279006369119686e-06, "loss": 0.2417, "num_input_tokens_seen": 26910816, "step": 44165 }, { "epoch": 13.704623022029166, "grad_norm": 14.520755767822266, "learning_rate": 2.7266947648247477e-06, "loss": 0.2647, "num_input_tokens_seen": 26914240, "step": 44170 }, { "epoch": 13.706174371703382, "grad_norm": 6.494384288787842, "learning_rate": 2.725489059397719e-06, "loss": 0.2324, "num_input_tokens_seen": 26917088, "step": 44175 }, { "epoch": 13.7077257213776, "grad_norm": 2.3228299617767334, "learning_rate": 2.7242835207192752e-06, "loss": 0.1806, "num_input_tokens_seen": 26919360, "step": 44180 }, { "epoch": 13.709277071051815, "grad_norm": 4.8729023933410645, "learning_rate": 2.723078148877799e-06, "loss": 0.1965, "num_input_tokens_seen": 26922112, "step": 44185 }, { "epoch": 13.710828420726031, "grad_norm": 5.369515419006348, "learning_rate": 2.721872943961659e-06, "loss": 0.1859, "num_input_tokens_seen": 26924992, "step": 44190 }, { "epoch": 13.712379770400249, "grad_norm": 4.63510799407959, "learning_rate": 2.7206679060592066e-06, "loss": 0.2308, "num_input_tokens_seen": 26927648, "step": 44195 }, { "epoch": 13.713931120074465, "grad_norm": 8.608962059020996, "learning_rate": 2.719463035258791e-06, "loss": 0.241, "num_input_tokens_seen": 26930432, "step": 44200 }, { "epoch": 13.71548246974868, "grad_norm": 4.433343887329102, "learning_rate": 2.7182583316487375e-06, "loss": 0.2071, "num_input_tokens_seen": 26932672, "step": 44205 }, { "epoch": 13.717033819422898, "grad_norm": 2.2929654121398926, "learning_rate": 2.7170537953173693e-06, "loss": 0.1799, "num_input_tokens_seen": 26934976, "step": 44210 }, { "epoch": 13.718585169097114, "grad_norm": 5.019186973571777, "learning_rate": 2.715849426352993e-06, "loss": 0.1945, "num_input_tokens_seen": 26937056, "step": 44215 }, { "epoch": 13.720136518771332, "grad_norm": 7.46735954284668, "learning_rate": 2.7146452248439e-06, "loss": 0.1878, "num_input_tokens_seen": 26939680, "step": 44220 }, { "epoch": 13.721687868445548, "grad_norm": 6.2350382804870605, "learning_rate": 2.7134411908783777e-06, "loss": 0.2063, "num_input_tokens_seen": 26942944, "step": 44225 }, { "epoch": 13.723239218119764, "grad_norm": 3.7954163551330566, "learning_rate": 2.712237324544691e-06, "loss": 0.1576, "num_input_tokens_seen": 26945600, "step": 44230 }, { "epoch": 13.724790567793981, "grad_norm": 9.943763732910156, "learning_rate": 2.711033625931101e-06, "loss": 0.1818, "num_input_tokens_seen": 26948704, "step": 44235 }, { "epoch": 13.726341917468197, "grad_norm": 5.577219486236572, "learning_rate": 2.7098300951258495e-06, "loss": 0.2591, "num_input_tokens_seen": 26951168, "step": 44240 }, { "epoch": 13.727893267142413, "grad_norm": 9.226458549499512, "learning_rate": 2.7086267322171744e-06, "loss": 0.1837, "num_input_tokens_seen": 26954112, "step": 44245 }, { "epoch": 13.72944461681663, "grad_norm": 11.446762084960938, "learning_rate": 2.707423537293291e-06, "loss": 0.1928, "num_input_tokens_seen": 26957312, "step": 44250 }, { "epoch": 13.730995966490847, "grad_norm": 5.882700443267822, "learning_rate": 2.7062205104424126e-06, "loss": 0.2259, "num_input_tokens_seen": 26961312, "step": 44255 }, { "epoch": 13.732547316165064, "grad_norm": 6.62214994430542, "learning_rate": 2.7050176517527316e-06, "loss": 0.2461, "num_input_tokens_seen": 26964416, "step": 44260 }, { "epoch": 13.73409866583928, "grad_norm": 4.647995471954346, "learning_rate": 2.703814961312433e-06, "loss": 0.1995, "num_input_tokens_seen": 26967552, "step": 44265 }, { "epoch": 13.735650015513496, "grad_norm": 7.308901309967041, "learning_rate": 2.7026124392096907e-06, "loss": 0.198, "num_input_tokens_seen": 26969984, "step": 44270 }, { "epoch": 13.737201365187714, "grad_norm": 8.81281566619873, "learning_rate": 2.7014100855326598e-06, "loss": 0.1957, "num_input_tokens_seen": 26974048, "step": 44275 }, { "epoch": 13.73875271486193, "grad_norm": 5.258228778839111, "learning_rate": 2.7002079003694913e-06, "loss": 0.2135, "num_input_tokens_seen": 26977152, "step": 44280 }, { "epoch": 13.740304064536147, "grad_norm": 7.9654669761657715, "learning_rate": 2.699005883808315e-06, "loss": 0.1463, "num_input_tokens_seen": 26981376, "step": 44285 }, { "epoch": 13.741855414210363, "grad_norm": 4.137790203094482, "learning_rate": 2.697804035937257e-06, "loss": 0.1871, "num_input_tokens_seen": 26984064, "step": 44290 }, { "epoch": 13.743406763884579, "grad_norm": 2.54478120803833, "learning_rate": 2.696602356844424e-06, "loss": 0.193, "num_input_tokens_seen": 26986528, "step": 44295 }, { "epoch": 13.744958113558797, "grad_norm": 2.493941307067871, "learning_rate": 2.695400846617916e-06, "loss": 0.1921, "num_input_tokens_seen": 26989568, "step": 44300 }, { "epoch": 13.746509463233012, "grad_norm": 5.397199630737305, "learning_rate": 2.694199505345815e-06, "loss": 0.2882, "num_input_tokens_seen": 26991904, "step": 44305 }, { "epoch": 13.74806081290723, "grad_norm": 4.116835117340088, "learning_rate": 2.6929983331161956e-06, "loss": 0.1346, "num_input_tokens_seen": 26995104, "step": 44310 }, { "epoch": 13.749612162581446, "grad_norm": 5.764228343963623, "learning_rate": 2.691797330017117e-06, "loss": 0.184, "num_input_tokens_seen": 26997888, "step": 44315 }, { "epoch": 13.751163512255662, "grad_norm": 5.807845592498779, "learning_rate": 2.6905964961366282e-06, "loss": 0.1648, "num_input_tokens_seen": 27001408, "step": 44320 }, { "epoch": 13.75271486192988, "grad_norm": 10.412185668945312, "learning_rate": 2.689395831562762e-06, "loss": 0.1693, "num_input_tokens_seen": 27004064, "step": 44325 }, { "epoch": 13.754266211604095, "grad_norm": 7.785220623016357, "learning_rate": 2.6881953363835433e-06, "loss": 0.2152, "num_input_tokens_seen": 27006592, "step": 44330 }, { "epoch": 13.755817561278311, "grad_norm": 8.444267272949219, "learning_rate": 2.6869950106869846e-06, "loss": 0.2369, "num_input_tokens_seen": 27009536, "step": 44335 }, { "epoch": 13.757368910952529, "grad_norm": 9.26117992401123, "learning_rate": 2.6857948545610792e-06, "loss": 0.1931, "num_input_tokens_seen": 27012288, "step": 44340 }, { "epoch": 13.758920260626745, "grad_norm": 6.395172119140625, "learning_rate": 2.684594868093817e-06, "loss": 0.216, "num_input_tokens_seen": 27015712, "step": 44345 }, { "epoch": 13.760471610300963, "grad_norm": 3.2580149173736572, "learning_rate": 2.6833950513731684e-06, "loss": 0.1497, "num_input_tokens_seen": 27017696, "step": 44350 }, { "epoch": 13.762022959975178, "grad_norm": 16.02352523803711, "learning_rate": 2.6821954044870962e-06, "loss": 0.2078, "num_input_tokens_seen": 27022048, "step": 44355 }, { "epoch": 13.763574309649394, "grad_norm": 7.683810234069824, "learning_rate": 2.6809959275235464e-06, "loss": 0.2025, "num_input_tokens_seen": 27025280, "step": 44360 }, { "epoch": 13.765125659323612, "grad_norm": 10.868900299072266, "learning_rate": 2.679796620570458e-06, "loss": 0.2013, "num_input_tokens_seen": 27028576, "step": 44365 }, { "epoch": 13.766677008997828, "grad_norm": 4.253256320953369, "learning_rate": 2.6785974837157504e-06, "loss": 0.2075, "num_input_tokens_seen": 27031200, "step": 44370 }, { "epoch": 13.768228358672044, "grad_norm": 5.639481067657471, "learning_rate": 2.6773985170473394e-06, "loss": 0.2002, "num_input_tokens_seen": 27034080, "step": 44375 }, { "epoch": 13.769779708346261, "grad_norm": 4.185464382171631, "learning_rate": 2.676199720653118e-06, "loss": 0.1651, "num_input_tokens_seen": 27037792, "step": 44380 }, { "epoch": 13.771331058020477, "grad_norm": 9.525425910949707, "learning_rate": 2.6750010946209757e-06, "loss": 0.1785, "num_input_tokens_seen": 27041312, "step": 44385 }, { "epoch": 13.772882407694695, "grad_norm": 9.438898086547852, "learning_rate": 2.6738026390387874e-06, "loss": 0.2067, "num_input_tokens_seen": 27044288, "step": 44390 }, { "epoch": 13.77443375736891, "grad_norm": 3.722285270690918, "learning_rate": 2.672604353994409e-06, "loss": 0.2315, "num_input_tokens_seen": 27047168, "step": 44395 }, { "epoch": 13.775985107043127, "grad_norm": 3.1376798152923584, "learning_rate": 2.6714062395756947e-06, "loss": 0.3464, "num_input_tokens_seen": 27050496, "step": 44400 }, { "epoch": 13.777536456717344, "grad_norm": 3.7195332050323486, "learning_rate": 2.6702082958704754e-06, "loss": 0.2067, "num_input_tokens_seen": 27052896, "step": 44405 }, { "epoch": 13.77908780639156, "grad_norm": 4.37614631652832, "learning_rate": 2.6690105229665786e-06, "loss": 0.2035, "num_input_tokens_seen": 27055744, "step": 44410 }, { "epoch": 13.780639156065778, "grad_norm": 9.017646789550781, "learning_rate": 2.667812920951812e-06, "loss": 0.2244, "num_input_tokens_seen": 27058464, "step": 44415 }, { "epoch": 13.782190505739994, "grad_norm": 7.863735198974609, "learning_rate": 2.6666154899139775e-06, "loss": 0.2426, "num_input_tokens_seen": 27060896, "step": 44420 }, { "epoch": 13.78374185541421, "grad_norm": 1.9708806276321411, "learning_rate": 2.665418229940857e-06, "loss": 0.1981, "num_input_tokens_seen": 27063808, "step": 44425 }, { "epoch": 13.785293205088427, "grad_norm": 2.179600954055786, "learning_rate": 2.664221141120228e-06, "loss": 0.1722, "num_input_tokens_seen": 27066720, "step": 44430 }, { "epoch": 13.786844554762643, "grad_norm": 9.894171714782715, "learning_rate": 2.6630242235398463e-06, "loss": 0.1903, "num_input_tokens_seen": 27069536, "step": 44435 }, { "epoch": 13.788395904436861, "grad_norm": 6.243129730224609, "learning_rate": 2.6618274772874653e-06, "loss": 0.3417, "num_input_tokens_seen": 27073440, "step": 44440 }, { "epoch": 13.789947254111077, "grad_norm": 4.426679611206055, "learning_rate": 2.660630902450817e-06, "loss": 0.1759, "num_input_tokens_seen": 27077024, "step": 44445 }, { "epoch": 13.791498603785293, "grad_norm": 3.5672051906585693, "learning_rate": 2.659434499117625e-06, "loss": 0.2235, "num_input_tokens_seen": 27079232, "step": 44450 }, { "epoch": 13.79304995345951, "grad_norm": 20.361053466796875, "learning_rate": 2.658238267375603e-06, "loss": 0.2278, "num_input_tokens_seen": 27082976, "step": 44455 }, { "epoch": 13.794601303133726, "grad_norm": 2.7297496795654297, "learning_rate": 2.6570422073124447e-06, "loss": 0.1996, "num_input_tokens_seen": 27085312, "step": 44460 }, { "epoch": 13.796152652807942, "grad_norm": 9.738945960998535, "learning_rate": 2.655846319015839e-06, "loss": 0.2158, "num_input_tokens_seen": 27088288, "step": 44465 }, { "epoch": 13.79770400248216, "grad_norm": 10.522796630859375, "learning_rate": 2.654650602573455e-06, "loss": 0.2235, "num_input_tokens_seen": 27090720, "step": 44470 }, { "epoch": 13.799255352156376, "grad_norm": 4.573145389556885, "learning_rate": 2.653455058072958e-06, "loss": 0.2034, "num_input_tokens_seen": 27093952, "step": 44475 }, { "epoch": 13.800806701830593, "grad_norm": 3.2159736156463623, "learning_rate": 2.6522596856019895e-06, "loss": 0.2421, "num_input_tokens_seen": 27096544, "step": 44480 }, { "epoch": 13.80235805150481, "grad_norm": 2.9031717777252197, "learning_rate": 2.6510644852481893e-06, "loss": 0.1642, "num_input_tokens_seen": 27099424, "step": 44485 }, { "epoch": 13.803909401179025, "grad_norm": 3.748671054840088, "learning_rate": 2.649869457099177e-06, "loss": 0.2152, "num_input_tokens_seen": 27102016, "step": 44490 }, { "epoch": 13.805460750853243, "grad_norm": 9.054179191589355, "learning_rate": 2.6486746012425642e-06, "loss": 0.2315, "num_input_tokens_seen": 27105120, "step": 44495 }, { "epoch": 13.807012100527459, "grad_norm": 4.036186695098877, "learning_rate": 2.6474799177659447e-06, "loss": 0.1895, "num_input_tokens_seen": 27107616, "step": 44500 }, { "epoch": 13.808563450201675, "grad_norm": 6.4973554611206055, "learning_rate": 2.6462854067569067e-06, "loss": 0.228, "num_input_tokens_seen": 27110240, "step": 44505 }, { "epoch": 13.810114799875892, "grad_norm": 4.279429912567139, "learning_rate": 2.645091068303021e-06, "loss": 0.2182, "num_input_tokens_seen": 27113824, "step": 44510 }, { "epoch": 13.811666149550108, "grad_norm": 16.598474502563477, "learning_rate": 2.6438969024918447e-06, "loss": 0.2198, "num_input_tokens_seen": 27116608, "step": 44515 }, { "epoch": 13.813217499224326, "grad_norm": 4.835526943206787, "learning_rate": 2.6427029094109287e-06, "loss": 0.2306, "num_input_tokens_seen": 27119168, "step": 44520 }, { "epoch": 13.814768848898542, "grad_norm": 10.259556770324707, "learning_rate": 2.641509089147801e-06, "loss": 0.2612, "num_input_tokens_seen": 27122336, "step": 44525 }, { "epoch": 13.816320198572758, "grad_norm": 5.331175327301025, "learning_rate": 2.6403154417899886e-06, "loss": 0.231, "num_input_tokens_seen": 27125280, "step": 44530 }, { "epoch": 13.817871548246975, "grad_norm": 4.444779872894287, "learning_rate": 2.6391219674249946e-06, "loss": 0.272, "num_input_tokens_seen": 27129568, "step": 44535 }, { "epoch": 13.819422897921191, "grad_norm": 18.844684600830078, "learning_rate": 2.6379286661403193e-06, "loss": 0.2574, "num_input_tokens_seen": 27132192, "step": 44540 }, { "epoch": 13.820974247595409, "grad_norm": 13.236001014709473, "learning_rate": 2.636735538023442e-06, "loss": 0.2023, "num_input_tokens_seen": 27135616, "step": 44545 }, { "epoch": 13.822525597269625, "grad_norm": 5.6205010414123535, "learning_rate": 2.6355425831618375e-06, "loss": 0.2298, "num_input_tokens_seen": 27139008, "step": 44550 }, { "epoch": 13.82407694694384, "grad_norm": 4.036271095275879, "learning_rate": 2.634349801642958e-06, "loss": 0.1984, "num_input_tokens_seen": 27142656, "step": 44555 }, { "epoch": 13.825628296618058, "grad_norm": 2.037461042404175, "learning_rate": 2.6331571935542544e-06, "loss": 0.2462, "num_input_tokens_seen": 27145056, "step": 44560 }, { "epoch": 13.827179646292274, "grad_norm": 3.4829750061035156, "learning_rate": 2.6319647589831543e-06, "loss": 0.1917, "num_input_tokens_seen": 27147584, "step": 44565 }, { "epoch": 13.828730995966492, "grad_norm": 10.983202934265137, "learning_rate": 2.6307724980170786e-06, "loss": 0.2333, "num_input_tokens_seen": 27151456, "step": 44570 }, { "epoch": 13.830282345640708, "grad_norm": 4.38406229019165, "learning_rate": 2.6295804107434362e-06, "loss": 0.2216, "num_input_tokens_seen": 27154272, "step": 44575 }, { "epoch": 13.831833695314923, "grad_norm": 2.842289447784424, "learning_rate": 2.6283884972496187e-06, "loss": 0.207, "num_input_tokens_seen": 27156256, "step": 44580 }, { "epoch": 13.833385044989141, "grad_norm": 5.631694793701172, "learning_rate": 2.6271967576230097e-06, "loss": 0.1772, "num_input_tokens_seen": 27159136, "step": 44585 }, { "epoch": 13.834936394663357, "grad_norm": 5.524141311645508, "learning_rate": 2.6260051919509747e-06, "loss": 0.1989, "num_input_tokens_seen": 27162080, "step": 44590 }, { "epoch": 13.836487744337573, "grad_norm": 6.337265491485596, "learning_rate": 2.6248138003208734e-06, "loss": 0.2023, "num_input_tokens_seen": 27164928, "step": 44595 }, { "epoch": 13.83803909401179, "grad_norm": 8.0065279006958, "learning_rate": 2.6236225828200458e-06, "loss": 0.1978, "num_input_tokens_seen": 27168128, "step": 44600 }, { "epoch": 13.839590443686006, "grad_norm": 4.549407958984375, "learning_rate": 2.622431539535824e-06, "loss": 0.1734, "num_input_tokens_seen": 27171072, "step": 44605 }, { "epoch": 13.841141793360224, "grad_norm": 5.644340515136719, "learning_rate": 2.621240670555524e-06, "loss": 0.2132, "num_input_tokens_seen": 27174592, "step": 44610 }, { "epoch": 13.84269314303444, "grad_norm": 6.390223979949951, "learning_rate": 2.620049975966453e-06, "loss": 0.2024, "num_input_tokens_seen": 27177728, "step": 44615 }, { "epoch": 13.844244492708656, "grad_norm": 8.536504745483398, "learning_rate": 2.6188594558559e-06, "loss": 0.2038, "num_input_tokens_seen": 27180352, "step": 44620 }, { "epoch": 13.845795842382874, "grad_norm": 5.736030101776123, "learning_rate": 2.617669110311145e-06, "loss": 0.2678, "num_input_tokens_seen": 27182976, "step": 44625 }, { "epoch": 13.84734719205709, "grad_norm": 5.253821849822998, "learning_rate": 2.6164789394194577e-06, "loss": 0.1959, "num_input_tokens_seen": 27185504, "step": 44630 }, { "epoch": 13.848898541731305, "grad_norm": 5.436068058013916, "learning_rate": 2.6152889432680876e-06, "loss": 0.1974, "num_input_tokens_seen": 27192224, "step": 44635 }, { "epoch": 13.850449891405523, "grad_norm": 8.357544898986816, "learning_rate": 2.614099121944279e-06, "loss": 0.2501, "num_input_tokens_seen": 27195776, "step": 44640 }, { "epoch": 13.852001241079739, "grad_norm": 3.2600347995758057, "learning_rate": 2.612909475535256e-06, "loss": 0.2124, "num_input_tokens_seen": 27198304, "step": 44645 }, { "epoch": 13.853552590753957, "grad_norm": 2.8725147247314453, "learning_rate": 2.6117200041282375e-06, "loss": 0.2093, "num_input_tokens_seen": 27201312, "step": 44650 }, { "epoch": 13.855103940428172, "grad_norm": 4.006941795349121, "learning_rate": 2.6105307078104233e-06, "loss": 0.216, "num_input_tokens_seen": 27203200, "step": 44655 }, { "epoch": 13.856655290102388, "grad_norm": 12.738641738891602, "learning_rate": 2.609341586669005e-06, "loss": 0.2045, "num_input_tokens_seen": 27206368, "step": 44660 }, { "epoch": 13.858206639776606, "grad_norm": 4.159834384918213, "learning_rate": 2.6081526407911555e-06, "loss": 0.2122, "num_input_tokens_seen": 27209120, "step": 44665 }, { "epoch": 13.859757989450822, "grad_norm": 5.821053981781006, "learning_rate": 2.6069638702640437e-06, "loss": 0.2018, "num_input_tokens_seen": 27211776, "step": 44670 }, { "epoch": 13.86130933912504, "grad_norm": 4.187216758728027, "learning_rate": 2.6057752751748156e-06, "loss": 0.1732, "num_input_tokens_seen": 27214912, "step": 44675 }, { "epoch": 13.862860688799255, "grad_norm": 7.480823993682861, "learning_rate": 2.6045868556106145e-06, "loss": 0.2329, "num_input_tokens_seen": 27218048, "step": 44680 }, { "epoch": 13.864412038473471, "grad_norm": 3.0369250774383545, "learning_rate": 2.60339861165856e-06, "loss": 0.1926, "num_input_tokens_seen": 27221728, "step": 44685 }, { "epoch": 13.865963388147689, "grad_norm": 2.1335113048553467, "learning_rate": 2.602210543405768e-06, "loss": 0.2052, "num_input_tokens_seen": 27224512, "step": 44690 }, { "epoch": 13.867514737821905, "grad_norm": 5.227756500244141, "learning_rate": 2.6010226509393387e-06, "loss": 0.2413, "num_input_tokens_seen": 27227104, "step": 44695 }, { "epoch": 13.869066087496122, "grad_norm": 4.3632493019104, "learning_rate": 2.599834934346355e-06, "loss": 0.2429, "num_input_tokens_seen": 27230720, "step": 44700 }, { "epoch": 13.870617437170338, "grad_norm": 5.785446643829346, "learning_rate": 2.5986473937138957e-06, "loss": 0.215, "num_input_tokens_seen": 27234432, "step": 44705 }, { "epoch": 13.872168786844554, "grad_norm": 3.3213934898376465, "learning_rate": 2.5974600291290157e-06, "loss": 0.2195, "num_input_tokens_seen": 27236512, "step": 44710 }, { "epoch": 13.873720136518772, "grad_norm": 4.226925849914551, "learning_rate": 2.5962728406787683e-06, "loss": 0.2203, "num_input_tokens_seen": 27239488, "step": 44715 }, { "epoch": 13.875271486192988, "grad_norm": 8.359471321105957, "learning_rate": 2.5950858284501847e-06, "loss": 0.1988, "num_input_tokens_seen": 27243840, "step": 44720 }, { "epoch": 13.876822835867204, "grad_norm": 6.861291408538818, "learning_rate": 2.5938989925302892e-06, "loss": 0.2306, "num_input_tokens_seen": 27248288, "step": 44725 }, { "epoch": 13.878374185541421, "grad_norm": 4.681954383850098, "learning_rate": 2.592712333006089e-06, "loss": 0.1983, "num_input_tokens_seen": 27250944, "step": 44730 }, { "epoch": 13.879925535215637, "grad_norm": 4.124634742736816, "learning_rate": 2.591525849964583e-06, "loss": 0.2057, "num_input_tokens_seen": 27254048, "step": 44735 }, { "epoch": 13.881476884889855, "grad_norm": 7.541003704071045, "learning_rate": 2.5903395434927504e-06, "loss": 0.2199, "num_input_tokens_seen": 27257504, "step": 44740 }, { "epoch": 13.88302823456407, "grad_norm": 5.3332695960998535, "learning_rate": 2.589153413677564e-06, "loss": 0.2258, "num_input_tokens_seen": 27261248, "step": 44745 }, { "epoch": 13.884579584238287, "grad_norm": 3.2053656578063965, "learning_rate": 2.587967460605984e-06, "loss": 0.1607, "num_input_tokens_seen": 27264032, "step": 44750 }, { "epoch": 13.886130933912504, "grad_norm": 6.538039684295654, "learning_rate": 2.5867816843649492e-06, "loss": 0.2294, "num_input_tokens_seen": 27267936, "step": 44755 }, { "epoch": 13.88768228358672, "grad_norm": 6.721709251403809, "learning_rate": 2.5855960850413936e-06, "loss": 0.2117, "num_input_tokens_seen": 27270656, "step": 44760 }, { "epoch": 13.889233633260936, "grad_norm": 4.215181827545166, "learning_rate": 2.5844106627222376e-06, "loss": 0.1881, "num_input_tokens_seen": 27273472, "step": 44765 }, { "epoch": 13.890784982935154, "grad_norm": 5.06752347946167, "learning_rate": 2.5832254174943838e-06, "loss": 0.2726, "num_input_tokens_seen": 27276320, "step": 44770 }, { "epoch": 13.89233633260937, "grad_norm": 7.768869876861572, "learning_rate": 2.5820403494447255e-06, "loss": 0.1808, "num_input_tokens_seen": 27280384, "step": 44775 }, { "epoch": 13.893887682283587, "grad_norm": 5.148759365081787, "learning_rate": 2.5808554586601437e-06, "loss": 0.2112, "num_input_tokens_seen": 27283040, "step": 44780 }, { "epoch": 13.895439031957803, "grad_norm": 12.123834609985352, "learning_rate": 2.5796707452275026e-06, "loss": 0.2586, "num_input_tokens_seen": 27285888, "step": 44785 }, { "epoch": 13.896990381632019, "grad_norm": 4.544934272766113, "learning_rate": 2.578486209233658e-06, "loss": 0.2175, "num_input_tokens_seen": 27290016, "step": 44790 }, { "epoch": 13.898541731306237, "grad_norm": 4.031675338745117, "learning_rate": 2.577301850765448e-06, "loss": 0.2715, "num_input_tokens_seen": 27292576, "step": 44795 }, { "epoch": 13.900093080980453, "grad_norm": 4.746832847595215, "learning_rate": 2.5761176699097035e-06, "loss": 0.2339, "num_input_tokens_seen": 27295072, "step": 44800 }, { "epoch": 13.90164443065467, "grad_norm": 6.471000671386719, "learning_rate": 2.5749336667532343e-06, "loss": 0.1936, "num_input_tokens_seen": 27297472, "step": 44805 }, { "epoch": 13.903195780328886, "grad_norm": 3.245091438293457, "learning_rate": 2.5737498413828465e-06, "loss": 0.1742, "num_input_tokens_seen": 27300032, "step": 44810 }, { "epoch": 13.904747130003102, "grad_norm": 3.590848445892334, "learning_rate": 2.572566193885324e-06, "loss": 0.1838, "num_input_tokens_seen": 27304704, "step": 44815 }, { "epoch": 13.90629847967732, "grad_norm": 4.568096160888672, "learning_rate": 2.5713827243474475e-06, "loss": 0.2684, "num_input_tokens_seen": 27307872, "step": 44820 }, { "epoch": 13.907849829351536, "grad_norm": 5.316540241241455, "learning_rate": 2.5701994328559743e-06, "loss": 0.2332, "num_input_tokens_seen": 27310272, "step": 44825 }, { "epoch": 13.909401179025753, "grad_norm": 2.359739065170288, "learning_rate": 2.5690163194976576e-06, "loss": 0.1472, "num_input_tokens_seen": 27313344, "step": 44830 }, { "epoch": 13.91095252869997, "grad_norm": 6.734898090362549, "learning_rate": 2.5678333843592294e-06, "loss": 0.2194, "num_input_tokens_seen": 27315840, "step": 44835 }, { "epoch": 13.912503878374185, "grad_norm": 12.580954551696777, "learning_rate": 2.566650627527416e-06, "loss": 0.2462, "num_input_tokens_seen": 27318816, "step": 44840 }, { "epoch": 13.914055228048403, "grad_norm": 11.41434097290039, "learning_rate": 2.565468049088928e-06, "loss": 0.2094, "num_input_tokens_seen": 27323136, "step": 44845 }, { "epoch": 13.915606577722619, "grad_norm": 5.286488056182861, "learning_rate": 2.56428564913046e-06, "loss": 0.1967, "num_input_tokens_seen": 27325568, "step": 44850 }, { "epoch": 13.917157927396834, "grad_norm": 9.355755805969238, "learning_rate": 2.563103427738699e-06, "loss": 0.2327, "num_input_tokens_seen": 27328704, "step": 44855 }, { "epoch": 13.918709277071052, "grad_norm": 2.4832675457000732, "learning_rate": 2.5619213850003117e-06, "loss": 0.1908, "num_input_tokens_seen": 27331264, "step": 44860 }, { "epoch": 13.920260626745268, "grad_norm": 4.402151584625244, "learning_rate": 2.5607395210019605e-06, "loss": 0.1819, "num_input_tokens_seen": 27334464, "step": 44865 }, { "epoch": 13.921811976419486, "grad_norm": 1.8307759761810303, "learning_rate": 2.5595578358302846e-06, "loss": 0.2322, "num_input_tokens_seen": 27338432, "step": 44870 }, { "epoch": 13.923363326093702, "grad_norm": 4.159381866455078, "learning_rate": 2.5583763295719212e-06, "loss": 0.2066, "num_input_tokens_seen": 27340960, "step": 44875 }, { "epoch": 13.924914675767917, "grad_norm": 6.037288665771484, "learning_rate": 2.557195002313484e-06, "loss": 0.2046, "num_input_tokens_seen": 27343552, "step": 44880 }, { "epoch": 13.926466025442135, "grad_norm": 2.7335402965545654, "learning_rate": 2.5560138541415814e-06, "loss": 0.1932, "num_input_tokens_seen": 27347040, "step": 44885 }, { "epoch": 13.928017375116351, "grad_norm": 4.775529384613037, "learning_rate": 2.5548328851428032e-06, "loss": 0.2046, "num_input_tokens_seen": 27350976, "step": 44890 }, { "epoch": 13.929568724790569, "grad_norm": 4.877234935760498, "learning_rate": 2.5536520954037295e-06, "loss": 0.1881, "num_input_tokens_seen": 27355232, "step": 44895 }, { "epoch": 13.931120074464785, "grad_norm": 4.76680326461792, "learning_rate": 2.5524714850109288e-06, "loss": 0.1853, "num_input_tokens_seen": 27358720, "step": 44900 }, { "epoch": 13.932671424139, "grad_norm": 7.665160179138184, "learning_rate": 2.551291054050948e-06, "loss": 0.2698, "num_input_tokens_seen": 27363104, "step": 44905 }, { "epoch": 13.934222773813218, "grad_norm": 7.709256172180176, "learning_rate": 2.5501108026103334e-06, "loss": 0.2571, "num_input_tokens_seen": 27365728, "step": 44910 }, { "epoch": 13.935774123487434, "grad_norm": 7.178210735321045, "learning_rate": 2.548930730775605e-06, "loss": 0.1846, "num_input_tokens_seen": 27369312, "step": 44915 }, { "epoch": 13.93732547316165, "grad_norm": 6.81754732131958, "learning_rate": 2.547750838633282e-06, "loss": 0.1667, "num_input_tokens_seen": 27372736, "step": 44920 }, { "epoch": 13.938876822835867, "grad_norm": 3.3685896396636963, "learning_rate": 2.5465711262698587e-06, "loss": 0.2421, "num_input_tokens_seen": 27376320, "step": 44925 }, { "epoch": 13.940428172510083, "grad_norm": 2.9785399436950684, "learning_rate": 2.545391593771827e-06, "loss": 0.1577, "num_input_tokens_seen": 27379008, "step": 44930 }, { "epoch": 13.941979522184301, "grad_norm": 4.575586795806885, "learning_rate": 2.5442122412256563e-06, "loss": 0.1885, "num_input_tokens_seen": 27381856, "step": 44935 }, { "epoch": 13.943530871858517, "grad_norm": 5.118680953979492, "learning_rate": 2.543033068717812e-06, "loss": 0.2231, "num_input_tokens_seen": 27385024, "step": 44940 }, { "epoch": 13.945082221532733, "grad_norm": 5.85064697265625, "learning_rate": 2.5418540763347356e-06, "loss": 0.2337, "num_input_tokens_seen": 27387392, "step": 44945 }, { "epoch": 13.94663357120695, "grad_norm": 3.250203847885132, "learning_rate": 2.5406752641628664e-06, "loss": 0.1918, "num_input_tokens_seen": 27390528, "step": 44950 }, { "epoch": 13.948184920881166, "grad_norm": 5.232473373413086, "learning_rate": 2.5394966322886215e-06, "loss": 0.1908, "num_input_tokens_seen": 27396448, "step": 44955 }, { "epoch": 13.949736270555384, "grad_norm": 6.50579833984375, "learning_rate": 2.5383181807984097e-06, "loss": 0.2317, "num_input_tokens_seen": 27398912, "step": 44960 }, { "epoch": 13.9512876202296, "grad_norm": 4.0344014167785645, "learning_rate": 2.5371399097786283e-06, "loss": 0.1953, "num_input_tokens_seen": 27402784, "step": 44965 }, { "epoch": 13.952838969903816, "grad_norm": 3.9675586223602295, "learning_rate": 2.5359618193156536e-06, "loss": 0.1994, "num_input_tokens_seen": 27406496, "step": 44970 }, { "epoch": 13.954390319578033, "grad_norm": 3.6719517707824707, "learning_rate": 2.534783909495859e-06, "loss": 0.2322, "num_input_tokens_seen": 27409280, "step": 44975 }, { "epoch": 13.95594166925225, "grad_norm": 5.66693639755249, "learning_rate": 2.5336061804055934e-06, "loss": 0.1814, "num_input_tokens_seen": 27412128, "step": 44980 }, { "epoch": 13.957493018926465, "grad_norm": 3.163191318511963, "learning_rate": 2.5324286321312043e-06, "loss": 0.1445, "num_input_tokens_seen": 27416160, "step": 44985 }, { "epoch": 13.959044368600683, "grad_norm": 3.97978138923645, "learning_rate": 2.5312512647590136e-06, "loss": 0.1657, "num_input_tokens_seen": 27421184, "step": 44990 }, { "epoch": 13.960595718274899, "grad_norm": 11.946145057678223, "learning_rate": 2.5300740783753427e-06, "loss": 0.181, "num_input_tokens_seen": 27424096, "step": 44995 }, { "epoch": 13.962147067949116, "grad_norm": 3.9590260982513428, "learning_rate": 2.5288970730664873e-06, "loss": 0.1815, "num_input_tokens_seen": 27426752, "step": 45000 }, { "epoch": 13.963698417623332, "grad_norm": 5.260643005371094, "learning_rate": 2.527720248918741e-06, "loss": 0.2311, "num_input_tokens_seen": 27429152, "step": 45005 }, { "epoch": 13.965249767297548, "grad_norm": 7.858834266662598, "learning_rate": 2.526543606018375e-06, "loss": 0.2007, "num_input_tokens_seen": 27432608, "step": 45010 }, { "epoch": 13.966801116971766, "grad_norm": 4.091526985168457, "learning_rate": 2.5253671444516526e-06, "loss": 0.2608, "num_input_tokens_seen": 27434976, "step": 45015 }, { "epoch": 13.968352466645982, "grad_norm": 3.4290995597839355, "learning_rate": 2.524190864304824e-06, "loss": 0.2092, "num_input_tokens_seen": 27437696, "step": 45020 }, { "epoch": 13.9699038163202, "grad_norm": 4.063498020172119, "learning_rate": 2.523014765664122e-06, "loss": 0.1493, "num_input_tokens_seen": 27441280, "step": 45025 }, { "epoch": 13.971455165994415, "grad_norm": 10.682255744934082, "learning_rate": 2.521838848615771e-06, "loss": 0.3052, "num_input_tokens_seen": 27444128, "step": 45030 }, { "epoch": 13.973006515668631, "grad_norm": 5.1994099617004395, "learning_rate": 2.5206631132459756e-06, "loss": 0.1736, "num_input_tokens_seen": 27447040, "step": 45035 }, { "epoch": 13.974557865342849, "grad_norm": 4.474007606506348, "learning_rate": 2.5194875596409368e-06, "loss": 0.18, "num_input_tokens_seen": 27449696, "step": 45040 }, { "epoch": 13.976109215017065, "grad_norm": 9.493388175964355, "learning_rate": 2.518312187886831e-06, "loss": 0.2463, "num_input_tokens_seen": 27452480, "step": 45045 }, { "epoch": 13.97766056469128, "grad_norm": 2.7288401126861572, "learning_rate": 2.5171369980698313e-06, "loss": 0.1692, "num_input_tokens_seen": 27455360, "step": 45050 }, { "epoch": 13.979211914365498, "grad_norm": 4.425580024719238, "learning_rate": 2.5159619902760897e-06, "loss": 0.1979, "num_input_tokens_seen": 27458240, "step": 45055 }, { "epoch": 13.980763264039714, "grad_norm": 8.03256893157959, "learning_rate": 2.514787164591751e-06, "loss": 0.2397, "num_input_tokens_seen": 27461184, "step": 45060 }, { "epoch": 13.982314613713932, "grad_norm": 5.402874946594238, "learning_rate": 2.513612521102941e-06, "loss": 0.2215, "num_input_tokens_seen": 27463616, "step": 45065 }, { "epoch": 13.983865963388148, "grad_norm": 6.6332316398620605, "learning_rate": 2.512438059895778e-06, "loss": 0.1703, "num_input_tokens_seen": 27466240, "step": 45070 }, { "epoch": 13.985417313062364, "grad_norm": 5.2669477462768555, "learning_rate": 2.5112637810563605e-06, "loss": 0.2031, "num_input_tokens_seen": 27468640, "step": 45075 }, { "epoch": 13.986968662736581, "grad_norm": 5.875992298126221, "learning_rate": 2.510089684670779e-06, "loss": 0.1812, "num_input_tokens_seen": 27472864, "step": 45080 }, { "epoch": 13.988520012410797, "grad_norm": 4.03463077545166, "learning_rate": 2.5089157708251105e-06, "loss": 0.1781, "num_input_tokens_seen": 27475744, "step": 45085 }, { "epoch": 13.990071362085015, "grad_norm": 5.5654754638671875, "learning_rate": 2.5077420396054133e-06, "loss": 0.2188, "num_input_tokens_seen": 27477728, "step": 45090 }, { "epoch": 13.99162271175923, "grad_norm": 4.6832051277160645, "learning_rate": 2.5065684910977383e-06, "loss": 0.1975, "num_input_tokens_seen": 27480832, "step": 45095 }, { "epoch": 13.993174061433447, "grad_norm": 5.687803745269775, "learning_rate": 2.5053951253881183e-06, "loss": 0.1998, "num_input_tokens_seen": 27486016, "step": 45100 }, { "epoch": 13.994725411107664, "grad_norm": 3.3630967140197754, "learning_rate": 2.504221942562578e-06, "loss": 0.3088, "num_input_tokens_seen": 27488256, "step": 45105 }, { "epoch": 13.99627676078188, "grad_norm": 10.968807220458984, "learning_rate": 2.503048942707121e-06, "loss": 0.2112, "num_input_tokens_seen": 27491840, "step": 45110 }, { "epoch": 13.997828110456096, "grad_norm": 3.3503944873809814, "learning_rate": 2.5018761259077485e-06, "loss": 0.1966, "num_input_tokens_seen": 27494848, "step": 45115 }, { "epoch": 13.999379460130314, "grad_norm": 4.62678337097168, "learning_rate": 2.5007034922504346e-06, "loss": 0.2071, "num_input_tokens_seen": 27497120, "step": 45120 }, { "epoch": 14.0, "eval_loss": 0.2693102955818176, "eval_runtime": 34.4628, "eval_samples_per_second": 93.521, "eval_steps_per_second": 23.388, "num_input_tokens_seen": 27497648, "step": 45122 }, { "epoch": 14.00093080980453, "grad_norm": 6.424166202545166, "learning_rate": 2.4995310418211538e-06, "loss": 0.1561, "num_input_tokens_seen": 27499440, "step": 45125 }, { "epoch": 14.002482159478747, "grad_norm": 2.932443141937256, "learning_rate": 2.4983587747058553e-06, "loss": 0.1769, "num_input_tokens_seen": 27502448, "step": 45130 }, { "epoch": 14.004033509152963, "grad_norm": 6.339284896850586, "learning_rate": 2.4971866909904824e-06, "loss": 0.2093, "num_input_tokens_seen": 27504944, "step": 45135 }, { "epoch": 14.005584858827179, "grad_norm": 15.401819229125977, "learning_rate": 2.496014790760965e-06, "loss": 0.1591, "num_input_tokens_seen": 27508496, "step": 45140 }, { "epoch": 14.007136208501397, "grad_norm": 4.311582088470459, "learning_rate": 2.4948430741032127e-06, "loss": 0.1957, "num_input_tokens_seen": 27510640, "step": 45145 }, { "epoch": 14.008687558175613, "grad_norm": 6.281970024108887, "learning_rate": 2.493671541103131e-06, "loss": 0.2088, "num_input_tokens_seen": 27514192, "step": 45150 }, { "epoch": 14.01023890784983, "grad_norm": 5.425478935241699, "learning_rate": 2.4925001918466025e-06, "loss": 0.2, "num_input_tokens_seen": 27516944, "step": 45155 }, { "epoch": 14.011790257524046, "grad_norm": 6.383081436157227, "learning_rate": 2.491329026419505e-06, "loss": 0.1922, "num_input_tokens_seen": 27519632, "step": 45160 }, { "epoch": 14.013341607198262, "grad_norm": 6.4600396156311035, "learning_rate": 2.490158044907695e-06, "loss": 0.2171, "num_input_tokens_seen": 27522576, "step": 45165 }, { "epoch": 14.01489295687248, "grad_norm": 4.6187214851379395, "learning_rate": 2.488987247397023e-06, "loss": 0.1576, "num_input_tokens_seen": 27524880, "step": 45170 }, { "epoch": 14.016444306546695, "grad_norm": 4.625185966491699, "learning_rate": 2.4878166339733194e-06, "loss": 0.1974, "num_input_tokens_seen": 27527408, "step": 45175 }, { "epoch": 14.017995656220911, "grad_norm": 2.9332404136657715, "learning_rate": 2.4866462047224064e-06, "loss": 0.1309, "num_input_tokens_seen": 27530064, "step": 45180 }, { "epoch": 14.019547005895129, "grad_norm": 5.1265363693237305, "learning_rate": 2.485475959730088e-06, "loss": 0.1515, "num_input_tokens_seen": 27532656, "step": 45185 }, { "epoch": 14.021098355569345, "grad_norm": 7.964199542999268, "learning_rate": 2.4843058990821596e-06, "loss": 0.172, "num_input_tokens_seen": 27535888, "step": 45190 }, { "epoch": 14.022649705243563, "grad_norm": 1.5721685886383057, "learning_rate": 2.4831360228643976e-06, "loss": 0.1779, "num_input_tokens_seen": 27539120, "step": 45195 }, { "epoch": 14.024201054917778, "grad_norm": 5.619290351867676, "learning_rate": 2.4819663311625686e-06, "loss": 0.1885, "num_input_tokens_seen": 27541744, "step": 45200 }, { "epoch": 14.025752404591994, "grad_norm": 7.023716449737549, "learning_rate": 2.4807968240624275e-06, "loss": 0.1985, "num_input_tokens_seen": 27544400, "step": 45205 }, { "epoch": 14.027303754266212, "grad_norm": 3.3485310077667236, "learning_rate": 2.4796275016497095e-06, "loss": 0.1942, "num_input_tokens_seen": 27547056, "step": 45210 }, { "epoch": 14.028855103940428, "grad_norm": 4.8435540199279785, "learning_rate": 2.4784583640101435e-06, "loss": 0.1389, "num_input_tokens_seen": 27550128, "step": 45215 }, { "epoch": 14.030406453614646, "grad_norm": 6.12144660949707, "learning_rate": 2.477289411229436e-06, "loss": 0.1727, "num_input_tokens_seen": 27552688, "step": 45220 }, { "epoch": 14.031957803288861, "grad_norm": 5.835507392883301, "learning_rate": 2.476120643393291e-06, "loss": 0.1422, "num_input_tokens_seen": 27555952, "step": 45225 }, { "epoch": 14.033509152963077, "grad_norm": 6.302326202392578, "learning_rate": 2.474952060587387e-06, "loss": 0.206, "num_input_tokens_seen": 27559632, "step": 45230 }, { "epoch": 14.035060502637295, "grad_norm": 7.125744819641113, "learning_rate": 2.473783662897401e-06, "loss": 0.202, "num_input_tokens_seen": 27562448, "step": 45235 }, { "epoch": 14.03661185231151, "grad_norm": 6.787424564361572, "learning_rate": 2.472615450408985e-06, "loss": 0.1604, "num_input_tokens_seen": 27565552, "step": 45240 }, { "epoch": 14.038163201985727, "grad_norm": 4.40067720413208, "learning_rate": 2.4714474232077873e-06, "loss": 0.2465, "num_input_tokens_seen": 27567952, "step": 45245 }, { "epoch": 14.039714551659944, "grad_norm": 10.615056991577148, "learning_rate": 2.4702795813794337e-06, "loss": 0.1727, "num_input_tokens_seen": 27570992, "step": 45250 }, { "epoch": 14.04126590133416, "grad_norm": 6.62509822845459, "learning_rate": 2.4691119250095437e-06, "loss": 0.2494, "num_input_tokens_seen": 27573648, "step": 45255 }, { "epoch": 14.042817251008378, "grad_norm": 9.231042861938477, "learning_rate": 2.4679444541837213e-06, "loss": 0.1734, "num_input_tokens_seen": 27577168, "step": 45260 }, { "epoch": 14.044368600682594, "grad_norm": 6.6259446144104, "learning_rate": 2.4667771689875523e-06, "loss": 0.2539, "num_input_tokens_seen": 27579536, "step": 45265 }, { "epoch": 14.04591995035681, "grad_norm": 2.2303571701049805, "learning_rate": 2.465610069506617e-06, "loss": 0.197, "num_input_tokens_seen": 27582640, "step": 45270 }, { "epoch": 14.047471300031027, "grad_norm": 3.2112772464752197, "learning_rate": 2.4644431558264738e-06, "loss": 0.134, "num_input_tokens_seen": 27585680, "step": 45275 }, { "epoch": 14.049022649705243, "grad_norm": 3.811225414276123, "learning_rate": 2.4632764280326737e-06, "loss": 0.142, "num_input_tokens_seen": 27589040, "step": 45280 }, { "epoch": 14.050573999379461, "grad_norm": 11.531603813171387, "learning_rate": 2.4621098862107496e-06, "loss": 0.2248, "num_input_tokens_seen": 27592688, "step": 45285 }, { "epoch": 14.052125349053677, "grad_norm": 8.258041381835938, "learning_rate": 2.460943530446225e-06, "loss": 0.1908, "num_input_tokens_seen": 27595312, "step": 45290 }, { "epoch": 14.053676698727893, "grad_norm": 8.427811622619629, "learning_rate": 2.459777360824606e-06, "loss": 0.1837, "num_input_tokens_seen": 27598096, "step": 45295 }, { "epoch": 14.05522804840211, "grad_norm": 5.968338489532471, "learning_rate": 2.458611377431388e-06, "loss": 0.2102, "num_input_tokens_seen": 27601392, "step": 45300 }, { "epoch": 14.056779398076326, "grad_norm": 14.934571266174316, "learning_rate": 2.4574455803520486e-06, "loss": 0.1363, "num_input_tokens_seen": 27604432, "step": 45305 }, { "epoch": 14.058330747750542, "grad_norm": 4.7103729248046875, "learning_rate": 2.456279969672059e-06, "loss": 0.157, "num_input_tokens_seen": 27606960, "step": 45310 }, { "epoch": 14.05988209742476, "grad_norm": 5.464115142822266, "learning_rate": 2.455114545476868e-06, "loss": 0.2234, "num_input_tokens_seen": 27610192, "step": 45315 }, { "epoch": 14.061433447098976, "grad_norm": 4.809729099273682, "learning_rate": 2.4539493078519163e-06, "loss": 0.2314, "num_input_tokens_seen": 27612304, "step": 45320 }, { "epoch": 14.062984796773193, "grad_norm": 8.454761505126953, "learning_rate": 2.4527842568826317e-06, "loss": 0.1784, "num_input_tokens_seen": 27615920, "step": 45325 }, { "epoch": 14.06453614644741, "grad_norm": 7.783824443817139, "learning_rate": 2.451619392654423e-06, "loss": 0.1541, "num_input_tokens_seen": 27620336, "step": 45330 }, { "epoch": 14.066087496121625, "grad_norm": 5.835822582244873, "learning_rate": 2.4504547152526905e-06, "loss": 0.1888, "num_input_tokens_seen": 27623312, "step": 45335 }, { "epoch": 14.067638845795843, "grad_norm": 5.348264694213867, "learning_rate": 2.449290224762818e-06, "loss": 0.1525, "num_input_tokens_seen": 27626480, "step": 45340 }, { "epoch": 14.069190195470059, "grad_norm": 2.6625142097473145, "learning_rate": 2.448125921270179e-06, "loss": 0.195, "num_input_tokens_seen": 27630128, "step": 45345 }, { "epoch": 14.070741545144276, "grad_norm": 5.427873134613037, "learning_rate": 2.4469618048601268e-06, "loss": 0.1851, "num_input_tokens_seen": 27633328, "step": 45350 }, { "epoch": 14.072292894818492, "grad_norm": 5.696822643280029, "learning_rate": 2.4457978756180088e-06, "loss": 0.1678, "num_input_tokens_seen": 27636336, "step": 45355 }, { "epoch": 14.073844244492708, "grad_norm": 12.316914558410645, "learning_rate": 2.4446341336291514e-06, "loss": 0.2018, "num_input_tokens_seen": 27638608, "step": 45360 }, { "epoch": 14.075395594166926, "grad_norm": 12.481403350830078, "learning_rate": 2.4434705789788734e-06, "loss": 0.1787, "num_input_tokens_seen": 27641648, "step": 45365 }, { "epoch": 14.076946943841142, "grad_norm": 7.281675815582275, "learning_rate": 2.442307211752474e-06, "loss": 0.2617, "num_input_tokens_seen": 27644336, "step": 45370 }, { "epoch": 14.078498293515358, "grad_norm": 9.233019828796387, "learning_rate": 2.4411440320352453e-06, "loss": 0.1648, "num_input_tokens_seen": 27647952, "step": 45375 }, { "epoch": 14.080049643189575, "grad_norm": 6.274461269378662, "learning_rate": 2.4399810399124585e-06, "loss": 0.1937, "num_input_tokens_seen": 27650352, "step": 45380 }, { "epoch": 14.081600992863791, "grad_norm": 6.807887077331543, "learning_rate": 2.4388182354693783e-06, "loss": 0.1878, "num_input_tokens_seen": 27655472, "step": 45385 }, { "epoch": 14.083152342538009, "grad_norm": 11.366890907287598, "learning_rate": 2.437655618791249e-06, "loss": 0.1672, "num_input_tokens_seen": 27659472, "step": 45390 }, { "epoch": 14.084703692212225, "grad_norm": 4.232616901397705, "learning_rate": 2.4364931899633078e-06, "loss": 0.2394, "num_input_tokens_seen": 27662608, "step": 45395 }, { "epoch": 14.08625504188644, "grad_norm": 4.016177654266357, "learning_rate": 2.4353309490707693e-06, "loss": 0.1758, "num_input_tokens_seen": 27666608, "step": 45400 }, { "epoch": 14.087806391560658, "grad_norm": 3.895451784133911, "learning_rate": 2.4341688961988437e-06, "loss": 0.2513, "num_input_tokens_seen": 27669328, "step": 45405 }, { "epoch": 14.089357741234874, "grad_norm": 7.515134811401367, "learning_rate": 2.4330070314327225e-06, "loss": 0.1762, "num_input_tokens_seen": 27671888, "step": 45410 }, { "epoch": 14.090909090909092, "grad_norm": 7.3591485023498535, "learning_rate": 2.4318453548575825e-06, "loss": 0.1299, "num_input_tokens_seen": 27675184, "step": 45415 }, { "epoch": 14.092460440583308, "grad_norm": 12.994499206542969, "learning_rate": 2.4306838665585915e-06, "loss": 0.1556, "num_input_tokens_seen": 27678608, "step": 45420 }, { "epoch": 14.094011790257523, "grad_norm": 6.370477676391602, "learning_rate": 2.4295225666208964e-06, "loss": 0.2742, "num_input_tokens_seen": 27682512, "step": 45425 }, { "epoch": 14.095563139931741, "grad_norm": 20.159276962280273, "learning_rate": 2.428361455129638e-06, "loss": 0.2701, "num_input_tokens_seen": 27685456, "step": 45430 }, { "epoch": 14.097114489605957, "grad_norm": 8.232123374938965, "learning_rate": 2.4272005321699356e-06, "loss": 0.2854, "num_input_tokens_seen": 27689008, "step": 45435 }, { "epoch": 14.098665839280173, "grad_norm": 9.226680755615234, "learning_rate": 2.4260397978269028e-06, "loss": 0.2891, "num_input_tokens_seen": 27692624, "step": 45440 }, { "epoch": 14.10021718895439, "grad_norm": 4.019114017486572, "learning_rate": 2.424879252185631e-06, "loss": 0.2236, "num_input_tokens_seen": 27695888, "step": 45445 }, { "epoch": 14.101768538628606, "grad_norm": 11.30560302734375, "learning_rate": 2.423718895331206e-06, "loss": 0.1662, "num_input_tokens_seen": 27699184, "step": 45450 }, { "epoch": 14.103319888302824, "grad_norm": 4.682065486907959, "learning_rate": 2.4225587273486915e-06, "loss": 0.2121, "num_input_tokens_seen": 27702032, "step": 45455 }, { "epoch": 14.10487123797704, "grad_norm": 29.119842529296875, "learning_rate": 2.4213987483231443e-06, "loss": 0.2274, "num_input_tokens_seen": 27705200, "step": 45460 }, { "epoch": 14.106422587651256, "grad_norm": 6.219662189483643, "learning_rate": 2.420238958339606e-06, "loss": 0.2193, "num_input_tokens_seen": 27707952, "step": 45465 }, { "epoch": 14.107973937325474, "grad_norm": 4.512369155883789, "learning_rate": 2.419079357483099e-06, "loss": 0.2604, "num_input_tokens_seen": 27710608, "step": 45470 }, { "epoch": 14.10952528699969, "grad_norm": 10.132920265197754, "learning_rate": 2.4179199458386393e-06, "loss": 0.2364, "num_input_tokens_seen": 27714352, "step": 45475 }, { "epoch": 14.111076636673907, "grad_norm": 3.9570047855377197, "learning_rate": 2.416760723491222e-06, "loss": 0.198, "num_input_tokens_seen": 27717520, "step": 45480 }, { "epoch": 14.112627986348123, "grad_norm": 9.508323669433594, "learning_rate": 2.415601690525836e-06, "loss": 0.2288, "num_input_tokens_seen": 27720496, "step": 45485 }, { "epoch": 14.114179336022339, "grad_norm": 5.4533514976501465, "learning_rate": 2.4144428470274483e-06, "loss": 0.1445, "num_input_tokens_seen": 27723024, "step": 45490 }, { "epoch": 14.115730685696557, "grad_norm": 12.313097953796387, "learning_rate": 2.413284193081019e-06, "loss": 0.1697, "num_input_tokens_seen": 27726256, "step": 45495 }, { "epoch": 14.117282035370772, "grad_norm": 11.731790542602539, "learning_rate": 2.4121257287714877e-06, "loss": 0.2, "num_input_tokens_seen": 27730544, "step": 45500 }, { "epoch": 14.118833385044988, "grad_norm": 5.6083831787109375, "learning_rate": 2.4109674541837873e-06, "loss": 0.1659, "num_input_tokens_seen": 27732976, "step": 45505 }, { "epoch": 14.120384734719206, "grad_norm": 5.529743194580078, "learning_rate": 2.4098093694028296e-06, "loss": 0.2493, "num_input_tokens_seen": 27735920, "step": 45510 }, { "epoch": 14.121936084393422, "grad_norm": 19.93313980102539, "learning_rate": 2.40865147451352e-06, "loss": 0.1725, "num_input_tokens_seen": 27739248, "step": 45515 }, { "epoch": 14.12348743406764, "grad_norm": 9.64719009399414, "learning_rate": 2.4074937696007407e-06, "loss": 0.2144, "num_input_tokens_seen": 27742192, "step": 45520 }, { "epoch": 14.125038783741855, "grad_norm": 10.811317443847656, "learning_rate": 2.4063362547493685e-06, "loss": 0.2129, "num_input_tokens_seen": 27744944, "step": 45525 }, { "epoch": 14.126590133416071, "grad_norm": 8.031442642211914, "learning_rate": 2.405178930044264e-06, "loss": 0.1968, "num_input_tokens_seen": 27747984, "step": 45530 }, { "epoch": 14.128141483090289, "grad_norm": 4.884719371795654, "learning_rate": 2.4040217955702693e-06, "loss": 0.1646, "num_input_tokens_seen": 27751088, "step": 45535 }, { "epoch": 14.129692832764505, "grad_norm": 7.25480842590332, "learning_rate": 2.40286485141222e-06, "loss": 0.2239, "num_input_tokens_seen": 27753808, "step": 45540 }, { "epoch": 14.131244182438722, "grad_norm": 6.054532051086426, "learning_rate": 2.4017080976549295e-06, "loss": 0.1858, "num_input_tokens_seen": 27757168, "step": 45545 }, { "epoch": 14.132795532112938, "grad_norm": 6.437033653259277, "learning_rate": 2.4005515343832063e-06, "loss": 0.1623, "num_input_tokens_seen": 27759952, "step": 45550 }, { "epoch": 14.134346881787154, "grad_norm": 6.3498406410217285, "learning_rate": 2.3993951616818357e-06, "loss": 0.2002, "num_input_tokens_seen": 27762384, "step": 45555 }, { "epoch": 14.135898231461372, "grad_norm": 7.362238883972168, "learning_rate": 2.3982389796355972e-06, "loss": 0.169, "num_input_tokens_seen": 27765008, "step": 45560 }, { "epoch": 14.137449581135588, "grad_norm": 9.503920555114746, "learning_rate": 2.397082988329249e-06, "loss": 0.1824, "num_input_tokens_seen": 27769584, "step": 45565 }, { "epoch": 14.139000930809804, "grad_norm": 6.52617073059082, "learning_rate": 2.3959271878475427e-06, "loss": 0.1668, "num_input_tokens_seen": 27772560, "step": 45570 }, { "epoch": 14.140552280484021, "grad_norm": 12.770366668701172, "learning_rate": 2.394771578275209e-06, "loss": 0.2009, "num_input_tokens_seen": 27774928, "step": 45575 }, { "epoch": 14.142103630158237, "grad_norm": 17.421876907348633, "learning_rate": 2.393616159696969e-06, "loss": 0.256, "num_input_tokens_seen": 27777328, "step": 45580 }, { "epoch": 14.143654979832455, "grad_norm": 11.478917121887207, "learning_rate": 2.39246093219753e-06, "loss": 0.1617, "num_input_tokens_seen": 27781168, "step": 45585 }, { "epoch": 14.14520632950667, "grad_norm": 11.479084014892578, "learning_rate": 2.391305895861581e-06, "loss": 0.2385, "num_input_tokens_seen": 27784432, "step": 45590 }, { "epoch": 14.146757679180887, "grad_norm": 4.981937408447266, "learning_rate": 2.3901510507738037e-06, "loss": 0.1474, "num_input_tokens_seen": 27786928, "step": 45595 }, { "epoch": 14.148309028855104, "grad_norm": 9.924497604370117, "learning_rate": 2.3889963970188574e-06, "loss": 0.1904, "num_input_tokens_seen": 27790032, "step": 45600 }, { "epoch": 14.14986037852932, "grad_norm": 4.607985973358154, "learning_rate": 2.3878419346813958e-06, "loss": 0.2024, "num_input_tokens_seen": 27792432, "step": 45605 }, { "epoch": 14.151411728203538, "grad_norm": 10.32619571685791, "learning_rate": 2.386687663846051e-06, "loss": 0.2468, "num_input_tokens_seen": 27795024, "step": 45610 }, { "epoch": 14.152963077877754, "grad_norm": 10.45390510559082, "learning_rate": 2.3855335845974493e-06, "loss": 0.1935, "num_input_tokens_seen": 27797648, "step": 45615 }, { "epoch": 14.15451442755197, "grad_norm": 8.59262752532959, "learning_rate": 2.384379697020193e-06, "loss": 0.2108, "num_input_tokens_seen": 27800080, "step": 45620 }, { "epoch": 14.156065777226187, "grad_norm": 5.408799171447754, "learning_rate": 2.3832260011988813e-06, "loss": 0.1762, "num_input_tokens_seen": 27802608, "step": 45625 }, { "epoch": 14.157617126900403, "grad_norm": 7.313859939575195, "learning_rate": 2.3820724972180882e-06, "loss": 0.2747, "num_input_tokens_seen": 27805264, "step": 45630 }, { "epoch": 14.159168476574619, "grad_norm": 7.290823936462402, "learning_rate": 2.3809191851623842e-06, "loss": 0.2109, "num_input_tokens_seen": 27808272, "step": 45635 }, { "epoch": 14.160719826248837, "grad_norm": 3.8161535263061523, "learning_rate": 2.3797660651163164e-06, "loss": 0.1518, "num_input_tokens_seen": 27811344, "step": 45640 }, { "epoch": 14.162271175923053, "grad_norm": 7.008798122406006, "learning_rate": 2.3786131371644244e-06, "loss": 0.1807, "num_input_tokens_seen": 27814288, "step": 45645 }, { "epoch": 14.16382252559727, "grad_norm": 2.271406412124634, "learning_rate": 2.3774604013912334e-06, "loss": 0.1734, "num_input_tokens_seen": 27817296, "step": 45650 }, { "epoch": 14.165373875271486, "grad_norm": 11.6380033493042, "learning_rate": 2.376307857881248e-06, "loss": 0.2845, "num_input_tokens_seen": 27820240, "step": 45655 }, { "epoch": 14.166925224945702, "grad_norm": 5.674911022186279, "learning_rate": 2.3751555067189687e-06, "loss": 0.2018, "num_input_tokens_seen": 27823024, "step": 45660 }, { "epoch": 14.16847657461992, "grad_norm": 24.516185760498047, "learning_rate": 2.3740033479888708e-06, "loss": 0.1982, "num_input_tokens_seen": 27825808, "step": 45665 }, { "epoch": 14.170027924294136, "grad_norm": 7.252277374267578, "learning_rate": 2.3728513817754264e-06, "loss": 0.2458, "num_input_tokens_seen": 27828080, "step": 45670 }, { "epoch": 14.171579273968353, "grad_norm": 4.545963287353516, "learning_rate": 2.3716996081630834e-06, "loss": 0.2001, "num_input_tokens_seen": 27830704, "step": 45675 }, { "epoch": 14.17313062364257, "grad_norm": 4.897068500518799, "learning_rate": 2.3705480272362848e-06, "loss": 0.1963, "num_input_tokens_seen": 27833264, "step": 45680 }, { "epoch": 14.174681973316785, "grad_norm": 11.576921463012695, "learning_rate": 2.3693966390794516e-06, "loss": 0.2392, "num_input_tokens_seen": 27835600, "step": 45685 }, { "epoch": 14.176233322991003, "grad_norm": 5.427720546722412, "learning_rate": 2.3682454437769975e-06, "loss": 0.1729, "num_input_tokens_seen": 27838736, "step": 45690 }, { "epoch": 14.177784672665219, "grad_norm": 7.226963520050049, "learning_rate": 2.367094441413315e-06, "loss": 0.1602, "num_input_tokens_seen": 27841520, "step": 45695 }, { "epoch": 14.179336022339434, "grad_norm": 6.893664836883545, "learning_rate": 2.3659436320727885e-06, "loss": 0.2134, "num_input_tokens_seen": 27845424, "step": 45700 }, { "epoch": 14.180887372013652, "grad_norm": 11.589091300964355, "learning_rate": 2.364793015839787e-06, "loss": 0.2035, "num_input_tokens_seen": 27849296, "step": 45705 }, { "epoch": 14.182438721687868, "grad_norm": 14.2567720413208, "learning_rate": 2.363642592798662e-06, "loss": 0.1561, "num_input_tokens_seen": 27852336, "step": 45710 }, { "epoch": 14.183990071362086, "grad_norm": 2.961733102798462, "learning_rate": 2.362492363033755e-06, "loss": 0.1211, "num_input_tokens_seen": 27857328, "step": 45715 }, { "epoch": 14.185541421036302, "grad_norm": 6.319824695587158, "learning_rate": 2.361342326629389e-06, "loss": 0.2506, "num_input_tokens_seen": 27859568, "step": 45720 }, { "epoch": 14.187092770710517, "grad_norm": 9.959772109985352, "learning_rate": 2.360192483669879e-06, "loss": 0.2321, "num_input_tokens_seen": 27862352, "step": 45725 }, { "epoch": 14.188644120384735, "grad_norm": 5.685427665710449, "learning_rate": 2.3590428342395176e-06, "loss": 0.171, "num_input_tokens_seen": 27865136, "step": 45730 }, { "epoch": 14.190195470058951, "grad_norm": 6.167307376861572, "learning_rate": 2.3578933784225926e-06, "loss": 0.1879, "num_input_tokens_seen": 27868112, "step": 45735 }, { "epoch": 14.191746819733169, "grad_norm": 5.148198127746582, "learning_rate": 2.3567441163033676e-06, "loss": 0.2796, "num_input_tokens_seen": 27870832, "step": 45740 }, { "epoch": 14.193298169407385, "grad_norm": 4.343691349029541, "learning_rate": 2.3555950479661024e-06, "loss": 0.2094, "num_input_tokens_seen": 27873584, "step": 45745 }, { "epoch": 14.1948495190816, "grad_norm": 4.879261016845703, "learning_rate": 2.354446173495032e-06, "loss": 0.2084, "num_input_tokens_seen": 27878672, "step": 45750 }, { "epoch": 14.196400868755818, "grad_norm": 12.22840690612793, "learning_rate": 2.3532974929743875e-06, "loss": 0.2034, "num_input_tokens_seen": 27882192, "step": 45755 }, { "epoch": 14.197952218430034, "grad_norm": 5.4211297035217285, "learning_rate": 2.3521490064883763e-06, "loss": 0.1297, "num_input_tokens_seen": 27885328, "step": 45760 }, { "epoch": 14.19950356810425, "grad_norm": 10.30030632019043, "learning_rate": 2.3510007141211976e-06, "loss": 0.1921, "num_input_tokens_seen": 27888464, "step": 45765 }, { "epoch": 14.201054917778468, "grad_norm": 17.226116180419922, "learning_rate": 2.349852615957038e-06, "loss": 0.2383, "num_input_tokens_seen": 27891088, "step": 45770 }, { "epoch": 14.202606267452683, "grad_norm": 3.933120012283325, "learning_rate": 2.348704712080062e-06, "loss": 0.15, "num_input_tokens_seen": 27893808, "step": 45775 }, { "epoch": 14.204157617126901, "grad_norm": 3.1171743869781494, "learning_rate": 2.347557002574429e-06, "loss": 0.1385, "num_input_tokens_seen": 27896080, "step": 45780 }, { "epoch": 14.205708966801117, "grad_norm": 10.638519287109375, "learning_rate": 2.3464094875242747e-06, "loss": 0.2161, "num_input_tokens_seen": 27901168, "step": 45785 }, { "epoch": 14.207260316475333, "grad_norm": 9.202115058898926, "learning_rate": 2.3452621670137303e-06, "loss": 0.1934, "num_input_tokens_seen": 27903504, "step": 45790 }, { "epoch": 14.20881166614955, "grad_norm": 13.972176551818848, "learning_rate": 2.344115041126904e-06, "loss": 0.2186, "num_input_tokens_seen": 27906160, "step": 45795 }, { "epoch": 14.210363015823766, "grad_norm": 6.08847188949585, "learning_rate": 2.3429681099478977e-06, "loss": 0.1748, "num_input_tokens_seen": 27908272, "step": 45800 }, { "epoch": 14.211914365497984, "grad_norm": 10.967059135437012, "learning_rate": 2.341821373560791e-06, "loss": 0.1727, "num_input_tokens_seen": 27911440, "step": 45805 }, { "epoch": 14.2134657151722, "grad_norm": 10.645483016967773, "learning_rate": 2.340674832049657e-06, "loss": 0.1948, "num_input_tokens_seen": 27915056, "step": 45810 }, { "epoch": 14.215017064846416, "grad_norm": 5.610249996185303, "learning_rate": 2.3395284854985468e-06, "loss": 0.2365, "num_input_tokens_seen": 27918224, "step": 45815 }, { "epoch": 14.216568414520633, "grad_norm": 15.759492874145508, "learning_rate": 2.3383823339915034e-06, "loss": 0.1997, "num_input_tokens_seen": 27920944, "step": 45820 }, { "epoch": 14.21811976419485, "grad_norm": 12.650461196899414, "learning_rate": 2.337236377612555e-06, "loss": 0.2241, "num_input_tokens_seen": 27926224, "step": 45825 }, { "epoch": 14.219671113869065, "grad_norm": 3.3438363075256348, "learning_rate": 2.3360906164457102e-06, "loss": 0.1748, "num_input_tokens_seen": 27929008, "step": 45830 }, { "epoch": 14.221222463543283, "grad_norm": 12.556947708129883, "learning_rate": 2.3349450505749706e-06, "loss": 0.1668, "num_input_tokens_seen": 27934032, "step": 45835 }, { "epoch": 14.222773813217499, "grad_norm": 3.1473114490509033, "learning_rate": 2.3337996800843155e-06, "loss": 0.1871, "num_input_tokens_seen": 27936976, "step": 45840 }, { "epoch": 14.224325162891716, "grad_norm": 7.704403400421143, "learning_rate": 2.3326545050577187e-06, "loss": 0.2138, "num_input_tokens_seen": 27939728, "step": 45845 }, { "epoch": 14.225876512565932, "grad_norm": 5.926602840423584, "learning_rate": 2.331509525579131e-06, "loss": 0.2072, "num_input_tokens_seen": 27941776, "step": 45850 }, { "epoch": 14.227427862240148, "grad_norm": 3.1891942024230957, "learning_rate": 2.330364741732496e-06, "loss": 0.1568, "num_input_tokens_seen": 27944720, "step": 45855 }, { "epoch": 14.228979211914366, "grad_norm": 5.0593953132629395, "learning_rate": 2.329220153601737e-06, "loss": 0.236, "num_input_tokens_seen": 27947312, "step": 45860 }, { "epoch": 14.230530561588582, "grad_norm": 3.1268470287323, "learning_rate": 2.3280757612707696e-06, "loss": 0.134, "num_input_tokens_seen": 27950992, "step": 45865 }, { "epoch": 14.2320819112628, "grad_norm": 5.956373691558838, "learning_rate": 2.3269315648234874e-06, "loss": 0.2081, "num_input_tokens_seen": 27953840, "step": 45870 }, { "epoch": 14.233633260937015, "grad_norm": 14.441205978393555, "learning_rate": 2.3257875643437772e-06, "loss": 0.2241, "num_input_tokens_seen": 27957360, "step": 45875 }, { "epoch": 14.235184610611231, "grad_norm": 2.648411512374878, "learning_rate": 2.3246437599155035e-06, "loss": 0.1744, "num_input_tokens_seen": 27960336, "step": 45880 }, { "epoch": 14.236735960285449, "grad_norm": 9.771599769592285, "learning_rate": 2.3235001516225236e-06, "loss": 0.2, "num_input_tokens_seen": 27964240, "step": 45885 }, { "epoch": 14.238287309959665, "grad_norm": 6.058264255523682, "learning_rate": 2.322356739548679e-06, "loss": 0.2098, "num_input_tokens_seen": 27966896, "step": 45890 }, { "epoch": 14.23983865963388, "grad_norm": 12.34087085723877, "learning_rate": 2.3212135237777917e-06, "loss": 0.1789, "num_input_tokens_seen": 27969296, "step": 45895 }, { "epoch": 14.241390009308098, "grad_norm": 3.9463980197906494, "learning_rate": 2.320070504393676e-06, "loss": 0.2042, "num_input_tokens_seen": 27973648, "step": 45900 }, { "epoch": 14.242941358982314, "grad_norm": 10.21056079864502, "learning_rate": 2.3189276814801265e-06, "loss": 0.2212, "num_input_tokens_seen": 27976336, "step": 45905 }, { "epoch": 14.244492708656532, "grad_norm": 8.779504776000977, "learning_rate": 2.3177850551209273e-06, "loss": 0.1651, "num_input_tokens_seen": 27978704, "step": 45910 }, { "epoch": 14.246044058330748, "grad_norm": 8.785868644714355, "learning_rate": 2.316642625399843e-06, "loss": 0.1869, "num_input_tokens_seen": 27981040, "step": 45915 }, { "epoch": 14.247595408004964, "grad_norm": 7.767013072967529, "learning_rate": 2.315500392400633e-06, "loss": 0.1592, "num_input_tokens_seen": 27983760, "step": 45920 }, { "epoch": 14.249146757679181, "grad_norm": 6.733401775360107, "learning_rate": 2.314358356207032e-06, "loss": 0.1846, "num_input_tokens_seen": 27986800, "step": 45925 }, { "epoch": 14.250698107353397, "grad_norm": 1.9562395811080933, "learning_rate": 2.313216516902768e-06, "loss": 0.1602, "num_input_tokens_seen": 27989232, "step": 45930 }, { "epoch": 14.252249457027615, "grad_norm": 1.706649661064148, "learning_rate": 2.3120748745715477e-06, "loss": 0.203, "num_input_tokens_seen": 27992144, "step": 45935 }, { "epoch": 14.25380080670183, "grad_norm": 19.192840576171875, "learning_rate": 2.3109334292970708e-06, "loss": 0.1826, "num_input_tokens_seen": 27995792, "step": 45940 }, { "epoch": 14.255352156376047, "grad_norm": 10.612716674804688, "learning_rate": 2.309792181163015e-06, "loss": 0.2523, "num_input_tokens_seen": 27998128, "step": 45945 }, { "epoch": 14.256903506050264, "grad_norm": 10.789305686950684, "learning_rate": 2.308651130253051e-06, "loss": 0.1882, "num_input_tokens_seen": 28002064, "step": 45950 }, { "epoch": 14.25845485572448, "grad_norm": 6.446588039398193, "learning_rate": 2.307510276650828e-06, "loss": 0.1608, "num_input_tokens_seen": 28004784, "step": 45955 }, { "epoch": 14.260006205398696, "grad_norm": 10.292073249816895, "learning_rate": 2.3063696204399866e-06, "loss": 0.2194, "num_input_tokens_seen": 28007472, "step": 45960 }, { "epoch": 14.261557555072914, "grad_norm": 9.16768741607666, "learning_rate": 2.3052291617041484e-06, "loss": 0.1791, "num_input_tokens_seen": 28010736, "step": 45965 }, { "epoch": 14.26310890474713, "grad_norm": 8.241341590881348, "learning_rate": 2.3040889005269227e-06, "loss": 0.1834, "num_input_tokens_seen": 28014128, "step": 45970 }, { "epoch": 14.264660254421347, "grad_norm": 15.43418025970459, "learning_rate": 2.3029488369919075e-06, "loss": 0.1626, "num_input_tokens_seen": 28016784, "step": 45975 }, { "epoch": 14.266211604095563, "grad_norm": 12.387420654296875, "learning_rate": 2.301808971182678e-06, "loss": 0.223, "num_input_tokens_seen": 28019536, "step": 45980 }, { "epoch": 14.267762953769779, "grad_norm": 8.056310653686523, "learning_rate": 2.300669303182804e-06, "loss": 0.1788, "num_input_tokens_seen": 28023024, "step": 45985 }, { "epoch": 14.269314303443997, "grad_norm": 5.545440196990967, "learning_rate": 2.2995298330758335e-06, "loss": 0.179, "num_input_tokens_seen": 28025584, "step": 45990 }, { "epoch": 14.270865653118213, "grad_norm": 7.388247966766357, "learning_rate": 2.2983905609453065e-06, "loss": 0.1604, "num_input_tokens_seen": 28028464, "step": 45995 }, { "epoch": 14.27241700279243, "grad_norm": 11.801166534423828, "learning_rate": 2.2972514868747415e-06, "loss": 0.229, "num_input_tokens_seen": 28030448, "step": 46000 }, { "epoch": 14.273968352466646, "grad_norm": 7.620347499847412, "learning_rate": 2.2961126109476496e-06, "loss": 0.1901, "num_input_tokens_seen": 28032848, "step": 46005 }, { "epoch": 14.275519702140862, "grad_norm": 7.072335243225098, "learning_rate": 2.2949739332475202e-06, "loss": 0.1674, "num_input_tokens_seen": 28035632, "step": 46010 }, { "epoch": 14.27707105181508, "grad_norm": 12.216421127319336, "learning_rate": 2.2938354538578357e-06, "loss": 0.2315, "num_input_tokens_seen": 28038160, "step": 46015 }, { "epoch": 14.278622401489296, "grad_norm": 6.835507869720459, "learning_rate": 2.2926971728620555e-06, "loss": 0.1373, "num_input_tokens_seen": 28042256, "step": 46020 }, { "epoch": 14.280173751163511, "grad_norm": 6.106566905975342, "learning_rate": 2.2915590903436347e-06, "loss": 0.1803, "num_input_tokens_seen": 28045232, "step": 46025 }, { "epoch": 14.281725100837729, "grad_norm": 13.071391105651855, "learning_rate": 2.2904212063860033e-06, "loss": 0.2508, "num_input_tokens_seen": 28047408, "step": 46030 }, { "epoch": 14.283276450511945, "grad_norm": 17.52528953552246, "learning_rate": 2.289283521072583e-06, "loss": 0.1576, "num_input_tokens_seen": 28050672, "step": 46035 }, { "epoch": 14.284827800186163, "grad_norm": 4.0209784507751465, "learning_rate": 2.2881460344867827e-06, "loss": 0.1794, "num_input_tokens_seen": 28053520, "step": 46040 }, { "epoch": 14.286379149860378, "grad_norm": 6.622351169586182, "learning_rate": 2.2870087467119893e-06, "loss": 0.1732, "num_input_tokens_seen": 28056112, "step": 46045 }, { "epoch": 14.287930499534594, "grad_norm": 8.348435401916504, "learning_rate": 2.285871657831584e-06, "loss": 0.2241, "num_input_tokens_seen": 28058512, "step": 46050 }, { "epoch": 14.289481849208812, "grad_norm": 19.06709861755371, "learning_rate": 2.284734767928923e-06, "loss": 0.1564, "num_input_tokens_seen": 28061360, "step": 46055 }, { "epoch": 14.291033198883028, "grad_norm": 8.678658485412598, "learning_rate": 2.28359807708736e-06, "loss": 0.1668, "num_input_tokens_seen": 28063696, "step": 46060 }, { "epoch": 14.292584548557246, "grad_norm": 3.8745346069335938, "learning_rate": 2.2824615853902226e-06, "loss": 0.1509, "num_input_tokens_seen": 28066256, "step": 46065 }, { "epoch": 14.294135898231461, "grad_norm": 25.559619903564453, "learning_rate": 2.2813252929208336e-06, "loss": 0.27, "num_input_tokens_seen": 28070224, "step": 46070 }, { "epoch": 14.295687247905677, "grad_norm": 7.379058361053467, "learning_rate": 2.2801891997624924e-06, "loss": 0.2128, "num_input_tokens_seen": 28074416, "step": 46075 }, { "epoch": 14.297238597579895, "grad_norm": 9.536956787109375, "learning_rate": 2.2790533059984924e-06, "loss": 0.1677, "num_input_tokens_seen": 28078416, "step": 46080 }, { "epoch": 14.298789947254111, "grad_norm": 7.61325216293335, "learning_rate": 2.277917611712104e-06, "loss": 0.2144, "num_input_tokens_seen": 28081264, "step": 46085 }, { "epoch": 14.300341296928327, "grad_norm": 20.789443969726562, "learning_rate": 2.2767821169865887e-06, "loss": 0.1965, "num_input_tokens_seen": 28084880, "step": 46090 }, { "epoch": 14.301892646602544, "grad_norm": 22.146957397460938, "learning_rate": 2.2756468219051942e-06, "loss": 0.2182, "num_input_tokens_seen": 28087920, "step": 46095 }, { "epoch": 14.30344399627676, "grad_norm": 8.306485176086426, "learning_rate": 2.274511726551146e-06, "loss": 0.1929, "num_input_tokens_seen": 28090544, "step": 46100 }, { "epoch": 14.304995345950978, "grad_norm": 9.64700698852539, "learning_rate": 2.273376831007666e-06, "loss": 0.2362, "num_input_tokens_seen": 28093552, "step": 46105 }, { "epoch": 14.306546695625194, "grad_norm": 10.372699737548828, "learning_rate": 2.27224213535795e-06, "loss": 0.2163, "num_input_tokens_seen": 28096688, "step": 46110 }, { "epoch": 14.30809804529941, "grad_norm": 3.5404398441314697, "learning_rate": 2.2711076396851883e-06, "loss": 0.1959, "num_input_tokens_seen": 28099248, "step": 46115 }, { "epoch": 14.309649394973627, "grad_norm": 10.361734390258789, "learning_rate": 2.2699733440725502e-06, "loss": 0.1889, "num_input_tokens_seen": 28104080, "step": 46120 }, { "epoch": 14.311200744647843, "grad_norm": 14.337789535522461, "learning_rate": 2.268839248603196e-06, "loss": 0.2847, "num_input_tokens_seen": 28106448, "step": 46125 }, { "epoch": 14.312752094322061, "grad_norm": 5.788785934448242, "learning_rate": 2.267705353360265e-06, "loss": 0.2468, "num_input_tokens_seen": 28109232, "step": 46130 }, { "epoch": 14.314303443996277, "grad_norm": 11.46983528137207, "learning_rate": 2.266571658426888e-06, "loss": 0.2442, "num_input_tokens_seen": 28111952, "step": 46135 }, { "epoch": 14.315854793670493, "grad_norm": 5.365719795227051, "learning_rate": 2.265438163886176e-06, "loss": 0.1505, "num_input_tokens_seen": 28115152, "step": 46140 }, { "epoch": 14.31740614334471, "grad_norm": 13.480051040649414, "learning_rate": 2.2643048698212305e-06, "loss": 0.2132, "num_input_tokens_seen": 28117840, "step": 46145 }, { "epoch": 14.318957493018926, "grad_norm": 10.664459228515625, "learning_rate": 2.2631717763151313e-06, "loss": 0.1643, "num_input_tokens_seen": 28121232, "step": 46150 }, { "epoch": 14.320508842693144, "grad_norm": 15.444902420043945, "learning_rate": 2.26203888345095e-06, "loss": 0.1833, "num_input_tokens_seen": 28124016, "step": 46155 }, { "epoch": 14.32206019236736, "grad_norm": 6.870750427246094, "learning_rate": 2.2609061913117424e-06, "loss": 0.1758, "num_input_tokens_seen": 28125968, "step": 46160 }, { "epoch": 14.323611542041576, "grad_norm": 8.466903686523438, "learning_rate": 2.259773699980545e-06, "loss": 0.1956, "num_input_tokens_seen": 28128208, "step": 46165 }, { "epoch": 14.325162891715793, "grad_norm": 4.463929653167725, "learning_rate": 2.2586414095403863e-06, "loss": 0.2071, "num_input_tokens_seen": 28130864, "step": 46170 }, { "epoch": 14.32671424139001, "grad_norm": 8.700133323669434, "learning_rate": 2.2575093200742733e-06, "loss": 0.171, "num_input_tokens_seen": 28133424, "step": 46175 }, { "epoch": 14.328265591064225, "grad_norm": 8.204453468322754, "learning_rate": 2.2563774316652047e-06, "loss": 0.2293, "num_input_tokens_seen": 28136464, "step": 46180 }, { "epoch": 14.329816940738443, "grad_norm": 9.212326049804688, "learning_rate": 2.2552457443961577e-06, "loss": 0.1815, "num_input_tokens_seen": 28138608, "step": 46185 }, { "epoch": 14.331368290412659, "grad_norm": 9.400568962097168, "learning_rate": 2.254114258350103e-06, "loss": 0.1993, "num_input_tokens_seen": 28141328, "step": 46190 }, { "epoch": 14.332919640086876, "grad_norm": 10.002387046813965, "learning_rate": 2.2529829736099868e-06, "loss": 0.1442, "num_input_tokens_seen": 28143600, "step": 46195 }, { "epoch": 14.334470989761092, "grad_norm": 7.026825904846191, "learning_rate": 2.25185189025875e-06, "loss": 0.1614, "num_input_tokens_seen": 28146192, "step": 46200 }, { "epoch": 14.336022339435308, "grad_norm": 10.209850311279297, "learning_rate": 2.2507210083793105e-06, "loss": 0.2398, "num_input_tokens_seen": 28148560, "step": 46205 }, { "epoch": 14.337573689109526, "grad_norm": 8.093499183654785, "learning_rate": 2.2495903280545782e-06, "loss": 0.1432, "num_input_tokens_seen": 28151376, "step": 46210 }, { "epoch": 14.339125038783742, "grad_norm": 26.249393463134766, "learning_rate": 2.248459849367446e-06, "loss": 0.3112, "num_input_tokens_seen": 28154096, "step": 46215 }, { "epoch": 14.340676388457958, "grad_norm": 19.574960708618164, "learning_rate": 2.2473295724007882e-06, "loss": 0.2613, "num_input_tokens_seen": 28157936, "step": 46220 }, { "epoch": 14.342227738132175, "grad_norm": 8.879544258117676, "learning_rate": 2.2461994972374707e-06, "loss": 0.1916, "num_input_tokens_seen": 28159984, "step": 46225 }, { "epoch": 14.343779087806391, "grad_norm": 9.926496505737305, "learning_rate": 2.2450696239603388e-06, "loss": 0.2344, "num_input_tokens_seen": 28163280, "step": 46230 }, { "epoch": 14.345330437480609, "grad_norm": 10.81551456451416, "learning_rate": 2.2439399526522284e-06, "loss": 0.2314, "num_input_tokens_seen": 28166608, "step": 46235 }, { "epoch": 14.346881787154825, "grad_norm": 16.873802185058594, "learning_rate": 2.2428104833959536e-06, "loss": 0.2578, "num_input_tokens_seen": 28169456, "step": 46240 }, { "epoch": 14.34843313682904, "grad_norm": 10.552977561950684, "learning_rate": 2.2416812162743223e-06, "loss": 0.1717, "num_input_tokens_seen": 28172528, "step": 46245 }, { "epoch": 14.349984486503258, "grad_norm": 7.566709518432617, "learning_rate": 2.24055215137012e-06, "loss": 0.2174, "num_input_tokens_seen": 28175120, "step": 46250 }, { "epoch": 14.351535836177474, "grad_norm": 5.643270015716553, "learning_rate": 2.2394232887661234e-06, "loss": 0.1992, "num_input_tokens_seen": 28178576, "step": 46255 }, { "epoch": 14.353087185851692, "grad_norm": 11.123568534851074, "learning_rate": 2.238294628545088e-06, "loss": 0.1888, "num_input_tokens_seen": 28181744, "step": 46260 }, { "epoch": 14.354638535525908, "grad_norm": 8.634444236755371, "learning_rate": 2.2371661707897615e-06, "loss": 0.2249, "num_input_tokens_seen": 28184048, "step": 46265 }, { "epoch": 14.356189885200124, "grad_norm": 7.6174726486206055, "learning_rate": 2.23603791558287e-06, "loss": 0.2709, "num_input_tokens_seen": 28186832, "step": 46270 }, { "epoch": 14.357741234874341, "grad_norm": 3.7555928230285645, "learning_rate": 2.2349098630071293e-06, "loss": 0.1779, "num_input_tokens_seen": 28189200, "step": 46275 }, { "epoch": 14.359292584548557, "grad_norm": 6.813736915588379, "learning_rate": 2.2337820131452407e-06, "loss": 0.1436, "num_input_tokens_seen": 28191984, "step": 46280 }, { "epoch": 14.360843934222775, "grad_norm": 17.363529205322266, "learning_rate": 2.232654366079886e-06, "loss": 0.2161, "num_input_tokens_seen": 28195024, "step": 46285 }, { "epoch": 14.36239528389699, "grad_norm": 7.187280654907227, "learning_rate": 2.2315269218937378e-06, "loss": 0.1597, "num_input_tokens_seen": 28198928, "step": 46290 }, { "epoch": 14.363946633571206, "grad_norm": 9.043731689453125, "learning_rate": 2.230399680669449e-06, "loss": 0.188, "num_input_tokens_seen": 28201488, "step": 46295 }, { "epoch": 14.365497983245424, "grad_norm": 4.156003952026367, "learning_rate": 2.229272642489662e-06, "loss": 0.1903, "num_input_tokens_seen": 28204880, "step": 46300 }, { "epoch": 14.36704933291964, "grad_norm": 8.566265106201172, "learning_rate": 2.228145807436999e-06, "loss": 0.2117, "num_input_tokens_seen": 28207600, "step": 46305 }, { "epoch": 14.368600682593856, "grad_norm": 4.672186374664307, "learning_rate": 2.2270191755940727e-06, "loss": 0.2306, "num_input_tokens_seen": 28210160, "step": 46310 }, { "epoch": 14.370152032268074, "grad_norm": 6.666008949279785, "learning_rate": 2.225892747043477e-06, "loss": 0.199, "num_input_tokens_seen": 28213104, "step": 46315 }, { "epoch": 14.37170338194229, "grad_norm": 5.970495223999023, "learning_rate": 2.2247665218677957e-06, "loss": 0.1446, "num_input_tokens_seen": 28217232, "step": 46320 }, { "epoch": 14.373254731616507, "grad_norm": 11.765449523925781, "learning_rate": 2.223640500149589e-06, "loss": 0.1733, "num_input_tokens_seen": 28219824, "step": 46325 }, { "epoch": 14.374806081290723, "grad_norm": 8.171984672546387, "learning_rate": 2.222514681971411e-06, "loss": 0.2485, "num_input_tokens_seen": 28222576, "step": 46330 }, { "epoch": 14.376357430964939, "grad_norm": 15.348831176757812, "learning_rate": 2.221389067415799e-06, "loss": 0.2025, "num_input_tokens_seen": 28225040, "step": 46335 }, { "epoch": 14.377908780639157, "grad_norm": 6.767430305480957, "learning_rate": 2.220263656565271e-06, "loss": 0.2557, "num_input_tokens_seen": 28227472, "step": 46340 }, { "epoch": 14.379460130313372, "grad_norm": 17.05235481262207, "learning_rate": 2.2191384495023343e-06, "loss": 0.1968, "num_input_tokens_seen": 28233776, "step": 46345 }, { "epoch": 14.381011479987588, "grad_norm": 3.668348789215088, "learning_rate": 2.2180134463094788e-06, "loss": 0.1902, "num_input_tokens_seen": 28237072, "step": 46350 }, { "epoch": 14.382562829661806, "grad_norm": 4.948861122131348, "learning_rate": 2.216888647069183e-06, "loss": 0.1792, "num_input_tokens_seen": 28239824, "step": 46355 }, { "epoch": 14.384114179336022, "grad_norm": 3.760331869125366, "learning_rate": 2.2157640518639043e-06, "loss": 0.1619, "num_input_tokens_seen": 28243056, "step": 46360 }, { "epoch": 14.38566552901024, "grad_norm": 4.690047264099121, "learning_rate": 2.214639660776093e-06, "loss": 0.1721, "num_input_tokens_seen": 28246032, "step": 46365 }, { "epoch": 14.387216878684455, "grad_norm": 5.582885265350342, "learning_rate": 2.2135154738881765e-06, "loss": 0.1728, "num_input_tokens_seen": 28250000, "step": 46370 }, { "epoch": 14.388768228358671, "grad_norm": 7.5143656730651855, "learning_rate": 2.212391491282574e-06, "loss": 0.2276, "num_input_tokens_seen": 28252592, "step": 46375 }, { "epoch": 14.390319578032889, "grad_norm": 6.340465545654297, "learning_rate": 2.2112677130416838e-06, "loss": 0.1943, "num_input_tokens_seen": 28255024, "step": 46380 }, { "epoch": 14.391870927707105, "grad_norm": 5.391209602355957, "learning_rate": 2.2101441392478956e-06, "loss": 0.1517, "num_input_tokens_seen": 28258416, "step": 46385 }, { "epoch": 14.393422277381323, "grad_norm": 4.592672824859619, "learning_rate": 2.209020769983577e-06, "loss": 0.2249, "num_input_tokens_seen": 28261264, "step": 46390 }, { "epoch": 14.394973627055538, "grad_norm": 3.111504077911377, "learning_rate": 2.207897605331086e-06, "loss": 0.2465, "num_input_tokens_seen": 28263664, "step": 46395 }, { "epoch": 14.396524976729754, "grad_norm": 6.132524490356445, "learning_rate": 2.2067746453727657e-06, "loss": 0.1987, "num_input_tokens_seen": 28266576, "step": 46400 }, { "epoch": 14.398076326403972, "grad_norm": 15.254534721374512, "learning_rate": 2.205651890190939e-06, "loss": 0.1747, "num_input_tokens_seen": 28269200, "step": 46405 }, { "epoch": 14.399627676078188, "grad_norm": 7.538872718811035, "learning_rate": 2.204529339867921e-06, "loss": 0.2133, "num_input_tokens_seen": 28272816, "step": 46410 }, { "epoch": 14.401179025752405, "grad_norm": 5.935425758361816, "learning_rate": 2.203406994486003e-06, "loss": 0.1911, "num_input_tokens_seen": 28276176, "step": 46415 }, { "epoch": 14.402730375426621, "grad_norm": 6.824592113494873, "learning_rate": 2.2022848541274712e-06, "loss": 0.21, "num_input_tokens_seen": 28280144, "step": 46420 }, { "epoch": 14.404281725100837, "grad_norm": 8.817602157592773, "learning_rate": 2.2011629188745875e-06, "loss": 0.1655, "num_input_tokens_seen": 28283536, "step": 46425 }, { "epoch": 14.405833074775055, "grad_norm": 6.473733901977539, "learning_rate": 2.2000411888096072e-06, "loss": 0.2065, "num_input_tokens_seen": 28286480, "step": 46430 }, { "epoch": 14.40738442444927, "grad_norm": 7.719821453094482, "learning_rate": 2.1989196640147625e-06, "loss": 0.1952, "num_input_tokens_seen": 28288368, "step": 46435 }, { "epoch": 14.408935774123487, "grad_norm": 7.763303756713867, "learning_rate": 2.1977983445722778e-06, "loss": 0.2036, "num_input_tokens_seen": 28290416, "step": 46440 }, { "epoch": 14.410487123797704, "grad_norm": 7.58727502822876, "learning_rate": 2.196677230564355e-06, "loss": 0.1646, "num_input_tokens_seen": 28292880, "step": 46445 }, { "epoch": 14.41203847347192, "grad_norm": 8.040167808532715, "learning_rate": 2.1955563220731887e-06, "loss": 0.2306, "num_input_tokens_seen": 28295920, "step": 46450 }, { "epoch": 14.413589823146138, "grad_norm": 14.133621215820312, "learning_rate": 2.1944356191809544e-06, "loss": 0.1451, "num_input_tokens_seen": 28299312, "step": 46455 }, { "epoch": 14.415141172820354, "grad_norm": 6.0125226974487305, "learning_rate": 2.1933151219698107e-06, "loss": 0.1724, "num_input_tokens_seen": 28301360, "step": 46460 }, { "epoch": 14.41669252249457, "grad_norm": 14.316507339477539, "learning_rate": 2.1921948305219066e-06, "loss": 0.1954, "num_input_tokens_seen": 28305008, "step": 46465 }, { "epoch": 14.418243872168787, "grad_norm": 7.967357158660889, "learning_rate": 2.1910747449193687e-06, "loss": 0.2, "num_input_tokens_seen": 28307472, "step": 46470 }, { "epoch": 14.419795221843003, "grad_norm": 8.565178871154785, "learning_rate": 2.189954865244317e-06, "loss": 0.1799, "num_input_tokens_seen": 28310320, "step": 46475 }, { "epoch": 14.421346571517219, "grad_norm": 7.568575859069824, "learning_rate": 2.188835191578847e-06, "loss": 0.1856, "num_input_tokens_seen": 28313360, "step": 46480 }, { "epoch": 14.422897921191437, "grad_norm": 6.368300914764404, "learning_rate": 2.187715724005049e-06, "loss": 0.2055, "num_input_tokens_seen": 28315952, "step": 46485 }, { "epoch": 14.424449270865653, "grad_norm": 15.178995132446289, "learning_rate": 2.186596462604989e-06, "loss": 0.1788, "num_input_tokens_seen": 28319536, "step": 46490 }, { "epoch": 14.42600062053987, "grad_norm": 13.584659576416016, "learning_rate": 2.1854774074607236e-06, "loss": 0.1805, "num_input_tokens_seen": 28323184, "step": 46495 }, { "epoch": 14.427551970214086, "grad_norm": 5.327381134033203, "learning_rate": 2.1843585586542936e-06, "loss": 0.1823, "num_input_tokens_seen": 28325968, "step": 46500 }, { "epoch": 14.429103319888302, "grad_norm": 15.591906547546387, "learning_rate": 2.1832399162677247e-06, "loss": 0.1959, "num_input_tokens_seen": 28329456, "step": 46505 }, { "epoch": 14.43065466956252, "grad_norm": 11.0278959274292, "learning_rate": 2.1821214803830243e-06, "loss": 0.1861, "num_input_tokens_seen": 28333296, "step": 46510 }, { "epoch": 14.432206019236736, "grad_norm": 5.852175712585449, "learning_rate": 2.1810032510821893e-06, "loss": 0.1945, "num_input_tokens_seen": 28335856, "step": 46515 }, { "epoch": 14.433757368910953, "grad_norm": 8.068278312683105, "learning_rate": 2.179885228447197e-06, "loss": 0.1852, "num_input_tokens_seen": 28338864, "step": 46520 }, { "epoch": 14.43530871858517, "grad_norm": 7.756221771240234, "learning_rate": 2.1787674125600135e-06, "loss": 0.1551, "num_input_tokens_seen": 28341968, "step": 46525 }, { "epoch": 14.436860068259385, "grad_norm": 9.4908447265625, "learning_rate": 2.177649803502585e-06, "loss": 0.2112, "num_input_tokens_seen": 28345008, "step": 46530 }, { "epoch": 14.438411417933603, "grad_norm": 24.326480865478516, "learning_rate": 2.1765324013568477e-06, "loss": 0.2505, "num_input_tokens_seen": 28347760, "step": 46535 }, { "epoch": 14.439962767607819, "grad_norm": 4.938264846801758, "learning_rate": 2.175415206204723e-06, "loss": 0.2162, "num_input_tokens_seen": 28351056, "step": 46540 }, { "epoch": 14.441514117282036, "grad_norm": 14.044286727905273, "learning_rate": 2.1742982181281093e-06, "loss": 0.2714, "num_input_tokens_seen": 28353904, "step": 46545 }, { "epoch": 14.443065466956252, "grad_norm": 4.6903462409973145, "learning_rate": 2.1731814372088988e-06, "loss": 0.1868, "num_input_tokens_seen": 28356144, "step": 46550 }, { "epoch": 14.444616816630468, "grad_norm": 5.678619384765625, "learning_rate": 2.172064863528963e-06, "loss": 0.1869, "num_input_tokens_seen": 28358992, "step": 46555 }, { "epoch": 14.446168166304686, "grad_norm": 6.992372035980225, "learning_rate": 2.170948497170161e-06, "loss": 0.1977, "num_input_tokens_seen": 28361776, "step": 46560 }, { "epoch": 14.447719515978902, "grad_norm": 11.249649047851562, "learning_rate": 2.169832338214334e-06, "loss": 0.1662, "num_input_tokens_seen": 28364432, "step": 46565 }, { "epoch": 14.449270865653117, "grad_norm": 4.429062366485596, "learning_rate": 2.168716386743312e-06, "loss": 0.1986, "num_input_tokens_seen": 28366992, "step": 46570 }, { "epoch": 14.450822215327335, "grad_norm": 8.2557954788208, "learning_rate": 2.167600642838905e-06, "loss": 0.1734, "num_input_tokens_seen": 28369424, "step": 46575 }, { "epoch": 14.452373565001551, "grad_norm": 7.680972099304199, "learning_rate": 2.1664851065829136e-06, "loss": 0.149, "num_input_tokens_seen": 28372400, "step": 46580 }, { "epoch": 14.453924914675769, "grad_norm": 25.53049659729004, "learning_rate": 2.1653697780571153e-06, "loss": 0.2394, "num_input_tokens_seen": 28374704, "step": 46585 }, { "epoch": 14.455476264349985, "grad_norm": 12.657435417175293, "learning_rate": 2.164254657343281e-06, "loss": 0.1723, "num_input_tokens_seen": 28377968, "step": 46590 }, { "epoch": 14.4570276140242, "grad_norm": 4.706840991973877, "learning_rate": 2.1631397445231596e-06, "loss": 0.2061, "num_input_tokens_seen": 28381136, "step": 46595 }, { "epoch": 14.458578963698418, "grad_norm": 6.746052265167236, "learning_rate": 2.1620250396784875e-06, "loss": 0.2404, "num_input_tokens_seen": 28383152, "step": 46600 }, { "epoch": 14.460130313372634, "grad_norm": 11.830025672912598, "learning_rate": 2.1609105428909887e-06, "loss": 0.2212, "num_input_tokens_seen": 28385520, "step": 46605 }, { "epoch": 14.46168166304685, "grad_norm": 3.8608808517456055, "learning_rate": 2.1597962542423656e-06, "loss": 0.1834, "num_input_tokens_seen": 28388880, "step": 46610 }, { "epoch": 14.463233012721068, "grad_norm": 9.725711822509766, "learning_rate": 2.1586821738143114e-06, "loss": 0.1525, "num_input_tokens_seen": 28391792, "step": 46615 }, { "epoch": 14.464784362395283, "grad_norm": 7.15950345993042, "learning_rate": 2.1575683016884984e-06, "loss": 0.212, "num_input_tokens_seen": 28394960, "step": 46620 }, { "epoch": 14.466335712069501, "grad_norm": 7.2202558517456055, "learning_rate": 2.15645463794659e-06, "loss": 0.1468, "num_input_tokens_seen": 28399536, "step": 46625 }, { "epoch": 14.467887061743717, "grad_norm": 5.345359802246094, "learning_rate": 2.155341182670228e-06, "loss": 0.196, "num_input_tokens_seen": 28401968, "step": 46630 }, { "epoch": 14.469438411417933, "grad_norm": 31.34811019897461, "learning_rate": 2.1542279359410446e-06, "loss": 0.2394, "num_input_tokens_seen": 28405456, "step": 46635 }, { "epoch": 14.47098976109215, "grad_norm": 12.832953453063965, "learning_rate": 2.153114897840651e-06, "loss": 0.2227, "num_input_tokens_seen": 28407888, "step": 46640 }, { "epoch": 14.472541110766366, "grad_norm": 21.984285354614258, "learning_rate": 2.152002068450649e-06, "loss": 0.2157, "num_input_tokens_seen": 28410352, "step": 46645 }, { "epoch": 14.474092460440584, "grad_norm": 11.422250747680664, "learning_rate": 2.1508894478526182e-06, "loss": 0.2186, "num_input_tokens_seen": 28412592, "step": 46650 }, { "epoch": 14.4756438101148, "grad_norm": 20.541597366333008, "learning_rate": 2.1497770361281305e-06, "loss": 0.2754, "num_input_tokens_seen": 28415664, "step": 46655 }, { "epoch": 14.477195159789016, "grad_norm": 7.881502628326416, "learning_rate": 2.148664833358739e-06, "loss": 0.1377, "num_input_tokens_seen": 28419504, "step": 46660 }, { "epoch": 14.478746509463233, "grad_norm": 11.3777494430542, "learning_rate": 2.1475528396259782e-06, "loss": 0.2136, "num_input_tokens_seen": 28421712, "step": 46665 }, { "epoch": 14.48029785913745, "grad_norm": 5.7140398025512695, "learning_rate": 2.1464410550113747e-06, "loss": 0.1355, "num_input_tokens_seen": 28424304, "step": 46670 }, { "epoch": 14.481849208811667, "grad_norm": 8.091277122497559, "learning_rate": 2.145329479596431e-06, "loss": 0.1964, "num_input_tokens_seen": 28427792, "step": 46675 }, { "epoch": 14.483400558485883, "grad_norm": 4.048941135406494, "learning_rate": 2.1442181134626423e-06, "loss": 0.1797, "num_input_tokens_seen": 28431376, "step": 46680 }, { "epoch": 14.484951908160099, "grad_norm": 6.5745158195495605, "learning_rate": 2.1431069566914814e-06, "loss": 0.1544, "num_input_tokens_seen": 28435536, "step": 46685 }, { "epoch": 14.486503257834316, "grad_norm": 6.102020263671875, "learning_rate": 2.1419960093644137e-06, "loss": 0.17, "num_input_tokens_seen": 28438576, "step": 46690 }, { "epoch": 14.488054607508532, "grad_norm": 6.494055271148682, "learning_rate": 2.1408852715628802e-06, "loss": 0.1478, "num_input_tokens_seen": 28441104, "step": 46695 }, { "epoch": 14.489605957182748, "grad_norm": 9.87181282043457, "learning_rate": 2.139774743368315e-06, "loss": 0.2354, "num_input_tokens_seen": 28446192, "step": 46700 }, { "epoch": 14.491157306856966, "grad_norm": 14.260652542114258, "learning_rate": 2.1386644248621297e-06, "loss": 0.2549, "num_input_tokens_seen": 28449296, "step": 46705 }, { "epoch": 14.492708656531182, "grad_norm": 6.691125869750977, "learning_rate": 2.1375543161257268e-06, "loss": 0.1917, "num_input_tokens_seen": 28452784, "step": 46710 }, { "epoch": 14.4942600062054, "grad_norm": 12.215095520019531, "learning_rate": 2.1364444172404875e-06, "loss": 0.2122, "num_input_tokens_seen": 28455632, "step": 46715 }, { "epoch": 14.495811355879615, "grad_norm": 16.805578231811523, "learning_rate": 2.135334728287782e-06, "loss": 0.2256, "num_input_tokens_seen": 28459600, "step": 46720 }, { "epoch": 14.497362705553831, "grad_norm": 6.286313533782959, "learning_rate": 2.134225249348965e-06, "loss": 0.1465, "num_input_tokens_seen": 28462576, "step": 46725 }, { "epoch": 14.498914055228049, "grad_norm": 9.039971351623535, "learning_rate": 2.133115980505372e-06, "loss": 0.2077, "num_input_tokens_seen": 28464848, "step": 46730 }, { "epoch": 14.500465404902265, "grad_norm": 6.100834846496582, "learning_rate": 2.1320069218383277e-06, "loss": 0.1586, "num_input_tokens_seen": 28467920, "step": 46735 }, { "epoch": 14.50201675457648, "grad_norm": 5.893074989318848, "learning_rate": 2.130898073429137e-06, "loss": 0.14, "num_input_tokens_seen": 28470704, "step": 46740 }, { "epoch": 14.503568104250698, "grad_norm": 27.717737197875977, "learning_rate": 2.1297894353590935e-06, "loss": 0.2247, "num_input_tokens_seen": 28474480, "step": 46745 }, { "epoch": 14.505119453924914, "grad_norm": 10.24570369720459, "learning_rate": 2.128681007709472e-06, "loss": 0.2009, "num_input_tokens_seen": 28477168, "step": 46750 }, { "epoch": 14.506670803599132, "grad_norm": 18.150012969970703, "learning_rate": 2.1275727905615358e-06, "loss": 0.1938, "num_input_tokens_seen": 28479760, "step": 46755 }, { "epoch": 14.508222153273348, "grad_norm": 9.618480682373047, "learning_rate": 2.1264647839965264e-06, "loss": 0.2533, "num_input_tokens_seen": 28482736, "step": 46760 }, { "epoch": 14.509773502947564, "grad_norm": 11.459857940673828, "learning_rate": 2.125356988095678e-06, "loss": 0.1928, "num_input_tokens_seen": 28485296, "step": 46765 }, { "epoch": 14.511324852621781, "grad_norm": 3.6540720462799072, "learning_rate": 2.1242494029402017e-06, "loss": 0.1661, "num_input_tokens_seen": 28488368, "step": 46770 }, { "epoch": 14.512876202295997, "grad_norm": 5.405239582061768, "learning_rate": 2.1231420286112982e-06, "loss": 0.2006, "num_input_tokens_seen": 28491696, "step": 46775 }, { "epoch": 14.514427551970215, "grad_norm": 2.0225226879119873, "learning_rate": 2.122034865190153e-06, "loss": 0.1597, "num_input_tokens_seen": 28494608, "step": 46780 }, { "epoch": 14.51597890164443, "grad_norm": 3.943725824356079, "learning_rate": 2.12092791275793e-06, "loss": 0.1511, "num_input_tokens_seen": 28497072, "step": 46785 }, { "epoch": 14.517530251318647, "grad_norm": 6.571254730224609, "learning_rate": 2.1198211713957866e-06, "loss": 0.2014, "num_input_tokens_seen": 28500112, "step": 46790 }, { "epoch": 14.519081600992864, "grad_norm": 6.23098087310791, "learning_rate": 2.118714641184856e-06, "loss": 0.1794, "num_input_tokens_seen": 28502384, "step": 46795 }, { "epoch": 14.52063295066708, "grad_norm": 15.212446212768555, "learning_rate": 2.1176083222062633e-06, "loss": 0.2298, "num_input_tokens_seen": 28505424, "step": 46800 }, { "epoch": 14.522184300341298, "grad_norm": 7.537843704223633, "learning_rate": 2.116502214541111e-06, "loss": 0.1655, "num_input_tokens_seen": 28508240, "step": 46805 }, { "epoch": 14.523735650015514, "grad_norm": 12.268932342529297, "learning_rate": 2.1153963182704946e-06, "loss": 0.2287, "num_input_tokens_seen": 28510768, "step": 46810 }, { "epoch": 14.52528699968973, "grad_norm": 7.882757663726807, "learning_rate": 2.114290633475485e-06, "loss": 0.2395, "num_input_tokens_seen": 28513552, "step": 46815 }, { "epoch": 14.526838349363947, "grad_norm": 13.570510864257812, "learning_rate": 2.113185160237145e-06, "loss": 0.186, "num_input_tokens_seen": 28518736, "step": 46820 }, { "epoch": 14.528389699038163, "grad_norm": 6.7118330001831055, "learning_rate": 2.1120798986365167e-06, "loss": 0.2316, "num_input_tokens_seen": 28521520, "step": 46825 }, { "epoch": 14.529941048712379, "grad_norm": 9.727408409118652, "learning_rate": 2.110974848754631e-06, "loss": 0.2586, "num_input_tokens_seen": 28524208, "step": 46830 }, { "epoch": 14.531492398386597, "grad_norm": 24.502422332763672, "learning_rate": 2.109870010672499e-06, "loss": 0.2223, "num_input_tokens_seen": 28527088, "step": 46835 }, { "epoch": 14.533043748060813, "grad_norm": 6.27020788192749, "learning_rate": 2.108765384471119e-06, "loss": 0.1905, "num_input_tokens_seen": 28529232, "step": 46840 }, { "epoch": 14.53459509773503, "grad_norm": 10.949499130249023, "learning_rate": 2.107660970231476e-06, "loss": 0.1998, "num_input_tokens_seen": 28532528, "step": 46845 }, { "epoch": 14.536146447409246, "grad_norm": 16.83680534362793, "learning_rate": 2.1065567680345324e-06, "loss": 0.1626, "num_input_tokens_seen": 28534992, "step": 46850 }, { "epoch": 14.537697797083462, "grad_norm": 15.62686538696289, "learning_rate": 2.1054527779612428e-06, "loss": 0.2133, "num_input_tokens_seen": 28537232, "step": 46855 }, { "epoch": 14.53924914675768, "grad_norm": 7.606319904327393, "learning_rate": 2.1043490000925386e-06, "loss": 0.1905, "num_input_tokens_seen": 28539792, "step": 46860 }, { "epoch": 14.540800496431896, "grad_norm": 9.259350776672363, "learning_rate": 2.103245434509345e-06, "loss": 0.1779, "num_input_tokens_seen": 28542736, "step": 46865 }, { "epoch": 14.542351846106111, "grad_norm": 8.52301025390625, "learning_rate": 2.102142081292562e-06, "loss": 0.1862, "num_input_tokens_seen": 28545360, "step": 46870 }, { "epoch": 14.543903195780329, "grad_norm": 7.826541900634766, "learning_rate": 2.101038940523082e-06, "loss": 0.1759, "num_input_tokens_seen": 28548560, "step": 46875 }, { "epoch": 14.545454545454545, "grad_norm": 9.542296409606934, "learning_rate": 2.099936012281774e-06, "loss": 0.2189, "num_input_tokens_seen": 28551728, "step": 46880 }, { "epoch": 14.547005895128763, "grad_norm": 9.199196815490723, "learning_rate": 2.098833296649501e-06, "loss": 0.1474, "num_input_tokens_seen": 28554992, "step": 46885 }, { "epoch": 14.548557244802979, "grad_norm": 4.664843559265137, "learning_rate": 2.0977307937070993e-06, "loss": 0.1859, "num_input_tokens_seen": 28557712, "step": 46890 }, { "epoch": 14.550108594477194, "grad_norm": 5.81804084777832, "learning_rate": 2.096628503535401e-06, "loss": 0.1966, "num_input_tokens_seen": 28560720, "step": 46895 }, { "epoch": 14.551659944151412, "grad_norm": 14.52091121673584, "learning_rate": 2.095526426215213e-06, "loss": 0.172, "num_input_tokens_seen": 28565808, "step": 46900 }, { "epoch": 14.553211293825628, "grad_norm": 14.626030921936035, "learning_rate": 2.0944245618273317e-06, "loss": 0.2374, "num_input_tokens_seen": 28569904, "step": 46905 }, { "epoch": 14.554762643499846, "grad_norm": 16.60772132873535, "learning_rate": 2.0933229104525386e-06, "loss": 0.2463, "num_input_tokens_seen": 28572336, "step": 46910 }, { "epoch": 14.556313993174061, "grad_norm": 7.278087139129639, "learning_rate": 2.092221472171595e-06, "loss": 0.1452, "num_input_tokens_seen": 28575024, "step": 46915 }, { "epoch": 14.557865342848277, "grad_norm": 10.838269233703613, "learning_rate": 2.0911202470652525e-06, "loss": 0.1937, "num_input_tokens_seen": 28577936, "step": 46920 }, { "epoch": 14.559416692522495, "grad_norm": 5.511266708374023, "learning_rate": 2.0900192352142408e-06, "loss": 0.1461, "num_input_tokens_seen": 28583504, "step": 46925 }, { "epoch": 14.560968042196711, "grad_norm": 8.715644836425781, "learning_rate": 2.0889184366992795e-06, "loss": 0.1587, "num_input_tokens_seen": 28585744, "step": 46930 }, { "epoch": 14.562519391870929, "grad_norm": 12.080365180969238, "learning_rate": 2.087817851601068e-06, "loss": 0.1406, "num_input_tokens_seen": 28589008, "step": 46935 }, { "epoch": 14.564070741545144, "grad_norm": 13.626981735229492, "learning_rate": 2.086717480000294e-06, "loss": 0.2039, "num_input_tokens_seen": 28594032, "step": 46940 }, { "epoch": 14.56562209121936, "grad_norm": 8.05092716217041, "learning_rate": 2.0856173219776264e-06, "loss": 0.2017, "num_input_tokens_seen": 28596048, "step": 46945 }, { "epoch": 14.567173440893578, "grad_norm": 6.281787395477295, "learning_rate": 2.0845173776137223e-06, "loss": 0.1504, "num_input_tokens_seen": 28598960, "step": 46950 }, { "epoch": 14.568724790567794, "grad_norm": 12.175294876098633, "learning_rate": 2.083417646989217e-06, "loss": 0.1786, "num_input_tokens_seen": 28601648, "step": 46955 }, { "epoch": 14.57027614024201, "grad_norm": 9.723611831665039, "learning_rate": 2.0823181301847356e-06, "loss": 0.2462, "num_input_tokens_seen": 28604432, "step": 46960 }, { "epoch": 14.571827489916227, "grad_norm": 8.3881196975708, "learning_rate": 2.0812188272808874e-06, "loss": 0.2197, "num_input_tokens_seen": 28607152, "step": 46965 }, { "epoch": 14.573378839590443, "grad_norm": 5.701934814453125, "learning_rate": 2.080119738358261e-06, "loss": 0.2459, "num_input_tokens_seen": 28610960, "step": 46970 }, { "epoch": 14.574930189264661, "grad_norm": 4.859155654907227, "learning_rate": 2.079020863497436e-06, "loss": 0.1411, "num_input_tokens_seen": 28613936, "step": 46975 }, { "epoch": 14.576481538938877, "grad_norm": 13.705469131469727, "learning_rate": 2.0779222027789704e-06, "loss": 0.1431, "num_input_tokens_seen": 28617648, "step": 46980 }, { "epoch": 14.578032888613093, "grad_norm": 11.593825340270996, "learning_rate": 2.076823756283411e-06, "loss": 0.2209, "num_input_tokens_seen": 28621584, "step": 46985 }, { "epoch": 14.57958423828731, "grad_norm": 9.371803283691406, "learning_rate": 2.0757255240912836e-06, "loss": 0.1642, "num_input_tokens_seen": 28624240, "step": 46990 }, { "epoch": 14.581135587961526, "grad_norm": 7.9749298095703125, "learning_rate": 2.074627506283106e-06, "loss": 0.1335, "num_input_tokens_seen": 28627504, "step": 46995 }, { "epoch": 14.582686937635742, "grad_norm": 8.585179328918457, "learning_rate": 2.0735297029393723e-06, "loss": 0.1369, "num_input_tokens_seen": 28630704, "step": 47000 }, { "epoch": 14.58423828730996, "grad_norm": 12.703118324279785, "learning_rate": 2.0724321141405677e-06, "loss": 0.3367, "num_input_tokens_seen": 28636720, "step": 47005 }, { "epoch": 14.585789636984176, "grad_norm": 15.771602630615234, "learning_rate": 2.071334739967155e-06, "loss": 0.2567, "num_input_tokens_seen": 28639088, "step": 47010 }, { "epoch": 14.587340986658393, "grad_norm": 7.284816265106201, "learning_rate": 2.0702375804995877e-06, "loss": 0.1496, "num_input_tokens_seen": 28642128, "step": 47015 }, { "epoch": 14.58889233633261, "grad_norm": 7.5990166664123535, "learning_rate": 2.069140635818298e-06, "loss": 0.1459, "num_input_tokens_seen": 28645648, "step": 47020 }, { "epoch": 14.590443686006825, "grad_norm": 9.65488338470459, "learning_rate": 2.068043906003706e-06, "loss": 0.2673, "num_input_tokens_seen": 28648560, "step": 47025 }, { "epoch": 14.591995035681043, "grad_norm": 6.161591529846191, "learning_rate": 2.0669473911362174e-06, "loss": 0.2002, "num_input_tokens_seen": 28651920, "step": 47030 }, { "epoch": 14.593546385355259, "grad_norm": 16.35296058654785, "learning_rate": 2.0658510912962156e-06, "loss": 0.2144, "num_input_tokens_seen": 28655056, "step": 47035 }, { "epoch": 14.595097735029476, "grad_norm": 10.562538146972656, "learning_rate": 2.064755006564076e-06, "loss": 0.2215, "num_input_tokens_seen": 28657168, "step": 47040 }, { "epoch": 14.596649084703692, "grad_norm": 3.956968307495117, "learning_rate": 2.0636591370201515e-06, "loss": 0.1563, "num_input_tokens_seen": 28659984, "step": 47045 }, { "epoch": 14.598200434377908, "grad_norm": 9.230794906616211, "learning_rate": 2.062563482744785e-06, "loss": 0.234, "num_input_tokens_seen": 28662672, "step": 47050 }, { "epoch": 14.599751784052126, "grad_norm": 15.164192199707031, "learning_rate": 2.061468043818298e-06, "loss": 0.34, "num_input_tokens_seen": 28666768, "step": 47055 }, { "epoch": 14.601303133726342, "grad_norm": 15.023146629333496, "learning_rate": 2.060372820321003e-06, "loss": 0.2831, "num_input_tokens_seen": 28670512, "step": 47060 }, { "epoch": 14.60285448340056, "grad_norm": 12.374741554260254, "learning_rate": 2.0592778123331888e-06, "loss": 0.139, "num_input_tokens_seen": 28672848, "step": 47065 }, { "epoch": 14.604405833074775, "grad_norm": 3.2912707328796387, "learning_rate": 2.0581830199351337e-06, "loss": 0.1979, "num_input_tokens_seen": 28676688, "step": 47070 }, { "epoch": 14.605957182748991, "grad_norm": 25.71442222595215, "learning_rate": 2.0570884432071e-06, "loss": 0.2968, "num_input_tokens_seen": 28679440, "step": 47075 }, { "epoch": 14.607508532423209, "grad_norm": 14.21524429321289, "learning_rate": 2.0559940822293344e-06, "loss": 0.2118, "num_input_tokens_seen": 28682224, "step": 47080 }, { "epoch": 14.609059882097425, "grad_norm": 5.848026752471924, "learning_rate": 2.054899937082063e-06, "loss": 0.1611, "num_input_tokens_seen": 28687152, "step": 47085 }, { "epoch": 14.61061123177164, "grad_norm": 4.1092939376831055, "learning_rate": 2.0538060078455036e-06, "loss": 0.1817, "num_input_tokens_seen": 28690480, "step": 47090 }, { "epoch": 14.612162581445858, "grad_norm": 8.806320190429688, "learning_rate": 2.0527122945998494e-06, "loss": 0.1571, "num_input_tokens_seen": 28692752, "step": 47095 }, { "epoch": 14.613713931120074, "grad_norm": 11.272196769714355, "learning_rate": 2.0516187974252875e-06, "loss": 0.2258, "num_input_tokens_seen": 28696656, "step": 47100 }, { "epoch": 14.615265280794292, "grad_norm": 3.0455880165100098, "learning_rate": 2.05052551640198e-06, "loss": 0.1953, "num_input_tokens_seen": 28699312, "step": 47105 }, { "epoch": 14.616816630468508, "grad_norm": 12.995007514953613, "learning_rate": 2.0494324516100788e-06, "loss": 0.2686, "num_input_tokens_seen": 28702256, "step": 47110 }, { "epoch": 14.618367980142724, "grad_norm": 13.2315673828125, "learning_rate": 2.048339603129721e-06, "loss": 0.1843, "num_input_tokens_seen": 28706256, "step": 47115 }, { "epoch": 14.619919329816941, "grad_norm": 4.411521911621094, "learning_rate": 2.0472469710410213e-06, "loss": 0.1645, "num_input_tokens_seen": 28710064, "step": 47120 }, { "epoch": 14.621470679491157, "grad_norm": 5.299132823944092, "learning_rate": 2.0461545554240865e-06, "loss": 0.2212, "num_input_tokens_seen": 28713104, "step": 47125 }, { "epoch": 14.623022029165373, "grad_norm": 19.10101890563965, "learning_rate": 2.0450623563589996e-06, "loss": 0.238, "num_input_tokens_seen": 28715856, "step": 47130 }, { "epoch": 14.62457337883959, "grad_norm": 6.401401519775391, "learning_rate": 2.0439703739258348e-06, "loss": 0.164, "num_input_tokens_seen": 28717904, "step": 47135 }, { "epoch": 14.626124728513807, "grad_norm": 17.48552703857422, "learning_rate": 2.042878608204645e-06, "loss": 0.2012, "num_input_tokens_seen": 28720528, "step": 47140 }, { "epoch": 14.627676078188024, "grad_norm": 6.949411869049072, "learning_rate": 2.0417870592754727e-06, "loss": 0.1766, "num_input_tokens_seen": 28723216, "step": 47145 }, { "epoch": 14.62922742786224, "grad_norm": 14.145062446594238, "learning_rate": 2.0406957272183376e-06, "loss": 0.2284, "num_input_tokens_seen": 28725744, "step": 47150 }, { "epoch": 14.630778777536456, "grad_norm": 10.613968849182129, "learning_rate": 2.0396046121132506e-06, "loss": 0.3218, "num_input_tokens_seen": 28727824, "step": 47155 }, { "epoch": 14.632330127210674, "grad_norm": 23.665956497192383, "learning_rate": 2.0385137140402006e-06, "loss": 0.2121, "num_input_tokens_seen": 28730608, "step": 47160 }, { "epoch": 14.63388147688489, "grad_norm": 2.2262842655181885, "learning_rate": 2.037423033079164e-06, "loss": 0.1967, "num_input_tokens_seen": 28733744, "step": 47165 }, { "epoch": 14.635432826559107, "grad_norm": 7.130504608154297, "learning_rate": 2.036332569310103e-06, "loss": 0.2421, "num_input_tokens_seen": 28736880, "step": 47170 }, { "epoch": 14.636984176233323, "grad_norm": 8.22089958190918, "learning_rate": 2.0352423228129585e-06, "loss": 0.167, "num_input_tokens_seen": 28739536, "step": 47175 }, { "epoch": 14.638535525907539, "grad_norm": 4.802649021148682, "learning_rate": 2.034152293667661e-06, "loss": 0.1612, "num_input_tokens_seen": 28742512, "step": 47180 }, { "epoch": 14.640086875581757, "grad_norm": 7.900918483734131, "learning_rate": 2.03306248195412e-06, "loss": 0.1619, "num_input_tokens_seen": 28745232, "step": 47185 }, { "epoch": 14.641638225255972, "grad_norm": 14.953804016113281, "learning_rate": 2.0319728877522345e-06, "loss": 0.1677, "num_input_tokens_seen": 28748400, "step": 47190 }, { "epoch": 14.64318957493019, "grad_norm": 9.257165908813477, "learning_rate": 2.0308835111418805e-06, "loss": 0.1881, "num_input_tokens_seen": 28751280, "step": 47195 }, { "epoch": 14.644740924604406, "grad_norm": 13.02015209197998, "learning_rate": 2.0297943522029274e-06, "loss": 0.2632, "num_input_tokens_seen": 28754480, "step": 47200 }, { "epoch": 14.646292274278622, "grad_norm": 4.4308977127075195, "learning_rate": 2.0287054110152186e-06, "loss": 0.1275, "num_input_tokens_seen": 28756912, "step": 47205 }, { "epoch": 14.64784362395284, "grad_norm": 6.093876838684082, "learning_rate": 2.0276166876585905e-06, "loss": 0.28, "num_input_tokens_seen": 28759824, "step": 47210 }, { "epoch": 14.649394973627055, "grad_norm": 4.749088764190674, "learning_rate": 2.026528182212856e-06, "loss": 0.2079, "num_input_tokens_seen": 28762480, "step": 47215 }, { "epoch": 14.650946323301271, "grad_norm": 11.799234390258789, "learning_rate": 2.025439894757818e-06, "loss": 0.217, "num_input_tokens_seen": 28764816, "step": 47220 }, { "epoch": 14.652497672975489, "grad_norm": 7.0923895835876465, "learning_rate": 2.0243518253732587e-06, "loss": 0.1355, "num_input_tokens_seen": 28767184, "step": 47225 }, { "epoch": 14.654049022649705, "grad_norm": 10.669132232666016, "learning_rate": 2.023263974138947e-06, "loss": 0.1983, "num_input_tokens_seen": 28771056, "step": 47230 }, { "epoch": 14.655600372323923, "grad_norm": 24.132169723510742, "learning_rate": 2.022176341134638e-06, "loss": 0.2355, "num_input_tokens_seen": 28774352, "step": 47235 }, { "epoch": 14.657151721998138, "grad_norm": 18.762157440185547, "learning_rate": 2.0210889264400635e-06, "loss": 0.3106, "num_input_tokens_seen": 28776944, "step": 47240 }, { "epoch": 14.658703071672354, "grad_norm": 6.051537036895752, "learning_rate": 2.020001730134949e-06, "loss": 0.1393, "num_input_tokens_seen": 28780304, "step": 47245 }, { "epoch": 14.660254421346572, "grad_norm": 7.71204948425293, "learning_rate": 2.0189147522989937e-06, "loss": 0.1296, "num_input_tokens_seen": 28784688, "step": 47250 }, { "epoch": 14.661805771020788, "grad_norm": 7.833967208862305, "learning_rate": 2.0178279930118904e-06, "loss": 0.1793, "num_input_tokens_seen": 28787504, "step": 47255 }, { "epoch": 14.663357120695004, "grad_norm": 6.84053373336792, "learning_rate": 2.0167414523533073e-06, "loss": 0.1934, "num_input_tokens_seen": 28789840, "step": 47260 }, { "epoch": 14.664908470369221, "grad_norm": 20.524215698242188, "learning_rate": 2.015655130402905e-06, "loss": 0.2746, "num_input_tokens_seen": 28792688, "step": 47265 }, { "epoch": 14.666459820043437, "grad_norm": 12.266488075256348, "learning_rate": 2.0145690272403195e-06, "loss": 0.1694, "num_input_tokens_seen": 28795600, "step": 47270 }, { "epoch": 14.668011169717655, "grad_norm": 6.944738388061523, "learning_rate": 2.013483142945179e-06, "loss": 0.194, "num_input_tokens_seen": 28799536, "step": 47275 }, { "epoch": 14.66956251939187, "grad_norm": 16.66614532470703, "learning_rate": 2.012397477597088e-06, "loss": 0.2282, "num_input_tokens_seen": 28802768, "step": 47280 }, { "epoch": 14.671113869066087, "grad_norm": 5.5296759605407715, "learning_rate": 2.01131203127564e-06, "loss": 0.2078, "num_input_tokens_seen": 28805360, "step": 47285 }, { "epoch": 14.672665218740304, "grad_norm": 5.636786460876465, "learning_rate": 2.010226804060414e-06, "loss": 0.2166, "num_input_tokens_seen": 28807824, "step": 47290 }, { "epoch": 14.67421656841452, "grad_norm": 8.899688720703125, "learning_rate": 2.0091417960309658e-06, "loss": 0.2021, "num_input_tokens_seen": 28810640, "step": 47295 }, { "epoch": 14.675767918088738, "grad_norm": 7.655430793762207, "learning_rate": 2.008057007266842e-06, "loss": 0.1504, "num_input_tokens_seen": 28813456, "step": 47300 }, { "epoch": 14.677319267762954, "grad_norm": 8.21194839477539, "learning_rate": 2.006972437847568e-06, "loss": 0.2074, "num_input_tokens_seen": 28815856, "step": 47305 }, { "epoch": 14.67887061743717, "grad_norm": 8.185332298278809, "learning_rate": 2.0058880878526587e-06, "loss": 0.2075, "num_input_tokens_seen": 28819056, "step": 47310 }, { "epoch": 14.680421967111387, "grad_norm": 13.857458114624023, "learning_rate": 2.004803957361607e-06, "loss": 0.2516, "num_input_tokens_seen": 28821872, "step": 47315 }, { "epoch": 14.681973316785603, "grad_norm": 9.21326732635498, "learning_rate": 2.0037200464538945e-06, "loss": 0.1723, "num_input_tokens_seen": 28824816, "step": 47320 }, { "epoch": 14.683524666459821, "grad_norm": 8.357462882995605, "learning_rate": 2.002636355208983e-06, "loss": 0.2094, "num_input_tokens_seen": 28827504, "step": 47325 }, { "epoch": 14.685076016134037, "grad_norm": 7.555082321166992, "learning_rate": 2.001552883706321e-06, "loss": 0.1919, "num_input_tokens_seen": 28830800, "step": 47330 }, { "epoch": 14.686627365808253, "grad_norm": 6.736879825592041, "learning_rate": 2.0004696320253395e-06, "loss": 0.1783, "num_input_tokens_seen": 28833552, "step": 47335 }, { "epoch": 14.68817871548247, "grad_norm": 13.408549308776855, "learning_rate": 1.999386600245455e-06, "loss": 0.2505, "num_input_tokens_seen": 28836464, "step": 47340 }, { "epoch": 14.689730065156686, "grad_norm": 4.677724361419678, "learning_rate": 1.9983037884460634e-06, "loss": 0.1355, "num_input_tokens_seen": 28839632, "step": 47345 }, { "epoch": 14.691281414830902, "grad_norm": 7.593091011047363, "learning_rate": 1.997221196706549e-06, "loss": 0.16, "num_input_tokens_seen": 28842448, "step": 47350 }, { "epoch": 14.69283276450512, "grad_norm": 10.777823448181152, "learning_rate": 1.9961388251062814e-06, "loss": 0.2124, "num_input_tokens_seen": 28845200, "step": 47355 }, { "epoch": 14.694384114179336, "grad_norm": 20.018108367919922, "learning_rate": 1.9950566737246075e-06, "loss": 0.2277, "num_input_tokens_seen": 28848720, "step": 47360 }, { "epoch": 14.695935463853553, "grad_norm": 11.061967849731445, "learning_rate": 1.9939747426408647e-06, "loss": 0.2471, "num_input_tokens_seen": 28851568, "step": 47365 }, { "epoch": 14.69748681352777, "grad_norm": 13.051291465759277, "learning_rate": 1.9928930319343674e-06, "loss": 0.1793, "num_input_tokens_seen": 28854032, "step": 47370 }, { "epoch": 14.699038163201985, "grad_norm": 7.248337268829346, "learning_rate": 1.991811541684423e-06, "loss": 0.224, "num_input_tokens_seen": 28857424, "step": 47375 }, { "epoch": 14.700589512876203, "grad_norm": 7.654515266418457, "learning_rate": 1.990730271970313e-06, "loss": 0.193, "num_input_tokens_seen": 28860080, "step": 47380 }, { "epoch": 14.702140862550419, "grad_norm": 14.457225799560547, "learning_rate": 1.989649222871311e-06, "loss": 0.1761, "num_input_tokens_seen": 28862864, "step": 47385 }, { "epoch": 14.703692212224635, "grad_norm": 10.433137893676758, "learning_rate": 1.9885683944666667e-06, "loss": 0.1567, "num_input_tokens_seen": 28867376, "step": 47390 }, { "epoch": 14.705243561898852, "grad_norm": 12.847010612487793, "learning_rate": 1.9874877868356217e-06, "loss": 0.2212, "num_input_tokens_seen": 28871312, "step": 47395 }, { "epoch": 14.706794911573068, "grad_norm": 5.033293724060059, "learning_rate": 1.986407400057394e-06, "loss": 0.1377, "num_input_tokens_seen": 28875216, "step": 47400 }, { "epoch": 14.708346261247286, "grad_norm": 9.955057144165039, "learning_rate": 1.98532723421119e-06, "loss": 0.2616, "num_input_tokens_seen": 28877904, "step": 47405 }, { "epoch": 14.709897610921502, "grad_norm": 5.967113018035889, "learning_rate": 1.9842472893762003e-06, "loss": 0.2909, "num_input_tokens_seen": 28880752, "step": 47410 }, { "epoch": 14.711448960595717, "grad_norm": 9.690028190612793, "learning_rate": 1.9831675656315947e-06, "loss": 0.2014, "num_input_tokens_seen": 28883920, "step": 47415 }, { "epoch": 14.713000310269935, "grad_norm": 6.15491247177124, "learning_rate": 1.9820880630565327e-06, "loss": 0.1993, "num_input_tokens_seen": 28887568, "step": 47420 }, { "epoch": 14.714551659944151, "grad_norm": 5.089820861816406, "learning_rate": 1.981008781730151e-06, "loss": 0.1677, "num_input_tokens_seen": 28890256, "step": 47425 }, { "epoch": 14.716103009618369, "grad_norm": 11.616708755493164, "learning_rate": 1.979929721731578e-06, "loss": 0.1838, "num_input_tokens_seen": 28892944, "step": 47430 }, { "epoch": 14.717654359292585, "grad_norm": 10.230097770690918, "learning_rate": 1.9788508831399176e-06, "loss": 0.2247, "num_input_tokens_seen": 28896144, "step": 47435 }, { "epoch": 14.7192057089668, "grad_norm": 2.8034253120422363, "learning_rate": 1.9777722660342654e-06, "loss": 0.1408, "num_input_tokens_seen": 28899440, "step": 47440 }, { "epoch": 14.720757058641018, "grad_norm": 4.651202201843262, "learning_rate": 1.9766938704936924e-06, "loss": 0.2606, "num_input_tokens_seen": 28901680, "step": 47445 }, { "epoch": 14.722308408315234, "grad_norm": 8.719097137451172, "learning_rate": 1.975615696597261e-06, "loss": 0.181, "num_input_tokens_seen": 28904720, "step": 47450 }, { "epoch": 14.723859757989452, "grad_norm": 6.462963104248047, "learning_rate": 1.974537744424013e-06, "loss": 0.1898, "num_input_tokens_seen": 28907184, "step": 47455 }, { "epoch": 14.725411107663668, "grad_norm": 6.193231582641602, "learning_rate": 1.973460014052976e-06, "loss": 0.157, "num_input_tokens_seen": 28909840, "step": 47460 }, { "epoch": 14.726962457337883, "grad_norm": 9.99256420135498, "learning_rate": 1.9723825055631574e-06, "loss": 0.1795, "num_input_tokens_seen": 28912560, "step": 47465 }, { "epoch": 14.728513807012101, "grad_norm": 5.356614589691162, "learning_rate": 1.971305219033554e-06, "loss": 0.245, "num_input_tokens_seen": 28914832, "step": 47470 }, { "epoch": 14.730065156686317, "grad_norm": 9.779902458190918, "learning_rate": 1.970228154543145e-06, "loss": 0.2225, "num_input_tokens_seen": 28917584, "step": 47475 }, { "epoch": 14.731616506360533, "grad_norm": 3.8245198726654053, "learning_rate": 1.969151312170888e-06, "loss": 0.1761, "num_input_tokens_seen": 28920208, "step": 47480 }, { "epoch": 14.73316785603475, "grad_norm": 6.705308437347412, "learning_rate": 1.9680746919957326e-06, "loss": 0.1596, "num_input_tokens_seen": 28923280, "step": 47485 }, { "epoch": 14.734719205708966, "grad_norm": 9.27151107788086, "learning_rate": 1.9669982940966036e-06, "loss": 0.2151, "num_input_tokens_seen": 28926416, "step": 47490 }, { "epoch": 14.736270555383184, "grad_norm": 10.806581497192383, "learning_rate": 1.9659221185524176e-06, "loss": 0.2138, "num_input_tokens_seen": 28929616, "step": 47495 }, { "epoch": 14.7378219050574, "grad_norm": 3.9697933197021484, "learning_rate": 1.9648461654420676e-06, "loss": 0.2029, "num_input_tokens_seen": 28932464, "step": 47500 }, { "epoch": 14.739373254731616, "grad_norm": 10.309894561767578, "learning_rate": 1.963770434844437e-06, "loss": 0.1739, "num_input_tokens_seen": 28935152, "step": 47505 }, { "epoch": 14.740924604405834, "grad_norm": 7.110080242156982, "learning_rate": 1.962694926838386e-06, "loss": 0.1347, "num_input_tokens_seen": 28938256, "step": 47510 }, { "epoch": 14.74247595408005, "grad_norm": 4.943237781524658, "learning_rate": 1.9616196415027657e-06, "loss": 0.1219, "num_input_tokens_seen": 28941552, "step": 47515 }, { "epoch": 14.744027303754265, "grad_norm": 3.7052462100982666, "learning_rate": 1.9605445789164035e-06, "loss": 0.1728, "num_input_tokens_seen": 28943920, "step": 47520 }, { "epoch": 14.745578653428483, "grad_norm": 8.378334999084473, "learning_rate": 1.959469739158116e-06, "loss": 0.2599, "num_input_tokens_seen": 28949456, "step": 47525 }, { "epoch": 14.747130003102699, "grad_norm": 6.39622688293457, "learning_rate": 1.9583951223067043e-06, "loss": 0.173, "num_input_tokens_seen": 28952112, "step": 47530 }, { "epoch": 14.748681352776916, "grad_norm": 5.231078624725342, "learning_rate": 1.9573207284409464e-06, "loss": 0.2241, "num_input_tokens_seen": 28955216, "step": 47535 }, { "epoch": 14.750232702451132, "grad_norm": 16.4456844329834, "learning_rate": 1.9562465576396113e-06, "loss": 0.2041, "num_input_tokens_seen": 28957744, "step": 47540 }, { "epoch": 14.751784052125348, "grad_norm": 3.7451109886169434, "learning_rate": 1.9551726099814448e-06, "loss": 0.1363, "num_input_tokens_seen": 28961840, "step": 47545 }, { "epoch": 14.753335401799566, "grad_norm": 10.972792625427246, "learning_rate": 1.9540988855451844e-06, "loss": 0.2029, "num_input_tokens_seen": 28964464, "step": 47550 }, { "epoch": 14.754886751473782, "grad_norm": 8.910443305969238, "learning_rate": 1.9530253844095425e-06, "loss": 0.2154, "num_input_tokens_seen": 28967440, "step": 47555 }, { "epoch": 14.756438101148, "grad_norm": 12.052919387817383, "learning_rate": 1.9519521066532236e-06, "loss": 0.248, "num_input_tokens_seen": 28970288, "step": 47560 }, { "epoch": 14.757989450822215, "grad_norm": 4.985607624053955, "learning_rate": 1.9508790523549077e-06, "loss": 0.1521, "num_input_tokens_seen": 28972944, "step": 47565 }, { "epoch": 14.759540800496431, "grad_norm": 4.338404655456543, "learning_rate": 1.949806221593266e-06, "loss": 0.2171, "num_input_tokens_seen": 28975952, "step": 47570 }, { "epoch": 14.761092150170649, "grad_norm": 3.1659817695617676, "learning_rate": 1.948733614446946e-06, "loss": 0.2034, "num_input_tokens_seen": 28979888, "step": 47575 }, { "epoch": 14.762643499844865, "grad_norm": 5.562023162841797, "learning_rate": 1.9476612309945863e-06, "loss": 0.1514, "num_input_tokens_seen": 28983152, "step": 47580 }, { "epoch": 14.764194849519082, "grad_norm": 11.0587158203125, "learning_rate": 1.9465890713148023e-06, "loss": 0.228, "num_input_tokens_seen": 28985712, "step": 47585 }, { "epoch": 14.765746199193298, "grad_norm": 4.0056471824646, "learning_rate": 1.9455171354861963e-06, "loss": 0.1644, "num_input_tokens_seen": 28988752, "step": 47590 }, { "epoch": 14.767297548867514, "grad_norm": 3.351557970046997, "learning_rate": 1.9444454235873573e-06, "loss": 0.1764, "num_input_tokens_seen": 28991952, "step": 47595 }, { "epoch": 14.768848898541732, "grad_norm": 5.7779459953308105, "learning_rate": 1.9433739356968497e-06, "loss": 0.1855, "num_input_tokens_seen": 28995248, "step": 47600 }, { "epoch": 14.770400248215948, "grad_norm": 9.252043724060059, "learning_rate": 1.9423026718932298e-06, "loss": 0.1853, "num_input_tokens_seen": 28998320, "step": 47605 }, { "epoch": 14.771951597890164, "grad_norm": 6.4958720207214355, "learning_rate": 1.9412316322550316e-06, "loss": 0.1884, "num_input_tokens_seen": 29002000, "step": 47610 }, { "epoch": 14.773502947564381, "grad_norm": 5.327998161315918, "learning_rate": 1.940160816860777e-06, "loss": 0.1684, "num_input_tokens_seen": 29005296, "step": 47615 }, { "epoch": 14.775054297238597, "grad_norm": 14.348482131958008, "learning_rate": 1.9390902257889678e-06, "loss": 0.238, "num_input_tokens_seen": 29008048, "step": 47620 }, { "epoch": 14.776605646912815, "grad_norm": 7.2053093910217285, "learning_rate": 1.9380198591180925e-06, "loss": 0.1868, "num_input_tokens_seen": 29011024, "step": 47625 }, { "epoch": 14.77815699658703, "grad_norm": 7.285340785980225, "learning_rate": 1.9369497169266193e-06, "loss": 0.1834, "num_input_tokens_seen": 29013872, "step": 47630 }, { "epoch": 14.779708346261247, "grad_norm": 13.456886291503906, "learning_rate": 1.9358797992930057e-06, "loss": 0.1937, "num_input_tokens_seen": 29017360, "step": 47635 }, { "epoch": 14.781259695935464, "grad_norm": 8.948063850402832, "learning_rate": 1.9348101062956853e-06, "loss": 0.1861, "num_input_tokens_seen": 29020528, "step": 47640 }, { "epoch": 14.78281104560968, "grad_norm": 10.952498435974121, "learning_rate": 1.933740638013082e-06, "loss": 0.208, "num_input_tokens_seen": 29023056, "step": 47645 }, { "epoch": 14.784362395283896, "grad_norm": 5.507214069366455, "learning_rate": 1.9326713945235993e-06, "loss": 0.1363, "num_input_tokens_seen": 29025520, "step": 47650 }, { "epoch": 14.785913744958114, "grad_norm": 6.771298408508301, "learning_rate": 1.931602375905628e-06, "loss": 0.1645, "num_input_tokens_seen": 29028336, "step": 47655 }, { "epoch": 14.78746509463233, "grad_norm": 16.99022102355957, "learning_rate": 1.930533582237536e-06, "loss": 0.197, "num_input_tokens_seen": 29031568, "step": 47660 }, { "epoch": 14.789016444306547, "grad_norm": 9.99662971496582, "learning_rate": 1.9294650135976816e-06, "loss": 0.1115, "num_input_tokens_seen": 29034736, "step": 47665 }, { "epoch": 14.790567793980763, "grad_norm": 3.7692806720733643, "learning_rate": 1.9283966700644012e-06, "loss": 0.168, "num_input_tokens_seen": 29036816, "step": 47670 }, { "epoch": 14.792119143654979, "grad_norm": 8.74315357208252, "learning_rate": 1.9273285517160178e-06, "loss": 0.1708, "num_input_tokens_seen": 29039632, "step": 47675 }, { "epoch": 14.793670493329197, "grad_norm": 16.820951461791992, "learning_rate": 1.9262606586308395e-06, "loss": 0.2813, "num_input_tokens_seen": 29042704, "step": 47680 }, { "epoch": 14.795221843003413, "grad_norm": 6.299307346343994, "learning_rate": 1.9251929908871514e-06, "loss": 0.2038, "num_input_tokens_seen": 29045616, "step": 47685 }, { "epoch": 14.79677319267763, "grad_norm": 8.381993293762207, "learning_rate": 1.9241255485632306e-06, "loss": 0.1596, "num_input_tokens_seen": 29048368, "step": 47690 }, { "epoch": 14.798324542351846, "grad_norm": 7.9701972007751465, "learning_rate": 1.923058331737328e-06, "loss": 0.2457, "num_input_tokens_seen": 29050832, "step": 47695 }, { "epoch": 14.799875892026062, "grad_norm": 3.835780382156372, "learning_rate": 1.921991340487689e-06, "loss": 0.1505, "num_input_tokens_seen": 29054128, "step": 47700 }, { "epoch": 14.80142724170028, "grad_norm": 3.7151992321014404, "learning_rate": 1.920924574892531e-06, "loss": 0.1717, "num_input_tokens_seen": 29056688, "step": 47705 }, { "epoch": 14.802978591374496, "grad_norm": 5.812088966369629, "learning_rate": 1.9198580350300662e-06, "loss": 0.3282, "num_input_tokens_seen": 29060496, "step": 47710 }, { "epoch": 14.804529941048713, "grad_norm": 8.297982215881348, "learning_rate": 1.918791720978479e-06, "loss": 0.1834, "num_input_tokens_seen": 29063312, "step": 47715 }, { "epoch": 14.806081290722929, "grad_norm": 16.42821502685547, "learning_rate": 1.917725632815948e-06, "loss": 0.228, "num_input_tokens_seen": 29066352, "step": 47720 }, { "epoch": 14.807632640397145, "grad_norm": 7.331854343414307, "learning_rate": 1.916659770620626e-06, "loss": 0.1923, "num_input_tokens_seen": 29069200, "step": 47725 }, { "epoch": 14.809183990071363, "grad_norm": 18.88544273376465, "learning_rate": 1.9155941344706547e-06, "loss": 0.1686, "num_input_tokens_seen": 29071728, "step": 47730 }, { "epoch": 14.810735339745579, "grad_norm": 10.163895606994629, "learning_rate": 1.91452872444416e-06, "loss": 0.2078, "num_input_tokens_seen": 29074416, "step": 47735 }, { "epoch": 14.812286689419794, "grad_norm": 4.5985894203186035, "learning_rate": 1.9134635406192453e-06, "loss": 0.1123, "num_input_tokens_seen": 29077200, "step": 47740 }, { "epoch": 14.813838039094012, "grad_norm": 7.673450469970703, "learning_rate": 1.9123985830740054e-06, "loss": 0.2195, "num_input_tokens_seen": 29079440, "step": 47745 }, { "epoch": 14.815389388768228, "grad_norm": 16.006179809570312, "learning_rate": 1.91133385188651e-06, "loss": 0.2603, "num_input_tokens_seen": 29082224, "step": 47750 }, { "epoch": 14.816940738442446, "grad_norm": 7.928204536437988, "learning_rate": 1.9102693471348206e-06, "loss": 0.213, "num_input_tokens_seen": 29085136, "step": 47755 }, { "epoch": 14.818492088116662, "grad_norm": 9.889561653137207, "learning_rate": 1.9092050688969736e-06, "loss": 0.2413, "num_input_tokens_seen": 29087856, "step": 47760 }, { "epoch": 14.820043437790877, "grad_norm": 25.424131393432617, "learning_rate": 1.9081410172509975e-06, "loss": 0.3179, "num_input_tokens_seen": 29090416, "step": 47765 }, { "epoch": 14.821594787465095, "grad_norm": 6.5542683601379395, "learning_rate": 1.9070771922748965e-06, "loss": 0.1733, "num_input_tokens_seen": 29093328, "step": 47770 }, { "epoch": 14.823146137139311, "grad_norm": 7.574337482452393, "learning_rate": 1.9060135940466645e-06, "loss": 0.2452, "num_input_tokens_seen": 29096112, "step": 47775 }, { "epoch": 14.824697486813527, "grad_norm": 9.419388771057129, "learning_rate": 1.9049502226442724e-06, "loss": 0.176, "num_input_tokens_seen": 29099664, "step": 47780 }, { "epoch": 14.826248836487744, "grad_norm": 21.026714324951172, "learning_rate": 1.903887078145682e-06, "loss": 0.224, "num_input_tokens_seen": 29101968, "step": 47785 }, { "epoch": 14.82780018616196, "grad_norm": 10.09738826751709, "learning_rate": 1.90282416062883e-06, "loss": 0.1703, "num_input_tokens_seen": 29104976, "step": 47790 }, { "epoch": 14.829351535836178, "grad_norm": 9.463761329650879, "learning_rate": 1.9017614701716435e-06, "loss": 0.1994, "num_input_tokens_seen": 29107440, "step": 47795 }, { "epoch": 14.830902885510394, "grad_norm": 7.448822021484375, "learning_rate": 1.9006990068520314e-06, "loss": 0.1426, "num_input_tokens_seen": 29110256, "step": 47800 }, { "epoch": 14.83245423518461, "grad_norm": 7.648019790649414, "learning_rate": 1.8996367707478814e-06, "loss": 0.2328, "num_input_tokens_seen": 29113520, "step": 47805 }, { "epoch": 14.834005584858827, "grad_norm": 7.969701766967773, "learning_rate": 1.898574761937072e-06, "loss": 0.2004, "num_input_tokens_seen": 29118192, "step": 47810 }, { "epoch": 14.835556934533043, "grad_norm": 11.475984573364258, "learning_rate": 1.8975129804974569e-06, "loss": 0.1856, "num_input_tokens_seen": 29120784, "step": 47815 }, { "epoch": 14.837108284207261, "grad_norm": 21.933948516845703, "learning_rate": 1.8964514265068812e-06, "loss": 0.2252, "num_input_tokens_seen": 29123248, "step": 47820 }, { "epoch": 14.838659633881477, "grad_norm": 8.716480255126953, "learning_rate": 1.8953901000431656e-06, "loss": 0.155, "num_input_tokens_seen": 29126352, "step": 47825 }, { "epoch": 14.840210983555693, "grad_norm": 6.1448822021484375, "learning_rate": 1.894329001184122e-06, "loss": 0.1407, "num_input_tokens_seen": 29128784, "step": 47830 }, { "epoch": 14.84176233322991, "grad_norm": 13.567063331604004, "learning_rate": 1.8932681300075366e-06, "loss": 0.2023, "num_input_tokens_seen": 29131632, "step": 47835 }, { "epoch": 14.843313682904126, "grad_norm": 2.947502613067627, "learning_rate": 1.8922074865911889e-06, "loss": 0.1206, "num_input_tokens_seen": 29134224, "step": 47840 }, { "epoch": 14.844865032578344, "grad_norm": 6.073634147644043, "learning_rate": 1.891147071012832e-06, "loss": 0.2362, "num_input_tokens_seen": 29136752, "step": 47845 }, { "epoch": 14.84641638225256, "grad_norm": 5.239945888519287, "learning_rate": 1.8900868833502095e-06, "loss": 0.1861, "num_input_tokens_seen": 29139632, "step": 47850 }, { "epoch": 14.847967731926776, "grad_norm": 5.874393939971924, "learning_rate": 1.889026923681047e-06, "loss": 0.1863, "num_input_tokens_seen": 29143152, "step": 47855 }, { "epoch": 14.849519081600993, "grad_norm": 30.057111740112305, "learning_rate": 1.887967192083049e-06, "loss": 0.269, "num_input_tokens_seen": 29146608, "step": 47860 }, { "epoch": 14.85107043127521, "grad_norm": 11.130732536315918, "learning_rate": 1.886907688633909e-06, "loss": 0.1755, "num_input_tokens_seen": 29150192, "step": 47865 }, { "epoch": 14.852621780949425, "grad_norm": 5.422685146331787, "learning_rate": 1.885848413411298e-06, "loss": 0.182, "num_input_tokens_seen": 29153456, "step": 47870 }, { "epoch": 14.854173130623643, "grad_norm": 4.0653862953186035, "learning_rate": 1.884789366492878e-06, "loss": 0.1773, "num_input_tokens_seen": 29157680, "step": 47875 }, { "epoch": 14.855724480297859, "grad_norm": 24.131811141967773, "learning_rate": 1.8837305479562845e-06, "loss": 0.179, "num_input_tokens_seen": 29160400, "step": 47880 }, { "epoch": 14.857275829972076, "grad_norm": 5.003453254699707, "learning_rate": 1.8826719578791463e-06, "loss": 0.2071, "num_input_tokens_seen": 29162832, "step": 47885 }, { "epoch": 14.858827179646292, "grad_norm": 16.167585372924805, "learning_rate": 1.8816135963390663e-06, "loss": 0.202, "num_input_tokens_seen": 29167120, "step": 47890 }, { "epoch": 14.860378529320508, "grad_norm": 10.871011734008789, "learning_rate": 1.8805554634136385e-06, "loss": 0.2479, "num_input_tokens_seen": 29170576, "step": 47895 }, { "epoch": 14.861929878994726, "grad_norm": 9.8560152053833, "learning_rate": 1.8794975591804332e-06, "loss": 0.1794, "num_input_tokens_seen": 29173584, "step": 47900 }, { "epoch": 14.863481228668942, "grad_norm": 2.563666343688965, "learning_rate": 1.8784398837170109e-06, "loss": 0.1496, "num_input_tokens_seen": 29177360, "step": 47905 }, { "epoch": 14.865032578343158, "grad_norm": 14.893199920654297, "learning_rate": 1.8773824371009087e-06, "loss": 0.2425, "num_input_tokens_seen": 29180368, "step": 47910 }, { "epoch": 14.866583928017375, "grad_norm": 18.932416915893555, "learning_rate": 1.8763252194096504e-06, "loss": 0.2029, "num_input_tokens_seen": 29183312, "step": 47915 }, { "epoch": 14.868135277691591, "grad_norm": 9.153458595275879, "learning_rate": 1.8752682307207454e-06, "loss": 0.1627, "num_input_tokens_seen": 29185552, "step": 47920 }, { "epoch": 14.869686627365809, "grad_norm": 11.108380317687988, "learning_rate": 1.8742114711116798e-06, "loss": 0.2372, "num_input_tokens_seen": 29188144, "step": 47925 }, { "epoch": 14.871237977040025, "grad_norm": 6.778042316436768, "learning_rate": 1.8731549406599303e-06, "loss": 0.2716, "num_input_tokens_seen": 29191632, "step": 47930 }, { "epoch": 14.87278932671424, "grad_norm": 12.726127624511719, "learning_rate": 1.8720986394429485e-06, "loss": 0.1874, "num_input_tokens_seen": 29194032, "step": 47935 }, { "epoch": 14.874340676388458, "grad_norm": 8.254398345947266, "learning_rate": 1.8710425675381788e-06, "loss": 0.2166, "num_input_tokens_seen": 29196720, "step": 47940 }, { "epoch": 14.875892026062674, "grad_norm": 5.205128192901611, "learning_rate": 1.8699867250230392e-06, "loss": 0.1525, "num_input_tokens_seen": 29201808, "step": 47945 }, { "epoch": 14.877443375736892, "grad_norm": 8.936247825622559, "learning_rate": 1.8689311119749393e-06, "loss": 0.1417, "num_input_tokens_seen": 29205072, "step": 47950 }, { "epoch": 14.878994725411108, "grad_norm": 10.815876007080078, "learning_rate": 1.8678757284712646e-06, "loss": 0.155, "num_input_tokens_seen": 29207568, "step": 47955 }, { "epoch": 14.880546075085324, "grad_norm": 13.672472953796387, "learning_rate": 1.8668205745893909e-06, "loss": 0.2251, "num_input_tokens_seen": 29210256, "step": 47960 }, { "epoch": 14.882097424759541, "grad_norm": 8.973258018493652, "learning_rate": 1.865765650406669e-06, "loss": 0.1686, "num_input_tokens_seen": 29214512, "step": 47965 }, { "epoch": 14.883648774433757, "grad_norm": 7.497221946716309, "learning_rate": 1.8647109560004422e-06, "loss": 0.2166, "num_input_tokens_seen": 29217424, "step": 47970 }, { "epoch": 14.885200124107975, "grad_norm": 9.991074562072754, "learning_rate": 1.8636564914480282e-06, "loss": 0.1604, "num_input_tokens_seen": 29220272, "step": 47975 }, { "epoch": 14.88675147378219, "grad_norm": 11.785683631896973, "learning_rate": 1.8626022568267326e-06, "loss": 0.1774, "num_input_tokens_seen": 29223152, "step": 47980 }, { "epoch": 14.888302823456407, "grad_norm": 16.73513412475586, "learning_rate": 1.8615482522138455e-06, "loss": 0.2501, "num_input_tokens_seen": 29226128, "step": 47985 }, { "epoch": 14.889854173130624, "grad_norm": 15.270927429199219, "learning_rate": 1.8604944776866352e-06, "loss": 0.185, "num_input_tokens_seen": 29229744, "step": 47990 }, { "epoch": 14.89140552280484, "grad_norm": 13.720638275146484, "learning_rate": 1.859440933322359e-06, "loss": 0.1761, "num_input_tokens_seen": 29234384, "step": 47995 }, { "epoch": 14.892956872479056, "grad_norm": 11.38033390045166, "learning_rate": 1.8583876191982497e-06, "loss": 0.198, "num_input_tokens_seen": 29236848, "step": 48000 }, { "epoch": 14.894508222153274, "grad_norm": 9.590109825134277, "learning_rate": 1.8573345353915323e-06, "loss": 0.2419, "num_input_tokens_seen": 29241648, "step": 48005 }, { "epoch": 14.89605957182749, "grad_norm": 5.762617111206055, "learning_rate": 1.856281681979406e-06, "loss": 0.2396, "num_input_tokens_seen": 29244720, "step": 48010 }, { "epoch": 14.897610921501707, "grad_norm": 6.374375343322754, "learning_rate": 1.855229059039062e-06, "loss": 0.1437, "num_input_tokens_seen": 29249488, "step": 48015 }, { "epoch": 14.899162271175923, "grad_norm": 8.054864883422852, "learning_rate": 1.8541766666476658e-06, "loss": 0.184, "num_input_tokens_seen": 29252592, "step": 48020 }, { "epoch": 14.900713620850139, "grad_norm": 8.623296737670898, "learning_rate": 1.8531245048823731e-06, "loss": 0.1767, "num_input_tokens_seen": 29255440, "step": 48025 }, { "epoch": 14.902264970524357, "grad_norm": 13.098541259765625, "learning_rate": 1.8520725738203177e-06, "loss": 0.2323, "num_input_tokens_seen": 29257808, "step": 48030 }, { "epoch": 14.903816320198572, "grad_norm": 9.411223411560059, "learning_rate": 1.8510208735386193e-06, "loss": 0.1408, "num_input_tokens_seen": 29260368, "step": 48035 }, { "epoch": 14.905367669872788, "grad_norm": 8.062325477600098, "learning_rate": 1.8499694041143823e-06, "loss": 0.1949, "num_input_tokens_seen": 29262608, "step": 48040 }, { "epoch": 14.906919019547006, "grad_norm": 5.834527015686035, "learning_rate": 1.848918165624688e-06, "loss": 0.2553, "num_input_tokens_seen": 29265200, "step": 48045 }, { "epoch": 14.908470369221222, "grad_norm": 5.904531955718994, "learning_rate": 1.8478671581466085e-06, "loss": 0.1899, "num_input_tokens_seen": 29268144, "step": 48050 }, { "epoch": 14.91002171889544, "grad_norm": 8.098915100097656, "learning_rate": 1.8468163817571916e-06, "loss": 0.1737, "num_input_tokens_seen": 29272944, "step": 48055 }, { "epoch": 14.911573068569655, "grad_norm": 8.255998611450195, "learning_rate": 1.8457658365334746e-06, "loss": 0.2513, "num_input_tokens_seen": 29275056, "step": 48060 }, { "epoch": 14.913124418243871, "grad_norm": 5.997672080993652, "learning_rate": 1.844715522552472e-06, "loss": 0.1885, "num_input_tokens_seen": 29278800, "step": 48065 }, { "epoch": 14.914675767918089, "grad_norm": 11.363935470581055, "learning_rate": 1.8436654398911874e-06, "loss": 0.2176, "num_input_tokens_seen": 29281104, "step": 48070 }, { "epoch": 14.916227117592305, "grad_norm": 12.127452850341797, "learning_rate": 1.8426155886266006e-06, "loss": 0.1738, "num_input_tokens_seen": 29284240, "step": 48075 }, { "epoch": 14.917778467266523, "grad_norm": 3.9083352088928223, "learning_rate": 1.8415659688356824e-06, "loss": 0.1442, "num_input_tokens_seen": 29286928, "step": 48080 }, { "epoch": 14.919329816940738, "grad_norm": 8.437204360961914, "learning_rate": 1.8405165805953778e-06, "loss": 0.186, "num_input_tokens_seen": 29289136, "step": 48085 }, { "epoch": 14.920881166614954, "grad_norm": 7.495588779449463, "learning_rate": 1.8394674239826239e-06, "loss": 0.1952, "num_input_tokens_seen": 29291952, "step": 48090 }, { "epoch": 14.922432516289172, "grad_norm": 4.670316219329834, "learning_rate": 1.8384184990743326e-06, "loss": 0.2576, "num_input_tokens_seen": 29295056, "step": 48095 }, { "epoch": 14.923983865963388, "grad_norm": 4.131702423095703, "learning_rate": 1.8373698059474038e-06, "loss": 0.1789, "num_input_tokens_seen": 29297872, "step": 48100 }, { "epoch": 14.925535215637606, "grad_norm": 13.355961799621582, "learning_rate": 1.836321344678721e-06, "loss": 0.2284, "num_input_tokens_seen": 29302160, "step": 48105 }, { "epoch": 14.927086565311821, "grad_norm": 6.842533111572266, "learning_rate": 1.8352731153451452e-06, "loss": 0.1813, "num_input_tokens_seen": 29305392, "step": 48110 }, { "epoch": 14.928637914986037, "grad_norm": 7.488839626312256, "learning_rate": 1.8342251180235283e-06, "loss": 0.2011, "num_input_tokens_seen": 29308784, "step": 48115 }, { "epoch": 14.930189264660255, "grad_norm": 7.364821434020996, "learning_rate": 1.8331773527906965e-06, "loss": 0.2458, "num_input_tokens_seen": 29311728, "step": 48120 }, { "epoch": 14.93174061433447, "grad_norm": 10.857532501220703, "learning_rate": 1.8321298197234676e-06, "loss": 0.2076, "num_input_tokens_seen": 29314896, "step": 48125 }, { "epoch": 14.933291964008687, "grad_norm": 8.663873672485352, "learning_rate": 1.8310825188986342e-06, "loss": 0.1372, "num_input_tokens_seen": 29318544, "step": 48130 }, { "epoch": 14.934843313682904, "grad_norm": 8.162335395812988, "learning_rate": 1.8300354503929796e-06, "loss": 0.1836, "num_input_tokens_seen": 29321392, "step": 48135 }, { "epoch": 14.93639466335712, "grad_norm": 5.931266784667969, "learning_rate": 1.8289886142832624e-06, "loss": 0.153, "num_input_tokens_seen": 29324464, "step": 48140 }, { "epoch": 14.937946013031338, "grad_norm": 5.982448577880859, "learning_rate": 1.8279420106462326e-06, "loss": 0.2227, "num_input_tokens_seen": 29326448, "step": 48145 }, { "epoch": 14.939497362705554, "grad_norm": 7.879666328430176, "learning_rate": 1.826895639558614e-06, "loss": 0.1618, "num_input_tokens_seen": 29330960, "step": 48150 }, { "epoch": 14.94104871237977, "grad_norm": 11.842412948608398, "learning_rate": 1.82584950109712e-06, "loss": 0.2042, "num_input_tokens_seen": 29333712, "step": 48155 }, { "epoch": 14.942600062053987, "grad_norm": 8.536123275756836, "learning_rate": 1.8248035953384474e-06, "loss": 0.2194, "num_input_tokens_seen": 29336400, "step": 48160 }, { "epoch": 14.944151411728203, "grad_norm": 8.167868614196777, "learning_rate": 1.8237579223592689e-06, "loss": 0.1663, "num_input_tokens_seen": 29339248, "step": 48165 }, { "epoch": 14.945702761402421, "grad_norm": 4.926609992980957, "learning_rate": 1.822712482236249e-06, "loss": 0.2339, "num_input_tokens_seen": 29341680, "step": 48170 }, { "epoch": 14.947254111076637, "grad_norm": 7.112048149108887, "learning_rate": 1.8216672750460274e-06, "loss": 0.1605, "num_input_tokens_seen": 29344560, "step": 48175 }, { "epoch": 14.948805460750853, "grad_norm": 11.589404106140137, "learning_rate": 1.820622300865233e-06, "loss": 0.1817, "num_input_tokens_seen": 29347344, "step": 48180 }, { "epoch": 14.95035681042507, "grad_norm": 8.974213600158691, "learning_rate": 1.8195775597704719e-06, "loss": 0.1591, "num_input_tokens_seen": 29350544, "step": 48185 }, { "epoch": 14.951908160099286, "grad_norm": 2.965285301208496, "learning_rate": 1.8185330518383392e-06, "loss": 0.1732, "num_input_tokens_seen": 29353680, "step": 48190 }, { "epoch": 14.953459509773502, "grad_norm": 11.492241859436035, "learning_rate": 1.817488777145406e-06, "loss": 0.3606, "num_input_tokens_seen": 29358448, "step": 48195 }, { "epoch": 14.95501085944772, "grad_norm": 9.421202659606934, "learning_rate": 1.8164447357682342e-06, "loss": 0.2351, "num_input_tokens_seen": 29361040, "step": 48200 }, { "epoch": 14.956562209121936, "grad_norm": 7.734641075134277, "learning_rate": 1.81540092778336e-06, "loss": 0.1508, "num_input_tokens_seen": 29364912, "step": 48205 }, { "epoch": 14.958113558796153, "grad_norm": 12.547369956970215, "learning_rate": 1.8143573532673108e-06, "loss": 0.2718, "num_input_tokens_seen": 29366960, "step": 48210 }, { "epoch": 14.95966490847037, "grad_norm": 10.736639976501465, "learning_rate": 1.8133140122965904e-06, "loss": 0.2515, "num_input_tokens_seen": 29369552, "step": 48215 }, { "epoch": 14.961216258144585, "grad_norm": 6.393827438354492, "learning_rate": 1.8122709049476877e-06, "loss": 0.2302, "num_input_tokens_seen": 29372368, "step": 48220 }, { "epoch": 14.962767607818803, "grad_norm": 2.9452998638153076, "learning_rate": 1.811228031297077e-06, "loss": 0.2201, "num_input_tokens_seen": 29377008, "step": 48225 }, { "epoch": 14.964318957493019, "grad_norm": 4.68449068069458, "learning_rate": 1.8101853914212137e-06, "loss": 0.2049, "num_input_tokens_seen": 29379408, "step": 48230 }, { "epoch": 14.965870307167236, "grad_norm": 7.572815895080566, "learning_rate": 1.8091429853965325e-06, "loss": 0.1599, "num_input_tokens_seen": 29382800, "step": 48235 }, { "epoch": 14.967421656841452, "grad_norm": 4.620917320251465, "learning_rate": 1.808100813299456e-06, "loss": 0.1667, "num_input_tokens_seen": 29385136, "step": 48240 }, { "epoch": 14.968973006515668, "grad_norm": 16.848308563232422, "learning_rate": 1.807058875206389e-06, "loss": 0.1781, "num_input_tokens_seen": 29388112, "step": 48245 }, { "epoch": 14.970524356189886, "grad_norm": 3.729469060897827, "learning_rate": 1.8060171711937152e-06, "loss": 0.1062, "num_input_tokens_seen": 29391920, "step": 48250 }, { "epoch": 14.972075705864102, "grad_norm": 6.229975700378418, "learning_rate": 1.8049757013378066e-06, "loss": 0.1405, "num_input_tokens_seen": 29394320, "step": 48255 }, { "epoch": 14.973627055538318, "grad_norm": 8.380876541137695, "learning_rate": 1.803934465715012e-06, "loss": 0.1982, "num_input_tokens_seen": 29397424, "step": 48260 }, { "epoch": 14.975178405212535, "grad_norm": 17.465408325195312, "learning_rate": 1.80289346440167e-06, "loss": 0.2144, "num_input_tokens_seen": 29400432, "step": 48265 }, { "epoch": 14.976729754886751, "grad_norm": 9.0106840133667, "learning_rate": 1.8018526974740952e-06, "loss": 0.2494, "num_input_tokens_seen": 29403152, "step": 48270 }, { "epoch": 14.978281104560969, "grad_norm": 7.395153999328613, "learning_rate": 1.8008121650085908e-06, "loss": 0.2137, "num_input_tokens_seen": 29405776, "step": 48275 }, { "epoch": 14.979832454235185, "grad_norm": 11.123861312866211, "learning_rate": 1.7997718670814367e-06, "loss": 0.1874, "num_input_tokens_seen": 29408816, "step": 48280 }, { "epoch": 14.9813838039094, "grad_norm": 5.408795356750488, "learning_rate": 1.7987318037689034e-06, "loss": 0.2021, "num_input_tokens_seen": 29411760, "step": 48285 }, { "epoch": 14.982935153583618, "grad_norm": 12.833882331848145, "learning_rate": 1.7976919751472355e-06, "loss": 0.1853, "num_input_tokens_seen": 29416112, "step": 48290 }, { "epoch": 14.984486503257834, "grad_norm": 7.091540336608887, "learning_rate": 1.7966523812926684e-06, "loss": 0.168, "num_input_tokens_seen": 29418704, "step": 48295 }, { "epoch": 14.986037852932052, "grad_norm": 6.95535945892334, "learning_rate": 1.7956130222814138e-06, "loss": 0.1621, "num_input_tokens_seen": 29422128, "step": 48300 }, { "epoch": 14.987589202606268, "grad_norm": 9.426612854003906, "learning_rate": 1.7945738981896699e-06, "loss": 0.1901, "num_input_tokens_seen": 29426992, "step": 48305 }, { "epoch": 14.989140552280483, "grad_norm": 12.225062370300293, "learning_rate": 1.7935350090936189e-06, "loss": 0.194, "num_input_tokens_seen": 29429904, "step": 48310 }, { "epoch": 14.990691901954701, "grad_norm": 14.32420825958252, "learning_rate": 1.7924963550694207e-06, "loss": 0.1999, "num_input_tokens_seen": 29433840, "step": 48315 }, { "epoch": 14.992243251628917, "grad_norm": 10.442678451538086, "learning_rate": 1.7914579361932233e-06, "loss": 0.1536, "num_input_tokens_seen": 29436464, "step": 48320 }, { "epoch": 14.993794601303133, "grad_norm": 8.845357894897461, "learning_rate": 1.7904197525411525e-06, "loss": 0.2701, "num_input_tokens_seen": 29440144, "step": 48325 }, { "epoch": 14.99534595097735, "grad_norm": 8.233278274536133, "learning_rate": 1.789381804189323e-06, "loss": 0.1864, "num_input_tokens_seen": 29442736, "step": 48330 }, { "epoch": 14.996897300651566, "grad_norm": 10.129339218139648, "learning_rate": 1.7883440912138244e-06, "loss": 0.1444, "num_input_tokens_seen": 29446352, "step": 48335 }, { "epoch": 14.998448650325784, "grad_norm": 18.58011817932129, "learning_rate": 1.787306613690738e-06, "loss": 0.2042, "num_input_tokens_seen": 29452656, "step": 48340 }, { "epoch": 15.0, "grad_norm": 29.751136779785156, "learning_rate": 1.786269371696119e-06, "loss": 0.2538, "num_input_tokens_seen": 29454800, "step": 48345 }, { "epoch": 15.001551349674216, "grad_norm": 9.589617729187012, "learning_rate": 1.785232365306012e-06, "loss": 0.1631, "num_input_tokens_seen": 29458352, "step": 48350 }, { "epoch": 15.003102699348434, "grad_norm": 6.726154327392578, "learning_rate": 1.7841955945964407e-06, "loss": 0.2492, "num_input_tokens_seen": 29461648, "step": 48355 }, { "epoch": 15.00465404902265, "grad_norm": 6.3470587730407715, "learning_rate": 1.7831590596434128e-06, "loss": 0.1573, "num_input_tokens_seen": 29464560, "step": 48360 }, { "epoch": 15.006205398696867, "grad_norm": 5.586417198181152, "learning_rate": 1.7821227605229203e-06, "loss": 0.1834, "num_input_tokens_seen": 29467216, "step": 48365 }, { "epoch": 15.007756748371083, "grad_norm": 11.849202156066895, "learning_rate": 1.781086697310933e-06, "loss": 0.2054, "num_input_tokens_seen": 29470000, "step": 48370 }, { "epoch": 15.009308098045299, "grad_norm": 5.407435417175293, "learning_rate": 1.7800508700834107e-06, "loss": 0.1621, "num_input_tokens_seen": 29473136, "step": 48375 }, { "epoch": 15.010859447719517, "grad_norm": 6.2473578453063965, "learning_rate": 1.7790152789162874e-06, "loss": 0.2324, "num_input_tokens_seen": 29475824, "step": 48380 }, { "epoch": 15.012410797393732, "grad_norm": 7.717048645019531, "learning_rate": 1.7779799238854883e-06, "loss": 0.1597, "num_input_tokens_seen": 29478256, "step": 48385 }, { "epoch": 15.013962147067948, "grad_norm": 6.27689266204834, "learning_rate": 1.7769448050669136e-06, "loss": 0.1961, "num_input_tokens_seen": 29482000, "step": 48390 }, { "epoch": 15.015513496742166, "grad_norm": 3.6711413860321045, "learning_rate": 1.7759099225364522e-06, "loss": 0.1205, "num_input_tokens_seen": 29485296, "step": 48395 }, { "epoch": 15.017064846416382, "grad_norm": 3.661778688430786, "learning_rate": 1.7748752763699717e-06, "loss": 0.1407, "num_input_tokens_seen": 29489744, "step": 48400 }, { "epoch": 15.0186161960906, "grad_norm": 13.14724063873291, "learning_rate": 1.773840866643326e-06, "loss": 0.2302, "num_input_tokens_seen": 29492304, "step": 48405 }, { "epoch": 15.020167545764815, "grad_norm": 3.7321338653564453, "learning_rate": 1.7728066934323462e-06, "loss": 0.1726, "num_input_tokens_seen": 29494800, "step": 48410 }, { "epoch": 15.021718895439031, "grad_norm": 5.4763312339782715, "learning_rate": 1.771772756812854e-06, "loss": 0.1736, "num_input_tokens_seen": 29497744, "step": 48415 }, { "epoch": 15.023270245113249, "grad_norm": 11.435462951660156, "learning_rate": 1.7707390568606442e-06, "loss": 0.1781, "num_input_tokens_seen": 29500016, "step": 48420 }, { "epoch": 15.024821594787465, "grad_norm": 3.6523334980010986, "learning_rate": 1.7697055936515018e-06, "loss": 0.1785, "num_input_tokens_seen": 29502480, "step": 48425 }, { "epoch": 15.026372944461682, "grad_norm": 18.56734275817871, "learning_rate": 1.7686723672611938e-06, "loss": 0.2136, "num_input_tokens_seen": 29504752, "step": 48430 }, { "epoch": 15.027924294135898, "grad_norm": 15.514934539794922, "learning_rate": 1.7676393777654637e-06, "loss": 0.1789, "num_input_tokens_seen": 29508496, "step": 48435 }, { "epoch": 15.029475643810114, "grad_norm": 21.254846572875977, "learning_rate": 1.7666066252400465e-06, "loss": 0.187, "num_input_tokens_seen": 29511120, "step": 48440 }, { "epoch": 15.031026993484332, "grad_norm": 3.20977520942688, "learning_rate": 1.7655741097606505e-06, "loss": 0.153, "num_input_tokens_seen": 29514736, "step": 48445 }, { "epoch": 15.032578343158548, "grad_norm": 1.7849931716918945, "learning_rate": 1.7645418314029755e-06, "loss": 0.1699, "num_input_tokens_seen": 29517808, "step": 48450 }, { "epoch": 15.034129692832764, "grad_norm": 3.087503433227539, "learning_rate": 1.7635097902426962e-06, "loss": 0.1034, "num_input_tokens_seen": 29520912, "step": 48455 }, { "epoch": 15.035681042506981, "grad_norm": 10.986174583435059, "learning_rate": 1.7624779863554765e-06, "loss": 0.1602, "num_input_tokens_seen": 29523344, "step": 48460 }, { "epoch": 15.037232392181197, "grad_norm": 8.55032730102539, "learning_rate": 1.7614464198169568e-06, "loss": 0.1798, "num_input_tokens_seen": 29526640, "step": 48465 }, { "epoch": 15.038783741855415, "grad_norm": 7.852141857147217, "learning_rate": 1.7604150907027667e-06, "loss": 0.1282, "num_input_tokens_seen": 29529744, "step": 48470 }, { "epoch": 15.04033509152963, "grad_norm": 15.07365608215332, "learning_rate": 1.7593839990885108e-06, "loss": 0.2171, "num_input_tokens_seen": 29532656, "step": 48475 }, { "epoch": 15.041886441203847, "grad_norm": 21.86186981201172, "learning_rate": 1.7583531450497826e-06, "loss": 0.15, "num_input_tokens_seen": 29535568, "step": 48480 }, { "epoch": 15.043437790878064, "grad_norm": 4.737612247467041, "learning_rate": 1.7573225286621575e-06, "loss": 0.1655, "num_input_tokens_seen": 29538704, "step": 48485 }, { "epoch": 15.04498914055228, "grad_norm": 12.730254173278809, "learning_rate": 1.756292150001188e-06, "loss": 0.1518, "num_input_tokens_seen": 29541744, "step": 48490 }, { "epoch": 15.046540490226498, "grad_norm": 9.3704252243042, "learning_rate": 1.7552620091424173e-06, "loss": 0.165, "num_input_tokens_seen": 29544560, "step": 48495 }, { "epoch": 15.048091839900714, "grad_norm": 3.8258907794952393, "learning_rate": 1.7542321061613632e-06, "loss": 0.1055, "num_input_tokens_seen": 29547632, "step": 48500 }, { "epoch": 15.04964318957493, "grad_norm": 9.792938232421875, "learning_rate": 1.7532024411335325e-06, "loss": 0.1397, "num_input_tokens_seen": 29550192, "step": 48505 }, { "epoch": 15.051194539249147, "grad_norm": 9.313910484313965, "learning_rate": 1.752173014134409e-06, "loss": 0.1387, "num_input_tokens_seen": 29552848, "step": 48510 }, { "epoch": 15.052745888923363, "grad_norm": 16.39999008178711, "learning_rate": 1.7511438252394659e-06, "loss": 0.1969, "num_input_tokens_seen": 29557072, "step": 48515 }, { "epoch": 15.054297238597579, "grad_norm": 11.219014167785645, "learning_rate": 1.7501148745241503e-06, "loss": 0.178, "num_input_tokens_seen": 29560240, "step": 48520 }, { "epoch": 15.055848588271797, "grad_norm": 4.698267459869385, "learning_rate": 1.7490861620639011e-06, "loss": 0.1884, "num_input_tokens_seen": 29564080, "step": 48525 }, { "epoch": 15.057399937946013, "grad_norm": 7.331088066101074, "learning_rate": 1.7480576879341304e-06, "loss": 0.111, "num_input_tokens_seen": 29567312, "step": 48530 }, { "epoch": 15.05895128762023, "grad_norm": 2.5355992317199707, "learning_rate": 1.7470294522102421e-06, "loss": 0.1295, "num_input_tokens_seen": 29569872, "step": 48535 }, { "epoch": 15.060502637294446, "grad_norm": 6.320805072784424, "learning_rate": 1.746001454967614e-06, "loss": 0.1482, "num_input_tokens_seen": 29572944, "step": 48540 }, { "epoch": 15.062053986968662, "grad_norm": 14.476011276245117, "learning_rate": 1.7449736962816127e-06, "loss": 0.2567, "num_input_tokens_seen": 29576016, "step": 48545 }, { "epoch": 15.06360533664288, "grad_norm": 6.059591770172119, "learning_rate": 1.743946176227586e-06, "loss": 0.1315, "num_input_tokens_seen": 29578896, "step": 48550 }, { "epoch": 15.065156686317096, "grad_norm": 13.22606372833252, "learning_rate": 1.7429188948808607e-06, "loss": 0.2179, "num_input_tokens_seen": 29582064, "step": 48555 }, { "epoch": 15.066708035991313, "grad_norm": 19.186235427856445, "learning_rate": 1.7418918523167517e-06, "loss": 0.2288, "num_input_tokens_seen": 29584560, "step": 48560 }, { "epoch": 15.06825938566553, "grad_norm": 2.860297441482544, "learning_rate": 1.7408650486105494e-06, "loss": 0.1347, "num_input_tokens_seen": 29586832, "step": 48565 }, { "epoch": 15.069810735339745, "grad_norm": 25.521984100341797, "learning_rate": 1.7398384838375354e-06, "loss": 0.2199, "num_input_tokens_seen": 29589232, "step": 48570 }, { "epoch": 15.071362085013963, "grad_norm": 12.109073638916016, "learning_rate": 1.7388121580729645e-06, "loss": 0.1449, "num_input_tokens_seen": 29592368, "step": 48575 }, { "epoch": 15.072913434688179, "grad_norm": 4.163100242614746, "learning_rate": 1.7377860713920825e-06, "loss": 0.1676, "num_input_tokens_seen": 29594768, "step": 48580 }, { "epoch": 15.074464784362394, "grad_norm": 16.326366424560547, "learning_rate": 1.7367602238701104e-06, "loss": 0.1479, "num_input_tokens_seen": 29597168, "step": 48585 }, { "epoch": 15.076016134036612, "grad_norm": 11.438888549804688, "learning_rate": 1.7357346155822575e-06, "loss": 0.1352, "num_input_tokens_seen": 29599536, "step": 48590 }, { "epoch": 15.077567483710828, "grad_norm": 13.265207290649414, "learning_rate": 1.7347092466037108e-06, "loss": 0.2272, "num_input_tokens_seen": 29602608, "step": 48595 }, { "epoch": 15.079118833385046, "grad_norm": 6.569094657897949, "learning_rate": 1.733684117009643e-06, "loss": 0.155, "num_input_tokens_seen": 29605648, "step": 48600 }, { "epoch": 15.080670183059262, "grad_norm": 13.361554145812988, "learning_rate": 1.7326592268752102e-06, "loss": 0.2587, "num_input_tokens_seen": 29608176, "step": 48605 }, { "epoch": 15.082221532733477, "grad_norm": 10.739200592041016, "learning_rate": 1.7316345762755448e-06, "loss": 0.1456, "num_input_tokens_seen": 29612752, "step": 48610 }, { "epoch": 15.083772882407695, "grad_norm": 8.602633476257324, "learning_rate": 1.7306101652857704e-06, "loss": 0.2541, "num_input_tokens_seen": 29615824, "step": 48615 }, { "epoch": 15.085324232081911, "grad_norm": 8.598095893859863, "learning_rate": 1.7295859939809851e-06, "loss": 0.131, "num_input_tokens_seen": 29618512, "step": 48620 }, { "epoch": 15.086875581756129, "grad_norm": 12.980985641479492, "learning_rate": 1.7285620624362748e-06, "loss": 0.1567, "num_input_tokens_seen": 29620720, "step": 48625 }, { "epoch": 15.088426931430345, "grad_norm": 9.752817153930664, "learning_rate": 1.7275383707267035e-06, "loss": 0.2836, "num_input_tokens_seen": 29624112, "step": 48630 }, { "epoch": 15.08997828110456, "grad_norm": 7.499190807342529, "learning_rate": 1.7265149189273234e-06, "loss": 0.1002, "num_input_tokens_seen": 29626736, "step": 48635 }, { "epoch": 15.091529630778778, "grad_norm": 14.647180557250977, "learning_rate": 1.7254917071131616e-06, "loss": 0.1179, "num_input_tokens_seen": 29629744, "step": 48640 }, { "epoch": 15.093080980452994, "grad_norm": 22.81258201599121, "learning_rate": 1.7244687353592355e-06, "loss": 0.2097, "num_input_tokens_seen": 29632752, "step": 48645 }, { "epoch": 15.09463233012721, "grad_norm": 8.362040519714355, "learning_rate": 1.723446003740537e-06, "loss": 0.1305, "num_input_tokens_seen": 29635440, "step": 48650 }, { "epoch": 15.096183679801428, "grad_norm": 6.117126941680908, "learning_rate": 1.7224235123320487e-06, "loss": 0.1694, "num_input_tokens_seen": 29638320, "step": 48655 }, { "epoch": 15.097735029475643, "grad_norm": 6.119518756866455, "learning_rate": 1.7214012612087277e-06, "loss": 0.1694, "num_input_tokens_seen": 29641648, "step": 48660 }, { "epoch": 15.099286379149861, "grad_norm": 3.3042843341827393, "learning_rate": 1.7203792504455186e-06, "loss": 0.1942, "num_input_tokens_seen": 29644464, "step": 48665 }, { "epoch": 15.100837728824077, "grad_norm": 19.483224868774414, "learning_rate": 1.7193574801173485e-06, "loss": 0.1738, "num_input_tokens_seen": 29647216, "step": 48670 }, { "epoch": 15.102389078498293, "grad_norm": 4.368000507354736, "learning_rate": 1.718335950299122e-06, "loss": 0.1195, "num_input_tokens_seen": 29650032, "step": 48675 }, { "epoch": 15.10394042817251, "grad_norm": 7.7541093826293945, "learning_rate": 1.7173146610657331e-06, "loss": 0.1871, "num_input_tokens_seen": 29653392, "step": 48680 }, { "epoch": 15.105491777846726, "grad_norm": 3.6227705478668213, "learning_rate": 1.7162936124920504e-06, "loss": 0.1602, "num_input_tokens_seen": 29656400, "step": 48685 }, { "epoch": 15.107043127520944, "grad_norm": 24.172788619995117, "learning_rate": 1.7152728046529327e-06, "loss": 0.1646, "num_input_tokens_seen": 29659632, "step": 48690 }, { "epoch": 15.10859447719516, "grad_norm": 11.332493782043457, "learning_rate": 1.7142522376232135e-06, "loss": 0.1768, "num_input_tokens_seen": 29662480, "step": 48695 }, { "epoch": 15.110145826869376, "grad_norm": 3.1023497581481934, "learning_rate": 1.7132319114777162e-06, "loss": 0.1409, "num_input_tokens_seen": 29664784, "step": 48700 }, { "epoch": 15.111697176543593, "grad_norm": 4.075058937072754, "learning_rate": 1.712211826291239e-06, "loss": 0.1195, "num_input_tokens_seen": 29668336, "step": 48705 }, { "epoch": 15.11324852621781, "grad_norm": 10.188809394836426, "learning_rate": 1.7111919821385703e-06, "loss": 0.1683, "num_input_tokens_seen": 29671280, "step": 48710 }, { "epoch": 15.114799875892025, "grad_norm": 5.165541648864746, "learning_rate": 1.7101723790944724e-06, "loss": 0.1604, "num_input_tokens_seen": 29674448, "step": 48715 }, { "epoch": 15.116351225566243, "grad_norm": 10.331707000732422, "learning_rate": 1.7091530172336968e-06, "loss": 0.1551, "num_input_tokens_seen": 29677168, "step": 48720 }, { "epoch": 15.117902575240459, "grad_norm": 3.211588144302368, "learning_rate": 1.7081338966309764e-06, "loss": 0.2383, "num_input_tokens_seen": 29680080, "step": 48725 }, { "epoch": 15.119453924914676, "grad_norm": 5.949109077453613, "learning_rate": 1.7071150173610208e-06, "loss": 0.2166, "num_input_tokens_seen": 29682640, "step": 48730 }, { "epoch": 15.121005274588892, "grad_norm": 9.998931884765625, "learning_rate": 1.7060963794985297e-06, "loss": 0.15, "num_input_tokens_seen": 29685744, "step": 48735 }, { "epoch": 15.122556624263108, "grad_norm": 5.910433769226074, "learning_rate": 1.7050779831181779e-06, "loss": 0.1816, "num_input_tokens_seen": 29687888, "step": 48740 }, { "epoch": 15.124107973937326, "grad_norm": 7.1073994636535645, "learning_rate": 1.7040598282946297e-06, "loss": 0.1426, "num_input_tokens_seen": 29690928, "step": 48745 }, { "epoch": 15.125659323611542, "grad_norm": 7.6568450927734375, "learning_rate": 1.703041915102524e-06, "loss": 0.2048, "num_input_tokens_seen": 29693840, "step": 48750 }, { "epoch": 15.12721067328576, "grad_norm": 4.309432029724121, "learning_rate": 1.7020242436164896e-06, "loss": 0.2033, "num_input_tokens_seen": 29697296, "step": 48755 }, { "epoch": 15.128762022959975, "grad_norm": 12.69279670715332, "learning_rate": 1.7010068139111302e-06, "loss": 0.1771, "num_input_tokens_seen": 29700464, "step": 48760 }, { "epoch": 15.130313372634191, "grad_norm": 13.423264503479004, "learning_rate": 1.6999896260610388e-06, "loss": 0.1428, "num_input_tokens_seen": 29703664, "step": 48765 }, { "epoch": 15.131864722308409, "grad_norm": 7.045945167541504, "learning_rate": 1.698972680140784e-06, "loss": 0.1702, "num_input_tokens_seen": 29706608, "step": 48770 }, { "epoch": 15.133416071982625, "grad_norm": 7.1125078201293945, "learning_rate": 1.6979559762249237e-06, "loss": 0.1638, "num_input_tokens_seen": 29709616, "step": 48775 }, { "epoch": 15.13496742165684, "grad_norm": 20.91486930847168, "learning_rate": 1.6969395143879908e-06, "loss": 0.1795, "num_input_tokens_seen": 29713264, "step": 48780 }, { "epoch": 15.136518771331058, "grad_norm": 18.5782470703125, "learning_rate": 1.6959232947045056e-06, "loss": 0.2939, "num_input_tokens_seen": 29717264, "step": 48785 }, { "epoch": 15.138070121005274, "grad_norm": 11.159541130065918, "learning_rate": 1.6949073172489705e-06, "loss": 0.1582, "num_input_tokens_seen": 29720048, "step": 48790 }, { "epoch": 15.139621470679492, "grad_norm": 4.960688591003418, "learning_rate": 1.6938915820958657e-06, "loss": 0.1529, "num_input_tokens_seen": 29724400, "step": 48795 }, { "epoch": 15.141172820353708, "grad_norm": 7.3528852462768555, "learning_rate": 1.6928760893196593e-06, "loss": 0.1402, "num_input_tokens_seen": 29727952, "step": 48800 }, { "epoch": 15.142724170027924, "grad_norm": 6.264150142669678, "learning_rate": 1.691860838994795e-06, "loss": 0.1344, "num_input_tokens_seen": 29730672, "step": 48805 }, { "epoch": 15.144275519702141, "grad_norm": 6.242552757263184, "learning_rate": 1.6908458311957088e-06, "loss": 0.1657, "num_input_tokens_seen": 29732976, "step": 48810 }, { "epoch": 15.145826869376357, "grad_norm": 6.7830586433410645, "learning_rate": 1.6898310659968077e-06, "loss": 0.1806, "num_input_tokens_seen": 29735376, "step": 48815 }, { "epoch": 15.147378219050575, "grad_norm": 7.424136161804199, "learning_rate": 1.688816543472489e-06, "loss": 0.1389, "num_input_tokens_seen": 29738096, "step": 48820 }, { "epoch": 15.14892956872479, "grad_norm": 11.417274475097656, "learning_rate": 1.6878022636971263e-06, "loss": 0.2088, "num_input_tokens_seen": 29740432, "step": 48825 }, { "epoch": 15.150480918399007, "grad_norm": 6.120090484619141, "learning_rate": 1.6867882267450819e-06, "loss": 0.2091, "num_input_tokens_seen": 29743216, "step": 48830 }, { "epoch": 15.152032268073224, "grad_norm": 12.541790962219238, "learning_rate": 1.6857744326906933e-06, "loss": 0.1871, "num_input_tokens_seen": 29745872, "step": 48835 }, { "epoch": 15.15358361774744, "grad_norm": 12.588924407958984, "learning_rate": 1.6847608816082861e-06, "loss": 0.1713, "num_input_tokens_seen": 29747728, "step": 48840 }, { "epoch": 15.155134967421656, "grad_norm": 13.539385795593262, "learning_rate": 1.683747573572163e-06, "loss": 0.1514, "num_input_tokens_seen": 29750928, "step": 48845 }, { "epoch": 15.156686317095874, "grad_norm": 9.539217948913574, "learning_rate": 1.6827345086566155e-06, "loss": 0.1816, "num_input_tokens_seen": 29753648, "step": 48850 }, { "epoch": 15.15823766677009, "grad_norm": 15.18515682220459, "learning_rate": 1.6817216869359088e-06, "loss": 0.1954, "num_input_tokens_seen": 29756304, "step": 48855 }, { "epoch": 15.159789016444307, "grad_norm": 21.70553207397461, "learning_rate": 1.680709108484298e-06, "loss": 0.1773, "num_input_tokens_seen": 29759344, "step": 48860 }, { "epoch": 15.161340366118523, "grad_norm": 17.64934730529785, "learning_rate": 1.6796967733760145e-06, "loss": 0.1709, "num_input_tokens_seen": 29761872, "step": 48865 }, { "epoch": 15.162891715792739, "grad_norm": 11.741543769836426, "learning_rate": 1.6786846816852758e-06, "loss": 0.1506, "num_input_tokens_seen": 29764688, "step": 48870 }, { "epoch": 15.164443065466957, "grad_norm": 46.973941802978516, "learning_rate": 1.677672833486282e-06, "loss": 0.1959, "num_input_tokens_seen": 29766928, "step": 48875 }, { "epoch": 15.165994415141173, "grad_norm": 7.544903755187988, "learning_rate": 1.6766612288532097e-06, "loss": 0.2019, "num_input_tokens_seen": 29768880, "step": 48880 }, { "epoch": 15.16754576481539, "grad_norm": 11.145894050598145, "learning_rate": 1.6756498678602251e-06, "loss": 0.1271, "num_input_tokens_seen": 29771664, "step": 48885 }, { "epoch": 15.169097114489606, "grad_norm": 42.219322204589844, "learning_rate": 1.6746387505814699e-06, "loss": 0.2081, "num_input_tokens_seen": 29775536, "step": 48890 }, { "epoch": 15.170648464163822, "grad_norm": 3.916191339492798, "learning_rate": 1.673627877091074e-06, "loss": 0.2493, "num_input_tokens_seen": 29779632, "step": 48895 }, { "epoch": 15.17219981383804, "grad_norm": 6.837774276733398, "learning_rate": 1.6726172474631435e-06, "loss": 0.1746, "num_input_tokens_seen": 29782064, "step": 48900 }, { "epoch": 15.173751163512256, "grad_norm": 7.441344261169434, "learning_rate": 1.6716068617717728e-06, "loss": 0.1869, "num_input_tokens_seen": 29784976, "step": 48905 }, { "epoch": 15.175302513186471, "grad_norm": 5.376856803894043, "learning_rate": 1.6705967200910312e-06, "loss": 0.1678, "num_input_tokens_seen": 29788176, "step": 48910 }, { "epoch": 15.176853862860689, "grad_norm": 10.07693862915039, "learning_rate": 1.6695868224949774e-06, "loss": 0.139, "num_input_tokens_seen": 29790736, "step": 48915 }, { "epoch": 15.178405212534905, "grad_norm": 18.40978240966797, "learning_rate": 1.6685771690576464e-06, "loss": 0.2278, "num_input_tokens_seen": 29793712, "step": 48920 }, { "epoch": 15.179956562209123, "grad_norm": 5.4489426612854, "learning_rate": 1.6675677598530598e-06, "loss": 0.2527, "num_input_tokens_seen": 29795984, "step": 48925 }, { "epoch": 15.181507911883338, "grad_norm": 14.697517395019531, "learning_rate": 1.666558594955217e-06, "loss": 0.1702, "num_input_tokens_seen": 29799152, "step": 48930 }, { "epoch": 15.183059261557554, "grad_norm": 8.704848289489746, "learning_rate": 1.6655496744381034e-06, "loss": 0.1683, "num_input_tokens_seen": 29801936, "step": 48935 }, { "epoch": 15.184610611231772, "grad_norm": 12.004307746887207, "learning_rate": 1.664540998375686e-06, "loss": 0.1646, "num_input_tokens_seen": 29805040, "step": 48940 }, { "epoch": 15.186161960905988, "grad_norm": 5.400594234466553, "learning_rate": 1.6635325668419088e-06, "loss": 0.1815, "num_input_tokens_seen": 29808240, "step": 48945 }, { "epoch": 15.187713310580206, "grad_norm": 20.31134605407715, "learning_rate": 1.6625243799107065e-06, "loss": 0.1859, "num_input_tokens_seen": 29811440, "step": 48950 }, { "epoch": 15.189264660254421, "grad_norm": 4.274477481842041, "learning_rate": 1.6615164376559873e-06, "loss": 0.1866, "num_input_tokens_seen": 29815408, "step": 48955 }, { "epoch": 15.190816009928637, "grad_norm": 5.413863182067871, "learning_rate": 1.6605087401516479e-06, "loss": 0.169, "num_input_tokens_seen": 29818096, "step": 48960 }, { "epoch": 15.192367359602855, "grad_norm": 11.400948524475098, "learning_rate": 1.6595012874715622e-06, "loss": 0.2888, "num_input_tokens_seen": 29820912, "step": 48965 }, { "epoch": 15.193918709277071, "grad_norm": 18.578142166137695, "learning_rate": 1.6584940796895904e-06, "loss": 0.1913, "num_input_tokens_seen": 29823760, "step": 48970 }, { "epoch": 15.195470058951287, "grad_norm": 13.668009757995605, "learning_rate": 1.657487116879571e-06, "loss": 0.2282, "num_input_tokens_seen": 29826416, "step": 48975 }, { "epoch": 15.197021408625504, "grad_norm": 7.060543537139893, "learning_rate": 1.6564803991153283e-06, "loss": 0.2074, "num_input_tokens_seen": 29828848, "step": 48980 }, { "epoch": 15.19857275829972, "grad_norm": 15.055294036865234, "learning_rate": 1.6554739264706644e-06, "loss": 0.1571, "num_input_tokens_seen": 29831824, "step": 48985 }, { "epoch": 15.200124107973938, "grad_norm": 7.9973015785217285, "learning_rate": 1.6544676990193664e-06, "loss": 0.1848, "num_input_tokens_seen": 29833968, "step": 48990 }, { "epoch": 15.201675457648154, "grad_norm": 10.8353910446167, "learning_rate": 1.6534617168352052e-06, "loss": 0.1424, "num_input_tokens_seen": 29837072, "step": 48995 }, { "epoch": 15.20322680732237, "grad_norm": 10.18640422821045, "learning_rate": 1.6524559799919272e-06, "loss": 0.1744, "num_input_tokens_seen": 29840336, "step": 49000 }, { "epoch": 15.204778156996587, "grad_norm": 13.068400382995605, "learning_rate": 1.6514504885632682e-06, "loss": 0.1446, "num_input_tokens_seen": 29843056, "step": 49005 }, { "epoch": 15.206329506670803, "grad_norm": 12.981431007385254, "learning_rate": 1.6504452426229395e-06, "loss": 0.1801, "num_input_tokens_seen": 29845904, "step": 49010 }, { "epoch": 15.207880856345021, "grad_norm": 4.55180549621582, "learning_rate": 1.649440242244641e-06, "loss": 0.1753, "num_input_tokens_seen": 29848432, "step": 49015 }, { "epoch": 15.209432206019237, "grad_norm": 13.695168495178223, "learning_rate": 1.6484354875020475e-06, "loss": 0.236, "num_input_tokens_seen": 29851120, "step": 49020 }, { "epoch": 15.210983555693453, "grad_norm": 20.643903732299805, "learning_rate": 1.647430978468823e-06, "loss": 0.2092, "num_input_tokens_seen": 29853936, "step": 49025 }, { "epoch": 15.21253490536767, "grad_norm": 10.597617149353027, "learning_rate": 1.6464267152186063e-06, "loss": 0.2772, "num_input_tokens_seen": 29856304, "step": 49030 }, { "epoch": 15.214086255041886, "grad_norm": 25.834062576293945, "learning_rate": 1.645422697825026e-06, "loss": 0.23, "num_input_tokens_seen": 29859632, "step": 49035 }, { "epoch": 15.215637604716102, "grad_norm": 9.87045955657959, "learning_rate": 1.6444189263616838e-06, "loss": 0.2044, "num_input_tokens_seen": 29862096, "step": 49040 }, { "epoch": 15.21718895439032, "grad_norm": 15.6215238571167, "learning_rate": 1.6434154009021718e-06, "loss": 0.1968, "num_input_tokens_seen": 29865552, "step": 49045 }, { "epoch": 15.218740304064536, "grad_norm": 11.617147445678711, "learning_rate": 1.6424121215200579e-06, "loss": 0.1709, "num_input_tokens_seen": 29870192, "step": 49050 }, { "epoch": 15.220291653738753, "grad_norm": 8.541135787963867, "learning_rate": 1.6414090882888944e-06, "loss": 0.2159, "num_input_tokens_seen": 29873168, "step": 49055 }, { "epoch": 15.22184300341297, "grad_norm": 3.8541953563690186, "learning_rate": 1.6404063012822186e-06, "loss": 0.2069, "num_input_tokens_seen": 29875632, "step": 49060 }, { "epoch": 15.223394353087185, "grad_norm": 10.509275436401367, "learning_rate": 1.6394037605735425e-06, "loss": 0.1586, "num_input_tokens_seen": 29879856, "step": 49065 }, { "epoch": 15.224945702761403, "grad_norm": 12.106846809387207, "learning_rate": 1.6384014662363684e-06, "loss": 0.15, "num_input_tokens_seen": 29883152, "step": 49070 }, { "epoch": 15.226497052435619, "grad_norm": 25.38691520690918, "learning_rate": 1.6373994183441715e-06, "loss": 0.2201, "num_input_tokens_seen": 29886160, "step": 49075 }, { "epoch": 15.228048402109836, "grad_norm": 9.76522159576416, "learning_rate": 1.6363976169704187e-06, "loss": 0.1529, "num_input_tokens_seen": 29889072, "step": 49080 }, { "epoch": 15.229599751784052, "grad_norm": 10.166725158691406, "learning_rate": 1.6353960621885501e-06, "loss": 0.2315, "num_input_tokens_seen": 29892688, "step": 49085 }, { "epoch": 15.231151101458268, "grad_norm": 6.7926812171936035, "learning_rate": 1.6343947540719946e-06, "loss": 0.1565, "num_input_tokens_seen": 29896560, "step": 49090 }, { "epoch": 15.232702451132486, "grad_norm": 18.356714248657227, "learning_rate": 1.6333936926941563e-06, "loss": 0.1302, "num_input_tokens_seen": 29899472, "step": 49095 }, { "epoch": 15.234253800806702, "grad_norm": 12.888683319091797, "learning_rate": 1.6323928781284286e-06, "loss": 0.1206, "num_input_tokens_seen": 29903440, "step": 49100 }, { "epoch": 15.235805150480918, "grad_norm": 6.340825080871582, "learning_rate": 1.63139231044818e-06, "loss": 0.1433, "num_input_tokens_seen": 29905872, "step": 49105 }, { "epoch": 15.237356500155135, "grad_norm": 4.828688144683838, "learning_rate": 1.630391989726765e-06, "loss": 0.1435, "num_input_tokens_seen": 29908816, "step": 49110 }, { "epoch": 15.238907849829351, "grad_norm": 22.886533737182617, "learning_rate": 1.629391916037521e-06, "loss": 0.2242, "num_input_tokens_seen": 29912080, "step": 49115 }, { "epoch": 15.240459199503569, "grad_norm": 4.130530834197998, "learning_rate": 1.6283920894537618e-06, "loss": 0.1246, "num_input_tokens_seen": 29915088, "step": 49120 }, { "epoch": 15.242010549177785, "grad_norm": 17.56119155883789, "learning_rate": 1.6273925100487904e-06, "loss": 0.2129, "num_input_tokens_seen": 29917840, "step": 49125 }, { "epoch": 15.243561898852, "grad_norm": 6.864195823669434, "learning_rate": 1.6263931778958836e-06, "loss": 0.1288, "num_input_tokens_seen": 29920528, "step": 49130 }, { "epoch": 15.245113248526218, "grad_norm": 9.475461959838867, "learning_rate": 1.6253940930683081e-06, "loss": 0.1594, "num_input_tokens_seen": 29923536, "step": 49135 }, { "epoch": 15.246664598200434, "grad_norm": 11.824066162109375, "learning_rate": 1.6243952556393056e-06, "loss": 0.082, "num_input_tokens_seen": 29926576, "step": 49140 }, { "epoch": 15.248215947874652, "grad_norm": 10.801629066467285, "learning_rate": 1.623396665682105e-06, "loss": 0.1911, "num_input_tokens_seen": 29928880, "step": 49145 }, { "epoch": 15.249767297548868, "grad_norm": 29.814483642578125, "learning_rate": 1.6223983232699126e-06, "loss": 0.3317, "num_input_tokens_seen": 29932176, "step": 49150 }, { "epoch": 15.251318647223084, "grad_norm": 8.585098266601562, "learning_rate": 1.6214002284759212e-06, "loss": 0.1961, "num_input_tokens_seen": 29934800, "step": 49155 }, { "epoch": 15.252869996897301, "grad_norm": 7.0777435302734375, "learning_rate": 1.6204023813733e-06, "loss": 0.1873, "num_input_tokens_seen": 29937072, "step": 49160 }, { "epoch": 15.254421346571517, "grad_norm": 19.503801345825195, "learning_rate": 1.6194047820352066e-06, "loss": 0.2917, "num_input_tokens_seen": 29940592, "step": 49165 }, { "epoch": 15.255972696245733, "grad_norm": 3.3754615783691406, "learning_rate": 1.618407430534773e-06, "loss": 0.1984, "num_input_tokens_seen": 29943856, "step": 49170 }, { "epoch": 15.25752404591995, "grad_norm": 7.2997636795043945, "learning_rate": 1.617410326945119e-06, "loss": 0.262, "num_input_tokens_seen": 29946512, "step": 49175 }, { "epoch": 15.259075395594166, "grad_norm": 37.385074615478516, "learning_rate": 1.616413471339346e-06, "loss": 0.2466, "num_input_tokens_seen": 29949328, "step": 49180 }, { "epoch": 15.260626745268384, "grad_norm": 20.78286361694336, "learning_rate": 1.6154168637905304e-06, "loss": 0.2258, "num_input_tokens_seen": 29951760, "step": 49185 }, { "epoch": 15.2621780949426, "grad_norm": 7.854645252227783, "learning_rate": 1.614420504371741e-06, "loss": 0.1435, "num_input_tokens_seen": 29954928, "step": 49190 }, { "epoch": 15.263729444616816, "grad_norm": 12.884589195251465, "learning_rate": 1.6134243931560173e-06, "loss": 0.2694, "num_input_tokens_seen": 29959344, "step": 49195 }, { "epoch": 15.265280794291034, "grad_norm": 10.200896263122559, "learning_rate": 1.6124285302163906e-06, "loss": 0.2028, "num_input_tokens_seen": 29962512, "step": 49200 }, { "epoch": 15.26683214396525, "grad_norm": 11.224364280700684, "learning_rate": 1.6114329156258663e-06, "loss": 0.2178, "num_input_tokens_seen": 29965296, "step": 49205 }, { "epoch": 15.268383493639467, "grad_norm": 8.097084999084473, "learning_rate": 1.610437549457437e-06, "loss": 0.1489, "num_input_tokens_seen": 29967600, "step": 49210 }, { "epoch": 15.269934843313683, "grad_norm": 13.683327674865723, "learning_rate": 1.6094424317840724e-06, "loss": 0.1842, "num_input_tokens_seen": 29970736, "step": 49215 }, { "epoch": 15.271486192987899, "grad_norm": 5.374046802520752, "learning_rate": 1.6084475626787289e-06, "loss": 0.17, "num_input_tokens_seen": 29974544, "step": 49220 }, { "epoch": 15.273037542662117, "grad_norm": 17.917367935180664, "learning_rate": 1.6074529422143398e-06, "loss": 0.1908, "num_input_tokens_seen": 29977744, "step": 49225 }, { "epoch": 15.274588892336332, "grad_norm": 43.38541030883789, "learning_rate": 1.6064585704638236e-06, "loss": 0.2083, "num_input_tokens_seen": 29981104, "step": 49230 }, { "epoch": 15.276140242010548, "grad_norm": 12.501398086547852, "learning_rate": 1.605464447500082e-06, "loss": 0.2104, "num_input_tokens_seen": 29985488, "step": 49235 }, { "epoch": 15.277691591684766, "grad_norm": 18.75869369506836, "learning_rate": 1.6044705733959914e-06, "loss": 0.119, "num_input_tokens_seen": 29988912, "step": 49240 }, { "epoch": 15.279242941358982, "grad_norm": 11.51761245727539, "learning_rate": 1.6034769482244188e-06, "loss": 0.1411, "num_input_tokens_seen": 29992592, "step": 49245 }, { "epoch": 15.2807942910332, "grad_norm": 19.843603134155273, "learning_rate": 1.602483572058205e-06, "loss": 0.2398, "num_input_tokens_seen": 29995312, "step": 49250 }, { "epoch": 15.282345640707415, "grad_norm": 10.621042251586914, "learning_rate": 1.6014904449701796e-06, "loss": 0.2764, "num_input_tokens_seen": 29998128, "step": 49255 }, { "epoch": 15.283896990381631, "grad_norm": 23.441762924194336, "learning_rate": 1.6004975670331475e-06, "loss": 0.2878, "num_input_tokens_seen": 30000560, "step": 49260 }, { "epoch": 15.285448340055849, "grad_norm": 14.75835132598877, "learning_rate": 1.5995049383199013e-06, "loss": 0.2088, "num_input_tokens_seen": 30003280, "step": 49265 }, { "epoch": 15.286999689730065, "grad_norm": 6.200197219848633, "learning_rate": 1.598512558903209e-06, "loss": 0.1964, "num_input_tokens_seen": 30006192, "step": 49270 }, { "epoch": 15.288551039404283, "grad_norm": 6.150052070617676, "learning_rate": 1.5975204288558283e-06, "loss": 0.2146, "num_input_tokens_seen": 30009168, "step": 49275 }, { "epoch": 15.290102389078498, "grad_norm": 6.723007678985596, "learning_rate": 1.5965285482504893e-06, "loss": 0.2044, "num_input_tokens_seen": 30011952, "step": 49280 }, { "epoch": 15.291653738752714, "grad_norm": 4.184910297393799, "learning_rate": 1.5955369171599128e-06, "loss": 0.1587, "num_input_tokens_seen": 30015216, "step": 49285 }, { "epoch": 15.293205088426932, "grad_norm": 15.430315017700195, "learning_rate": 1.594545535656793e-06, "loss": 0.1726, "num_input_tokens_seen": 30017424, "step": 49290 }, { "epoch": 15.294756438101148, "grad_norm": 5.810516357421875, "learning_rate": 1.5935544038138123e-06, "loss": 0.1605, "num_input_tokens_seen": 30020208, "step": 49295 }, { "epoch": 15.296307787775364, "grad_norm": 2.7174904346466064, "learning_rate": 1.592563521703634e-06, "loss": 0.1878, "num_input_tokens_seen": 30023440, "step": 49300 }, { "epoch": 15.297859137449581, "grad_norm": 9.83482837677002, "learning_rate": 1.591572889398898e-06, "loss": 0.2084, "num_input_tokens_seen": 30026032, "step": 49305 }, { "epoch": 15.299410487123797, "grad_norm": 5.39086389541626, "learning_rate": 1.5905825069722324e-06, "loss": 0.1595, "num_input_tokens_seen": 30028880, "step": 49310 }, { "epoch": 15.300961836798015, "grad_norm": 7.657444000244141, "learning_rate": 1.589592374496241e-06, "loss": 0.179, "num_input_tokens_seen": 30032336, "step": 49315 }, { "epoch": 15.30251318647223, "grad_norm": 12.094938278198242, "learning_rate": 1.5886024920435155e-06, "loss": 0.2834, "num_input_tokens_seen": 30035600, "step": 49320 }, { "epoch": 15.304064536146447, "grad_norm": 20.604921340942383, "learning_rate": 1.5876128596866231e-06, "loss": 0.1902, "num_input_tokens_seen": 30039184, "step": 49325 }, { "epoch": 15.305615885820664, "grad_norm": 5.509582042694092, "learning_rate": 1.5866234774981175e-06, "loss": 0.1796, "num_input_tokens_seen": 30041584, "step": 49330 }, { "epoch": 15.30716723549488, "grad_norm": 13.855926513671875, "learning_rate": 1.5856343455505308e-06, "loss": 0.1946, "num_input_tokens_seen": 30043984, "step": 49335 }, { "epoch": 15.308718585169098, "grad_norm": 8.655577659606934, "learning_rate": 1.58464546391638e-06, "loss": 0.2281, "num_input_tokens_seen": 30047152, "step": 49340 }, { "epoch": 15.310269934843314, "grad_norm": 10.439610481262207, "learning_rate": 1.5836568326681583e-06, "loss": 0.1555, "num_input_tokens_seen": 30050640, "step": 49345 }, { "epoch": 15.31182128451753, "grad_norm": 9.765238761901855, "learning_rate": 1.582668451878347e-06, "loss": 0.2417, "num_input_tokens_seen": 30054000, "step": 49350 }, { "epoch": 15.313372634191747, "grad_norm": 22.138103485107422, "learning_rate": 1.5816803216194071e-06, "loss": 0.143, "num_input_tokens_seen": 30058160, "step": 49355 }, { "epoch": 15.314923983865963, "grad_norm": 19.41131591796875, "learning_rate": 1.5806924419637765e-06, "loss": 0.149, "num_input_tokens_seen": 30061552, "step": 49360 }, { "epoch": 15.316475333540179, "grad_norm": 14.561539649963379, "learning_rate": 1.5797048129838827e-06, "loss": 0.2009, "num_input_tokens_seen": 30064560, "step": 49365 }, { "epoch": 15.318026683214397, "grad_norm": 8.902101516723633, "learning_rate": 1.5787174347521267e-06, "loss": 0.1619, "num_input_tokens_seen": 30067856, "step": 49370 }, { "epoch": 15.319578032888613, "grad_norm": 27.660179138183594, "learning_rate": 1.5777303073408978e-06, "loss": 0.1804, "num_input_tokens_seen": 30071024, "step": 49375 }, { "epoch": 15.32112938256283, "grad_norm": 8.523417472839355, "learning_rate": 1.5767434308225603e-06, "loss": 0.2746, "num_input_tokens_seen": 30074352, "step": 49380 }, { "epoch": 15.322680732237046, "grad_norm": 5.766780376434326, "learning_rate": 1.5757568052694705e-06, "loss": 0.1864, "num_input_tokens_seen": 30077168, "step": 49385 }, { "epoch": 15.324232081911262, "grad_norm": 8.159637451171875, "learning_rate": 1.5747704307539536e-06, "loss": 0.1405, "num_input_tokens_seen": 30080080, "step": 49390 }, { "epoch": 15.32578343158548, "grad_norm": 7.031156063079834, "learning_rate": 1.5737843073483266e-06, "loss": 0.1374, "num_input_tokens_seen": 30084592, "step": 49395 }, { "epoch": 15.327334781259696, "grad_norm": 23.34037208557129, "learning_rate": 1.572798435124881e-06, "loss": 0.1684, "num_input_tokens_seen": 30087088, "step": 49400 }, { "epoch": 15.328886130933913, "grad_norm": 14.965869903564453, "learning_rate": 1.5718128141558958e-06, "loss": 0.1862, "num_input_tokens_seen": 30090608, "step": 49405 }, { "epoch": 15.33043748060813, "grad_norm": 19.6839542388916, "learning_rate": 1.5708274445136256e-06, "loss": 0.2016, "num_input_tokens_seen": 30093200, "step": 49410 }, { "epoch": 15.331988830282345, "grad_norm": 26.802824020385742, "learning_rate": 1.5698423262703128e-06, "loss": 0.1955, "num_input_tokens_seen": 30097776, "step": 49415 }, { "epoch": 15.333540179956563, "grad_norm": 15.718706130981445, "learning_rate": 1.5688574594981749e-06, "loss": 0.2253, "num_input_tokens_seen": 30101040, "step": 49420 }, { "epoch": 15.335091529630779, "grad_norm": 17.272743225097656, "learning_rate": 1.567872844269418e-06, "loss": 0.2026, "num_input_tokens_seen": 30104560, "step": 49425 }, { "epoch": 15.336642879304996, "grad_norm": 6.762728691101074, "learning_rate": 1.5668884806562217e-06, "loss": 0.172, "num_input_tokens_seen": 30107248, "step": 49430 }, { "epoch": 15.338194228979212, "grad_norm": 13.992733001708984, "learning_rate": 1.5659043687307547e-06, "loss": 0.1882, "num_input_tokens_seen": 30110192, "step": 49435 }, { "epoch": 15.339745578653428, "grad_norm": 22.093692779541016, "learning_rate": 1.5649205085651642e-06, "loss": 0.2053, "num_input_tokens_seen": 30113680, "step": 49440 }, { "epoch": 15.341296928327646, "grad_norm": 11.78072738647461, "learning_rate": 1.5639369002315769e-06, "loss": 0.193, "num_input_tokens_seen": 30118096, "step": 49445 }, { "epoch": 15.342848278001862, "grad_norm": 19.363872528076172, "learning_rate": 1.5629535438021049e-06, "loss": 0.1628, "num_input_tokens_seen": 30121456, "step": 49450 }, { "epoch": 15.344399627676077, "grad_norm": 7.368194103240967, "learning_rate": 1.5619704393488372e-06, "loss": 0.1848, "num_input_tokens_seen": 30123952, "step": 49455 }, { "epoch": 15.345950977350295, "grad_norm": 16.53936195373535, "learning_rate": 1.5609875869438508e-06, "loss": 0.1699, "num_input_tokens_seen": 30126544, "step": 49460 }, { "epoch": 15.347502327024511, "grad_norm": 4.217804908752441, "learning_rate": 1.5600049866591966e-06, "loss": 0.1721, "num_input_tokens_seen": 30129904, "step": 49465 }, { "epoch": 15.349053676698729, "grad_norm": 22.068511962890625, "learning_rate": 1.5590226385669138e-06, "loss": 0.1919, "num_input_tokens_seen": 30132496, "step": 49470 }, { "epoch": 15.350605026372945, "grad_norm": 13.661025047302246, "learning_rate": 1.5580405427390172e-06, "loss": 0.1487, "num_input_tokens_seen": 30135888, "step": 49475 }, { "epoch": 15.35215637604716, "grad_norm": 8.470528602600098, "learning_rate": 1.5570586992475095e-06, "loss": 0.164, "num_input_tokens_seen": 30138608, "step": 49480 }, { "epoch": 15.353707725721378, "grad_norm": 23.989479064941406, "learning_rate": 1.5560771081643682e-06, "loss": 0.1662, "num_input_tokens_seen": 30141424, "step": 49485 }, { "epoch": 15.355259075395594, "grad_norm": 12.924599647521973, "learning_rate": 1.5550957695615582e-06, "loss": 0.1456, "num_input_tokens_seen": 30143952, "step": 49490 }, { "epoch": 15.35681042506981, "grad_norm": 12.437760353088379, "learning_rate": 1.554114683511021e-06, "loss": 0.1902, "num_input_tokens_seen": 30147888, "step": 49495 }, { "epoch": 15.358361774744028, "grad_norm": 3.781290054321289, "learning_rate": 1.5531338500846827e-06, "loss": 0.1828, "num_input_tokens_seen": 30150256, "step": 49500 }, { "epoch": 15.359913124418243, "grad_norm": 9.671501159667969, "learning_rate": 1.5521532693544517e-06, "loss": 0.1902, "num_input_tokens_seen": 30153776, "step": 49505 }, { "epoch": 15.361464474092461, "grad_norm": 27.2315673828125, "learning_rate": 1.5511729413922134e-06, "loss": 0.1619, "num_input_tokens_seen": 30156080, "step": 49510 }, { "epoch": 15.363015823766677, "grad_norm": 17.653522491455078, "learning_rate": 1.5501928662698397e-06, "loss": 0.184, "num_input_tokens_seen": 30159184, "step": 49515 }, { "epoch": 15.364567173440893, "grad_norm": 11.150038719177246, "learning_rate": 1.5492130440591797e-06, "loss": 0.1447, "num_input_tokens_seen": 30162896, "step": 49520 }, { "epoch": 15.36611852311511, "grad_norm": 8.197624206542969, "learning_rate": 1.5482334748320681e-06, "loss": 0.09, "num_input_tokens_seen": 30166896, "step": 49525 }, { "epoch": 15.367669872789326, "grad_norm": 10.923483848571777, "learning_rate": 1.5472541586603163e-06, "loss": 0.1585, "num_input_tokens_seen": 30169552, "step": 49530 }, { "epoch": 15.369221222463544, "grad_norm": 25.378969192504883, "learning_rate": 1.546275095615723e-06, "loss": 0.2198, "num_input_tokens_seen": 30171600, "step": 49535 }, { "epoch": 15.37077257213776, "grad_norm": 11.884284973144531, "learning_rate": 1.545296285770062e-06, "loss": 0.2174, "num_input_tokens_seen": 30174096, "step": 49540 }, { "epoch": 15.372323921811976, "grad_norm": 17.057092666625977, "learning_rate": 1.5443177291950946e-06, "loss": 0.172, "num_input_tokens_seen": 30176496, "step": 49545 }, { "epoch": 15.373875271486193, "grad_norm": 10.261533737182617, "learning_rate": 1.5433394259625573e-06, "loss": 0.1764, "num_input_tokens_seen": 30179152, "step": 49550 }, { "epoch": 15.37542662116041, "grad_norm": 22.690574645996094, "learning_rate": 1.542361376144173e-06, "loss": 0.1741, "num_input_tokens_seen": 30183088, "step": 49555 }, { "epoch": 15.376977970834627, "grad_norm": 6.183963298797607, "learning_rate": 1.5413835798116467e-06, "loss": 0.1823, "num_input_tokens_seen": 30186416, "step": 49560 }, { "epoch": 15.378529320508843, "grad_norm": 6.998028755187988, "learning_rate": 1.5404060370366576e-06, "loss": 0.1516, "num_input_tokens_seen": 30189712, "step": 49565 }, { "epoch": 15.380080670183059, "grad_norm": 10.893641471862793, "learning_rate": 1.539428747890876e-06, "loss": 0.2612, "num_input_tokens_seen": 30192592, "step": 49570 }, { "epoch": 15.381632019857276, "grad_norm": 15.070691108703613, "learning_rate": 1.5384517124459441e-06, "loss": 0.1905, "num_input_tokens_seen": 30195056, "step": 49575 }, { "epoch": 15.383183369531492, "grad_norm": 7.636777877807617, "learning_rate": 1.5374749307734948e-06, "loss": 0.1589, "num_input_tokens_seen": 30197648, "step": 49580 }, { "epoch": 15.384734719205708, "grad_norm": 10.10472297668457, "learning_rate": 1.5364984029451341e-06, "loss": 0.1848, "num_input_tokens_seen": 30200304, "step": 49585 }, { "epoch": 15.386286068879926, "grad_norm": 15.894612312316895, "learning_rate": 1.5355221290324556e-06, "loss": 0.223, "num_input_tokens_seen": 30203824, "step": 49590 }, { "epoch": 15.387837418554142, "grad_norm": 5.276369571685791, "learning_rate": 1.5345461091070291e-06, "loss": 0.1659, "num_input_tokens_seen": 30206192, "step": 49595 }, { "epoch": 15.38938876822836, "grad_norm": 19.644577026367188, "learning_rate": 1.5335703432404114e-06, "loss": 0.1431, "num_input_tokens_seen": 30210064, "step": 49600 }, { "epoch": 15.390940117902575, "grad_norm": 12.342912673950195, "learning_rate": 1.5325948315041345e-06, "loss": 0.1952, "num_input_tokens_seen": 30212528, "step": 49605 }, { "epoch": 15.392491467576791, "grad_norm": 17.22603988647461, "learning_rate": 1.5316195739697187e-06, "loss": 0.1449, "num_input_tokens_seen": 30216624, "step": 49610 }, { "epoch": 15.394042817251009, "grad_norm": 15.231115341186523, "learning_rate": 1.5306445707086582e-06, "loss": 0.2517, "num_input_tokens_seen": 30219376, "step": 49615 }, { "epoch": 15.395594166925225, "grad_norm": 12.903997421264648, "learning_rate": 1.5296698217924339e-06, "loss": 0.1872, "num_input_tokens_seen": 30222128, "step": 49620 }, { "epoch": 15.39714551659944, "grad_norm": 10.548358917236328, "learning_rate": 1.5286953272925082e-06, "loss": 0.2086, "num_input_tokens_seen": 30224656, "step": 49625 }, { "epoch": 15.398696866273658, "grad_norm": 20.555524826049805, "learning_rate": 1.5277210872803194e-06, "loss": 0.2622, "num_input_tokens_seen": 30226864, "step": 49630 }, { "epoch": 15.400248215947874, "grad_norm": 4.2332940101623535, "learning_rate": 1.5267471018272945e-06, "loss": 0.1666, "num_input_tokens_seen": 30232240, "step": 49635 }, { "epoch": 15.401799565622092, "grad_norm": 6.510969161987305, "learning_rate": 1.525773371004835e-06, "loss": 0.1948, "num_input_tokens_seen": 30234832, "step": 49640 }, { "epoch": 15.403350915296308, "grad_norm": 5.481441974639893, "learning_rate": 1.5247998948843306e-06, "loss": 0.1965, "num_input_tokens_seen": 30236912, "step": 49645 }, { "epoch": 15.404902264970524, "grad_norm": 9.588263511657715, "learning_rate": 1.5238266735371443e-06, "loss": 0.2347, "num_input_tokens_seen": 30239664, "step": 49650 }, { "epoch": 15.406453614644741, "grad_norm": 6.99320125579834, "learning_rate": 1.5228537070346284e-06, "loss": 0.1371, "num_input_tokens_seen": 30242864, "step": 49655 }, { "epoch": 15.408004964318957, "grad_norm": 7.71436882019043, "learning_rate": 1.52188099544811e-06, "loss": 0.1579, "num_input_tokens_seen": 30245648, "step": 49660 }, { "epoch": 15.409556313993175, "grad_norm": 13.66139030456543, "learning_rate": 1.5209085388489032e-06, "loss": 0.137, "num_input_tokens_seen": 30248784, "step": 49665 }, { "epoch": 15.41110766366739, "grad_norm": 14.674764633178711, "learning_rate": 1.5199363373082976e-06, "loss": 0.1735, "num_input_tokens_seen": 30251472, "step": 49670 }, { "epoch": 15.412659013341607, "grad_norm": 19.402482986450195, "learning_rate": 1.5189643908975682e-06, "loss": 0.1682, "num_input_tokens_seen": 30254064, "step": 49675 }, { "epoch": 15.414210363015824, "grad_norm": 7.787759780883789, "learning_rate": 1.5179926996879729e-06, "loss": 0.1752, "num_input_tokens_seen": 30256336, "step": 49680 }, { "epoch": 15.41576171269004, "grad_norm": 9.775091171264648, "learning_rate": 1.5170212637507437e-06, "loss": 0.177, "num_input_tokens_seen": 30259184, "step": 49685 }, { "epoch": 15.417313062364258, "grad_norm": 19.772829055786133, "learning_rate": 1.5160500831571024e-06, "loss": 0.156, "num_input_tokens_seen": 30262960, "step": 49690 }, { "epoch": 15.418864412038474, "grad_norm": 8.271780967712402, "learning_rate": 1.5150791579782443e-06, "loss": 0.1943, "num_input_tokens_seen": 30266000, "step": 49695 }, { "epoch": 15.42041576171269, "grad_norm": 14.713614463806152, "learning_rate": 1.514108488285353e-06, "loss": 0.19, "num_input_tokens_seen": 30268144, "step": 49700 }, { "epoch": 15.421967111386907, "grad_norm": 15.067091941833496, "learning_rate": 1.513138074149587e-06, "loss": 0.1353, "num_input_tokens_seen": 30272304, "step": 49705 }, { "epoch": 15.423518461061123, "grad_norm": 15.474542617797852, "learning_rate": 1.5121679156420932e-06, "loss": 0.2476, "num_input_tokens_seen": 30275024, "step": 49710 }, { "epoch": 15.425069810735339, "grad_norm": 8.272257804870605, "learning_rate": 1.511198012833991e-06, "loss": 0.1914, "num_input_tokens_seen": 30277360, "step": 49715 }, { "epoch": 15.426621160409557, "grad_norm": 23.20563507080078, "learning_rate": 1.5102283657963895e-06, "loss": 0.2763, "num_input_tokens_seen": 30280752, "step": 49720 }, { "epoch": 15.428172510083773, "grad_norm": 16.207826614379883, "learning_rate": 1.5092589746003727e-06, "loss": 0.1469, "num_input_tokens_seen": 30284112, "step": 49725 }, { "epoch": 15.42972385975799, "grad_norm": 4.062133312225342, "learning_rate": 1.5082898393170115e-06, "loss": 0.1781, "num_input_tokens_seen": 30286384, "step": 49730 }, { "epoch": 15.431275209432206, "grad_norm": 11.388071060180664, "learning_rate": 1.507320960017351e-06, "loss": 0.1502, "num_input_tokens_seen": 30289104, "step": 49735 }, { "epoch": 15.432826559106422, "grad_norm": 3.7599937915802, "learning_rate": 1.5063523367724237e-06, "loss": 0.1636, "num_input_tokens_seen": 30292016, "step": 49740 }, { "epoch": 15.43437790878064, "grad_norm": 18.31781768798828, "learning_rate": 1.505383969653243e-06, "loss": 0.1719, "num_input_tokens_seen": 30294320, "step": 49745 }, { "epoch": 15.435929258454856, "grad_norm": 1.6924763917922974, "learning_rate": 1.5044158587307977e-06, "loss": 0.1673, "num_input_tokens_seen": 30297168, "step": 49750 }, { "epoch": 15.437480608129071, "grad_norm": 8.493295669555664, "learning_rate": 1.5034480040760658e-06, "loss": 0.1207, "num_input_tokens_seen": 30300624, "step": 49755 }, { "epoch": 15.439031957803289, "grad_norm": 5.373509883880615, "learning_rate": 1.5024804057599984e-06, "loss": 0.1269, "num_input_tokens_seen": 30303312, "step": 49760 }, { "epoch": 15.440583307477505, "grad_norm": 10.726236343383789, "learning_rate": 1.5015130638535363e-06, "loss": 0.154, "num_input_tokens_seen": 30306608, "step": 49765 }, { "epoch": 15.442134657151723, "grad_norm": 12.9093599319458, "learning_rate": 1.5005459784275932e-06, "loss": 0.2358, "num_input_tokens_seen": 30309712, "step": 49770 }, { "epoch": 15.443686006825939, "grad_norm": 4.12785530090332, "learning_rate": 1.4995791495530715e-06, "loss": 0.1567, "num_input_tokens_seen": 30312080, "step": 49775 }, { "epoch": 15.445237356500154, "grad_norm": 13.725855827331543, "learning_rate": 1.498612577300847e-06, "loss": 0.1584, "num_input_tokens_seen": 30315056, "step": 49780 }, { "epoch": 15.446788706174372, "grad_norm": 24.786771774291992, "learning_rate": 1.4976462617417854e-06, "loss": 0.2444, "num_input_tokens_seen": 30317776, "step": 49785 }, { "epoch": 15.448340055848588, "grad_norm": 10.9076566696167, "learning_rate": 1.4966802029467249e-06, "loss": 0.1991, "num_input_tokens_seen": 30320528, "step": 49790 }, { "epoch": 15.449891405522806, "grad_norm": 17.24254608154297, "learning_rate": 1.4957144009864915e-06, "loss": 0.1374, "num_input_tokens_seen": 30324528, "step": 49795 }, { "epoch": 15.451442755197021, "grad_norm": 13.977991104125977, "learning_rate": 1.4947488559318908e-06, "loss": 0.1368, "num_input_tokens_seen": 30327760, "step": 49800 }, { "epoch": 15.452994104871237, "grad_norm": 15.085531234741211, "learning_rate": 1.4937835678537054e-06, "loss": 0.1664, "num_input_tokens_seen": 30330512, "step": 49805 }, { "epoch": 15.454545454545455, "grad_norm": 4.077820301055908, "learning_rate": 1.4928185368227066e-06, "loss": 0.0893, "num_input_tokens_seen": 30334800, "step": 49810 }, { "epoch": 15.456096804219671, "grad_norm": 14.075664520263672, "learning_rate": 1.491853762909638e-06, "loss": 0.2174, "num_input_tokens_seen": 30337424, "step": 49815 }, { "epoch": 15.457648153893889, "grad_norm": 11.514604568481445, "learning_rate": 1.4908892461852331e-06, "loss": 0.2235, "num_input_tokens_seen": 30339760, "step": 49820 }, { "epoch": 15.459199503568104, "grad_norm": 14.935983657836914, "learning_rate": 1.4899249867201992e-06, "loss": 0.2224, "num_input_tokens_seen": 30342416, "step": 49825 }, { "epoch": 15.46075085324232, "grad_norm": 17.70763397216797, "learning_rate": 1.4889609845852305e-06, "loss": 0.1851, "num_input_tokens_seen": 30345456, "step": 49830 }, { "epoch": 15.462302202916538, "grad_norm": 6.8988938331604, "learning_rate": 1.4879972398509973e-06, "loss": 0.1769, "num_input_tokens_seen": 30348464, "step": 49835 }, { "epoch": 15.463853552590754, "grad_norm": 12.830462455749512, "learning_rate": 1.4870337525881561e-06, "loss": 0.1669, "num_input_tokens_seen": 30351344, "step": 49840 }, { "epoch": 15.46540490226497, "grad_norm": 10.358344078063965, "learning_rate": 1.486070522867339e-06, "loss": 0.1242, "num_input_tokens_seen": 30354032, "step": 49845 }, { "epoch": 15.466956251939187, "grad_norm": 17.52992820739746, "learning_rate": 1.4851075507591656e-06, "loss": 0.2347, "num_input_tokens_seen": 30358064, "step": 49850 }, { "epoch": 15.468507601613403, "grad_norm": 5.502399921417236, "learning_rate": 1.4841448363342292e-06, "loss": 0.1665, "num_input_tokens_seen": 30361424, "step": 49855 }, { "epoch": 15.470058951287621, "grad_norm": 25.123483657836914, "learning_rate": 1.4831823796631107e-06, "loss": 0.1969, "num_input_tokens_seen": 30365712, "step": 49860 }, { "epoch": 15.471610300961837, "grad_norm": 21.069429397583008, "learning_rate": 1.4822201808163705e-06, "loss": 0.2134, "num_input_tokens_seen": 30368208, "step": 49865 }, { "epoch": 15.473161650636053, "grad_norm": 9.536541938781738, "learning_rate": 1.4812582398645463e-06, "loss": 0.2205, "num_input_tokens_seen": 30370864, "step": 49870 }, { "epoch": 15.47471300031027, "grad_norm": 16.923828125, "learning_rate": 1.4802965568781625e-06, "loss": 0.1963, "num_input_tokens_seen": 30373904, "step": 49875 }, { "epoch": 15.476264349984486, "grad_norm": 22.154821395874023, "learning_rate": 1.4793351319277194e-06, "loss": 0.3049, "num_input_tokens_seen": 30377392, "step": 49880 }, { "epoch": 15.477815699658702, "grad_norm": 9.982351303100586, "learning_rate": 1.4783739650837036e-06, "loss": 0.1776, "num_input_tokens_seen": 30379984, "step": 49885 }, { "epoch": 15.47936704933292, "grad_norm": 16.65127944946289, "learning_rate": 1.4774130564165767e-06, "loss": 0.2959, "num_input_tokens_seen": 30382608, "step": 49890 }, { "epoch": 15.480918399007136, "grad_norm": 8.479745864868164, "learning_rate": 1.4764524059967884e-06, "loss": 0.1185, "num_input_tokens_seen": 30385456, "step": 49895 }, { "epoch": 15.482469748681353, "grad_norm": 23.109111785888672, "learning_rate": 1.4754920138947614e-06, "loss": 0.2936, "num_input_tokens_seen": 30388464, "step": 49900 }, { "epoch": 15.48402109835557, "grad_norm": 16.465347290039062, "learning_rate": 1.4745318801809083e-06, "loss": 0.1706, "num_input_tokens_seen": 30391696, "step": 49905 }, { "epoch": 15.485572448029785, "grad_norm": 12.790813446044922, "learning_rate": 1.4735720049256148e-06, "loss": 0.2018, "num_input_tokens_seen": 30394416, "step": 49910 }, { "epoch": 15.487123797704003, "grad_norm": 12.347496032714844, "learning_rate": 1.4726123881992521e-06, "loss": 0.1932, "num_input_tokens_seen": 30396528, "step": 49915 }, { "epoch": 15.488675147378219, "grad_norm": 31.15656852722168, "learning_rate": 1.4716530300721737e-06, "loss": 0.1829, "num_input_tokens_seen": 30399376, "step": 49920 }, { "epoch": 15.490226497052436, "grad_norm": 10.013463973999023, "learning_rate": 1.4706939306147083e-06, "loss": 0.2085, "num_input_tokens_seen": 30402000, "step": 49925 }, { "epoch": 15.491777846726652, "grad_norm": 11.502848625183105, "learning_rate": 1.4697350898971729e-06, "loss": 0.1709, "num_input_tokens_seen": 30404240, "step": 49930 }, { "epoch": 15.493329196400868, "grad_norm": 6.463834762573242, "learning_rate": 1.468776507989858e-06, "loss": 0.2258, "num_input_tokens_seen": 30410736, "step": 49935 }, { "epoch": 15.494880546075086, "grad_norm": 18.111230850219727, "learning_rate": 1.467818184963043e-06, "loss": 0.1653, "num_input_tokens_seen": 30414000, "step": 49940 }, { "epoch": 15.496431895749302, "grad_norm": 8.422319412231445, "learning_rate": 1.4668601208869804e-06, "loss": 0.1781, "num_input_tokens_seen": 30416784, "step": 49945 }, { "epoch": 15.49798324542352, "grad_norm": 25.619464874267578, "learning_rate": 1.4659023158319113e-06, "loss": 0.2062, "num_input_tokens_seen": 30419664, "step": 49950 }, { "epoch": 15.499534595097735, "grad_norm": 4.76701545715332, "learning_rate": 1.4649447698680513e-06, "loss": 0.1487, "num_input_tokens_seen": 30422320, "step": 49955 }, { "epoch": 15.501085944771951, "grad_norm": 9.514177322387695, "learning_rate": 1.4639874830656003e-06, "loss": 0.197, "num_input_tokens_seen": 30425296, "step": 49960 }, { "epoch": 15.502637294446169, "grad_norm": 4.353306293487549, "learning_rate": 1.46303045549474e-06, "loss": 0.1123, "num_input_tokens_seen": 30428336, "step": 49965 }, { "epoch": 15.504188644120385, "grad_norm": 6.319928169250488, "learning_rate": 1.4620736872256325e-06, "loss": 0.1664, "num_input_tokens_seen": 30432176, "step": 49970 }, { "epoch": 15.5057399937946, "grad_norm": 23.759201049804688, "learning_rate": 1.4611171783284173e-06, "loss": 0.191, "num_input_tokens_seen": 30435536, "step": 49975 }, { "epoch": 15.507291343468818, "grad_norm": 18.716218948364258, "learning_rate": 1.4601609288732217e-06, "loss": 0.1808, "num_input_tokens_seen": 30438512, "step": 49980 }, { "epoch": 15.508842693143034, "grad_norm": 11.148244857788086, "learning_rate": 1.459204938930146e-06, "loss": 0.1869, "num_input_tokens_seen": 30441264, "step": 49985 }, { "epoch": 15.510394042817252, "grad_norm": 12.276227951049805, "learning_rate": 1.458249208569279e-06, "loss": 0.1217, "num_input_tokens_seen": 30444048, "step": 49990 }, { "epoch": 15.511945392491468, "grad_norm": 5.850061416625977, "learning_rate": 1.4572937378606844e-06, "loss": 0.1693, "num_input_tokens_seen": 30446448, "step": 49995 }, { "epoch": 15.513496742165684, "grad_norm": 8.38856029510498, "learning_rate": 1.4563385268744124e-06, "loss": 0.176, "num_input_tokens_seen": 30450992, "step": 50000 }, { "epoch": 15.515048091839901, "grad_norm": 27.376689910888672, "learning_rate": 1.4553835756804873e-06, "loss": 0.2035, "num_input_tokens_seen": 30454160, "step": 50005 }, { "epoch": 15.516599441514117, "grad_norm": 20.162551879882812, "learning_rate": 1.4544288843489212e-06, "loss": 0.1592, "num_input_tokens_seen": 30456912, "step": 50010 }, { "epoch": 15.518150791188333, "grad_norm": 28.896059036254883, "learning_rate": 1.4534744529497058e-06, "loss": 0.1882, "num_input_tokens_seen": 30460272, "step": 50015 }, { "epoch": 15.51970214086255, "grad_norm": 6.929715156555176, "learning_rate": 1.4525202815528078e-06, "loss": 0.1324, "num_input_tokens_seen": 30463408, "step": 50020 }, { "epoch": 15.521253490536767, "grad_norm": 5.145373821258545, "learning_rate": 1.4515663702281835e-06, "loss": 0.2035, "num_input_tokens_seen": 30465872, "step": 50025 }, { "epoch": 15.522804840210984, "grad_norm": 12.858858108520508, "learning_rate": 1.450612719045762e-06, "loss": 0.2028, "num_input_tokens_seen": 30469200, "step": 50030 }, { "epoch": 15.5243561898852, "grad_norm": 2.561558485031128, "learning_rate": 1.4496593280754612e-06, "loss": 0.1593, "num_input_tokens_seen": 30472208, "step": 50035 }, { "epoch": 15.525907539559416, "grad_norm": 5.9437785148620605, "learning_rate": 1.4487061973871725e-06, "loss": 0.1532, "num_input_tokens_seen": 30474960, "step": 50040 }, { "epoch": 15.527458889233634, "grad_norm": 17.922101974487305, "learning_rate": 1.4477533270507743e-06, "loss": 0.2322, "num_input_tokens_seen": 30477136, "step": 50045 }, { "epoch": 15.52901023890785, "grad_norm": 20.068004608154297, "learning_rate": 1.446800717136121e-06, "loss": 0.1886, "num_input_tokens_seen": 30481008, "step": 50050 }, { "epoch": 15.530561588582067, "grad_norm": 5.371922016143799, "learning_rate": 1.4458483677130525e-06, "loss": 0.2024, "num_input_tokens_seen": 30484080, "step": 50055 }, { "epoch": 15.532112938256283, "grad_norm": 16.201499938964844, "learning_rate": 1.4448962788513848e-06, "loss": 0.1945, "num_input_tokens_seen": 30486992, "step": 50060 }, { "epoch": 15.533664287930499, "grad_norm": 2.5452520847320557, "learning_rate": 1.4439444506209187e-06, "loss": 0.1378, "num_input_tokens_seen": 30489584, "step": 50065 }, { "epoch": 15.535215637604717, "grad_norm": 16.01787757873535, "learning_rate": 1.4429928830914358e-06, "loss": 0.2814, "num_input_tokens_seen": 30492688, "step": 50070 }, { "epoch": 15.536766987278932, "grad_norm": 11.390056610107422, "learning_rate": 1.442041576332694e-06, "loss": 0.1428, "num_input_tokens_seen": 30495568, "step": 50075 }, { "epoch": 15.53831833695315, "grad_norm": 19.100414276123047, "learning_rate": 1.441090530414439e-06, "loss": 0.2212, "num_input_tokens_seen": 30498416, "step": 50080 }, { "epoch": 15.539869686627366, "grad_norm": 6.442463397979736, "learning_rate": 1.4401397454063904e-06, "loss": 0.2342, "num_input_tokens_seen": 30500976, "step": 50085 }, { "epoch": 15.541421036301582, "grad_norm": 6.789567470550537, "learning_rate": 1.439189221378255e-06, "loss": 0.1551, "num_input_tokens_seen": 30505040, "step": 50090 }, { "epoch": 15.5429723859758, "grad_norm": 11.493887901306152, "learning_rate": 1.4382389583997141e-06, "loss": 0.1375, "num_input_tokens_seen": 30508976, "step": 50095 }, { "epoch": 15.544523735650015, "grad_norm": 29.17184066772461, "learning_rate": 1.4372889565404368e-06, "loss": 0.2626, "num_input_tokens_seen": 30511472, "step": 50100 }, { "epoch": 15.546075085324231, "grad_norm": 17.235349655151367, "learning_rate": 1.4363392158700667e-06, "loss": 0.1881, "num_input_tokens_seen": 30516464, "step": 50105 }, { "epoch": 15.547626434998449, "grad_norm": 5.672308921813965, "learning_rate": 1.435389736458233e-06, "loss": 0.1365, "num_input_tokens_seen": 30519024, "step": 50110 }, { "epoch": 15.549177784672665, "grad_norm": 33.14468765258789, "learning_rate": 1.4344405183745419e-06, "loss": 0.1964, "num_input_tokens_seen": 30521968, "step": 50115 }, { "epoch": 15.550729134346883, "grad_norm": 7.441002368927002, "learning_rate": 1.4334915616885847e-06, "loss": 0.207, "num_input_tokens_seen": 30524368, "step": 50120 }, { "epoch": 15.552280484021098, "grad_norm": 18.04414939880371, "learning_rate": 1.4325428664699287e-06, "loss": 0.2312, "num_input_tokens_seen": 30527408, "step": 50125 }, { "epoch": 15.553831833695314, "grad_norm": 6.959349155426025, "learning_rate": 1.4315944327881249e-06, "loss": 0.2083, "num_input_tokens_seen": 30530640, "step": 50130 }, { "epoch": 15.555383183369532, "grad_norm": 21.31810760498047, "learning_rate": 1.4306462607127075e-06, "loss": 0.229, "num_input_tokens_seen": 30533392, "step": 50135 }, { "epoch": 15.556934533043748, "grad_norm": 5.540663242340088, "learning_rate": 1.4296983503131851e-06, "loss": 0.1773, "num_input_tokens_seen": 30536112, "step": 50140 }, { "epoch": 15.558485882717964, "grad_norm": 13.306255340576172, "learning_rate": 1.4287507016590534e-06, "loss": 0.2149, "num_input_tokens_seen": 30539728, "step": 50145 }, { "epoch": 15.560037232392181, "grad_norm": 4.309845924377441, "learning_rate": 1.427803314819784e-06, "loss": 0.2074, "num_input_tokens_seen": 30542864, "step": 50150 }, { "epoch": 15.561588582066397, "grad_norm": 12.722367286682129, "learning_rate": 1.4268561898648342e-06, "loss": 0.1775, "num_input_tokens_seen": 30545168, "step": 50155 }, { "epoch": 15.563139931740615, "grad_norm": 2.64888334274292, "learning_rate": 1.4259093268636364e-06, "loss": 0.0977, "num_input_tokens_seen": 30547152, "step": 50160 }, { "epoch": 15.56469128141483, "grad_norm": 1.8469609022140503, "learning_rate": 1.4249627258856103e-06, "loss": 0.1521, "num_input_tokens_seen": 30550800, "step": 50165 }, { "epoch": 15.566242631089047, "grad_norm": 15.166324615478516, "learning_rate": 1.4240163870001494e-06, "loss": 0.1568, "num_input_tokens_seen": 30553232, "step": 50170 }, { "epoch": 15.567793980763264, "grad_norm": 16.154518127441406, "learning_rate": 1.4230703102766347e-06, "loss": 0.1828, "num_input_tokens_seen": 30555664, "step": 50175 }, { "epoch": 15.56934533043748, "grad_norm": 9.14148235321045, "learning_rate": 1.422124495784422e-06, "loss": 0.2011, "num_input_tokens_seen": 30559216, "step": 50180 }, { "epoch": 15.570896680111698, "grad_norm": 25.94032859802246, "learning_rate": 1.4211789435928525e-06, "loss": 0.2453, "num_input_tokens_seen": 30562640, "step": 50185 }, { "epoch": 15.572448029785914, "grad_norm": 9.08768081665039, "learning_rate": 1.4202336537712474e-06, "loss": 0.1387, "num_input_tokens_seen": 30565296, "step": 50190 }, { "epoch": 15.57399937946013, "grad_norm": 14.803275108337402, "learning_rate": 1.4192886263889043e-06, "loss": 0.1828, "num_input_tokens_seen": 30567664, "step": 50195 }, { "epoch": 15.575550729134347, "grad_norm": 5.708300590515137, "learning_rate": 1.4183438615151091e-06, "loss": 0.1589, "num_input_tokens_seen": 30570768, "step": 50200 }, { "epoch": 15.577102078808563, "grad_norm": 11.868738174438477, "learning_rate": 1.4173993592191199e-06, "loss": 0.1944, "num_input_tokens_seen": 30573552, "step": 50205 }, { "epoch": 15.578653428482781, "grad_norm": 13.725201606750488, "learning_rate": 1.4164551195701837e-06, "loss": 0.2262, "num_input_tokens_seen": 30577904, "step": 50210 }, { "epoch": 15.580204778156997, "grad_norm": 9.622286796569824, "learning_rate": 1.4155111426375213e-06, "loss": 0.1367, "num_input_tokens_seen": 30580848, "step": 50215 }, { "epoch": 15.581756127831213, "grad_norm": 12.743050575256348, "learning_rate": 1.4145674284903404e-06, "loss": 0.1677, "num_input_tokens_seen": 30583824, "step": 50220 }, { "epoch": 15.58330747750543, "grad_norm": 4.392817974090576, "learning_rate": 1.4136239771978232e-06, "loss": 0.0874, "num_input_tokens_seen": 30587056, "step": 50225 }, { "epoch": 15.584858827179646, "grad_norm": 7.143552780151367, "learning_rate": 1.4126807888291394e-06, "loss": 0.1983, "num_input_tokens_seen": 30590448, "step": 50230 }, { "epoch": 15.586410176853862, "grad_norm": 5.537806034088135, "learning_rate": 1.4117378634534318e-06, "loss": 0.2369, "num_input_tokens_seen": 30592944, "step": 50235 }, { "epoch": 15.58796152652808, "grad_norm": 25.11905288696289, "learning_rate": 1.4107952011398324e-06, "loss": 0.2224, "num_input_tokens_seen": 30596816, "step": 50240 }, { "epoch": 15.589512876202296, "grad_norm": 6.462485313415527, "learning_rate": 1.4098528019574454e-06, "loss": 0.1817, "num_input_tokens_seen": 30599696, "step": 50245 }, { "epoch": 15.591064225876513, "grad_norm": 8.552559852600098, "learning_rate": 1.4089106659753616e-06, "loss": 0.2455, "num_input_tokens_seen": 30604272, "step": 50250 }, { "epoch": 15.59261557555073, "grad_norm": 12.257357597351074, "learning_rate": 1.407968793262653e-06, "loss": 0.2709, "num_input_tokens_seen": 30606992, "step": 50255 }, { "epoch": 15.594166925224945, "grad_norm": 8.579197883605957, "learning_rate": 1.407027183888366e-06, "loss": 0.1546, "num_input_tokens_seen": 30609712, "step": 50260 }, { "epoch": 15.595718274899163, "grad_norm": 16.50975799560547, "learning_rate": 1.4060858379215347e-06, "loss": 0.1712, "num_input_tokens_seen": 30614032, "step": 50265 }, { "epoch": 15.597269624573379, "grad_norm": 27.033447265625, "learning_rate": 1.4051447554311687e-06, "loss": 0.3195, "num_input_tokens_seen": 30616176, "step": 50270 }, { "epoch": 15.598820974247595, "grad_norm": 14.374798774719238, "learning_rate": 1.4042039364862631e-06, "loss": 0.1564, "num_input_tokens_seen": 30619312, "step": 50275 }, { "epoch": 15.600372323921812, "grad_norm": 7.7302446365356445, "learning_rate": 1.4032633811557878e-06, "loss": 0.1344, "num_input_tokens_seen": 30622128, "step": 50280 }, { "epoch": 15.601923673596028, "grad_norm": 6.09732723236084, "learning_rate": 1.4023230895086997e-06, "loss": 0.2339, "num_input_tokens_seen": 30625168, "step": 50285 }, { "epoch": 15.603475023270246, "grad_norm": 6.281012535095215, "learning_rate": 1.4013830616139313e-06, "loss": 0.185, "num_input_tokens_seen": 30627536, "step": 50290 }, { "epoch": 15.605026372944462, "grad_norm": 13.972583770751953, "learning_rate": 1.4004432975403992e-06, "loss": 0.165, "num_input_tokens_seen": 30631440, "step": 50295 }, { "epoch": 15.606577722618677, "grad_norm": 17.294448852539062, "learning_rate": 1.3995037973569975e-06, "loss": 0.2221, "num_input_tokens_seen": 30633840, "step": 50300 }, { "epoch": 15.608129072292895, "grad_norm": 4.302087783813477, "learning_rate": 1.3985645611326031e-06, "loss": 0.1535, "num_input_tokens_seen": 30636208, "step": 50305 }, { "epoch": 15.609680421967111, "grad_norm": 16.647233963012695, "learning_rate": 1.3976255889360763e-06, "loss": 0.1424, "num_input_tokens_seen": 30639024, "step": 50310 }, { "epoch": 15.611231771641329, "grad_norm": 21.91349983215332, "learning_rate": 1.39668688083625e-06, "loss": 0.1732, "num_input_tokens_seen": 30642032, "step": 50315 }, { "epoch": 15.612783121315545, "grad_norm": 12.147905349731445, "learning_rate": 1.3957484369019465e-06, "loss": 0.2163, "num_input_tokens_seen": 30645488, "step": 50320 }, { "epoch": 15.61433447098976, "grad_norm": 5.815384387969971, "learning_rate": 1.3948102572019617e-06, "loss": 0.1072, "num_input_tokens_seen": 30649328, "step": 50325 }, { "epoch": 15.615885820663978, "grad_norm": 9.718963623046875, "learning_rate": 1.3938723418050786e-06, "loss": 0.1021, "num_input_tokens_seen": 30652176, "step": 50330 }, { "epoch": 15.617437170338194, "grad_norm": 7.295934677124023, "learning_rate": 1.3929346907800545e-06, "loss": 0.1805, "num_input_tokens_seen": 30655504, "step": 50335 }, { "epoch": 15.618988520012412, "grad_norm": 5.694187641143799, "learning_rate": 1.391997304195633e-06, "loss": 0.1233, "num_input_tokens_seen": 30658480, "step": 50340 }, { "epoch": 15.620539869686628, "grad_norm": 7.023221492767334, "learning_rate": 1.3910601821205327e-06, "loss": 0.1124, "num_input_tokens_seen": 30661648, "step": 50345 }, { "epoch": 15.622091219360843, "grad_norm": 5.6227192878723145, "learning_rate": 1.3901233246234585e-06, "loss": 0.1678, "num_input_tokens_seen": 30665808, "step": 50350 }, { "epoch": 15.623642569035061, "grad_norm": 43.10183334350586, "learning_rate": 1.3891867317730912e-06, "loss": 0.2539, "num_input_tokens_seen": 30668976, "step": 50355 }, { "epoch": 15.625193918709277, "grad_norm": 13.061875343322754, "learning_rate": 1.3882504036380956e-06, "loss": 0.1466, "num_input_tokens_seen": 30672240, "step": 50360 }, { "epoch": 15.626745268383493, "grad_norm": 19.81296730041504, "learning_rate": 1.3873143402871136e-06, "loss": 0.1728, "num_input_tokens_seen": 30675088, "step": 50365 }, { "epoch": 15.62829661805771, "grad_norm": 9.14294719696045, "learning_rate": 1.3863785417887714e-06, "loss": 0.1306, "num_input_tokens_seen": 30677648, "step": 50370 }, { "epoch": 15.629847967731926, "grad_norm": 19.135194778442383, "learning_rate": 1.3854430082116749e-06, "loss": 0.2821, "num_input_tokens_seen": 30680400, "step": 50375 }, { "epoch": 15.631399317406144, "grad_norm": 24.121034622192383, "learning_rate": 1.3845077396244071e-06, "loss": 0.159, "num_input_tokens_seen": 30682896, "step": 50380 }, { "epoch": 15.63295066708036, "grad_norm": 12.86888599395752, "learning_rate": 1.383572736095538e-06, "loss": 0.1366, "num_input_tokens_seen": 30686608, "step": 50385 }, { "epoch": 15.634502016754576, "grad_norm": 16.04666519165039, "learning_rate": 1.3826379976936099e-06, "loss": 0.2688, "num_input_tokens_seen": 30689552, "step": 50390 }, { "epoch": 15.636053366428794, "grad_norm": 16.63875961303711, "learning_rate": 1.3817035244871546e-06, "loss": 0.1937, "num_input_tokens_seen": 30692208, "step": 50395 }, { "epoch": 15.63760471610301, "grad_norm": 20.273273468017578, "learning_rate": 1.3807693165446761e-06, "loss": 0.1929, "num_input_tokens_seen": 30696560, "step": 50400 }, { "epoch": 15.639156065777225, "grad_norm": 6.376904487609863, "learning_rate": 1.3798353739346665e-06, "loss": 0.1257, "num_input_tokens_seen": 30700528, "step": 50405 }, { "epoch": 15.640707415451443, "grad_norm": 11.35950756072998, "learning_rate": 1.378901696725592e-06, "loss": 0.1871, "num_input_tokens_seen": 30703600, "step": 50410 }, { "epoch": 15.642258765125659, "grad_norm": 15.499277114868164, "learning_rate": 1.3779682849859043e-06, "loss": 0.1876, "num_input_tokens_seen": 30706000, "step": 50415 }, { "epoch": 15.643810114799876, "grad_norm": 7.550565242767334, "learning_rate": 1.3770351387840314e-06, "loss": 0.1636, "num_input_tokens_seen": 30708816, "step": 50420 }, { "epoch": 15.645361464474092, "grad_norm": 17.535131454467773, "learning_rate": 1.3761022581883848e-06, "loss": 0.1803, "num_input_tokens_seen": 30711696, "step": 50425 }, { "epoch": 15.646912814148308, "grad_norm": 6.967916965484619, "learning_rate": 1.3751696432673578e-06, "loss": 0.2628, "num_input_tokens_seen": 30715664, "step": 50430 }, { "epoch": 15.648464163822526, "grad_norm": 5.838742733001709, "learning_rate": 1.3742372940893189e-06, "loss": 0.1098, "num_input_tokens_seen": 30719184, "step": 50435 }, { "epoch": 15.650015513496742, "grad_norm": 9.225179672241211, "learning_rate": 1.3733052107226236e-06, "loss": 0.2062, "num_input_tokens_seen": 30721456, "step": 50440 }, { "epoch": 15.65156686317096, "grad_norm": 10.057290077209473, "learning_rate": 1.3723733932356009e-06, "loss": 0.1239, "num_input_tokens_seen": 30724976, "step": 50445 }, { "epoch": 15.653118212845175, "grad_norm": 11.273436546325684, "learning_rate": 1.3714418416965675e-06, "loss": 0.2767, "num_input_tokens_seen": 30727856, "step": 50450 }, { "epoch": 15.654669562519391, "grad_norm": 13.737223625183105, "learning_rate": 1.3705105561738141e-06, "loss": 0.1649, "num_input_tokens_seen": 30730128, "step": 50455 }, { "epoch": 15.656220912193609, "grad_norm": 11.4681978225708, "learning_rate": 1.3695795367356185e-06, "loss": 0.1779, "num_input_tokens_seen": 30732240, "step": 50460 }, { "epoch": 15.657772261867825, "grad_norm": 8.885154724121094, "learning_rate": 1.3686487834502316e-06, "loss": 0.2252, "num_input_tokens_seen": 30735664, "step": 50465 }, { "epoch": 15.659323611542042, "grad_norm": 17.225032806396484, "learning_rate": 1.367718296385892e-06, "loss": 0.2471, "num_input_tokens_seen": 30738736, "step": 50470 }, { "epoch": 15.660874961216258, "grad_norm": 22.21307945251465, "learning_rate": 1.3667880756108126e-06, "loss": 0.2281, "num_input_tokens_seen": 30741520, "step": 50475 }, { "epoch": 15.662426310890474, "grad_norm": 28.719085693359375, "learning_rate": 1.365858121193192e-06, "loss": 0.1919, "num_input_tokens_seen": 30746000, "step": 50480 }, { "epoch": 15.663977660564692, "grad_norm": 13.000232696533203, "learning_rate": 1.3649284332012048e-06, "loss": 0.2181, "num_input_tokens_seen": 30748784, "step": 50485 }, { "epoch": 15.665529010238908, "grad_norm": 6.230853080749512, "learning_rate": 1.3639990117030089e-06, "loss": 0.193, "num_input_tokens_seen": 30752368, "step": 50490 }, { "epoch": 15.667080359913124, "grad_norm": 7.374227046966553, "learning_rate": 1.3630698567667434e-06, "loss": 0.2073, "num_input_tokens_seen": 30755632, "step": 50495 }, { "epoch": 15.668631709587341, "grad_norm": 3.5978071689605713, "learning_rate": 1.3621409684605242e-06, "loss": 0.1361, "num_input_tokens_seen": 30758512, "step": 50500 }, { "epoch": 15.670183059261557, "grad_norm": 16.793567657470703, "learning_rate": 1.361212346852452e-06, "loss": 0.2272, "num_input_tokens_seen": 30761328, "step": 50505 }, { "epoch": 15.671734408935775, "grad_norm": 5.244437217712402, "learning_rate": 1.3602839920106026e-06, "loss": 0.1354, "num_input_tokens_seen": 30763632, "step": 50510 }, { "epoch": 15.67328575860999, "grad_norm": 9.786576271057129, "learning_rate": 1.3593559040030391e-06, "loss": 0.1669, "num_input_tokens_seen": 30767952, "step": 50515 }, { "epoch": 15.674837108284207, "grad_norm": 8.117924690246582, "learning_rate": 1.3584280828977975e-06, "loss": 0.1896, "num_input_tokens_seen": 30770448, "step": 50520 }, { "epoch": 15.676388457958424, "grad_norm": 12.582221984863281, "learning_rate": 1.3575005287629022e-06, "loss": 0.1556, "num_input_tokens_seen": 30772464, "step": 50525 }, { "epoch": 15.67793980763264, "grad_norm": 13.395674705505371, "learning_rate": 1.3565732416663501e-06, "loss": 0.168, "num_input_tokens_seen": 30774864, "step": 50530 }, { "epoch": 15.679491157306856, "grad_norm": 10.466094017028809, "learning_rate": 1.3556462216761257e-06, "loss": 0.1659, "num_input_tokens_seen": 30777456, "step": 50535 }, { "epoch": 15.681042506981074, "grad_norm": 13.236490249633789, "learning_rate": 1.3547194688601855e-06, "loss": 0.1539, "num_input_tokens_seen": 30781264, "step": 50540 }, { "epoch": 15.68259385665529, "grad_norm": 13.884682655334473, "learning_rate": 1.3537929832864777e-06, "loss": 0.2089, "num_input_tokens_seen": 30784656, "step": 50545 }, { "epoch": 15.684145206329507, "grad_norm": 14.235245704650879, "learning_rate": 1.3528667650229205e-06, "loss": 0.1981, "num_input_tokens_seen": 30786992, "step": 50550 }, { "epoch": 15.685696556003723, "grad_norm": 10.328995704650879, "learning_rate": 1.3519408141374196e-06, "loss": 0.2146, "num_input_tokens_seen": 30790064, "step": 50555 }, { "epoch": 15.687247905677939, "grad_norm": 9.819690704345703, "learning_rate": 1.3510151306978547e-06, "loss": 0.1679, "num_input_tokens_seen": 30793360, "step": 50560 }, { "epoch": 15.688799255352157, "grad_norm": 9.86176586151123, "learning_rate": 1.3500897147720931e-06, "loss": 0.2024, "num_input_tokens_seen": 30796944, "step": 50565 }, { "epoch": 15.690350605026373, "grad_norm": 11.584815979003906, "learning_rate": 1.3491645664279752e-06, "loss": 0.2196, "num_input_tokens_seen": 30801008, "step": 50570 }, { "epoch": 15.69190195470059, "grad_norm": 12.603604316711426, "learning_rate": 1.348239685733327e-06, "loss": 0.1372, "num_input_tokens_seen": 30804048, "step": 50575 }, { "epoch": 15.693453304374806, "grad_norm": 4.706139087677002, "learning_rate": 1.3473150727559543e-06, "loss": 0.2081, "num_input_tokens_seen": 30806768, "step": 50580 }, { "epoch": 15.695004654049022, "grad_norm": 10.2958345413208, "learning_rate": 1.3463907275636395e-06, "loss": 0.1586, "num_input_tokens_seen": 30809776, "step": 50585 }, { "epoch": 15.69655600372324, "grad_norm": 14.047338485717773, "learning_rate": 1.3454666502241514e-06, "loss": 0.2477, "num_input_tokens_seen": 30813424, "step": 50590 }, { "epoch": 15.698107353397456, "grad_norm": 7.773207187652588, "learning_rate": 1.3445428408052325e-06, "loss": 0.1862, "num_input_tokens_seen": 30815920, "step": 50595 }, { "epoch": 15.699658703071673, "grad_norm": 9.327605247497559, "learning_rate": 1.3436192993746117e-06, "loss": 0.1778, "num_input_tokens_seen": 30820016, "step": 50600 }, { "epoch": 15.701210052745889, "grad_norm": 12.720491409301758, "learning_rate": 1.3426960259999927e-06, "loss": 0.2441, "num_input_tokens_seen": 30824496, "step": 50605 }, { "epoch": 15.702761402420105, "grad_norm": 17.682710647583008, "learning_rate": 1.341773020749066e-06, "loss": 0.2179, "num_input_tokens_seen": 30827024, "step": 50610 }, { "epoch": 15.704312752094323, "grad_norm": 13.665332794189453, "learning_rate": 1.3408502836894943e-06, "loss": 0.1694, "num_input_tokens_seen": 30830416, "step": 50615 }, { "epoch": 15.705864101768539, "grad_norm": 16.201753616333008, "learning_rate": 1.33992781488893e-06, "loss": 0.1914, "num_input_tokens_seen": 30833648, "step": 50620 }, { "epoch": 15.707415451442754, "grad_norm": 15.278275489807129, "learning_rate": 1.3390056144149966e-06, "loss": 0.2425, "num_input_tokens_seen": 30836880, "step": 50625 }, { "epoch": 15.708966801116972, "grad_norm": 5.065091133117676, "learning_rate": 1.338083682335305e-06, "loss": 0.1762, "num_input_tokens_seen": 30840112, "step": 50630 }, { "epoch": 15.710518150791188, "grad_norm": 10.62905216217041, "learning_rate": 1.3371620187174438e-06, "loss": 0.1692, "num_input_tokens_seen": 30843600, "step": 50635 }, { "epoch": 15.712069500465406, "grad_norm": 10.2205228805542, "learning_rate": 1.3362406236289799e-06, "loss": 0.1805, "num_input_tokens_seen": 30847024, "step": 50640 }, { "epoch": 15.713620850139622, "grad_norm": 21.187137603759766, "learning_rate": 1.3353194971374657e-06, "loss": 0.2717, "num_input_tokens_seen": 30849872, "step": 50645 }, { "epoch": 15.715172199813837, "grad_norm": 6.412413597106934, "learning_rate": 1.3343986393104263e-06, "loss": 0.1762, "num_input_tokens_seen": 30853808, "step": 50650 }, { "epoch": 15.716723549488055, "grad_norm": 19.837345123291016, "learning_rate": 1.333478050215376e-06, "loss": 0.2118, "num_input_tokens_seen": 30856752, "step": 50655 }, { "epoch": 15.718274899162271, "grad_norm": 10.132004737854004, "learning_rate": 1.3325577299198005e-06, "loss": 0.1511, "num_input_tokens_seen": 30859632, "step": 50660 }, { "epoch": 15.719826248836487, "grad_norm": 1.2043125629425049, "learning_rate": 1.3316376784911745e-06, "loss": 0.2114, "num_input_tokens_seen": 30862896, "step": 50665 }, { "epoch": 15.721377598510704, "grad_norm": 14.13935661315918, "learning_rate": 1.3307178959969453e-06, "loss": 0.1651, "num_input_tokens_seen": 30865136, "step": 50670 }, { "epoch": 15.72292894818492, "grad_norm": 6.4402031898498535, "learning_rate": 1.3297983825045462e-06, "loss": 0.2231, "num_input_tokens_seen": 30868560, "step": 50675 }, { "epoch": 15.724480297859138, "grad_norm": 12.490310668945312, "learning_rate": 1.328879138081386e-06, "loss": 0.1252, "num_input_tokens_seen": 30871664, "step": 50680 }, { "epoch": 15.726031647533354, "grad_norm": 8.928667068481445, "learning_rate": 1.327960162794859e-06, "loss": 0.2016, "num_input_tokens_seen": 30874448, "step": 50685 }, { "epoch": 15.72758299720757, "grad_norm": 7.462630271911621, "learning_rate": 1.3270414567123342e-06, "loss": 0.207, "num_input_tokens_seen": 30877136, "step": 50690 }, { "epoch": 15.729134346881787, "grad_norm": 4.455593109130859, "learning_rate": 1.3261230199011643e-06, "loss": 0.1857, "num_input_tokens_seen": 30880272, "step": 50695 }, { "epoch": 15.730685696556003, "grad_norm": 8.90855598449707, "learning_rate": 1.3252048524286843e-06, "loss": 0.1233, "num_input_tokens_seen": 30883344, "step": 50700 }, { "epoch": 15.732237046230221, "grad_norm": 22.50623893737793, "learning_rate": 1.3242869543622034e-06, "loss": 0.288, "num_input_tokens_seen": 30886000, "step": 50705 }, { "epoch": 15.733788395904437, "grad_norm": 4.430984020233154, "learning_rate": 1.3233693257690167e-06, "loss": 0.2062, "num_input_tokens_seen": 30889072, "step": 50710 }, { "epoch": 15.735339745578653, "grad_norm": 8.266517639160156, "learning_rate": 1.322451966716395e-06, "loss": 0.1545, "num_input_tokens_seen": 30893648, "step": 50715 }, { "epoch": 15.73689109525287, "grad_norm": 10.315122604370117, "learning_rate": 1.321534877271594e-06, "loss": 0.1998, "num_input_tokens_seen": 30897648, "step": 50720 }, { "epoch": 15.738442444927086, "grad_norm": 17.148941040039062, "learning_rate": 1.320618057501845e-06, "loss": 0.1759, "num_input_tokens_seen": 30900752, "step": 50725 }, { "epoch": 15.739993794601304, "grad_norm": 15.601459503173828, "learning_rate": 1.3197015074743642e-06, "loss": 0.1835, "num_input_tokens_seen": 30903408, "step": 50730 }, { "epoch": 15.74154514427552, "grad_norm": 15.145771026611328, "learning_rate": 1.3187852272563427e-06, "loss": 0.1711, "num_input_tokens_seen": 30906416, "step": 50735 }, { "epoch": 15.743096493949736, "grad_norm": 6.745412826538086, "learning_rate": 1.3178692169149576e-06, "loss": 0.1105, "num_input_tokens_seen": 30909200, "step": 50740 }, { "epoch": 15.744647843623953, "grad_norm": 4.323965072631836, "learning_rate": 1.3169534765173604e-06, "loss": 0.164, "num_input_tokens_seen": 30912688, "step": 50745 }, { "epoch": 15.74619919329817, "grad_norm": 5.550434589385986, "learning_rate": 1.316038006130687e-06, "loss": 0.2118, "num_input_tokens_seen": 30915504, "step": 50750 }, { "epoch": 15.747750542972385, "grad_norm": 7.272108554840088, "learning_rate": 1.3151228058220539e-06, "loss": 0.2091, "num_input_tokens_seen": 30917872, "step": 50755 }, { "epoch": 15.749301892646603, "grad_norm": 14.533554077148438, "learning_rate": 1.314207875658553e-06, "loss": 0.1671, "num_input_tokens_seen": 30920624, "step": 50760 }, { "epoch": 15.750853242320819, "grad_norm": 10.910465240478516, "learning_rate": 1.3132932157072626e-06, "loss": 0.1817, "num_input_tokens_seen": 30922896, "step": 50765 }, { "epoch": 15.752404591995036, "grad_norm": 6.722623348236084, "learning_rate": 1.3123788260352355e-06, "loss": 0.1563, "num_input_tokens_seen": 30926832, "step": 50770 }, { "epoch": 15.753955941669252, "grad_norm": 9.886101722717285, "learning_rate": 1.3114647067095099e-06, "loss": 0.2303, "num_input_tokens_seen": 30929744, "step": 50775 }, { "epoch": 15.755507291343468, "grad_norm": 7.593928813934326, "learning_rate": 1.3105508577970982e-06, "loss": 0.1782, "num_input_tokens_seen": 30932304, "step": 50780 }, { "epoch": 15.757058641017686, "grad_norm": 17.769508361816406, "learning_rate": 1.3096372793649998e-06, "loss": 0.2386, "num_input_tokens_seen": 30934992, "step": 50785 }, { "epoch": 15.758609990691902, "grad_norm": 6.678618431091309, "learning_rate": 1.308723971480188e-06, "loss": 0.2551, "num_input_tokens_seen": 30938064, "step": 50790 }, { "epoch": 15.760161340366118, "grad_norm": 8.16019058227539, "learning_rate": 1.3078109342096219e-06, "loss": 0.2172, "num_input_tokens_seen": 30940272, "step": 50795 }, { "epoch": 15.761712690040335, "grad_norm": 9.29994010925293, "learning_rate": 1.3068981676202347e-06, "loss": 0.1572, "num_input_tokens_seen": 30942704, "step": 50800 }, { "epoch": 15.763264039714551, "grad_norm": 14.528179168701172, "learning_rate": 1.305985671778946e-06, "loss": 0.261, "num_input_tokens_seen": 30945776, "step": 50805 }, { "epoch": 15.764815389388769, "grad_norm": 22.441614151000977, "learning_rate": 1.30507344675265e-06, "loss": 0.2081, "num_input_tokens_seen": 30949584, "step": 50810 }, { "epoch": 15.766366739062985, "grad_norm": 17.552352905273438, "learning_rate": 1.3041614926082247e-06, "loss": 0.1941, "num_input_tokens_seen": 30952272, "step": 50815 }, { "epoch": 15.7679180887372, "grad_norm": 11.084322929382324, "learning_rate": 1.303249809412529e-06, "loss": 0.1719, "num_input_tokens_seen": 30954704, "step": 50820 }, { "epoch": 15.769469438411418, "grad_norm": 11.420726776123047, "learning_rate": 1.3023383972323967e-06, "loss": 0.1343, "num_input_tokens_seen": 30957488, "step": 50825 }, { "epoch": 15.771020788085634, "grad_norm": 10.25281047821045, "learning_rate": 1.301427256134648e-06, "loss": 0.2147, "num_input_tokens_seen": 30960848, "step": 50830 }, { "epoch": 15.772572137759852, "grad_norm": 14.57943058013916, "learning_rate": 1.3005163861860781e-06, "loss": 0.1696, "num_input_tokens_seen": 30964176, "step": 50835 }, { "epoch": 15.774123487434068, "grad_norm": 13.90860366821289, "learning_rate": 1.2996057874534662e-06, "loss": 0.2592, "num_input_tokens_seen": 30966896, "step": 50840 }, { "epoch": 15.775674837108284, "grad_norm": 7.578100204467773, "learning_rate": 1.2986954600035683e-06, "loss": 0.1301, "num_input_tokens_seen": 30969136, "step": 50845 }, { "epoch": 15.777226186782501, "grad_norm": 22.673097610473633, "learning_rate": 1.2977854039031251e-06, "loss": 0.2179, "num_input_tokens_seen": 30972656, "step": 50850 }, { "epoch": 15.778777536456717, "grad_norm": 6.640272617340088, "learning_rate": 1.2968756192188509e-06, "loss": 0.1881, "num_input_tokens_seen": 30975760, "step": 50855 }, { "epoch": 15.780328886130935, "grad_norm": 33.98240280151367, "learning_rate": 1.2959661060174462e-06, "loss": 0.1593, "num_input_tokens_seen": 30980464, "step": 50860 }, { "epoch": 15.78188023580515, "grad_norm": 9.941519737243652, "learning_rate": 1.2950568643655876e-06, "loss": 0.2581, "num_input_tokens_seen": 30984528, "step": 50865 }, { "epoch": 15.783431585479367, "grad_norm": 4.888494491577148, "learning_rate": 1.2941478943299334e-06, "loss": 0.1972, "num_input_tokens_seen": 30986928, "step": 50870 }, { "epoch": 15.784982935153584, "grad_norm": 6.2335615158081055, "learning_rate": 1.293239195977124e-06, "loss": 0.2105, "num_input_tokens_seen": 30989520, "step": 50875 }, { "epoch": 15.7865342848278, "grad_norm": 20.610328674316406, "learning_rate": 1.2923307693737747e-06, "loss": 0.3011, "num_input_tokens_seen": 30992336, "step": 50880 }, { "epoch": 15.788085634502016, "grad_norm": 14.8938570022583, "learning_rate": 1.2914226145864866e-06, "loss": 0.2027, "num_input_tokens_seen": 30996016, "step": 50885 }, { "epoch": 15.789636984176234, "grad_norm": 18.12192153930664, "learning_rate": 1.2905147316818363e-06, "loss": 0.2145, "num_input_tokens_seen": 30998704, "step": 50890 }, { "epoch": 15.79118833385045, "grad_norm": 4.284637928009033, "learning_rate": 1.2896071207263838e-06, "loss": 0.1981, "num_input_tokens_seen": 31000848, "step": 50895 }, { "epoch": 15.792739683524667, "grad_norm": 21.157320022583008, "learning_rate": 1.2886997817866653e-06, "loss": 0.1881, "num_input_tokens_seen": 31004752, "step": 50900 }, { "epoch": 15.794291033198883, "grad_norm": 7.343675136566162, "learning_rate": 1.287792714929203e-06, "loss": 0.1762, "num_input_tokens_seen": 31007568, "step": 50905 }, { "epoch": 15.795842382873099, "grad_norm": 10.865788459777832, "learning_rate": 1.2868859202204925e-06, "loss": 0.1804, "num_input_tokens_seen": 31010064, "step": 50910 }, { "epoch": 15.797393732547317, "grad_norm": 11.736265182495117, "learning_rate": 1.2859793977270151e-06, "loss": 0.1724, "num_input_tokens_seen": 31013136, "step": 50915 }, { "epoch": 15.798945082221532, "grad_norm": 17.466598510742188, "learning_rate": 1.2850731475152274e-06, "loss": 0.2458, "num_input_tokens_seen": 31015952, "step": 50920 }, { "epoch": 15.800496431895748, "grad_norm": 4.110691547393799, "learning_rate": 1.28416716965157e-06, "loss": 0.2359, "num_input_tokens_seen": 31018384, "step": 50925 }, { "epoch": 15.802047781569966, "grad_norm": 6.247149467468262, "learning_rate": 1.2832614642024604e-06, "loss": 0.2288, "num_input_tokens_seen": 31021072, "step": 50930 }, { "epoch": 15.803599131244182, "grad_norm": 8.089757919311523, "learning_rate": 1.282356031234298e-06, "loss": 0.1593, "num_input_tokens_seen": 31024400, "step": 50935 }, { "epoch": 15.8051504809184, "grad_norm": 9.738499641418457, "learning_rate": 1.2814508708134631e-06, "loss": 0.1811, "num_input_tokens_seen": 31027024, "step": 50940 }, { "epoch": 15.806701830592615, "grad_norm": 10.865693092346191, "learning_rate": 1.2805459830063126e-06, "loss": 0.1508, "num_input_tokens_seen": 31030064, "step": 50945 }, { "epoch": 15.808253180266831, "grad_norm": 14.388339042663574, "learning_rate": 1.279641367879188e-06, "loss": 0.209, "num_input_tokens_seen": 31033264, "step": 50950 }, { "epoch": 15.809804529941049, "grad_norm": 4.882588863372803, "learning_rate": 1.2787370254984049e-06, "loss": 0.1451, "num_input_tokens_seen": 31035568, "step": 50955 }, { "epoch": 15.811355879615265, "grad_norm": 14.994144439697266, "learning_rate": 1.2778329559302654e-06, "loss": 0.1875, "num_input_tokens_seen": 31038160, "step": 50960 }, { "epoch": 15.812907229289483, "grad_norm": 11.278197288513184, "learning_rate": 1.2769291592410465e-06, "loss": 0.2539, "num_input_tokens_seen": 31040560, "step": 50965 }, { "epoch": 15.814458578963698, "grad_norm": 4.786709785461426, "learning_rate": 1.2760256354970091e-06, "loss": 0.2228, "num_input_tokens_seen": 31044048, "step": 50970 }, { "epoch": 15.816009928637914, "grad_norm": 12.960304260253906, "learning_rate": 1.2751223847643896e-06, "loss": 0.2201, "num_input_tokens_seen": 31046928, "step": 50975 }, { "epoch": 15.817561278312132, "grad_norm": 19.84006118774414, "learning_rate": 1.27421940710941e-06, "loss": 0.1674, "num_input_tokens_seen": 31051696, "step": 50980 }, { "epoch": 15.819112627986348, "grad_norm": 10.075423240661621, "learning_rate": 1.2733167025982663e-06, "loss": 0.1471, "num_input_tokens_seen": 31054608, "step": 50985 }, { "epoch": 15.820663977660566, "grad_norm": 10.260873794555664, "learning_rate": 1.2724142712971388e-06, "loss": 0.1858, "num_input_tokens_seen": 31058320, "step": 50990 }, { "epoch": 15.822215327334781, "grad_norm": 7.167118549346924, "learning_rate": 1.2715121132721881e-06, "loss": 0.1442, "num_input_tokens_seen": 31062288, "step": 50995 }, { "epoch": 15.823766677008997, "grad_norm": 14.023274421691895, "learning_rate": 1.2706102285895493e-06, "loss": 0.2431, "num_input_tokens_seen": 31065104, "step": 51000 }, { "epoch": 15.825318026683215, "grad_norm": 23.076141357421875, "learning_rate": 1.269708617315345e-06, "loss": 0.1843, "num_input_tokens_seen": 31068208, "step": 51005 }, { "epoch": 15.82686937635743, "grad_norm": 8.213700294494629, "learning_rate": 1.2688072795156714e-06, "loss": 0.1991, "num_input_tokens_seen": 31070864, "step": 51010 }, { "epoch": 15.828420726031647, "grad_norm": 8.746732711791992, "learning_rate": 1.267906215256609e-06, "loss": 0.135, "num_input_tokens_seen": 31074288, "step": 51015 }, { "epoch": 15.829972075705864, "grad_norm": 12.606372833251953, "learning_rate": 1.2670054246042145e-06, "loss": 0.1585, "num_input_tokens_seen": 31080528, "step": 51020 }, { "epoch": 15.83152342538008, "grad_norm": 10.113419532775879, "learning_rate": 1.266104907624529e-06, "loss": 0.2234, "num_input_tokens_seen": 31083056, "step": 51025 }, { "epoch": 15.833074775054298, "grad_norm": 11.578136444091797, "learning_rate": 1.265204664383568e-06, "loss": 0.1738, "num_input_tokens_seen": 31085776, "step": 51030 }, { "epoch": 15.834626124728514, "grad_norm": 25.608680725097656, "learning_rate": 1.2643046949473337e-06, "loss": 0.1687, "num_input_tokens_seen": 31089360, "step": 51035 }, { "epoch": 15.83617747440273, "grad_norm": 14.845134735107422, "learning_rate": 1.2634049993818004e-06, "loss": 0.1759, "num_input_tokens_seen": 31092048, "step": 51040 }, { "epoch": 15.837728824076947, "grad_norm": 10.284463882446289, "learning_rate": 1.2625055777529305e-06, "loss": 0.1654, "num_input_tokens_seen": 31094544, "step": 51045 }, { "epoch": 15.839280173751163, "grad_norm": 5.785654544830322, "learning_rate": 1.261606430126659e-06, "loss": 0.1648, "num_input_tokens_seen": 31097200, "step": 51050 }, { "epoch": 15.84083152342538, "grad_norm": 7.195449352264404, "learning_rate": 1.2607075565689047e-06, "loss": 0.1886, "num_input_tokens_seen": 31100112, "step": 51055 }, { "epoch": 15.842382873099597, "grad_norm": 7.764495849609375, "learning_rate": 1.2598089571455685e-06, "loss": 0.1692, "num_input_tokens_seen": 31102512, "step": 51060 }, { "epoch": 15.843934222773813, "grad_norm": 18.982181549072266, "learning_rate": 1.2589106319225242e-06, "loss": 0.2104, "num_input_tokens_seen": 31105552, "step": 51065 }, { "epoch": 15.84548557244803, "grad_norm": 12.339273452758789, "learning_rate": 1.2580125809656334e-06, "loss": 0.139, "num_input_tokens_seen": 31109680, "step": 51070 }, { "epoch": 15.847036922122246, "grad_norm": 8.279536247253418, "learning_rate": 1.2571148043407306e-06, "loss": 0.1116, "num_input_tokens_seen": 31113616, "step": 51075 }, { "epoch": 15.848588271796462, "grad_norm": 8.678544044494629, "learning_rate": 1.2562173021136371e-06, "loss": 0.2011, "num_input_tokens_seen": 31116720, "step": 51080 }, { "epoch": 15.85013962147068, "grad_norm": 8.19948673248291, "learning_rate": 1.255320074350146e-06, "loss": 0.116, "num_input_tokens_seen": 31120240, "step": 51085 }, { "epoch": 15.851690971144896, "grad_norm": 5.175789833068848, "learning_rate": 1.254423121116039e-06, "loss": 0.2124, "num_input_tokens_seen": 31123088, "step": 51090 }, { "epoch": 15.853242320819113, "grad_norm": 15.527016639709473, "learning_rate": 1.2535264424770699e-06, "loss": 0.1848, "num_input_tokens_seen": 31126416, "step": 51095 }, { "epoch": 15.85479367049333, "grad_norm": 19.256515502929688, "learning_rate": 1.2526300384989793e-06, "loss": 0.2202, "num_input_tokens_seen": 31129008, "step": 51100 }, { "epoch": 15.856345020167545, "grad_norm": 7.688034534454346, "learning_rate": 1.2517339092474807e-06, "loss": 0.1681, "num_input_tokens_seen": 31132240, "step": 51105 }, { "epoch": 15.857896369841763, "grad_norm": 9.774076461791992, "learning_rate": 1.2508380547882742e-06, "loss": 0.2655, "num_input_tokens_seen": 31134768, "step": 51110 }, { "epoch": 15.859447719515979, "grad_norm": 20.039993286132812, "learning_rate": 1.2499424751870316e-06, "loss": 0.2142, "num_input_tokens_seen": 31137488, "step": 51115 }, { "epoch": 15.860999069190196, "grad_norm": 8.519967079162598, "learning_rate": 1.2490471705094164e-06, "loss": 0.1872, "num_input_tokens_seen": 31140368, "step": 51120 }, { "epoch": 15.862550418864412, "grad_norm": 4.868825435638428, "learning_rate": 1.2481521408210595e-06, "loss": 0.205, "num_input_tokens_seen": 31142800, "step": 51125 }, { "epoch": 15.864101768538628, "grad_norm": 11.72671890258789, "learning_rate": 1.2472573861875815e-06, "loss": 0.2074, "num_input_tokens_seen": 31147120, "step": 51130 }, { "epoch": 15.865653118212846, "grad_norm": 23.471298217773438, "learning_rate": 1.2463629066745742e-06, "loss": 0.1819, "num_input_tokens_seen": 31150800, "step": 51135 }, { "epoch": 15.867204467887062, "grad_norm": 15.934649467468262, "learning_rate": 1.2454687023476153e-06, "loss": 0.1813, "num_input_tokens_seen": 31155312, "step": 51140 }, { "epoch": 15.868755817561278, "grad_norm": 9.504817962646484, "learning_rate": 1.244574773272263e-06, "loss": 0.1977, "num_input_tokens_seen": 31157872, "step": 51145 }, { "epoch": 15.870307167235495, "grad_norm": 11.547167778015137, "learning_rate": 1.2436811195140485e-06, "loss": 0.1354, "num_input_tokens_seen": 31160272, "step": 51150 }, { "epoch": 15.871858516909711, "grad_norm": 6.809048175811768, "learning_rate": 1.2427877411384915e-06, "loss": 0.1407, "num_input_tokens_seen": 31162800, "step": 51155 }, { "epoch": 15.873409866583929, "grad_norm": 8.260101318359375, "learning_rate": 1.241894638211083e-06, "loss": 0.1842, "num_input_tokens_seen": 31165936, "step": 51160 }, { "epoch": 15.874961216258145, "grad_norm": 3.6326797008514404, "learning_rate": 1.241001810797302e-06, "loss": 0.1366, "num_input_tokens_seen": 31168176, "step": 51165 }, { "epoch": 15.87651256593236, "grad_norm": 9.529352188110352, "learning_rate": 1.2401092589626001e-06, "loss": 0.2464, "num_input_tokens_seen": 31171888, "step": 51170 }, { "epoch": 15.878063915606578, "grad_norm": 4.088884353637695, "learning_rate": 1.239216982772415e-06, "loss": 0.2743, "num_input_tokens_seen": 31174960, "step": 51175 }, { "epoch": 15.879615265280794, "grad_norm": 18.186504364013672, "learning_rate": 1.2383249822921572e-06, "loss": 0.184, "num_input_tokens_seen": 31178608, "step": 51180 }, { "epoch": 15.88116661495501, "grad_norm": 27.973773956298828, "learning_rate": 1.237433257587225e-06, "loss": 0.1971, "num_input_tokens_seen": 31181264, "step": 51185 }, { "epoch": 15.882717964629228, "grad_norm": 19.235828399658203, "learning_rate": 1.2365418087229885e-06, "loss": 0.1568, "num_input_tokens_seen": 31184144, "step": 51190 }, { "epoch": 15.884269314303443, "grad_norm": 11.79239273071289, "learning_rate": 1.2356506357648058e-06, "loss": 0.144, "num_input_tokens_seen": 31186800, "step": 51195 }, { "epoch": 15.885820663977661, "grad_norm": 21.858842849731445, "learning_rate": 1.2347597387780058e-06, "loss": 0.2103, "num_input_tokens_seen": 31189872, "step": 51200 }, { "epoch": 15.887372013651877, "grad_norm": 9.050572395324707, "learning_rate": 1.2338691178279045e-06, "loss": 0.1599, "num_input_tokens_seen": 31192976, "step": 51205 }, { "epoch": 15.888923363326093, "grad_norm": 10.649767875671387, "learning_rate": 1.232978772979796e-06, "loss": 0.174, "num_input_tokens_seen": 31195280, "step": 51210 }, { "epoch": 15.89047471300031, "grad_norm": 5.512466907501221, "learning_rate": 1.2320887042989498e-06, "loss": 0.1734, "num_input_tokens_seen": 31198672, "step": 51215 }, { "epoch": 15.892026062674526, "grad_norm": 9.850401878356934, "learning_rate": 1.2311989118506224e-06, "loss": 0.2128, "num_input_tokens_seen": 31201648, "step": 51220 }, { "epoch": 15.893577412348744, "grad_norm": 8.293558120727539, "learning_rate": 1.2303093957000422e-06, "loss": 0.1746, "num_input_tokens_seen": 31204528, "step": 51225 }, { "epoch": 15.89512876202296, "grad_norm": 6.985992431640625, "learning_rate": 1.229420155912424e-06, "loss": 0.2306, "num_input_tokens_seen": 31208144, "step": 51230 }, { "epoch": 15.896680111697176, "grad_norm": 5.159328937530518, "learning_rate": 1.228531192552958e-06, "loss": 0.2497, "num_input_tokens_seen": 31211120, "step": 51235 }, { "epoch": 15.898231461371394, "grad_norm": 5.072651386260986, "learning_rate": 1.227642505686818e-06, "loss": 0.168, "num_input_tokens_seen": 31214320, "step": 51240 }, { "epoch": 15.89978281104561, "grad_norm": 24.992280960083008, "learning_rate": 1.2267540953791517e-06, "loss": 0.2098, "num_input_tokens_seen": 31217200, "step": 51245 }, { "epoch": 15.901334160719827, "grad_norm": 10.918668746948242, "learning_rate": 1.2258659616950942e-06, "loss": 0.1649, "num_input_tokens_seen": 31219760, "step": 51250 }, { "epoch": 15.902885510394043, "grad_norm": 7.782419681549072, "learning_rate": 1.2249781046997527e-06, "loss": 0.1562, "num_input_tokens_seen": 31222480, "step": 51255 }, { "epoch": 15.904436860068259, "grad_norm": 9.08116340637207, "learning_rate": 1.2240905244582186e-06, "loss": 0.2028, "num_input_tokens_seen": 31225392, "step": 51260 }, { "epoch": 15.905988209742477, "grad_norm": 5.299497604370117, "learning_rate": 1.2232032210355643e-06, "loss": 0.1943, "num_input_tokens_seen": 31227376, "step": 51265 }, { "epoch": 15.907539559416692, "grad_norm": 13.645376205444336, "learning_rate": 1.2223161944968359e-06, "loss": 0.1683, "num_input_tokens_seen": 31229840, "step": 51270 }, { "epoch": 15.909090909090908, "grad_norm": 10.198389053344727, "learning_rate": 1.2214294449070668e-06, "loss": 0.2251, "num_input_tokens_seen": 31232944, "step": 51275 }, { "epoch": 15.910642258765126, "grad_norm": 22.956186294555664, "learning_rate": 1.2205429723312628e-06, "loss": 0.2245, "num_input_tokens_seen": 31235120, "step": 51280 }, { "epoch": 15.912193608439342, "grad_norm": 10.096149444580078, "learning_rate": 1.2196567768344158e-06, "loss": 0.1697, "num_input_tokens_seen": 31238288, "step": 51285 }, { "epoch": 15.91374495811356, "grad_norm": 6.49105167388916, "learning_rate": 1.2187708584814917e-06, "loss": 0.1466, "num_input_tokens_seen": 31241232, "step": 51290 }, { "epoch": 15.915296307787775, "grad_norm": 4.664527416229248, "learning_rate": 1.2178852173374417e-06, "loss": 0.156, "num_input_tokens_seen": 31244208, "step": 51295 }, { "epoch": 15.916847657461991, "grad_norm": 7.972297668457031, "learning_rate": 1.2169998534671907e-06, "loss": 0.2231, "num_input_tokens_seen": 31247696, "step": 51300 }, { "epoch": 15.918399007136209, "grad_norm": 15.795745849609375, "learning_rate": 1.2161147669356488e-06, "loss": 0.2176, "num_input_tokens_seen": 31250352, "step": 51305 }, { "epoch": 15.919950356810425, "grad_norm": 6.450018405914307, "learning_rate": 1.2152299578077016e-06, "loss": 0.1789, "num_input_tokens_seen": 31253040, "step": 51310 }, { "epoch": 15.921501706484642, "grad_norm": 13.025314331054688, "learning_rate": 1.2143454261482178e-06, "loss": 0.2429, "num_input_tokens_seen": 31256208, "step": 51315 }, { "epoch": 15.923053056158858, "grad_norm": 20.748388290405273, "learning_rate": 1.2134611720220424e-06, "loss": 0.1979, "num_input_tokens_seen": 31259504, "step": 51320 }, { "epoch": 15.924604405833074, "grad_norm": 4.382246971130371, "learning_rate": 1.212577195494002e-06, "loss": 0.1983, "num_input_tokens_seen": 31264368, "step": 51325 }, { "epoch": 15.926155755507292, "grad_norm": 11.800980567932129, "learning_rate": 1.211693496628904e-06, "loss": 0.134, "num_input_tokens_seen": 31267856, "step": 51330 }, { "epoch": 15.927707105181508, "grad_norm": 7.818701267242432, "learning_rate": 1.2108100754915319e-06, "loss": 0.1655, "num_input_tokens_seen": 31270864, "step": 51335 }, { "epoch": 15.929258454855724, "grad_norm": 12.704591751098633, "learning_rate": 1.2099269321466533e-06, "loss": 0.1608, "num_input_tokens_seen": 31273328, "step": 51340 }, { "epoch": 15.930809804529941, "grad_norm": 19.796131134033203, "learning_rate": 1.2090440666590102e-06, "loss": 0.175, "num_input_tokens_seen": 31277008, "step": 51345 }, { "epoch": 15.932361154204157, "grad_norm": 11.060186386108398, "learning_rate": 1.2081614790933304e-06, "loss": 0.1457, "num_input_tokens_seen": 31281232, "step": 51350 }, { "epoch": 15.933912503878375, "grad_norm": 19.101205825805664, "learning_rate": 1.2072791695143148e-06, "loss": 0.2222, "num_input_tokens_seen": 31283856, "step": 51355 }, { "epoch": 15.93546385355259, "grad_norm": 4.3253397941589355, "learning_rate": 1.2063971379866501e-06, "loss": 0.1999, "num_input_tokens_seen": 31286800, "step": 51360 }, { "epoch": 15.937015203226807, "grad_norm": 15.023667335510254, "learning_rate": 1.2055153845749967e-06, "loss": 0.2035, "num_input_tokens_seen": 31290448, "step": 51365 }, { "epoch": 15.938566552901024, "grad_norm": 6.732093334197998, "learning_rate": 1.2046339093440001e-06, "loss": 0.1955, "num_input_tokens_seen": 31293232, "step": 51370 }, { "epoch": 15.94011790257524, "grad_norm": 20.529327392578125, "learning_rate": 1.203752712358281e-06, "loss": 0.2066, "num_input_tokens_seen": 31296016, "step": 51375 }, { "epoch": 15.941669252249458, "grad_norm": 11.7559232711792, "learning_rate": 1.202871793682442e-06, "loss": 0.164, "num_input_tokens_seen": 31298864, "step": 51380 }, { "epoch": 15.943220601923674, "grad_norm": 7.045316696166992, "learning_rate": 1.2019911533810662e-06, "loss": 0.1444, "num_input_tokens_seen": 31301648, "step": 51385 }, { "epoch": 15.94477195159789, "grad_norm": 27.4735050201416, "learning_rate": 1.2011107915187131e-06, "loss": 0.2723, "num_input_tokens_seen": 31304720, "step": 51390 }, { "epoch": 15.946323301272107, "grad_norm": 12.392524719238281, "learning_rate": 1.2002307081599264e-06, "loss": 0.1807, "num_input_tokens_seen": 31307152, "step": 51395 }, { "epoch": 15.947874650946323, "grad_norm": 12.095904350280762, "learning_rate": 1.199350903369223e-06, "loss": 0.1297, "num_input_tokens_seen": 31310288, "step": 51400 }, { "epoch": 15.949426000620539, "grad_norm": 20.6434383392334, "learning_rate": 1.198471377211106e-06, "loss": 0.2646, "num_input_tokens_seen": 31313424, "step": 51405 }, { "epoch": 15.950977350294757, "grad_norm": 6.932227611541748, "learning_rate": 1.1975921297500527e-06, "loss": 0.1707, "num_input_tokens_seen": 31317360, "step": 51410 }, { "epoch": 15.952528699968973, "grad_norm": 15.124361991882324, "learning_rate": 1.1967131610505246e-06, "loss": 0.1644, "num_input_tokens_seen": 31322288, "step": 51415 }, { "epoch": 15.95408004964319, "grad_norm": 19.983505249023438, "learning_rate": 1.1958344711769582e-06, "loss": 0.2142, "num_input_tokens_seen": 31324784, "step": 51420 }, { "epoch": 15.955631399317406, "grad_norm": 10.513725280761719, "learning_rate": 1.194956060193775e-06, "loss": 0.2168, "num_input_tokens_seen": 31329264, "step": 51425 }, { "epoch": 15.957182748991622, "grad_norm": 14.099644660949707, "learning_rate": 1.1940779281653691e-06, "loss": 0.1779, "num_input_tokens_seen": 31332528, "step": 51430 }, { "epoch": 15.95873409866584, "grad_norm": 14.255189895629883, "learning_rate": 1.1932000751561217e-06, "loss": 0.2604, "num_input_tokens_seen": 31335536, "step": 51435 }, { "epoch": 15.960285448340056, "grad_norm": 19.17215919494629, "learning_rate": 1.1923225012303864e-06, "loss": 0.1754, "num_input_tokens_seen": 31339184, "step": 51440 }, { "epoch": 15.961836798014273, "grad_norm": 8.330462455749512, "learning_rate": 1.1914452064525011e-06, "loss": 0.1509, "num_input_tokens_seen": 31343504, "step": 51445 }, { "epoch": 15.96338814768849, "grad_norm": 11.568755149841309, "learning_rate": 1.1905681908867845e-06, "loss": 0.1545, "num_input_tokens_seen": 31348016, "step": 51450 }, { "epoch": 15.964939497362705, "grad_norm": 13.06137466430664, "learning_rate": 1.1896914545975279e-06, "loss": 0.1806, "num_input_tokens_seen": 31351248, "step": 51455 }, { "epoch": 15.966490847036923, "grad_norm": 8.910050392150879, "learning_rate": 1.1888149976490098e-06, "loss": 0.1377, "num_input_tokens_seen": 31353808, "step": 51460 }, { "epoch": 15.968042196711139, "grad_norm": 11.11376667022705, "learning_rate": 1.1879388201054826e-06, "loss": 0.1729, "num_input_tokens_seen": 31357136, "step": 51465 }, { "epoch": 15.969593546385354, "grad_norm": 5.8781352043151855, "learning_rate": 1.1870629220311825e-06, "loss": 0.1357, "num_input_tokens_seen": 31360240, "step": 51470 }, { "epoch": 15.971144896059572, "grad_norm": 6.273642063140869, "learning_rate": 1.1861873034903204e-06, "loss": 0.1644, "num_input_tokens_seen": 31362864, "step": 51475 }, { "epoch": 15.972696245733788, "grad_norm": 4.139793395996094, "learning_rate": 1.1853119645470934e-06, "loss": 0.1693, "num_input_tokens_seen": 31364880, "step": 51480 }, { "epoch": 15.974247595408006, "grad_norm": 22.22797393798828, "learning_rate": 1.1844369052656702e-06, "loss": 0.1849, "num_input_tokens_seen": 31368112, "step": 51485 }, { "epoch": 15.975798945082222, "grad_norm": 26.879501342773438, "learning_rate": 1.1835621257102066e-06, "loss": 0.1215, "num_input_tokens_seen": 31371760, "step": 51490 }, { "epoch": 15.977350294756437, "grad_norm": 10.221174240112305, "learning_rate": 1.1826876259448305e-06, "loss": 0.1563, "num_input_tokens_seen": 31374800, "step": 51495 }, { "epoch": 15.978901644430655, "grad_norm": 9.04226303100586, "learning_rate": 1.1818134060336562e-06, "loss": 0.1652, "num_input_tokens_seen": 31377328, "step": 51500 }, { "epoch": 15.980452994104871, "grad_norm": 18.960031509399414, "learning_rate": 1.180939466040774e-06, "loss": 0.1958, "num_input_tokens_seen": 31382832, "step": 51505 }, { "epoch": 15.982004343779089, "grad_norm": 12.147732734680176, "learning_rate": 1.1800658060302522e-06, "loss": 0.1347, "num_input_tokens_seen": 31385744, "step": 51510 }, { "epoch": 15.983555693453305, "grad_norm": 6.612238883972168, "learning_rate": 1.179192426066143e-06, "loss": 0.1424, "num_input_tokens_seen": 31389488, "step": 51515 }, { "epoch": 15.98510704312752, "grad_norm": 13.808357238769531, "learning_rate": 1.1783193262124725e-06, "loss": 0.1993, "num_input_tokens_seen": 31391920, "step": 51520 }, { "epoch": 15.986658392801738, "grad_norm": 5.636212348937988, "learning_rate": 1.1774465065332524e-06, "loss": 0.151, "num_input_tokens_seen": 31395728, "step": 51525 }, { "epoch": 15.988209742475954, "grad_norm": 7.263254165649414, "learning_rate": 1.1765739670924676e-06, "loss": 0.1248, "num_input_tokens_seen": 31398736, "step": 51530 }, { "epoch": 15.98976109215017, "grad_norm": 9.619714736938477, "learning_rate": 1.175701707954089e-06, "loss": 0.2087, "num_input_tokens_seen": 31401648, "step": 51535 }, { "epoch": 15.991312441824387, "grad_norm": 16.102157592773438, "learning_rate": 1.17482972918206e-06, "loss": 0.1228, "num_input_tokens_seen": 31404496, "step": 51540 }, { "epoch": 15.992863791498603, "grad_norm": 12.687246322631836, "learning_rate": 1.1739580308403097e-06, "loss": 0.2245, "num_input_tokens_seen": 31407056, "step": 51545 }, { "epoch": 15.994415141172821, "grad_norm": 5.57530403137207, "learning_rate": 1.173086612992742e-06, "loss": 0.1846, "num_input_tokens_seen": 31409872, "step": 51550 }, { "epoch": 15.995966490847037, "grad_norm": 15.323599815368652, "learning_rate": 1.1722154757032445e-06, "loss": 0.1597, "num_input_tokens_seen": 31413456, "step": 51555 }, { "epoch": 15.997517840521253, "grad_norm": 8.054615020751953, "learning_rate": 1.1713446190356786e-06, "loss": 0.1669, "num_input_tokens_seen": 31415536, "step": 51560 }, { "epoch": 15.99906919019547, "grad_norm": 12.585521697998047, "learning_rate": 1.1704740430538902e-06, "loss": 0.2247, "num_input_tokens_seen": 31418192, "step": 51565 }, { "epoch": 16.0, "eval_loss": 0.31593939661979675, "eval_runtime": 34.6417, "eval_samples_per_second": 93.038, "eval_steps_per_second": 23.267, "num_input_tokens_seen": 31419552, "step": 51568 }, { "epoch": 16.000620539869686, "grad_norm": 18.042043685913086, "learning_rate": 1.1696037478217043e-06, "loss": 0.2029, "num_input_tokens_seen": 31420928, "step": 51570 }, { "epoch": 16.002171889543902, "grad_norm": 4.211586952209473, "learning_rate": 1.1687337334029209e-06, "loss": 0.1478, "num_input_tokens_seen": 31423456, "step": 51575 }, { "epoch": 16.003723239218118, "grad_norm": 6.23471736907959, "learning_rate": 1.1678639998613256e-06, "loss": 0.1385, "num_input_tokens_seen": 31426624, "step": 51580 }, { "epoch": 16.005274588892338, "grad_norm": 4.385097503662109, "learning_rate": 1.1669945472606763e-06, "loss": 0.17, "num_input_tokens_seen": 31429568, "step": 51585 }, { "epoch": 16.006825938566553, "grad_norm": 5.8952789306640625, "learning_rate": 1.1661253756647184e-06, "loss": 0.1668, "num_input_tokens_seen": 31432384, "step": 51590 }, { "epoch": 16.00837728824077, "grad_norm": 8.019463539123535, "learning_rate": 1.1652564851371684e-06, "loss": 0.1581, "num_input_tokens_seen": 31435040, "step": 51595 }, { "epoch": 16.009928637914985, "grad_norm": 17.91953468322754, "learning_rate": 1.1643878757417298e-06, "loss": 0.1355, "num_input_tokens_seen": 31438400, "step": 51600 }, { "epoch": 16.0114799875892, "grad_norm": 18.405553817749023, "learning_rate": 1.1635195475420785e-06, "loss": 0.1453, "num_input_tokens_seen": 31441504, "step": 51605 }, { "epoch": 16.01303133726342, "grad_norm": 12.066207885742188, "learning_rate": 1.1626515006018774e-06, "loss": 0.1772, "num_input_tokens_seen": 31444448, "step": 51610 }, { "epoch": 16.014582686937636, "grad_norm": 22.157835006713867, "learning_rate": 1.16178373498476e-06, "loss": 0.2517, "num_input_tokens_seen": 31448448, "step": 51615 }, { "epoch": 16.016134036611852, "grad_norm": 15.337839126586914, "learning_rate": 1.1609162507543464e-06, "loss": 0.168, "num_input_tokens_seen": 31450848, "step": 51620 }, { "epoch": 16.017685386286068, "grad_norm": 6.196359157562256, "learning_rate": 1.160049047974235e-06, "loss": 0.1516, "num_input_tokens_seen": 31453024, "step": 51625 }, { "epoch": 16.019236735960284, "grad_norm": 9.456260681152344, "learning_rate": 1.1591821267079984e-06, "loss": 0.1191, "num_input_tokens_seen": 31455520, "step": 51630 }, { "epoch": 16.020788085634504, "grad_norm": 14.214640617370605, "learning_rate": 1.1583154870191954e-06, "loss": 0.1713, "num_input_tokens_seen": 31460000, "step": 51635 }, { "epoch": 16.02233943530872, "grad_norm": 6.773873329162598, "learning_rate": 1.1574491289713586e-06, "loss": 0.174, "num_input_tokens_seen": 31462560, "step": 51640 }, { "epoch": 16.023890784982935, "grad_norm": 2.31325364112854, "learning_rate": 1.1565830526280042e-06, "loss": 0.1315, "num_input_tokens_seen": 31465792, "step": 51645 }, { "epoch": 16.02544213465715, "grad_norm": 4.708544731140137, "learning_rate": 1.1557172580526233e-06, "loss": 0.1868, "num_input_tokens_seen": 31470848, "step": 51650 }, { "epoch": 16.026993484331367, "grad_norm": 7.289852619171143, "learning_rate": 1.1548517453086926e-06, "loss": 0.2209, "num_input_tokens_seen": 31474624, "step": 51655 }, { "epoch": 16.028544834005586, "grad_norm": 5.3450188636779785, "learning_rate": 1.15398651445966e-06, "loss": 0.2117, "num_input_tokens_seen": 31477632, "step": 51660 }, { "epoch": 16.030096183679802, "grad_norm": 12.947032928466797, "learning_rate": 1.153121565568962e-06, "loss": 0.1277, "num_input_tokens_seen": 31480320, "step": 51665 }, { "epoch": 16.03164753335402, "grad_norm": 11.389811515808105, "learning_rate": 1.1522568987000049e-06, "loss": 0.1754, "num_input_tokens_seen": 31484160, "step": 51670 }, { "epoch": 16.033198883028234, "grad_norm": 32.01719665527344, "learning_rate": 1.151392513916183e-06, "loss": 0.1605, "num_input_tokens_seen": 31487072, "step": 51675 }, { "epoch": 16.03475023270245, "grad_norm": 27.862703323364258, "learning_rate": 1.1505284112808624e-06, "loss": 0.1846, "num_input_tokens_seen": 31489792, "step": 51680 }, { "epoch": 16.03630158237667, "grad_norm": 5.646848201751709, "learning_rate": 1.1496645908573946e-06, "loss": 0.1361, "num_input_tokens_seen": 31492864, "step": 51685 }, { "epoch": 16.037852932050885, "grad_norm": 18.625144958496094, "learning_rate": 1.1488010527091075e-06, "loss": 0.2158, "num_input_tokens_seen": 31495456, "step": 51690 }, { "epoch": 16.0394042817251, "grad_norm": 8.551365852355957, "learning_rate": 1.147937796899307e-06, "loss": 0.1511, "num_input_tokens_seen": 31499328, "step": 51695 }, { "epoch": 16.040955631399317, "grad_norm": 15.994803428649902, "learning_rate": 1.1470748234912821e-06, "loss": 0.1767, "num_input_tokens_seen": 31502112, "step": 51700 }, { "epoch": 16.042506981073533, "grad_norm": 10.523021697998047, "learning_rate": 1.1462121325482972e-06, "loss": 0.1104, "num_input_tokens_seen": 31505312, "step": 51705 }, { "epoch": 16.04405833074775, "grad_norm": 8.04956340789795, "learning_rate": 1.1453497241336004e-06, "loss": 0.142, "num_input_tokens_seen": 31507968, "step": 51710 }, { "epoch": 16.04560968042197, "grad_norm": 8.44491195678711, "learning_rate": 1.1444875983104131e-06, "loss": 0.1631, "num_input_tokens_seen": 31510880, "step": 51715 }, { "epoch": 16.047161030096184, "grad_norm": 16.339405059814453, "learning_rate": 1.1436257551419427e-06, "loss": 0.2581, "num_input_tokens_seen": 31513312, "step": 51720 }, { "epoch": 16.0487123797704, "grad_norm": 8.507718086242676, "learning_rate": 1.1427641946913693e-06, "loss": 0.192, "num_input_tokens_seen": 31516512, "step": 51725 }, { "epoch": 16.050263729444616, "grad_norm": 12.111017227172852, "learning_rate": 1.1419029170218582e-06, "loss": 0.1006, "num_input_tokens_seen": 31519168, "step": 51730 }, { "epoch": 16.051815079118832, "grad_norm": 3.260624885559082, "learning_rate": 1.1410419221965485e-06, "loss": 0.1281, "num_input_tokens_seen": 31523104, "step": 51735 }, { "epoch": 16.05336642879305, "grad_norm": 5.446031093597412, "learning_rate": 1.1401812102785643e-06, "loss": 0.2229, "num_input_tokens_seen": 31527264, "step": 51740 }, { "epoch": 16.054917778467267, "grad_norm": 6.475614070892334, "learning_rate": 1.139320781331003e-06, "loss": 0.1543, "num_input_tokens_seen": 31529472, "step": 51745 }, { "epoch": 16.056469128141483, "grad_norm": 9.45405101776123, "learning_rate": 1.1384606354169475e-06, "loss": 0.1897, "num_input_tokens_seen": 31533056, "step": 51750 }, { "epoch": 16.0580204778157, "grad_norm": 2.7415456771850586, "learning_rate": 1.137600772599453e-06, "loss": 0.2158, "num_input_tokens_seen": 31535584, "step": 51755 }, { "epoch": 16.059571827489915, "grad_norm": 12.378911972045898, "learning_rate": 1.1367411929415612e-06, "loss": 0.1769, "num_input_tokens_seen": 31537856, "step": 51760 }, { "epoch": 16.061123177164134, "grad_norm": 9.205982208251953, "learning_rate": 1.1358818965062862e-06, "loss": 0.203, "num_input_tokens_seen": 31541152, "step": 51765 }, { "epoch": 16.06267452683835, "grad_norm": 13.922109603881836, "learning_rate": 1.1350228833566263e-06, "loss": 0.2885, "num_input_tokens_seen": 31545056, "step": 51770 }, { "epoch": 16.064225876512566, "grad_norm": 5.84218692779541, "learning_rate": 1.1341641535555587e-06, "loss": 0.0873, "num_input_tokens_seen": 31548384, "step": 51775 }, { "epoch": 16.065777226186782, "grad_norm": 4.795211315155029, "learning_rate": 1.1333057071660353e-06, "loss": 0.1356, "num_input_tokens_seen": 31551008, "step": 51780 }, { "epoch": 16.067328575860998, "grad_norm": 6.928465366363525, "learning_rate": 1.1324475442509935e-06, "loss": 0.1259, "num_input_tokens_seen": 31554496, "step": 51785 }, { "epoch": 16.068879925535217, "grad_norm": 7.848719596862793, "learning_rate": 1.1315896648733444e-06, "loss": 0.1716, "num_input_tokens_seen": 31558880, "step": 51790 }, { "epoch": 16.070431275209433, "grad_norm": 14.831829071044922, "learning_rate": 1.1307320690959828e-06, "loss": 0.1671, "num_input_tokens_seen": 31562912, "step": 51795 }, { "epoch": 16.07198262488365, "grad_norm": 14.931565284729004, "learning_rate": 1.129874756981778e-06, "loss": 0.1347, "num_input_tokens_seen": 31567296, "step": 51800 }, { "epoch": 16.073533974557865, "grad_norm": 8.994217872619629, "learning_rate": 1.1290177285935844e-06, "loss": 0.1326, "num_input_tokens_seen": 31570272, "step": 51805 }, { "epoch": 16.07508532423208, "grad_norm": 26.681591033935547, "learning_rate": 1.1281609839942291e-06, "loss": 0.1273, "num_input_tokens_seen": 31572992, "step": 51810 }, { "epoch": 16.0766366739063, "grad_norm": 14.506619453430176, "learning_rate": 1.127304523246524e-06, "loss": 0.1601, "num_input_tokens_seen": 31575552, "step": 51815 }, { "epoch": 16.078188023580516, "grad_norm": 8.48285961151123, "learning_rate": 1.1264483464132563e-06, "loss": 0.1334, "num_input_tokens_seen": 31578464, "step": 51820 }, { "epoch": 16.079739373254732, "grad_norm": 13.662510871887207, "learning_rate": 1.1255924535571939e-06, "loss": 0.1478, "num_input_tokens_seen": 31580480, "step": 51825 }, { "epoch": 16.081290722928948, "grad_norm": 8.072333335876465, "learning_rate": 1.124736844741086e-06, "loss": 0.1597, "num_input_tokens_seen": 31583072, "step": 51830 }, { "epoch": 16.082842072603164, "grad_norm": 9.947007179260254, "learning_rate": 1.1238815200276565e-06, "loss": 0.166, "num_input_tokens_seen": 31585280, "step": 51835 }, { "epoch": 16.08439342227738, "grad_norm": 12.084548950195312, "learning_rate": 1.1230264794796126e-06, "loss": 0.1323, "num_input_tokens_seen": 31587712, "step": 51840 }, { "epoch": 16.0859447719516, "grad_norm": 14.779526710510254, "learning_rate": 1.1221717231596368e-06, "loss": 0.1435, "num_input_tokens_seen": 31590080, "step": 51845 }, { "epoch": 16.087496121625815, "grad_norm": 7.782783508300781, "learning_rate": 1.1213172511303954e-06, "loss": 0.1459, "num_input_tokens_seen": 31592928, "step": 51850 }, { "epoch": 16.08904747130003, "grad_norm": 11.096383094787598, "learning_rate": 1.1204630634545283e-06, "loss": 0.1867, "num_input_tokens_seen": 31595520, "step": 51855 }, { "epoch": 16.090598820974247, "grad_norm": 3.8567028045654297, "learning_rate": 1.1196091601946607e-06, "loss": 0.1029, "num_input_tokens_seen": 31599872, "step": 51860 }, { "epoch": 16.092150170648463, "grad_norm": 13.55209732055664, "learning_rate": 1.1187555414133916e-06, "loss": 0.2309, "num_input_tokens_seen": 31602848, "step": 51865 }, { "epoch": 16.093701520322682, "grad_norm": 22.720184326171875, "learning_rate": 1.1179022071733025e-06, "loss": 0.1707, "num_input_tokens_seen": 31605152, "step": 51870 }, { "epoch": 16.095252869996898, "grad_norm": 5.48469877243042, "learning_rate": 1.117049157536952e-06, "loss": 0.2129, "num_input_tokens_seen": 31608000, "step": 51875 }, { "epoch": 16.096804219671114, "grad_norm": 21.253698348999023, "learning_rate": 1.11619639256688e-06, "loss": 0.1485, "num_input_tokens_seen": 31611072, "step": 51880 }, { "epoch": 16.09835556934533, "grad_norm": 4.235057830810547, "learning_rate": 1.1153439123256026e-06, "loss": 0.1565, "num_input_tokens_seen": 31614016, "step": 51885 }, { "epoch": 16.099906919019546, "grad_norm": 13.204566955566406, "learning_rate": 1.1144917168756175e-06, "loss": 0.1641, "num_input_tokens_seen": 31617600, "step": 51890 }, { "epoch": 16.101458268693765, "grad_norm": 20.873149871826172, "learning_rate": 1.1136398062794023e-06, "loss": 0.1754, "num_input_tokens_seen": 31620160, "step": 51895 }, { "epoch": 16.10300961836798, "grad_norm": 10.561491966247559, "learning_rate": 1.1127881805994095e-06, "loss": 0.2013, "num_input_tokens_seen": 31624672, "step": 51900 }, { "epoch": 16.104560968042197, "grad_norm": 6.343047142028809, "learning_rate": 1.1119368398980756e-06, "loss": 0.2513, "num_input_tokens_seen": 31627936, "step": 51905 }, { "epoch": 16.106112317716413, "grad_norm": 9.041720390319824, "learning_rate": 1.1110857842378114e-06, "loss": 0.1151, "num_input_tokens_seen": 31630752, "step": 51910 }, { "epoch": 16.10766366739063, "grad_norm": 19.957077026367188, "learning_rate": 1.1102350136810124e-06, "loss": 0.2391, "num_input_tokens_seen": 31633664, "step": 51915 }, { "epoch": 16.109215017064848, "grad_norm": 15.616147994995117, "learning_rate": 1.1093845282900473e-06, "loss": 0.1672, "num_input_tokens_seen": 31637312, "step": 51920 }, { "epoch": 16.110766366739064, "grad_norm": 13.85974407196045, "learning_rate": 1.1085343281272697e-06, "loss": 0.1677, "num_input_tokens_seen": 31639840, "step": 51925 }, { "epoch": 16.11231771641328, "grad_norm": 20.384601593017578, "learning_rate": 1.1076844132550057e-06, "loss": 0.1623, "num_input_tokens_seen": 31642656, "step": 51930 }, { "epoch": 16.113869066087496, "grad_norm": 15.032186508178711, "learning_rate": 1.106834783735568e-06, "loss": 0.1846, "num_input_tokens_seen": 31646752, "step": 51935 }, { "epoch": 16.11542041576171, "grad_norm": 4.935771942138672, "learning_rate": 1.1059854396312408e-06, "loss": 0.056, "num_input_tokens_seen": 31652000, "step": 51940 }, { "epoch": 16.11697176543593, "grad_norm": 17.486583709716797, "learning_rate": 1.1051363810042931e-06, "loss": 0.186, "num_input_tokens_seen": 31654720, "step": 51945 }, { "epoch": 16.118523115110147, "grad_norm": 8.83619499206543, "learning_rate": 1.1042876079169722e-06, "loss": 0.2489, "num_input_tokens_seen": 31657184, "step": 51950 }, { "epoch": 16.120074464784363, "grad_norm": 6.594125270843506, "learning_rate": 1.1034391204315003e-06, "loss": 0.1217, "num_input_tokens_seen": 31659872, "step": 51955 }, { "epoch": 16.12162581445858, "grad_norm": 12.641311645507812, "learning_rate": 1.1025909186100847e-06, "loss": 0.1491, "num_input_tokens_seen": 31663040, "step": 51960 }, { "epoch": 16.123177164132795, "grad_norm": 12.72672176361084, "learning_rate": 1.1017430025149057e-06, "loss": 0.0943, "num_input_tokens_seen": 31665824, "step": 51965 }, { "epoch": 16.12472851380701, "grad_norm": 31.321626663208008, "learning_rate": 1.1008953722081277e-06, "loss": 0.2091, "num_input_tokens_seen": 31671520, "step": 51970 }, { "epoch": 16.12627986348123, "grad_norm": 5.621146202087402, "learning_rate": 1.1000480277518905e-06, "loss": 0.204, "num_input_tokens_seen": 31674144, "step": 51975 }, { "epoch": 16.127831213155446, "grad_norm": 12.001412391662598, "learning_rate": 1.0992009692083161e-06, "loss": 0.1869, "num_input_tokens_seen": 31676992, "step": 51980 }, { "epoch": 16.12938256282966, "grad_norm": 8.109930038452148, "learning_rate": 1.0983541966395016e-06, "loss": 0.1426, "num_input_tokens_seen": 31679712, "step": 51985 }, { "epoch": 16.130933912503878, "grad_norm": 10.860960960388184, "learning_rate": 1.0975077101075288e-06, "loss": 0.2064, "num_input_tokens_seen": 31684000, "step": 51990 }, { "epoch": 16.132485262178093, "grad_norm": 8.4802885055542, "learning_rate": 1.0966615096744515e-06, "loss": 0.1119, "num_input_tokens_seen": 31687168, "step": 51995 }, { "epoch": 16.134036611852313, "grad_norm": 6.947931289672852, "learning_rate": 1.0958155954023092e-06, "loss": 0.1346, "num_input_tokens_seen": 31689824, "step": 52000 }, { "epoch": 16.13558796152653, "grad_norm": 10.002413749694824, "learning_rate": 1.094969967353115e-06, "loss": 0.2123, "num_input_tokens_seen": 31693440, "step": 52005 }, { "epoch": 16.137139311200745, "grad_norm": 18.653980255126953, "learning_rate": 1.0941246255888648e-06, "loss": 0.1837, "num_input_tokens_seen": 31696480, "step": 52010 }, { "epoch": 16.13869066087496, "grad_norm": 24.161151885986328, "learning_rate": 1.0932795701715333e-06, "loss": 0.1983, "num_input_tokens_seen": 31701056, "step": 52015 }, { "epoch": 16.140242010549176, "grad_norm": 7.067215919494629, "learning_rate": 1.0924348011630704e-06, "loss": 0.1133, "num_input_tokens_seen": 31705440, "step": 52020 }, { "epoch": 16.141793360223396, "grad_norm": 4.035342216491699, "learning_rate": 1.0915903186254107e-06, "loss": 0.21, "num_input_tokens_seen": 31708128, "step": 52025 }, { "epoch": 16.14334470989761, "grad_norm": 6.4360551834106445, "learning_rate": 1.0907461226204614e-06, "loss": 0.1316, "num_input_tokens_seen": 31710336, "step": 52030 }, { "epoch": 16.144896059571828, "grad_norm": 13.124000549316406, "learning_rate": 1.089902213210115e-06, "loss": 0.1965, "num_input_tokens_seen": 31714112, "step": 52035 }, { "epoch": 16.146447409246043, "grad_norm": 13.966588973999023, "learning_rate": 1.0890585904562378e-06, "loss": 0.1518, "num_input_tokens_seen": 31717344, "step": 52040 }, { "epoch": 16.14799875892026, "grad_norm": 19.04329490661621, "learning_rate": 1.0882152544206798e-06, "loss": 0.2063, "num_input_tokens_seen": 31720992, "step": 52045 }, { "epoch": 16.14955010859448, "grad_norm": 7.156733512878418, "learning_rate": 1.0873722051652646e-06, "loss": 0.1751, "num_input_tokens_seen": 31724640, "step": 52050 }, { "epoch": 16.151101458268695, "grad_norm": 6.7853007316589355, "learning_rate": 1.0865294427518008e-06, "loss": 0.1484, "num_input_tokens_seen": 31727648, "step": 52055 }, { "epoch": 16.15265280794291, "grad_norm": 15.928553581237793, "learning_rate": 1.08568696724207e-06, "loss": 0.1503, "num_input_tokens_seen": 31730656, "step": 52060 }, { "epoch": 16.154204157617126, "grad_norm": 20.03816795349121, "learning_rate": 1.0848447786978378e-06, "loss": 0.1435, "num_input_tokens_seen": 31732832, "step": 52065 }, { "epoch": 16.155755507291342, "grad_norm": 9.977319717407227, "learning_rate": 1.0840028771808448e-06, "loss": 0.1807, "num_input_tokens_seen": 31735168, "step": 52070 }, { "epoch": 16.157306856965562, "grad_norm": 10.381181716918945, "learning_rate": 1.0831612627528132e-06, "loss": 0.1598, "num_input_tokens_seen": 31737888, "step": 52075 }, { "epoch": 16.158858206639778, "grad_norm": 11.861640930175781, "learning_rate": 1.0823199354754442e-06, "loss": 0.163, "num_input_tokens_seen": 31740384, "step": 52080 }, { "epoch": 16.160409556313994, "grad_norm": 13.501373291015625, "learning_rate": 1.0814788954104156e-06, "loss": 0.1526, "num_input_tokens_seen": 31743360, "step": 52085 }, { "epoch": 16.16196090598821, "grad_norm": 13.724215507507324, "learning_rate": 1.0806381426193869e-06, "loss": 0.1752, "num_input_tokens_seen": 31746528, "step": 52090 }, { "epoch": 16.163512255662425, "grad_norm": 5.819864273071289, "learning_rate": 1.0797976771639934e-06, "loss": 0.1326, "num_input_tokens_seen": 31750112, "step": 52095 }, { "epoch": 16.16506360533664, "grad_norm": 9.88554573059082, "learning_rate": 1.0789574991058538e-06, "loss": 0.1715, "num_input_tokens_seen": 31753152, "step": 52100 }, { "epoch": 16.16661495501086, "grad_norm": 10.556657791137695, "learning_rate": 1.0781176085065598e-06, "loss": 0.1707, "num_input_tokens_seen": 31756032, "step": 52105 }, { "epoch": 16.168166304685077, "grad_norm": 12.878767967224121, "learning_rate": 1.0772780054276887e-06, "loss": 0.1443, "num_input_tokens_seen": 31759136, "step": 52110 }, { "epoch": 16.169717654359292, "grad_norm": 3.5521512031555176, "learning_rate": 1.076438689930791e-06, "loss": 0.1795, "num_input_tokens_seen": 31761728, "step": 52115 }, { "epoch": 16.17126900403351, "grad_norm": 16.096641540527344, "learning_rate": 1.0755996620774001e-06, "loss": 0.1471, "num_input_tokens_seen": 31764512, "step": 52120 }, { "epoch": 16.172820353707724, "grad_norm": 10.261061668395996, "learning_rate": 1.0747609219290245e-06, "loss": 0.2222, "num_input_tokens_seen": 31768896, "step": 52125 }, { "epoch": 16.174371703381944, "grad_norm": 18.403532028198242, "learning_rate": 1.073922469547155e-06, "loss": 0.1588, "num_input_tokens_seen": 31771712, "step": 52130 }, { "epoch": 16.17592305305616, "grad_norm": 1.7701032161712646, "learning_rate": 1.0730843049932615e-06, "loss": 0.1353, "num_input_tokens_seen": 31775328, "step": 52135 }, { "epoch": 16.177474402730375, "grad_norm": 8.843179702758789, "learning_rate": 1.0722464283287891e-06, "loss": 0.2255, "num_input_tokens_seen": 31777888, "step": 52140 }, { "epoch": 16.17902575240459, "grad_norm": 5.224101543426514, "learning_rate": 1.0714088396151662e-06, "loss": 0.0987, "num_input_tokens_seen": 31780896, "step": 52145 }, { "epoch": 16.180577102078807, "grad_norm": 19.343019485473633, "learning_rate": 1.0705715389137955e-06, "loss": 0.164, "num_input_tokens_seen": 31783360, "step": 52150 }, { "epoch": 16.182128451753027, "grad_norm": 7.144640922546387, "learning_rate": 1.0697345262860638e-06, "loss": 0.1791, "num_input_tokens_seen": 31785952, "step": 52155 }, { "epoch": 16.183679801427242, "grad_norm": 11.465349197387695, "learning_rate": 1.068897801793331e-06, "loss": 0.1439, "num_input_tokens_seen": 31788096, "step": 52160 }, { "epoch": 16.18523115110146, "grad_norm": 7.691533088684082, "learning_rate": 1.0680613654969423e-06, "loss": 0.126, "num_input_tokens_seen": 31791360, "step": 52165 }, { "epoch": 16.186782500775674, "grad_norm": 16.71139144897461, "learning_rate": 1.0672252174582148e-06, "loss": 0.1278, "num_input_tokens_seen": 31793600, "step": 52170 }, { "epoch": 16.18833385044989, "grad_norm": 29.223533630371094, "learning_rate": 1.0663893577384515e-06, "loss": 0.1682, "num_input_tokens_seen": 31796960, "step": 52175 }, { "epoch": 16.18988520012411, "grad_norm": 3.7938594818115234, "learning_rate": 1.0655537863989278e-06, "loss": 0.1087, "num_input_tokens_seen": 31800128, "step": 52180 }, { "epoch": 16.191436549798325, "grad_norm": 23.152103424072266, "learning_rate": 1.064718503500904e-06, "loss": 0.1688, "num_input_tokens_seen": 31803296, "step": 52185 }, { "epoch": 16.19298789947254, "grad_norm": 9.84509563446045, "learning_rate": 1.0638835091056132e-06, "loss": 0.2385, "num_input_tokens_seen": 31806016, "step": 52190 }, { "epoch": 16.194539249146757, "grad_norm": 14.907258033752441, "learning_rate": 1.0630488032742713e-06, "loss": 0.1945, "num_input_tokens_seen": 31809088, "step": 52195 }, { "epoch": 16.196090598820973, "grad_norm": 10.758974075317383, "learning_rate": 1.0622143860680744e-06, "loss": 0.1737, "num_input_tokens_seen": 31811968, "step": 52200 }, { "epoch": 16.197641948495193, "grad_norm": 22.673547744750977, "learning_rate": 1.0613802575481919e-06, "loss": 0.2011, "num_input_tokens_seen": 31814976, "step": 52205 }, { "epoch": 16.19919329816941, "grad_norm": 28.68759536743164, "learning_rate": 1.060546417775778e-06, "loss": 0.2955, "num_input_tokens_seen": 31817376, "step": 52210 }, { "epoch": 16.200744647843624, "grad_norm": 9.179291725158691, "learning_rate": 1.0597128668119606e-06, "loss": 0.1476, "num_input_tokens_seen": 31820192, "step": 52215 }, { "epoch": 16.20229599751784, "grad_norm": 11.929065704345703, "learning_rate": 1.0588796047178512e-06, "loss": 0.1685, "num_input_tokens_seen": 31823296, "step": 52220 }, { "epoch": 16.203847347192056, "grad_norm": 7.055545806884766, "learning_rate": 1.0580466315545357e-06, "loss": 0.1483, "num_input_tokens_seen": 31826816, "step": 52225 }, { "epoch": 16.205398696866272, "grad_norm": 21.842742919921875, "learning_rate": 1.0572139473830828e-06, "loss": 0.1981, "num_input_tokens_seen": 31829696, "step": 52230 }, { "epoch": 16.20695004654049, "grad_norm": 13.137151718139648, "learning_rate": 1.0563815522645353e-06, "loss": 0.2008, "num_input_tokens_seen": 31832992, "step": 52235 }, { "epoch": 16.208501396214707, "grad_norm": 10.463016510009766, "learning_rate": 1.055549446259922e-06, "loss": 0.2022, "num_input_tokens_seen": 31836992, "step": 52240 }, { "epoch": 16.210052745888923, "grad_norm": 19.95437240600586, "learning_rate": 1.0547176294302414e-06, "loss": 0.1264, "num_input_tokens_seen": 31840672, "step": 52245 }, { "epoch": 16.21160409556314, "grad_norm": 4.496696472167969, "learning_rate": 1.0538861018364777e-06, "loss": 0.1053, "num_input_tokens_seen": 31843808, "step": 52250 }, { "epoch": 16.213155445237355, "grad_norm": 10.538129806518555, "learning_rate": 1.0530548635395932e-06, "loss": 0.2079, "num_input_tokens_seen": 31846304, "step": 52255 }, { "epoch": 16.214706794911574, "grad_norm": 7.3651347160339355, "learning_rate": 1.0522239146005248e-06, "loss": 0.1781, "num_input_tokens_seen": 31849600, "step": 52260 }, { "epoch": 16.21625814458579, "grad_norm": 21.23454475402832, "learning_rate": 1.051393255080193e-06, "loss": 0.1149, "num_input_tokens_seen": 31852512, "step": 52265 }, { "epoch": 16.217809494260006, "grad_norm": 14.225690841674805, "learning_rate": 1.0505628850394934e-06, "loss": 0.1528, "num_input_tokens_seen": 31855776, "step": 52270 }, { "epoch": 16.219360843934222, "grad_norm": 8.62035846710205, "learning_rate": 1.0497328045393024e-06, "loss": 0.1155, "num_input_tokens_seen": 31858688, "step": 52275 }, { "epoch": 16.220912193608438, "grad_norm": 12.216837882995605, "learning_rate": 1.048903013640475e-06, "loss": 0.1461, "num_input_tokens_seen": 31861792, "step": 52280 }, { "epoch": 16.222463543282657, "grad_norm": 7.774298191070557, "learning_rate": 1.048073512403846e-06, "loss": 0.1547, "num_input_tokens_seen": 31865440, "step": 52285 }, { "epoch": 16.224014892956873, "grad_norm": 10.645712852478027, "learning_rate": 1.047244300890225e-06, "loss": 0.1758, "num_input_tokens_seen": 31868096, "step": 52290 }, { "epoch": 16.22556624263109, "grad_norm": 24.866146087646484, "learning_rate": 1.0464153791604054e-06, "loss": 0.212, "num_input_tokens_seen": 31870880, "step": 52295 }, { "epoch": 16.227117592305305, "grad_norm": 27.920106887817383, "learning_rate": 1.0455867472751551e-06, "loss": 0.1487, "num_input_tokens_seen": 31873632, "step": 52300 }, { "epoch": 16.22866894197952, "grad_norm": 25.472375869750977, "learning_rate": 1.0447584052952248e-06, "loss": 0.1818, "num_input_tokens_seen": 31876416, "step": 52305 }, { "epoch": 16.23022029165374, "grad_norm": 44.983428955078125, "learning_rate": 1.0439303532813388e-06, "loss": 0.2761, "num_input_tokens_seen": 31880736, "step": 52310 }, { "epoch": 16.231771641327956, "grad_norm": 26.238183975219727, "learning_rate": 1.043102591294206e-06, "loss": 0.1665, "num_input_tokens_seen": 31883680, "step": 52315 }, { "epoch": 16.233322991002172, "grad_norm": 17.455944061279297, "learning_rate": 1.042275119394509e-06, "loss": 0.2724, "num_input_tokens_seen": 31886752, "step": 52320 }, { "epoch": 16.234874340676388, "grad_norm": 12.349957466125488, "learning_rate": 1.0414479376429137e-06, "loss": 0.1883, "num_input_tokens_seen": 31889696, "step": 52325 }, { "epoch": 16.236425690350604, "grad_norm": 8.431102752685547, "learning_rate": 1.0406210461000587e-06, "loss": 0.1389, "num_input_tokens_seen": 31892384, "step": 52330 }, { "epoch": 16.237977040024823, "grad_norm": 34.786678314208984, "learning_rate": 1.039794444826568e-06, "loss": 0.1421, "num_input_tokens_seen": 31895552, "step": 52335 }, { "epoch": 16.23952838969904, "grad_norm": 6.375337600708008, "learning_rate": 1.0389681338830414e-06, "loss": 0.3123, "num_input_tokens_seen": 31899072, "step": 52340 }, { "epoch": 16.241079739373255, "grad_norm": 29.853199005126953, "learning_rate": 1.0381421133300545e-06, "loss": 0.1553, "num_input_tokens_seen": 31902272, "step": 52345 }, { "epoch": 16.24263108904747, "grad_norm": 15.764847755432129, "learning_rate": 1.037316383228168e-06, "loss": 0.2063, "num_input_tokens_seen": 31904832, "step": 52350 }, { "epoch": 16.244182438721687, "grad_norm": 14.9983491897583, "learning_rate": 1.0364909436379139e-06, "loss": 0.1756, "num_input_tokens_seen": 31907904, "step": 52355 }, { "epoch": 16.245733788395903, "grad_norm": 15.393498420715332, "learning_rate": 1.0356657946198107e-06, "loss": 0.2404, "num_input_tokens_seen": 31910624, "step": 52360 }, { "epoch": 16.247285138070122, "grad_norm": 14.057269096374512, "learning_rate": 1.0348409362343476e-06, "loss": 0.1732, "num_input_tokens_seen": 31913664, "step": 52365 }, { "epoch": 16.248836487744338, "grad_norm": 20.806262969970703, "learning_rate": 1.0340163685419997e-06, "loss": 0.251, "num_input_tokens_seen": 31915776, "step": 52370 }, { "epoch": 16.250387837418554, "grad_norm": 24.720048904418945, "learning_rate": 1.033192091603215e-06, "loss": 0.2117, "num_input_tokens_seen": 31918208, "step": 52375 }, { "epoch": 16.25193918709277, "grad_norm": 22.764873504638672, "learning_rate": 1.032368105478425e-06, "loss": 0.1533, "num_input_tokens_seen": 31921344, "step": 52380 }, { "epoch": 16.253490536766986, "grad_norm": 10.028773307800293, "learning_rate": 1.0315444102280358e-06, "loss": 0.1356, "num_input_tokens_seen": 31923616, "step": 52385 }, { "epoch": 16.255041886441205, "grad_norm": 2.4727470874786377, "learning_rate": 1.0307210059124362e-06, "loss": 0.1997, "num_input_tokens_seen": 31926144, "step": 52390 }, { "epoch": 16.25659323611542, "grad_norm": 38.87656021118164, "learning_rate": 1.0298978925919888e-06, "loss": 0.2837, "num_input_tokens_seen": 31928704, "step": 52395 }, { "epoch": 16.258144585789637, "grad_norm": 14.423454284667969, "learning_rate": 1.0290750703270392e-06, "loss": 0.2874, "num_input_tokens_seen": 31932000, "step": 52400 }, { "epoch": 16.259695935463853, "grad_norm": 13.04771614074707, "learning_rate": 1.0282525391779109e-06, "loss": 0.2021, "num_input_tokens_seen": 31935360, "step": 52405 }, { "epoch": 16.26124728513807, "grad_norm": 4.697226047515869, "learning_rate": 1.0274302992049024e-06, "loss": 0.1228, "num_input_tokens_seen": 31938336, "step": 52410 }, { "epoch": 16.262798634812288, "grad_norm": 17.262605667114258, "learning_rate": 1.0266083504682966e-06, "loss": 0.1337, "num_input_tokens_seen": 31940960, "step": 52415 }, { "epoch": 16.264349984486504, "grad_norm": 21.64548683166504, "learning_rate": 1.0257866930283494e-06, "loss": 0.2678, "num_input_tokens_seen": 31943264, "step": 52420 }, { "epoch": 16.26590133416072, "grad_norm": 13.826258659362793, "learning_rate": 1.0249653269453003e-06, "loss": 0.1399, "num_input_tokens_seen": 31946560, "step": 52425 }, { "epoch": 16.267452683834936, "grad_norm": 6.079192638397217, "learning_rate": 1.0241442522793632e-06, "loss": 0.1232, "num_input_tokens_seen": 31948896, "step": 52430 }, { "epoch": 16.26900403350915, "grad_norm": 15.567761421203613, "learning_rate": 1.0233234690907351e-06, "loss": 0.1847, "num_input_tokens_seen": 31951168, "step": 52435 }, { "epoch": 16.27055538318337, "grad_norm": 15.102669715881348, "learning_rate": 1.0225029774395862e-06, "loss": 0.0791, "num_input_tokens_seen": 31954368, "step": 52440 }, { "epoch": 16.272106732857587, "grad_norm": 12.89896297454834, "learning_rate": 1.021682777386071e-06, "loss": 0.1588, "num_input_tokens_seen": 31957536, "step": 52445 }, { "epoch": 16.273658082531803, "grad_norm": 14.467072486877441, "learning_rate": 1.020862868990317e-06, "loss": 0.2672, "num_input_tokens_seen": 31960416, "step": 52450 }, { "epoch": 16.27520943220602, "grad_norm": 24.70088768005371, "learning_rate": 1.0200432523124348e-06, "loss": 0.2244, "num_input_tokens_seen": 31963584, "step": 52455 }, { "epoch": 16.276760781880235, "grad_norm": 11.264205932617188, "learning_rate": 1.0192239274125137e-06, "loss": 0.2331, "num_input_tokens_seen": 31966848, "step": 52460 }, { "epoch": 16.278312131554454, "grad_norm": 34.577911376953125, "learning_rate": 1.0184048943506164e-06, "loss": 0.1753, "num_input_tokens_seen": 31969280, "step": 52465 }, { "epoch": 16.27986348122867, "grad_norm": 8.870546340942383, "learning_rate": 1.017586153186791e-06, "loss": 0.1006, "num_input_tokens_seen": 31972448, "step": 52470 }, { "epoch": 16.281414830902886, "grad_norm": 8.560447692871094, "learning_rate": 1.0167677039810581e-06, "loss": 0.1493, "num_input_tokens_seen": 31975392, "step": 52475 }, { "epoch": 16.2829661805771, "grad_norm": 16.096601486206055, "learning_rate": 1.0159495467934222e-06, "loss": 0.131, "num_input_tokens_seen": 31978848, "step": 52480 }, { "epoch": 16.284517530251318, "grad_norm": 8.512880325317383, "learning_rate": 1.0151316816838614e-06, "loss": 0.115, "num_input_tokens_seen": 31981568, "step": 52485 }, { "epoch": 16.286068879925534, "grad_norm": 20.137523651123047, "learning_rate": 1.0143141087123377e-06, "loss": 0.2381, "num_input_tokens_seen": 31984672, "step": 52490 }, { "epoch": 16.287620229599753, "grad_norm": 17.770708084106445, "learning_rate": 1.0134968279387858e-06, "loss": 0.1306, "num_input_tokens_seen": 31988256, "step": 52495 }, { "epoch": 16.28917157927397, "grad_norm": 14.51845645904541, "learning_rate": 1.0126798394231252e-06, "loss": 0.1598, "num_input_tokens_seen": 31991008, "step": 52500 }, { "epoch": 16.290722928948185, "grad_norm": 7.40975284576416, "learning_rate": 1.011863143225248e-06, "loss": 0.1359, "num_input_tokens_seen": 31993536, "step": 52505 }, { "epoch": 16.2922742786224, "grad_norm": 14.062667846679688, "learning_rate": 1.0110467394050306e-06, "loss": 0.1795, "num_input_tokens_seen": 31996832, "step": 52510 }, { "epoch": 16.293825628296617, "grad_norm": 15.3147554397583, "learning_rate": 1.0102306280223217e-06, "loss": 0.2629, "num_input_tokens_seen": 31999392, "step": 52515 }, { "epoch": 16.295376977970836, "grad_norm": 5.298707008361816, "learning_rate": 1.0094148091369539e-06, "loss": 0.1133, "num_input_tokens_seen": 32003008, "step": 52520 }, { "epoch": 16.296928327645052, "grad_norm": 6.086711406707764, "learning_rate": 1.008599282808737e-06, "loss": 0.2207, "num_input_tokens_seen": 32006304, "step": 52525 }, { "epoch": 16.298479677319268, "grad_norm": 14.189838409423828, "learning_rate": 1.0077840490974572e-06, "loss": 0.2176, "num_input_tokens_seen": 32009056, "step": 52530 }, { "epoch": 16.300031026993484, "grad_norm": 15.958395004272461, "learning_rate": 1.006969108062883e-06, "loss": 0.222, "num_input_tokens_seen": 32013376, "step": 52535 }, { "epoch": 16.3015823766677, "grad_norm": 24.28366470336914, "learning_rate": 1.006154459764756e-06, "loss": 0.2263, "num_input_tokens_seen": 32016192, "step": 52540 }, { "epoch": 16.30313372634192, "grad_norm": 10.177640914916992, "learning_rate": 1.0053401042628031e-06, "loss": 0.1572, "num_input_tokens_seen": 32018912, "step": 52545 }, { "epoch": 16.304685076016135, "grad_norm": 2.5246012210845947, "learning_rate": 1.0045260416167224e-06, "loss": 0.0901, "num_input_tokens_seen": 32021760, "step": 52550 }, { "epoch": 16.30623642569035, "grad_norm": 33.89997100830078, "learning_rate": 1.003712271886198e-06, "loss": 0.1989, "num_input_tokens_seen": 32024544, "step": 52555 }, { "epoch": 16.307787775364567, "grad_norm": 6.002432823181152, "learning_rate": 1.002898795130886e-06, "loss": 0.1872, "num_input_tokens_seen": 32027520, "step": 52560 }, { "epoch": 16.309339125038782, "grad_norm": 11.278554916381836, "learning_rate": 1.0020856114104261e-06, "loss": 0.0944, "num_input_tokens_seen": 32030496, "step": 52565 }, { "epoch": 16.310890474713002, "grad_norm": 18.44502067565918, "learning_rate": 1.0012727207844325e-06, "loss": 0.1906, "num_input_tokens_seen": 32033056, "step": 52570 }, { "epoch": 16.312441824387218, "grad_norm": 14.808247566223145, "learning_rate": 1.0004601233125e-06, "loss": 0.1692, "num_input_tokens_seen": 32036064, "step": 52575 }, { "epoch": 16.313993174061434, "grad_norm": 10.108349800109863, "learning_rate": 9.996478190542036e-07, "loss": 0.1671, "num_input_tokens_seen": 32038560, "step": 52580 }, { "epoch": 16.31554452373565, "grad_norm": 12.677628517150879, "learning_rate": 9.988358080690918e-07, "loss": 0.1558, "num_input_tokens_seen": 32041856, "step": 52585 }, { "epoch": 16.317095873409865, "grad_norm": 7.495939254760742, "learning_rate": 9.980240904166976e-07, "loss": 0.1229, "num_input_tokens_seen": 32044544, "step": 52590 }, { "epoch": 16.318647223084085, "grad_norm": 22.74471664428711, "learning_rate": 9.972126661565268e-07, "loss": 0.1422, "num_input_tokens_seen": 32047136, "step": 52595 }, { "epoch": 16.3201985727583, "grad_norm": 13.210247039794922, "learning_rate": 9.964015353480688e-07, "loss": 0.1707, "num_input_tokens_seen": 32049664, "step": 52600 }, { "epoch": 16.321749922432517, "grad_norm": 22.608577728271484, "learning_rate": 9.955906980507868e-07, "loss": 0.1738, "num_input_tokens_seen": 32052768, "step": 52605 }, { "epoch": 16.323301272106733, "grad_norm": 9.624375343322754, "learning_rate": 9.94780154324127e-07, "loss": 0.0935, "num_input_tokens_seen": 32056672, "step": 52610 }, { "epoch": 16.32485262178095, "grad_norm": 27.056997299194336, "learning_rate": 9.939699042275097e-07, "loss": 0.1749, "num_input_tokens_seen": 32059104, "step": 52615 }, { "epoch": 16.326403971455164, "grad_norm": 14.721463203430176, "learning_rate": 9.931599478203384e-07, "loss": 0.1723, "num_input_tokens_seen": 32061792, "step": 52620 }, { "epoch": 16.327955321129384, "grad_norm": 19.396331787109375, "learning_rate": 9.923502851619893e-07, "loss": 0.185, "num_input_tokens_seen": 32064416, "step": 52625 }, { "epoch": 16.3295066708036, "grad_norm": 24.8922061920166, "learning_rate": 9.915409163118235e-07, "loss": 0.2163, "num_input_tokens_seen": 32067552, "step": 52630 }, { "epoch": 16.331058020477816, "grad_norm": 6.191756248474121, "learning_rate": 9.907318413291745e-07, "loss": 0.2461, "num_input_tokens_seen": 32070368, "step": 52635 }, { "epoch": 16.33260937015203, "grad_norm": 15.387908935546875, "learning_rate": 9.899230602733583e-07, "loss": 0.1075, "num_input_tokens_seen": 32073696, "step": 52640 }, { "epoch": 16.334160719826247, "grad_norm": 7.985354900360107, "learning_rate": 9.891145732036695e-07, "loss": 0.1317, "num_input_tokens_seen": 32076992, "step": 52645 }, { "epoch": 16.335712069500467, "grad_norm": 17.379159927368164, "learning_rate": 9.883063801793774e-07, "loss": 0.1082, "num_input_tokens_seen": 32079744, "step": 52650 }, { "epoch": 16.337263419174683, "grad_norm": 7.681221008300781, "learning_rate": 9.874984812597344e-07, "loss": 0.2592, "num_input_tokens_seen": 32082112, "step": 52655 }, { "epoch": 16.3388147688489, "grad_norm": 7.885885715484619, "learning_rate": 9.866908765039661e-07, "loss": 0.1545, "num_input_tokens_seen": 32084736, "step": 52660 }, { "epoch": 16.340366118523114, "grad_norm": 9.337839126586914, "learning_rate": 9.858835659712829e-07, "loss": 0.2054, "num_input_tokens_seen": 32087584, "step": 52665 }, { "epoch": 16.34191746819733, "grad_norm": 12.896620750427246, "learning_rate": 9.850765497208674e-07, "loss": 0.2701, "num_input_tokens_seen": 32090848, "step": 52670 }, { "epoch": 16.34346881787155, "grad_norm": 15.702524185180664, "learning_rate": 9.842698278118857e-07, "loss": 0.2037, "num_input_tokens_seen": 32093952, "step": 52675 }, { "epoch": 16.345020167545766, "grad_norm": 7.276959419250488, "learning_rate": 9.834634003034777e-07, "loss": 0.0449, "num_input_tokens_seen": 32098560, "step": 52680 }, { "epoch": 16.34657151721998, "grad_norm": 11.441993713378906, "learning_rate": 9.826572672547668e-07, "loss": 0.2202, "num_input_tokens_seen": 32100800, "step": 52685 }, { "epoch": 16.348122866894197, "grad_norm": 33.269622802734375, "learning_rate": 9.818514287248494e-07, "loss": 0.1908, "num_input_tokens_seen": 32103840, "step": 52690 }, { "epoch": 16.349674216568413, "grad_norm": 9.016149520874023, "learning_rate": 9.810458847728039e-07, "loss": 0.1977, "num_input_tokens_seen": 32106432, "step": 52695 }, { "epoch": 16.351225566242633, "grad_norm": 18.617332458496094, "learning_rate": 9.802406354576882e-07, "loss": 0.1357, "num_input_tokens_seen": 32108960, "step": 52700 }, { "epoch": 16.35277691591685, "grad_norm": 6.883584499359131, "learning_rate": 9.794356808385335e-07, "loss": 0.1378, "num_input_tokens_seen": 32111584, "step": 52705 }, { "epoch": 16.354328265591064, "grad_norm": 9.00427532196045, "learning_rate": 9.786310209743555e-07, "loss": 0.22, "num_input_tokens_seen": 32114240, "step": 52710 }, { "epoch": 16.35587961526528, "grad_norm": 7.928815841674805, "learning_rate": 9.778266559241422e-07, "loss": 0.2076, "num_input_tokens_seen": 32117024, "step": 52715 }, { "epoch": 16.357430964939496, "grad_norm": 12.347667694091797, "learning_rate": 9.770225857468662e-07, "loss": 0.143, "num_input_tokens_seen": 32119616, "step": 52720 }, { "epoch": 16.358982314613716, "grad_norm": 34.9622917175293, "learning_rate": 9.76218810501473e-07, "loss": 0.2284, "num_input_tokens_seen": 32122464, "step": 52725 }, { "epoch": 16.36053366428793, "grad_norm": 7.544450759887695, "learning_rate": 9.754153302468906e-07, "loss": 0.2304, "num_input_tokens_seen": 32124992, "step": 52730 }, { "epoch": 16.362085013962147, "grad_norm": 8.901022911071777, "learning_rate": 9.74612145042021e-07, "loss": 0.2231, "num_input_tokens_seen": 32127136, "step": 52735 }, { "epoch": 16.363636363636363, "grad_norm": 7.947187423706055, "learning_rate": 9.738092549457506e-07, "loss": 0.1428, "num_input_tokens_seen": 32129408, "step": 52740 }, { "epoch": 16.36518771331058, "grad_norm": 7.4168219566345215, "learning_rate": 9.730066600169375e-07, "loss": 0.1961, "num_input_tokens_seen": 32132096, "step": 52745 }, { "epoch": 16.366739062984795, "grad_norm": 5.9944987297058105, "learning_rate": 9.722043603144243e-07, "loss": 0.1107, "num_input_tokens_seen": 32138432, "step": 52750 }, { "epoch": 16.368290412659015, "grad_norm": 8.859289169311523, "learning_rate": 9.71402355897027e-07, "loss": 0.1242, "num_input_tokens_seen": 32141632, "step": 52755 }, { "epoch": 16.36984176233323, "grad_norm": 10.296541213989258, "learning_rate": 9.706006468235425e-07, "loss": 0.1498, "num_input_tokens_seen": 32144320, "step": 52760 }, { "epoch": 16.371393112007446, "grad_norm": 9.508024215698242, "learning_rate": 9.697992331527468e-07, "loss": 0.1196, "num_input_tokens_seen": 32147264, "step": 52765 }, { "epoch": 16.372944461681662, "grad_norm": 9.715271949768066, "learning_rate": 9.689981149433909e-07, "loss": 0.1363, "num_input_tokens_seen": 32150368, "step": 52770 }, { "epoch": 16.374495811355878, "grad_norm": 15.851679801940918, "learning_rate": 9.681972922542082e-07, "loss": 0.1229, "num_input_tokens_seen": 32153216, "step": 52775 }, { "epoch": 16.376047161030097, "grad_norm": 12.62289810180664, "learning_rate": 9.67396765143907e-07, "loss": 0.15, "num_input_tokens_seen": 32155968, "step": 52780 }, { "epoch": 16.377598510704313, "grad_norm": 5.339049816131592, "learning_rate": 9.665965336711768e-07, "loss": 0.1461, "num_input_tokens_seen": 32159392, "step": 52785 }, { "epoch": 16.37914986037853, "grad_norm": 8.242219924926758, "learning_rate": 9.657965978946825e-07, "loss": 0.1655, "num_input_tokens_seen": 32162560, "step": 52790 }, { "epoch": 16.380701210052745, "grad_norm": 28.408239364624023, "learning_rate": 9.649969578730711e-07, "loss": 0.1518, "num_input_tokens_seen": 32165120, "step": 52795 }, { "epoch": 16.38225255972696, "grad_norm": 10.505114555358887, "learning_rate": 9.641976136649627e-07, "loss": 0.1365, "num_input_tokens_seen": 32167904, "step": 52800 }, { "epoch": 16.38380390940118, "grad_norm": 10.205896377563477, "learning_rate": 9.63398565328962e-07, "loss": 0.1283, "num_input_tokens_seen": 32170976, "step": 52805 }, { "epoch": 16.385355259075396, "grad_norm": 20.045225143432617, "learning_rate": 9.625998129236446e-07, "loss": 0.1552, "num_input_tokens_seen": 32173824, "step": 52810 }, { "epoch": 16.386906608749612, "grad_norm": 19.770565032958984, "learning_rate": 9.618013565075719e-07, "loss": 0.3604, "num_input_tokens_seen": 32176672, "step": 52815 }, { "epoch": 16.388457958423828, "grad_norm": 17.308359146118164, "learning_rate": 9.6100319613928e-07, "loss": 0.2003, "num_input_tokens_seen": 32179232, "step": 52820 }, { "epoch": 16.390009308098044, "grad_norm": 9.004673957824707, "learning_rate": 9.602053318772819e-07, "loss": 0.1939, "num_input_tokens_seen": 32182432, "step": 52825 }, { "epoch": 16.391560657772263, "grad_norm": 18.76241111755371, "learning_rate": 9.59407763780072e-07, "loss": 0.1852, "num_input_tokens_seen": 32185472, "step": 52830 }, { "epoch": 16.39311200744648, "grad_norm": 19.416513442993164, "learning_rate": 9.586104919061195e-07, "loss": 0.2482, "num_input_tokens_seen": 32189024, "step": 52835 }, { "epoch": 16.394663357120695, "grad_norm": 4.986040115356445, "learning_rate": 9.578135163138768e-07, "loss": 0.1967, "num_input_tokens_seen": 32191584, "step": 52840 }, { "epoch": 16.39621470679491, "grad_norm": 9.150303840637207, "learning_rate": 9.57016837061769e-07, "loss": 0.1458, "num_input_tokens_seen": 32194944, "step": 52845 }, { "epoch": 16.397766056469127, "grad_norm": 8.9520263671875, "learning_rate": 9.562204542082027e-07, "loss": 0.1375, "num_input_tokens_seen": 32198720, "step": 52850 }, { "epoch": 16.399317406143346, "grad_norm": 4.822501182556152, "learning_rate": 9.554243678115626e-07, "loss": 0.1773, "num_input_tokens_seen": 32202080, "step": 52855 }, { "epoch": 16.400868755817562, "grad_norm": 11.260645866394043, "learning_rate": 9.546285779302128e-07, "loss": 0.1731, "num_input_tokens_seen": 32204864, "step": 52860 }, { "epoch": 16.402420105491778, "grad_norm": 19.46819496154785, "learning_rate": 9.53833084622491e-07, "loss": 0.1825, "num_input_tokens_seen": 32208000, "step": 52865 }, { "epoch": 16.403971455165994, "grad_norm": 28.595956802368164, "learning_rate": 9.530378879467194e-07, "loss": 0.2248, "num_input_tokens_seen": 32210816, "step": 52870 }, { "epoch": 16.40552280484021, "grad_norm": 8.615880012512207, "learning_rate": 9.522429879611922e-07, "loss": 0.1823, "num_input_tokens_seen": 32214272, "step": 52875 }, { "epoch": 16.407074154514426, "grad_norm": 7.09793758392334, "learning_rate": 9.514483847241885e-07, "loss": 0.2179, "num_input_tokens_seen": 32216576, "step": 52880 }, { "epoch": 16.408625504188645, "grad_norm": 9.452460289001465, "learning_rate": 9.506540782939583e-07, "loss": 0.1465, "num_input_tokens_seen": 32219712, "step": 52885 }, { "epoch": 16.41017685386286, "grad_norm": 6.290846347808838, "learning_rate": 9.498600687287368e-07, "loss": 0.1768, "num_input_tokens_seen": 32222176, "step": 52890 }, { "epoch": 16.411728203537077, "grad_norm": 13.59374713897705, "learning_rate": 9.490663560867324e-07, "loss": 0.2077, "num_input_tokens_seen": 32225504, "step": 52895 }, { "epoch": 16.413279553211293, "grad_norm": 6.781517028808594, "learning_rate": 9.482729404261343e-07, "loss": 0.1938, "num_input_tokens_seen": 32228256, "step": 52900 }, { "epoch": 16.41483090288551, "grad_norm": 13.902864456176758, "learning_rate": 9.474798218051101e-07, "loss": 0.1795, "num_input_tokens_seen": 32231104, "step": 52905 }, { "epoch": 16.41638225255973, "grad_norm": 18.866107940673828, "learning_rate": 9.466870002818029e-07, "loss": 0.1976, "num_input_tokens_seen": 32234816, "step": 52910 }, { "epoch": 16.417933602233944, "grad_norm": 17.35329246520996, "learning_rate": 9.458944759143385e-07, "loss": 0.1504, "num_input_tokens_seen": 32237856, "step": 52915 }, { "epoch": 16.41948495190816, "grad_norm": 16.392040252685547, "learning_rate": 9.451022487608152e-07, "loss": 0.2303, "num_input_tokens_seen": 32240608, "step": 52920 }, { "epoch": 16.421036301582376, "grad_norm": 3.145270586013794, "learning_rate": 9.443103188793157e-07, "loss": 0.2162, "num_input_tokens_seen": 32243936, "step": 52925 }, { "epoch": 16.422587651256592, "grad_norm": 20.433813095092773, "learning_rate": 9.43518686327895e-07, "loss": 0.2187, "num_input_tokens_seen": 32246944, "step": 52930 }, { "epoch": 16.42413900093081, "grad_norm": 11.147234916687012, "learning_rate": 9.427273511645918e-07, "loss": 0.1248, "num_input_tokens_seen": 32249600, "step": 52935 }, { "epoch": 16.425690350605027, "grad_norm": 9.10817813873291, "learning_rate": 9.41936313447418e-07, "loss": 0.2328, "num_input_tokens_seen": 32251936, "step": 52940 }, { "epoch": 16.427241700279243, "grad_norm": 16.085609436035156, "learning_rate": 9.411455732343683e-07, "loss": 0.1556, "num_input_tokens_seen": 32254944, "step": 52945 }, { "epoch": 16.42879304995346, "grad_norm": 11.819046020507812, "learning_rate": 9.403551305834108e-07, "loss": 0.1347, "num_input_tokens_seen": 32258144, "step": 52950 }, { "epoch": 16.430344399627675, "grad_norm": 11.987957954406738, "learning_rate": 9.395649855524968e-07, "loss": 0.1443, "num_input_tokens_seen": 32261536, "step": 52955 }, { "epoch": 16.431895749301894, "grad_norm": 26.23387336730957, "learning_rate": 9.387751381995508e-07, "loss": 0.2496, "num_input_tokens_seen": 32264160, "step": 52960 }, { "epoch": 16.43344709897611, "grad_norm": 9.726268768310547, "learning_rate": 9.37985588582479e-07, "loss": 0.1593, "num_input_tokens_seen": 32267488, "step": 52965 }, { "epoch": 16.434998448650326, "grad_norm": 23.509830474853516, "learning_rate": 9.371963367591669e-07, "loss": 0.1598, "num_input_tokens_seen": 32270368, "step": 52970 }, { "epoch": 16.436549798324542, "grad_norm": 11.782602310180664, "learning_rate": 9.364073827874726e-07, "loss": 0.1452, "num_input_tokens_seen": 32273312, "step": 52975 }, { "epoch": 16.438101147998758, "grad_norm": 15.147836685180664, "learning_rate": 9.356187267252381e-07, "loss": 0.1784, "num_input_tokens_seen": 32275520, "step": 52980 }, { "epoch": 16.439652497672977, "grad_norm": 15.93250846862793, "learning_rate": 9.348303686302795e-07, "loss": 0.2121, "num_input_tokens_seen": 32279488, "step": 52985 }, { "epoch": 16.441203847347193, "grad_norm": 29.036357879638672, "learning_rate": 9.34042308560395e-07, "loss": 0.2639, "num_input_tokens_seen": 32281952, "step": 52990 }, { "epoch": 16.44275519702141, "grad_norm": 6.702205657958984, "learning_rate": 9.332545465733562e-07, "loss": 0.1257, "num_input_tokens_seen": 32284096, "step": 52995 }, { "epoch": 16.444306546695625, "grad_norm": 35.62173843383789, "learning_rate": 9.324670827269177e-07, "loss": 0.2363, "num_input_tokens_seen": 32287808, "step": 53000 }, { "epoch": 16.44585789636984, "grad_norm": 15.02415657043457, "learning_rate": 9.316799170788083e-07, "loss": 0.1879, "num_input_tokens_seen": 32290048, "step": 53005 }, { "epoch": 16.447409246044057, "grad_norm": 7.80753755569458, "learning_rate": 9.308930496867374e-07, "loss": 0.097, "num_input_tokens_seen": 32292800, "step": 53010 }, { "epoch": 16.448960595718276, "grad_norm": 24.468059539794922, "learning_rate": 9.301064806083904e-07, "loss": 0.177, "num_input_tokens_seen": 32295008, "step": 53015 }, { "epoch": 16.450511945392492, "grad_norm": 11.730134010314941, "learning_rate": 9.293202099014331e-07, "loss": 0.1582, "num_input_tokens_seen": 32298144, "step": 53020 }, { "epoch": 16.452063295066708, "grad_norm": 23.52755355834961, "learning_rate": 9.285342376235101e-07, "loss": 0.1867, "num_input_tokens_seen": 32301184, "step": 53025 }, { "epoch": 16.453614644740924, "grad_norm": 24.212451934814453, "learning_rate": 9.277485638322392e-07, "loss": 0.1818, "num_input_tokens_seen": 32304096, "step": 53030 }, { "epoch": 16.45516599441514, "grad_norm": 9.325994491577148, "learning_rate": 9.269631885852226e-07, "loss": 0.2165, "num_input_tokens_seen": 32306560, "step": 53035 }, { "epoch": 16.45671734408936, "grad_norm": 12.044154167175293, "learning_rate": 9.261781119400349e-07, "loss": 0.0813, "num_input_tokens_seen": 32309408, "step": 53040 }, { "epoch": 16.458268693763575, "grad_norm": 13.349821090698242, "learning_rate": 9.253933339542342e-07, "loss": 0.1706, "num_input_tokens_seen": 32312032, "step": 53045 }, { "epoch": 16.45982004343779, "grad_norm": 4.419226169586182, "learning_rate": 9.246088546853516e-07, "loss": 0.1089, "num_input_tokens_seen": 32315200, "step": 53050 }, { "epoch": 16.461371393112007, "grad_norm": 30.726367950439453, "learning_rate": 9.238246741909007e-07, "loss": 0.2378, "num_input_tokens_seen": 32317856, "step": 53055 }, { "epoch": 16.462922742786223, "grad_norm": 28.570791244506836, "learning_rate": 9.230407925283697e-07, "loss": 0.2326, "num_input_tokens_seen": 32320640, "step": 53060 }, { "epoch": 16.464474092460442, "grad_norm": 20.444801330566406, "learning_rate": 9.222572097552273e-07, "loss": 0.1294, "num_input_tokens_seen": 32323584, "step": 53065 }, { "epoch": 16.466025442134658, "grad_norm": 22.854440689086914, "learning_rate": 9.214739259289185e-07, "loss": 0.1984, "num_input_tokens_seen": 32326816, "step": 53070 }, { "epoch": 16.467576791808874, "grad_norm": 39.50490951538086, "learning_rate": 9.206909411068693e-07, "loss": 0.2623, "num_input_tokens_seen": 32329568, "step": 53075 }, { "epoch": 16.46912814148309, "grad_norm": 12.888699531555176, "learning_rate": 9.199082553464789e-07, "loss": 0.1463, "num_input_tokens_seen": 32333024, "step": 53080 }, { "epoch": 16.470679491157306, "grad_norm": 15.924697875976562, "learning_rate": 9.191258687051291e-07, "loss": 0.1584, "num_input_tokens_seen": 32336032, "step": 53085 }, { "epoch": 16.472230840831525, "grad_norm": 7.7469682693481445, "learning_rate": 9.183437812401786e-07, "loss": 0.2034, "num_input_tokens_seen": 32338528, "step": 53090 }, { "epoch": 16.47378219050574, "grad_norm": 14.290653228759766, "learning_rate": 9.175619930089625e-07, "loss": 0.1649, "num_input_tokens_seen": 32341024, "step": 53095 }, { "epoch": 16.475333540179957, "grad_norm": 13.085175514221191, "learning_rate": 9.167805040687961e-07, "loss": 0.135, "num_input_tokens_seen": 32344800, "step": 53100 }, { "epoch": 16.476884889854173, "grad_norm": 16.248886108398438, "learning_rate": 9.159993144769702e-07, "loss": 0.1771, "num_input_tokens_seen": 32347168, "step": 53105 }, { "epoch": 16.47843623952839, "grad_norm": 5.824944972991943, "learning_rate": 9.15218424290758e-07, "loss": 0.1636, "num_input_tokens_seen": 32352032, "step": 53110 }, { "epoch": 16.479987589202608, "grad_norm": 9.51728630065918, "learning_rate": 9.144378335674048e-07, "loss": 0.1928, "num_input_tokens_seen": 32354368, "step": 53115 }, { "epoch": 16.481538938876824, "grad_norm": 20.368457794189453, "learning_rate": 9.136575423641403e-07, "loss": 0.1641, "num_input_tokens_seen": 32358272, "step": 53120 }, { "epoch": 16.48309028855104, "grad_norm": 6.809435844421387, "learning_rate": 9.12877550738166e-07, "loss": 0.146, "num_input_tokens_seen": 32361120, "step": 53125 }, { "epoch": 16.484641638225256, "grad_norm": 5.972461700439453, "learning_rate": 9.120978587466673e-07, "loss": 0.2238, "num_input_tokens_seen": 32363808, "step": 53130 }, { "epoch": 16.48619298789947, "grad_norm": 15.407487869262695, "learning_rate": 9.113184664468017e-07, "loss": 0.1396, "num_input_tokens_seen": 32366592, "step": 53135 }, { "epoch": 16.487744337573687, "grad_norm": 12.220927238464355, "learning_rate": 9.105393738957119e-07, "loss": 0.2142, "num_input_tokens_seen": 32369792, "step": 53140 }, { "epoch": 16.489295687247907, "grad_norm": 8.922758102416992, "learning_rate": 9.097605811505106e-07, "loss": 0.0932, "num_input_tokens_seen": 32372512, "step": 53145 }, { "epoch": 16.490847036922123, "grad_norm": 1.813840389251709, "learning_rate": 9.089820882682949e-07, "loss": 0.0981, "num_input_tokens_seen": 32377216, "step": 53150 }, { "epoch": 16.49239838659634, "grad_norm": 14.994230270385742, "learning_rate": 9.082038953061378e-07, "loss": 0.2366, "num_input_tokens_seen": 32380096, "step": 53155 }, { "epoch": 16.493949736270554, "grad_norm": 9.302541732788086, "learning_rate": 9.074260023210879e-07, "loss": 0.2009, "num_input_tokens_seen": 32382208, "step": 53160 }, { "epoch": 16.49550108594477, "grad_norm": 9.068767547607422, "learning_rate": 9.066484093701772e-07, "loss": 0.1714, "num_input_tokens_seen": 32385792, "step": 53165 }, { "epoch": 16.49705243561899, "grad_norm": 14.197915077209473, "learning_rate": 9.05871116510409e-07, "loss": 0.1814, "num_input_tokens_seen": 32388768, "step": 53170 }, { "epoch": 16.498603785293206, "grad_norm": 19.74357795715332, "learning_rate": 9.050941237987709e-07, "loss": 0.1761, "num_input_tokens_seen": 32392128, "step": 53175 }, { "epoch": 16.50015513496742, "grad_norm": 27.990398406982422, "learning_rate": 9.043174312922237e-07, "loss": 0.1758, "num_input_tokens_seen": 32395872, "step": 53180 }, { "epoch": 16.501706484641637, "grad_norm": 10.809852600097656, "learning_rate": 9.035410390477095e-07, "loss": 0.1741, "num_input_tokens_seen": 32399040, "step": 53185 }, { "epoch": 16.503257834315853, "grad_norm": 12.958329200744629, "learning_rate": 9.027649471221456e-07, "loss": 0.2453, "num_input_tokens_seen": 32402432, "step": 53190 }, { "epoch": 16.504809183990073, "grad_norm": 10.188641548156738, "learning_rate": 9.019891555724308e-07, "loss": 0.1109, "num_input_tokens_seen": 32405056, "step": 53195 }, { "epoch": 16.50636053366429, "grad_norm": 13.78142261505127, "learning_rate": 9.012136644554376e-07, "loss": 0.1076, "num_input_tokens_seen": 32408032, "step": 53200 }, { "epoch": 16.507911883338505, "grad_norm": 5.334964275360107, "learning_rate": 9.004384738280192e-07, "loss": 0.1329, "num_input_tokens_seen": 32410560, "step": 53205 }, { "epoch": 16.50946323301272, "grad_norm": 11.698861122131348, "learning_rate": 8.996635837470086e-07, "loss": 0.1755, "num_input_tokens_seen": 32415520, "step": 53210 }, { "epoch": 16.511014582686936, "grad_norm": 14.844587326049805, "learning_rate": 8.988889942692109e-07, "loss": 0.1652, "num_input_tokens_seen": 32418208, "step": 53215 }, { "epoch": 16.512565932361156, "grad_norm": 8.54512882232666, "learning_rate": 8.981147054514155e-07, "loss": 0.1217, "num_input_tokens_seen": 32421024, "step": 53220 }, { "epoch": 16.51411728203537, "grad_norm": 3.8461153507232666, "learning_rate": 8.973407173503846e-07, "loss": 0.2, "num_input_tokens_seen": 32423680, "step": 53225 }, { "epoch": 16.515668631709588, "grad_norm": 4.294621467590332, "learning_rate": 8.96567030022863e-07, "loss": 0.1584, "num_input_tokens_seen": 32426112, "step": 53230 }, { "epoch": 16.517219981383803, "grad_norm": 13.51743221282959, "learning_rate": 8.957936435255693e-07, "loss": 0.1745, "num_input_tokens_seen": 32429952, "step": 53235 }, { "epoch": 16.51877133105802, "grad_norm": 11.311821937561035, "learning_rate": 8.950205579152033e-07, "loss": 0.1088, "num_input_tokens_seen": 32433088, "step": 53240 }, { "epoch": 16.52032268073224, "grad_norm": 15.970819473266602, "learning_rate": 8.942477732484394e-07, "loss": 0.1885, "num_input_tokens_seen": 32437088, "step": 53245 }, { "epoch": 16.521874030406455, "grad_norm": 8.354300498962402, "learning_rate": 8.934752895819349e-07, "loss": 0.1305, "num_input_tokens_seen": 32439616, "step": 53250 }, { "epoch": 16.52342538008067, "grad_norm": 30.714120864868164, "learning_rate": 8.927031069723185e-07, "loss": 0.1746, "num_input_tokens_seen": 32442560, "step": 53255 }, { "epoch": 16.524976729754886, "grad_norm": 13.33174991607666, "learning_rate": 8.91931225476203e-07, "loss": 0.1648, "num_input_tokens_seen": 32445184, "step": 53260 }, { "epoch": 16.526528079429102, "grad_norm": 18.708892822265625, "learning_rate": 8.911596451501747e-07, "loss": 0.1376, "num_input_tokens_seen": 32449184, "step": 53265 }, { "epoch": 16.528079429103318, "grad_norm": 11.283927917480469, "learning_rate": 8.903883660508006e-07, "loss": 0.1276, "num_input_tokens_seen": 32452320, "step": 53270 }, { "epoch": 16.529630778777538, "grad_norm": 5.6356048583984375, "learning_rate": 8.896173882346248e-07, "loss": 0.133, "num_input_tokens_seen": 32455168, "step": 53275 }, { "epoch": 16.531182128451753, "grad_norm": 15.668688774108887, "learning_rate": 8.888467117581684e-07, "loss": 0.1599, "num_input_tokens_seen": 32457696, "step": 53280 }, { "epoch": 16.53273347812597, "grad_norm": 5.7232513427734375, "learning_rate": 8.880763366779322e-07, "loss": 0.1783, "num_input_tokens_seen": 32459840, "step": 53285 }, { "epoch": 16.534284827800185, "grad_norm": 15.880388259887695, "learning_rate": 8.873062630503915e-07, "loss": 0.1657, "num_input_tokens_seen": 32462816, "step": 53290 }, { "epoch": 16.5358361774744, "grad_norm": 9.77112865447998, "learning_rate": 8.865364909320046e-07, "loss": 0.1464, "num_input_tokens_seen": 32466688, "step": 53295 }, { "epoch": 16.53738752714862, "grad_norm": 21.52865982055664, "learning_rate": 8.857670203792023e-07, "loss": 0.175, "num_input_tokens_seen": 32469888, "step": 53300 }, { "epoch": 16.538938876822836, "grad_norm": 15.012596130371094, "learning_rate": 8.849978514483986e-07, "loss": 0.1676, "num_input_tokens_seen": 32472448, "step": 53305 }, { "epoch": 16.540490226497052, "grad_norm": 10.748536109924316, "learning_rate": 8.842289841959801e-07, "loss": 0.2369, "num_input_tokens_seen": 32475040, "step": 53310 }, { "epoch": 16.54204157617127, "grad_norm": 13.50665283203125, "learning_rate": 8.834604186783164e-07, "loss": 0.1789, "num_input_tokens_seen": 32477248, "step": 53315 }, { "epoch": 16.543592925845484, "grad_norm": 18.929611206054688, "learning_rate": 8.826921549517498e-07, "loss": 0.1961, "num_input_tokens_seen": 32480224, "step": 53320 }, { "epoch": 16.545144275519704, "grad_norm": 12.096880912780762, "learning_rate": 8.819241930726041e-07, "loss": 0.1771, "num_input_tokens_seen": 32483168, "step": 53325 }, { "epoch": 16.54669562519392, "grad_norm": 2.0262157917022705, "learning_rate": 8.811565330971822e-07, "loss": 0.1608, "num_input_tokens_seen": 32488192, "step": 53330 }, { "epoch": 16.548246974868135, "grad_norm": 21.673906326293945, "learning_rate": 8.803891750817589e-07, "loss": 0.1478, "num_input_tokens_seen": 32491872, "step": 53335 }, { "epoch": 16.54979832454235, "grad_norm": 18.505992889404297, "learning_rate": 8.796221190825943e-07, "loss": 0.2017, "num_input_tokens_seen": 32494240, "step": 53340 }, { "epoch": 16.551349674216567, "grad_norm": 12.086984634399414, "learning_rate": 8.788553651559195e-07, "loss": 0.1464, "num_input_tokens_seen": 32498144, "step": 53345 }, { "epoch": 16.552901023890787, "grad_norm": 4.4280619621276855, "learning_rate": 8.780889133579496e-07, "loss": 0.1156, "num_input_tokens_seen": 32500448, "step": 53350 }, { "epoch": 16.554452373565002, "grad_norm": 14.9396390914917, "learning_rate": 8.773227637448717e-07, "loss": 0.1435, "num_input_tokens_seen": 32503168, "step": 53355 }, { "epoch": 16.55600372323922, "grad_norm": 16.639699935913086, "learning_rate": 8.76556916372856e-07, "loss": 0.2663, "num_input_tokens_seen": 32506080, "step": 53360 }, { "epoch": 16.557555072913434, "grad_norm": 88.21499633789062, "learning_rate": 8.757913712980465e-07, "loss": 0.2799, "num_input_tokens_seen": 32509408, "step": 53365 }, { "epoch": 16.55910642258765, "grad_norm": 24.161231994628906, "learning_rate": 8.750261285765682e-07, "loss": 0.1915, "num_input_tokens_seen": 32512160, "step": 53370 }, { "epoch": 16.56065777226187, "grad_norm": 5.62682580947876, "learning_rate": 8.742611882645207e-07, "loss": 0.1647, "num_input_tokens_seen": 32515168, "step": 53375 }, { "epoch": 16.562209121936085, "grad_norm": 20.518674850463867, "learning_rate": 8.734965504179854e-07, "loss": 0.1526, "num_input_tokens_seen": 32518016, "step": 53380 }, { "epoch": 16.5637604716103, "grad_norm": 5.579946994781494, "learning_rate": 8.727322150930167e-07, "loss": 0.1372, "num_input_tokens_seen": 32521344, "step": 53385 }, { "epoch": 16.565311821284517, "grad_norm": 8.318795204162598, "learning_rate": 8.719681823456505e-07, "loss": 0.269, "num_input_tokens_seen": 32523968, "step": 53390 }, { "epoch": 16.566863170958733, "grad_norm": 10.312539100646973, "learning_rate": 8.712044522319013e-07, "loss": 0.15, "num_input_tokens_seen": 32527360, "step": 53395 }, { "epoch": 16.56841452063295, "grad_norm": 31.68685531616211, "learning_rate": 8.704410248077572e-07, "loss": 0.1831, "num_input_tokens_seen": 32532256, "step": 53400 }, { "epoch": 16.56996587030717, "grad_norm": 31.141447067260742, "learning_rate": 8.696779001291878e-07, "loss": 0.1622, "num_input_tokens_seen": 32534464, "step": 53405 }, { "epoch": 16.571517219981384, "grad_norm": 24.723962783813477, "learning_rate": 8.689150782521377e-07, "loss": 0.1373, "num_input_tokens_seen": 32537120, "step": 53410 }, { "epoch": 16.5730685696556, "grad_norm": 8.235884666442871, "learning_rate": 8.68152559232533e-07, "loss": 0.2349, "num_input_tokens_seen": 32540992, "step": 53415 }, { "epoch": 16.574619919329816, "grad_norm": 13.464917182922363, "learning_rate": 8.673903431262726e-07, "loss": 0.1552, "num_input_tokens_seen": 32543360, "step": 53420 }, { "epoch": 16.576171269004032, "grad_norm": 5.153465270996094, "learning_rate": 8.66628429989238e-07, "loss": 0.1454, "num_input_tokens_seen": 32545280, "step": 53425 }, { "epoch": 16.57772261867825, "grad_norm": 17.423648834228516, "learning_rate": 8.658668198772857e-07, "loss": 0.1422, "num_input_tokens_seen": 32548128, "step": 53430 }, { "epoch": 16.579273968352467, "grad_norm": 8.838266372680664, "learning_rate": 8.651055128462521e-07, "loss": 0.2281, "num_input_tokens_seen": 32551072, "step": 53435 }, { "epoch": 16.580825318026683, "grad_norm": 30.800031661987305, "learning_rate": 8.643445089519475e-07, "loss": 0.1159, "num_input_tokens_seen": 32553504, "step": 53440 }, { "epoch": 16.5823766677009, "grad_norm": 18.14215087890625, "learning_rate": 8.635838082501651e-07, "loss": 0.1642, "num_input_tokens_seen": 32556544, "step": 53445 }, { "epoch": 16.583928017375115, "grad_norm": 10.932921409606934, "learning_rate": 8.628234107966709e-07, "loss": 0.16, "num_input_tokens_seen": 32558944, "step": 53450 }, { "epoch": 16.585479367049334, "grad_norm": 11.796477317810059, "learning_rate": 8.620633166472136e-07, "loss": 0.1556, "num_input_tokens_seen": 32562240, "step": 53455 }, { "epoch": 16.58703071672355, "grad_norm": 23.9306697845459, "learning_rate": 8.613035258575148e-07, "loss": 0.17, "num_input_tokens_seen": 32564704, "step": 53460 }, { "epoch": 16.588582066397766, "grad_norm": 13.964715003967285, "learning_rate": 8.605440384832775e-07, "loss": 0.1551, "num_input_tokens_seen": 32567584, "step": 53465 }, { "epoch": 16.590133416071982, "grad_norm": 9.643735885620117, "learning_rate": 8.597848545801801e-07, "loss": 0.1954, "num_input_tokens_seen": 32570240, "step": 53470 }, { "epoch": 16.591684765746198, "grad_norm": 6.869566917419434, "learning_rate": 8.590259742038798e-07, "loss": 0.1822, "num_input_tokens_seen": 32574560, "step": 53475 }, { "epoch": 16.593236115420417, "grad_norm": 10.283738136291504, "learning_rate": 8.582673974100136e-07, "loss": 0.1927, "num_input_tokens_seen": 32577504, "step": 53480 }, { "epoch": 16.594787465094633, "grad_norm": 7.22208309173584, "learning_rate": 8.575091242541911e-07, "loss": 0.0878, "num_input_tokens_seen": 32580960, "step": 53485 }, { "epoch": 16.59633881476885, "grad_norm": 51.82099914550781, "learning_rate": 8.56751154792006e-07, "loss": 0.2419, "num_input_tokens_seen": 32583520, "step": 53490 }, { "epoch": 16.597890164443065, "grad_norm": 9.213879585266113, "learning_rate": 8.559934890790228e-07, "loss": 0.1235, "num_input_tokens_seen": 32586784, "step": 53495 }, { "epoch": 16.59944151411728, "grad_norm": 7.395885467529297, "learning_rate": 8.552361271707909e-07, "loss": 0.189, "num_input_tokens_seen": 32590336, "step": 53500 }, { "epoch": 16.6009928637915, "grad_norm": 16.323455810546875, "learning_rate": 8.54479069122831e-07, "loss": 0.1695, "num_input_tokens_seen": 32593408, "step": 53505 }, { "epoch": 16.602544213465716, "grad_norm": 12.503372192382812, "learning_rate": 8.537223149906465e-07, "loss": 0.0956, "num_input_tokens_seen": 32595872, "step": 53510 }, { "epoch": 16.604095563139932, "grad_norm": 29.150165557861328, "learning_rate": 8.529658648297146e-07, "loss": 0.1175, "num_input_tokens_seen": 32599040, "step": 53515 }, { "epoch": 16.605646912814148, "grad_norm": 20.72993278503418, "learning_rate": 8.522097186954942e-07, "loss": 0.1769, "num_input_tokens_seen": 32601504, "step": 53520 }, { "epoch": 16.607198262488364, "grad_norm": 18.25034523010254, "learning_rate": 8.514538766434178e-07, "loss": 0.2365, "num_input_tokens_seen": 32604480, "step": 53525 }, { "epoch": 16.608749612162583, "grad_norm": 22.212055206298828, "learning_rate": 8.506983387288981e-07, "loss": 0.1747, "num_input_tokens_seen": 32608096, "step": 53530 }, { "epoch": 16.6103009618368, "grad_norm": 8.226358413696289, "learning_rate": 8.499431050073259e-07, "loss": 0.1649, "num_input_tokens_seen": 32611168, "step": 53535 }, { "epoch": 16.611852311511015, "grad_norm": 8.414624214172363, "learning_rate": 8.491881755340676e-07, "loss": 0.2107, "num_input_tokens_seen": 32614688, "step": 53540 }, { "epoch": 16.61340366118523, "grad_norm": 8.286848068237305, "learning_rate": 8.484335503644703e-07, "loss": 0.1157, "num_input_tokens_seen": 32618912, "step": 53545 }, { "epoch": 16.614955010859447, "grad_norm": 40.8893928527832, "learning_rate": 8.476792295538539e-07, "loss": 0.1952, "num_input_tokens_seen": 32621152, "step": 53550 }, { "epoch": 16.616506360533663, "grad_norm": 18.106689453125, "learning_rate": 8.469252131575223e-07, "loss": 0.2281, "num_input_tokens_seen": 32624544, "step": 53555 }, { "epoch": 16.618057710207882, "grad_norm": 9.577908515930176, "learning_rate": 8.461715012307508e-07, "loss": 0.1545, "num_input_tokens_seen": 32626848, "step": 53560 }, { "epoch": 16.619609059882098, "grad_norm": 34.55850601196289, "learning_rate": 8.454180938287987e-07, "loss": 0.1231, "num_input_tokens_seen": 32629952, "step": 53565 }, { "epoch": 16.621160409556314, "grad_norm": 17.166473388671875, "learning_rate": 8.446649910068965e-07, "loss": 0.1879, "num_input_tokens_seen": 32632576, "step": 53570 }, { "epoch": 16.62271175923053, "grad_norm": 7.413275241851807, "learning_rate": 8.439121928202582e-07, "loss": 0.1832, "num_input_tokens_seen": 32635488, "step": 53575 }, { "epoch": 16.624263108904746, "grad_norm": 11.985410690307617, "learning_rate": 8.431596993240704e-07, "loss": 0.1805, "num_input_tokens_seen": 32639328, "step": 53580 }, { "epoch": 16.625814458578965, "grad_norm": 22.244930267333984, "learning_rate": 8.424075105735024e-07, "loss": 0.1509, "num_input_tokens_seen": 32641472, "step": 53585 }, { "epoch": 16.62736580825318, "grad_norm": 12.19994831085205, "learning_rate": 8.416556266236959e-07, "loss": 0.2062, "num_input_tokens_seen": 32644416, "step": 53590 }, { "epoch": 16.628917157927397, "grad_norm": 19.78518295288086, "learning_rate": 8.409040475297736e-07, "loss": 0.1146, "num_input_tokens_seen": 32646656, "step": 53595 }, { "epoch": 16.630468507601613, "grad_norm": 8.099167823791504, "learning_rate": 8.401527733468373e-07, "loss": 0.1398, "num_input_tokens_seen": 32649696, "step": 53600 }, { "epoch": 16.63201985727583, "grad_norm": 11.282408714294434, "learning_rate": 8.394018041299612e-07, "loss": 0.2561, "num_input_tokens_seen": 32652064, "step": 53605 }, { "epoch": 16.633571206950048, "grad_norm": 5.822947978973389, "learning_rate": 8.386511399342034e-07, "loss": 0.1855, "num_input_tokens_seen": 32654752, "step": 53610 }, { "epoch": 16.635122556624264, "grad_norm": 23.88111114501953, "learning_rate": 8.379007808145934e-07, "loss": 0.1021, "num_input_tokens_seen": 32657280, "step": 53615 }, { "epoch": 16.63667390629848, "grad_norm": 21.364587783813477, "learning_rate": 8.371507268261436e-07, "loss": 0.1139, "num_input_tokens_seen": 32660128, "step": 53620 }, { "epoch": 16.638225255972696, "grad_norm": 6.9802727699279785, "learning_rate": 8.364009780238403e-07, "loss": 0.1132, "num_input_tokens_seen": 32664512, "step": 53625 }, { "epoch": 16.63977660564691, "grad_norm": 11.958935737609863, "learning_rate": 8.356515344626509e-07, "loss": 0.1983, "num_input_tokens_seen": 32666528, "step": 53630 }, { "epoch": 16.64132795532113, "grad_norm": 4.296799182891846, "learning_rate": 8.349023961975155e-07, "loss": 0.1043, "num_input_tokens_seen": 32669696, "step": 53635 }, { "epoch": 16.642879304995347, "grad_norm": 12.099654197692871, "learning_rate": 8.341535632833586e-07, "loss": 0.1786, "num_input_tokens_seen": 32672896, "step": 53640 }, { "epoch": 16.644430654669563, "grad_norm": 14.145516395568848, "learning_rate": 8.334050357750745e-07, "loss": 0.1935, "num_input_tokens_seen": 32675744, "step": 53645 }, { "epoch": 16.64598200434378, "grad_norm": 4.763348579406738, "learning_rate": 8.32656813727542e-07, "loss": 0.1178, "num_input_tokens_seen": 32679264, "step": 53650 }, { "epoch": 16.647533354017995, "grad_norm": 9.58795166015625, "learning_rate": 8.319088971956141e-07, "loss": 0.1905, "num_input_tokens_seen": 32681696, "step": 53655 }, { "epoch": 16.64908470369221, "grad_norm": 7.697866916656494, "learning_rate": 8.311612862341212e-07, "loss": 0.1728, "num_input_tokens_seen": 32684864, "step": 53660 }, { "epoch": 16.65063605336643, "grad_norm": 5.0668253898620605, "learning_rate": 8.304139808978734e-07, "loss": 0.1498, "num_input_tokens_seen": 32687552, "step": 53665 }, { "epoch": 16.652187403040646, "grad_norm": 16.851221084594727, "learning_rate": 8.296669812416546e-07, "loss": 0.1362, "num_input_tokens_seen": 32690560, "step": 53670 }, { "epoch": 16.65373875271486, "grad_norm": 17.6759090423584, "learning_rate": 8.289202873202317e-07, "loss": 0.1912, "num_input_tokens_seen": 32693568, "step": 53675 }, { "epoch": 16.655290102389078, "grad_norm": 28.90541648864746, "learning_rate": 8.281738991883431e-07, "loss": 0.1612, "num_input_tokens_seen": 32695424, "step": 53680 }, { "epoch": 16.656841452063293, "grad_norm": 3.857508897781372, "learning_rate": 8.27427816900711e-07, "loss": 0.1402, "num_input_tokens_seen": 32698272, "step": 53685 }, { "epoch": 16.658392801737513, "grad_norm": 9.072793960571289, "learning_rate": 8.266820405120296e-07, "loss": 0.1403, "num_input_tokens_seen": 32700768, "step": 53690 }, { "epoch": 16.65994415141173, "grad_norm": 11.192973136901855, "learning_rate": 8.259365700769751e-07, "loss": 0.1511, "num_input_tokens_seen": 32705664, "step": 53695 }, { "epoch": 16.661495501085945, "grad_norm": 15.915088653564453, "learning_rate": 8.25191405650197e-07, "loss": 0.1286, "num_input_tokens_seen": 32708352, "step": 53700 }, { "epoch": 16.66304685076016, "grad_norm": 13.180947303771973, "learning_rate": 8.244465472863278e-07, "loss": 0.167, "num_input_tokens_seen": 32711168, "step": 53705 }, { "epoch": 16.664598200434376, "grad_norm": 26.27939796447754, "learning_rate": 8.237019950399705e-07, "loss": 0.1379, "num_input_tokens_seen": 32713920, "step": 53710 }, { "epoch": 16.666149550108596, "grad_norm": 18.673873901367188, "learning_rate": 8.229577489657126e-07, "loss": 0.1515, "num_input_tokens_seen": 32717248, "step": 53715 }, { "epoch": 16.66770089978281, "grad_norm": 9.468900680541992, "learning_rate": 8.222138091181158e-07, "loss": 0.2621, "num_input_tokens_seen": 32719520, "step": 53720 }, { "epoch": 16.669252249457028, "grad_norm": 11.935286521911621, "learning_rate": 8.214701755517185e-07, "loss": 0.1731, "num_input_tokens_seen": 32723296, "step": 53725 }, { "epoch": 16.670803599131244, "grad_norm": 30.22046661376953, "learning_rate": 8.20726848321039e-07, "loss": 0.2155, "num_input_tokens_seen": 32725792, "step": 53730 }, { "epoch": 16.67235494880546, "grad_norm": 11.402661323547363, "learning_rate": 8.19983827480571e-07, "loss": 0.1219, "num_input_tokens_seen": 32729376, "step": 53735 }, { "epoch": 16.67390629847968, "grad_norm": 19.2641544342041, "learning_rate": 8.192411130847883e-07, "loss": 0.2135, "num_input_tokens_seen": 32731744, "step": 53740 }, { "epoch": 16.675457648153895, "grad_norm": 17.245412826538086, "learning_rate": 8.184987051881382e-07, "loss": 0.1733, "num_input_tokens_seen": 32734944, "step": 53745 }, { "epoch": 16.67700899782811, "grad_norm": 23.627748489379883, "learning_rate": 8.177566038450507e-07, "loss": 0.2824, "num_input_tokens_seen": 32737088, "step": 53750 }, { "epoch": 16.678560347502327, "grad_norm": 10.945590019226074, "learning_rate": 8.170148091099284e-07, "loss": 0.2229, "num_input_tokens_seen": 32740608, "step": 53755 }, { "epoch": 16.680111697176542, "grad_norm": 5.32966423034668, "learning_rate": 8.162733210371554e-07, "loss": 0.1529, "num_input_tokens_seen": 32744032, "step": 53760 }, { "epoch": 16.681663046850762, "grad_norm": 21.648395538330078, "learning_rate": 8.155321396810894e-07, "loss": 0.1612, "num_input_tokens_seen": 32747616, "step": 53765 }, { "epoch": 16.683214396524978, "grad_norm": 19.516538619995117, "learning_rate": 8.147912650960693e-07, "loss": 0.2794, "num_input_tokens_seen": 32750432, "step": 53770 }, { "epoch": 16.684765746199194, "grad_norm": 15.435648918151855, "learning_rate": 8.140506973364109e-07, "loss": 0.1428, "num_input_tokens_seen": 32753216, "step": 53775 }, { "epoch": 16.68631709587341, "grad_norm": 19.023174285888672, "learning_rate": 8.133104364564043e-07, "loss": 0.2009, "num_input_tokens_seen": 32756224, "step": 53780 }, { "epoch": 16.687868445547625, "grad_norm": 22.658599853515625, "learning_rate": 8.12570482510322e-07, "loss": 0.1667, "num_input_tokens_seen": 32758912, "step": 53785 }, { "epoch": 16.689419795221845, "grad_norm": 24.533653259277344, "learning_rate": 8.118308355524079e-07, "loss": 0.1641, "num_input_tokens_seen": 32761184, "step": 53790 }, { "epoch": 16.69097114489606, "grad_norm": 5.730476379394531, "learning_rate": 8.11091495636891e-07, "loss": 0.1876, "num_input_tokens_seen": 32763840, "step": 53795 }, { "epoch": 16.692522494570277, "grad_norm": 14.564080238342285, "learning_rate": 8.103524628179699e-07, "loss": 0.1274, "num_input_tokens_seen": 32768064, "step": 53800 }, { "epoch": 16.694073844244492, "grad_norm": 8.496302604675293, "learning_rate": 8.096137371498275e-07, "loss": 0.1244, "num_input_tokens_seen": 32771680, "step": 53805 }, { "epoch": 16.69562519391871, "grad_norm": 4.600185394287109, "learning_rate": 8.088753186866183e-07, "loss": 0.1703, "num_input_tokens_seen": 32773984, "step": 53810 }, { "epoch": 16.697176543592924, "grad_norm": 10.173927307128906, "learning_rate": 8.0813720748248e-07, "loss": 0.1179, "num_input_tokens_seen": 32776704, "step": 53815 }, { "epoch": 16.698727893267144, "grad_norm": 29.2541446685791, "learning_rate": 8.073994035915222e-07, "loss": 0.1811, "num_input_tokens_seen": 32779648, "step": 53820 }, { "epoch": 16.70027924294136, "grad_norm": 2.1225550174713135, "learning_rate": 8.066619070678372e-07, "loss": 0.1227, "num_input_tokens_seen": 32782400, "step": 53825 }, { "epoch": 16.701830592615575, "grad_norm": 3.7334365844726562, "learning_rate": 8.059247179654894e-07, "loss": 0.1087, "num_input_tokens_seen": 32785984, "step": 53830 }, { "epoch": 16.70338194228979, "grad_norm": 5.068108081817627, "learning_rate": 8.051878363385257e-07, "loss": 0.1188, "num_input_tokens_seen": 32788800, "step": 53835 }, { "epoch": 16.704933291964007, "grad_norm": 19.76468276977539, "learning_rate": 8.044512622409684e-07, "loss": 0.2443, "num_input_tokens_seen": 32792640, "step": 53840 }, { "epoch": 16.706484641638227, "grad_norm": 7.612569332122803, "learning_rate": 8.037149957268154e-07, "loss": 0.1673, "num_input_tokens_seen": 32794976, "step": 53845 }, { "epoch": 16.708035991312443, "grad_norm": 19.38054084777832, "learning_rate": 8.029790368500456e-07, "loss": 0.1799, "num_input_tokens_seen": 32798944, "step": 53850 }, { "epoch": 16.70958734098666, "grad_norm": 27.617755889892578, "learning_rate": 8.022433856646111e-07, "loss": 0.1448, "num_input_tokens_seen": 32802400, "step": 53855 }, { "epoch": 16.711138690660874, "grad_norm": 9.338907241821289, "learning_rate": 8.01508042224447e-07, "loss": 0.203, "num_input_tokens_seen": 32805600, "step": 53860 }, { "epoch": 16.71269004033509, "grad_norm": 25.44624137878418, "learning_rate": 8.007730065834596e-07, "loss": 0.142, "num_input_tokens_seen": 32809184, "step": 53865 }, { "epoch": 16.71424139000931, "grad_norm": 16.964637756347656, "learning_rate": 8.00038278795538e-07, "loss": 0.2575, "num_input_tokens_seen": 32812320, "step": 53870 }, { "epoch": 16.715792739683526, "grad_norm": 11.8615083694458, "learning_rate": 7.993038589145447e-07, "loss": 0.1096, "num_input_tokens_seen": 32815232, "step": 53875 }, { "epoch": 16.71734408935774, "grad_norm": 11.82829761505127, "learning_rate": 7.985697469943237e-07, "loss": 0.1154, "num_input_tokens_seen": 32817440, "step": 53880 }, { "epoch": 16.718895439031957, "grad_norm": 26.4464168548584, "learning_rate": 7.978359430886912e-07, "loss": 0.1367, "num_input_tokens_seen": 32820160, "step": 53885 }, { "epoch": 16.720446788706173, "grad_norm": 22.987258911132812, "learning_rate": 7.97102447251445e-07, "loss": 0.1822, "num_input_tokens_seen": 32823104, "step": 53890 }, { "epoch": 16.721998138380393, "grad_norm": 8.83200454711914, "learning_rate": 7.963692595363603e-07, "loss": 0.0998, "num_input_tokens_seen": 32826080, "step": 53895 }, { "epoch": 16.72354948805461, "grad_norm": 7.051533222198486, "learning_rate": 7.956363799971862e-07, "loss": 0.1745, "num_input_tokens_seen": 32830400, "step": 53900 }, { "epoch": 16.725100837728824, "grad_norm": 45.78134536743164, "learning_rate": 7.949038086876537e-07, "loss": 0.236, "num_input_tokens_seen": 32833568, "step": 53905 }, { "epoch": 16.72665218740304, "grad_norm": 30.785072326660156, "learning_rate": 7.941715456614668e-07, "loss": 0.1206, "num_input_tokens_seen": 32836064, "step": 53910 }, { "epoch": 16.728203537077256, "grad_norm": 19.115785598754883, "learning_rate": 7.934395909723109e-07, "loss": 0.2091, "num_input_tokens_seen": 32839200, "step": 53915 }, { "epoch": 16.729754886751472, "grad_norm": 15.299495697021484, "learning_rate": 7.927079446738451e-07, "loss": 0.1211, "num_input_tokens_seen": 32841824, "step": 53920 }, { "epoch": 16.73130623642569, "grad_norm": 28.85942840576172, "learning_rate": 7.919766068197099e-07, "loss": 0.2791, "num_input_tokens_seen": 32845376, "step": 53925 }, { "epoch": 16.732857586099907, "grad_norm": 24.7860050201416, "learning_rate": 7.912455774635192e-07, "loss": 0.2243, "num_input_tokens_seen": 32849280, "step": 53930 }, { "epoch": 16.734408935774123, "grad_norm": 12.932319641113281, "learning_rate": 7.905148566588672e-07, "loss": 0.1624, "num_input_tokens_seen": 32853152, "step": 53935 }, { "epoch": 16.73596028544834, "grad_norm": 24.18044662475586, "learning_rate": 7.897844444593234e-07, "loss": 0.1874, "num_input_tokens_seen": 32855616, "step": 53940 }, { "epoch": 16.737511635122555, "grad_norm": 10.218323707580566, "learning_rate": 7.89054340918437e-07, "loss": 0.1983, "num_input_tokens_seen": 32858400, "step": 53945 }, { "epoch": 16.739062984796774, "grad_norm": 27.23569107055664, "learning_rate": 7.883245460897315e-07, "loss": 0.1541, "num_input_tokens_seen": 32861472, "step": 53950 }, { "epoch": 16.74061433447099, "grad_norm": 27.859703063964844, "learning_rate": 7.875950600267102e-07, "loss": 0.2065, "num_input_tokens_seen": 32863520, "step": 53955 }, { "epoch": 16.742165684145206, "grad_norm": 17.061405181884766, "learning_rate": 7.868658827828551e-07, "loss": 0.1547, "num_input_tokens_seen": 32866048, "step": 53960 }, { "epoch": 16.743717033819422, "grad_norm": 10.496427536010742, "learning_rate": 7.861370144116198e-07, "loss": 0.1674, "num_input_tokens_seen": 32868704, "step": 53965 }, { "epoch": 16.745268383493638, "grad_norm": 26.483579635620117, "learning_rate": 7.854084549664426e-07, "loss": 0.2002, "num_input_tokens_seen": 32871488, "step": 53970 }, { "epoch": 16.746819733167857, "grad_norm": 10.448675155639648, "learning_rate": 7.846802045007329e-07, "loss": 0.1927, "num_input_tokens_seen": 32874944, "step": 53975 }, { "epoch": 16.748371082842073, "grad_norm": 6.972650051116943, "learning_rate": 7.839522630678819e-07, "loss": 0.1515, "num_input_tokens_seen": 32877920, "step": 53980 }, { "epoch": 16.74992243251629, "grad_norm": 45.82406997680664, "learning_rate": 7.832246307212543e-07, "loss": 0.181, "num_input_tokens_seen": 32881664, "step": 53985 }, { "epoch": 16.751473782190505, "grad_norm": 7.242868423461914, "learning_rate": 7.824973075141967e-07, "loss": 0.1899, "num_input_tokens_seen": 32884704, "step": 53990 }, { "epoch": 16.75302513186472, "grad_norm": 7.602902889251709, "learning_rate": 7.817702935000282e-07, "loss": 0.1403, "num_input_tokens_seen": 32887680, "step": 53995 }, { "epoch": 16.75457648153894, "grad_norm": 3.991825580596924, "learning_rate": 7.810435887320494e-07, "loss": 0.1726, "num_input_tokens_seen": 32890976, "step": 54000 }, { "epoch": 16.756127831213156, "grad_norm": 14.834966659545898, "learning_rate": 7.803171932635329e-07, "loss": 0.2004, "num_input_tokens_seen": 32893568, "step": 54005 }, { "epoch": 16.757679180887372, "grad_norm": 11.564543724060059, "learning_rate": 7.795911071477374e-07, "loss": 0.1588, "num_input_tokens_seen": 32898016, "step": 54010 }, { "epoch": 16.759230530561588, "grad_norm": 15.21138858795166, "learning_rate": 7.78865330437889e-07, "loss": 0.2177, "num_input_tokens_seen": 32900224, "step": 54015 }, { "epoch": 16.760781880235804, "grad_norm": 23.0147705078125, "learning_rate": 7.781398631871995e-07, "loss": 0.2053, "num_input_tokens_seen": 32903360, "step": 54020 }, { "epoch": 16.762333229910023, "grad_norm": 5.395733833312988, "learning_rate": 7.774147054488513e-07, "loss": 0.1693, "num_input_tokens_seen": 32906304, "step": 54025 }, { "epoch": 16.76388457958424, "grad_norm": 16.47547721862793, "learning_rate": 7.766898572760084e-07, "loss": 0.1804, "num_input_tokens_seen": 32908800, "step": 54030 }, { "epoch": 16.765435929258455, "grad_norm": 11.979626655578613, "learning_rate": 7.759653187218097e-07, "loss": 0.195, "num_input_tokens_seen": 32911680, "step": 54035 }, { "epoch": 16.76698727893267, "grad_norm": 6.483911991119385, "learning_rate": 7.752410898393736e-07, "loss": 0.1842, "num_input_tokens_seen": 32914304, "step": 54040 }, { "epoch": 16.768538628606887, "grad_norm": 3.281870126724243, "learning_rate": 7.74517170681795e-07, "loss": 0.1559, "num_input_tokens_seen": 32917888, "step": 54045 }, { "epoch": 16.770089978281106, "grad_norm": 8.063234329223633, "learning_rate": 7.737935613021436e-07, "loss": 0.2424, "num_input_tokens_seen": 32920704, "step": 54050 }, { "epoch": 16.771641327955322, "grad_norm": 10.678940773010254, "learning_rate": 7.730702617534713e-07, "loss": 0.1144, "num_input_tokens_seen": 32923872, "step": 54055 }, { "epoch": 16.773192677629538, "grad_norm": 12.905359268188477, "learning_rate": 7.72347272088802e-07, "loss": 0.1503, "num_input_tokens_seen": 32928192, "step": 54060 }, { "epoch": 16.774744027303754, "grad_norm": 22.078472137451172, "learning_rate": 7.71624592361142e-07, "loss": 0.1656, "num_input_tokens_seen": 32931136, "step": 54065 }, { "epoch": 16.77629537697797, "grad_norm": 34.59852600097656, "learning_rate": 7.709022226234697e-07, "loss": 0.1927, "num_input_tokens_seen": 32933760, "step": 54070 }, { "epoch": 16.777846726652186, "grad_norm": 30.978435516357422, "learning_rate": 7.701801629287454e-07, "loss": 0.231, "num_input_tokens_seen": 32937088, "step": 54075 }, { "epoch": 16.779398076326405, "grad_norm": 6.106910228729248, "learning_rate": 7.694584133299021e-07, "loss": 0.2403, "num_input_tokens_seen": 32940800, "step": 54080 }, { "epoch": 16.78094942600062, "grad_norm": 10.173287391662598, "learning_rate": 7.687369738798561e-07, "loss": 0.1778, "num_input_tokens_seen": 32943424, "step": 54085 }, { "epoch": 16.782500775674837, "grad_norm": 25.240222930908203, "learning_rate": 7.68015844631494e-07, "loss": 0.1173, "num_input_tokens_seen": 32946816, "step": 54090 }, { "epoch": 16.784052125349053, "grad_norm": 12.182961463928223, "learning_rate": 7.67295025637686e-07, "loss": 0.1161, "num_input_tokens_seen": 32950336, "step": 54095 }, { "epoch": 16.78560347502327, "grad_norm": 14.019761085510254, "learning_rate": 7.665745169512739e-07, "loss": 0.2048, "num_input_tokens_seen": 32953408, "step": 54100 }, { "epoch": 16.787154824697488, "grad_norm": 12.266664505004883, "learning_rate": 7.658543186250817e-07, "loss": 0.1355, "num_input_tokens_seen": 32956160, "step": 54105 }, { "epoch": 16.788706174371704, "grad_norm": 9.171443939208984, "learning_rate": 7.65134430711908e-07, "loss": 0.0943, "num_input_tokens_seen": 32959136, "step": 54110 }, { "epoch": 16.79025752404592, "grad_norm": 8.139135360717773, "learning_rate": 7.644148532645284e-07, "loss": 0.277, "num_input_tokens_seen": 32963616, "step": 54115 }, { "epoch": 16.791808873720136, "grad_norm": 6.152019023895264, "learning_rate": 7.636955863356977e-07, "loss": 0.1519, "num_input_tokens_seen": 32966496, "step": 54120 }, { "epoch": 16.79336022339435, "grad_norm": 15.883769989013672, "learning_rate": 7.629766299781449e-07, "loss": 0.1307, "num_input_tokens_seen": 32969952, "step": 54125 }, { "epoch": 16.79491157306857, "grad_norm": 9.522964477539062, "learning_rate": 7.622579842445799e-07, "loss": 0.1749, "num_input_tokens_seen": 32973216, "step": 54130 }, { "epoch": 16.796462922742787, "grad_norm": 18.154556274414062, "learning_rate": 7.615396491876859e-07, "loss": 0.2373, "num_input_tokens_seen": 32976384, "step": 54135 }, { "epoch": 16.798014272417003, "grad_norm": 12.133612632751465, "learning_rate": 7.608216248601274e-07, "loss": 0.1734, "num_input_tokens_seen": 32979744, "step": 54140 }, { "epoch": 16.79956562209122, "grad_norm": 5.761214733123779, "learning_rate": 7.601039113145425e-07, "loss": 0.1209, "num_input_tokens_seen": 32985504, "step": 54145 }, { "epoch": 16.801116971765435, "grad_norm": 6.908786296844482, "learning_rate": 7.593865086035501e-07, "loss": 0.1517, "num_input_tokens_seen": 32988736, "step": 54150 }, { "epoch": 16.802668321439654, "grad_norm": 6.5272135734558105, "learning_rate": 7.586694167797415e-07, "loss": 0.1527, "num_input_tokens_seen": 32991136, "step": 54155 }, { "epoch": 16.80421967111387, "grad_norm": 8.98961067199707, "learning_rate": 7.57952635895689e-07, "loss": 0.1057, "num_input_tokens_seen": 32993984, "step": 54160 }, { "epoch": 16.805771020788086, "grad_norm": 6.798081398010254, "learning_rate": 7.572361660039434e-07, "loss": 0.1293, "num_input_tokens_seen": 32998912, "step": 54165 }, { "epoch": 16.807322370462302, "grad_norm": 12.638233184814453, "learning_rate": 7.565200071570278e-07, "loss": 0.1462, "num_input_tokens_seen": 33002304, "step": 54170 }, { "epoch": 16.808873720136518, "grad_norm": 7.896702289581299, "learning_rate": 7.558041594074466e-07, "loss": 0.1431, "num_input_tokens_seen": 33006304, "step": 54175 }, { "epoch": 16.810425069810734, "grad_norm": 19.487939834594727, "learning_rate": 7.550886228076787e-07, "loss": 0.201, "num_input_tokens_seen": 33008864, "step": 54180 }, { "epoch": 16.811976419484953, "grad_norm": 20.64238166809082, "learning_rate": 7.543733974101825e-07, "loss": 0.1821, "num_input_tokens_seen": 33011232, "step": 54185 }, { "epoch": 16.81352776915917, "grad_norm": 4.6253437995910645, "learning_rate": 7.53658483267391e-07, "loss": 0.1306, "num_input_tokens_seen": 33016288, "step": 54190 }, { "epoch": 16.815079118833385, "grad_norm": 9.962931632995605, "learning_rate": 7.529438804317185e-07, "loss": 0.1941, "num_input_tokens_seen": 33018720, "step": 54195 }, { "epoch": 16.8166304685076, "grad_norm": 8.444969177246094, "learning_rate": 7.522295889555508e-07, "loss": 0.2147, "num_input_tokens_seen": 33020960, "step": 54200 }, { "epoch": 16.818181818181817, "grad_norm": 4.729205131530762, "learning_rate": 7.515156088912567e-07, "loss": 0.1716, "num_input_tokens_seen": 33023872, "step": 54205 }, { "epoch": 16.819733167856036, "grad_norm": 12.607025146484375, "learning_rate": 7.508019402911765e-07, "loss": 0.2481, "num_input_tokens_seen": 33026592, "step": 54210 }, { "epoch": 16.821284517530252, "grad_norm": 23.081472396850586, "learning_rate": 7.500885832076332e-07, "loss": 0.1502, "num_input_tokens_seen": 33029408, "step": 54215 }, { "epoch": 16.822835867204468, "grad_norm": 8.045926094055176, "learning_rate": 7.493755376929224e-07, "loss": 0.1179, "num_input_tokens_seen": 33032608, "step": 54220 }, { "epoch": 16.824387216878684, "grad_norm": 15.771688461303711, "learning_rate": 7.486628037993199e-07, "loss": 0.1918, "num_input_tokens_seen": 33035392, "step": 54225 }, { "epoch": 16.8259385665529, "grad_norm": 15.340289115905762, "learning_rate": 7.479503815790779e-07, "loss": 0.151, "num_input_tokens_seen": 33037824, "step": 54230 }, { "epoch": 16.82748991622712, "grad_norm": 33.00305938720703, "learning_rate": 7.472382710844239e-07, "loss": 0.2081, "num_input_tokens_seen": 33040000, "step": 54235 }, { "epoch": 16.829041265901335, "grad_norm": 16.33702850341797, "learning_rate": 7.465264723675658e-07, "loss": 0.1358, "num_input_tokens_seen": 33042880, "step": 54240 }, { "epoch": 16.83059261557555, "grad_norm": 7.414628505706787, "learning_rate": 7.458149854806846e-07, "loss": 0.1378, "num_input_tokens_seen": 33047008, "step": 54245 }, { "epoch": 16.832143965249767, "grad_norm": 6.148200988769531, "learning_rate": 7.451038104759434e-07, "loss": 0.1352, "num_input_tokens_seen": 33049888, "step": 54250 }, { "epoch": 16.833695314923983, "grad_norm": 20.983055114746094, "learning_rate": 7.443929474054773e-07, "loss": 0.2769, "num_input_tokens_seen": 33052736, "step": 54255 }, { "epoch": 16.835246664598202, "grad_norm": 11.475358963012695, "learning_rate": 7.436823963214035e-07, "loss": 0.1277, "num_input_tokens_seen": 33056416, "step": 54260 }, { "epoch": 16.836798014272418, "grad_norm": 6.753276348114014, "learning_rate": 7.429721572758108e-07, "loss": 0.1276, "num_input_tokens_seen": 33059872, "step": 54265 }, { "epoch": 16.838349363946634, "grad_norm": 15.669981002807617, "learning_rate": 7.42262230320771e-07, "loss": 0.2235, "num_input_tokens_seen": 33063072, "step": 54270 }, { "epoch": 16.83990071362085, "grad_norm": 51.94801330566406, "learning_rate": 7.415526155083281e-07, "loss": 0.204, "num_input_tokens_seen": 33066304, "step": 54275 }, { "epoch": 16.841452063295065, "grad_norm": 25.79756736755371, "learning_rate": 7.408433128905057e-07, "loss": 0.2688, "num_input_tokens_seen": 33069120, "step": 54280 }, { "epoch": 16.843003412969285, "grad_norm": 10.053542137145996, "learning_rate": 7.401343225193053e-07, "loss": 0.0982, "num_input_tokens_seen": 33071680, "step": 54285 }, { "epoch": 16.8445547626435, "grad_norm": 10.021265029907227, "learning_rate": 7.394256444467024e-07, "loss": 0.1746, "num_input_tokens_seen": 33074432, "step": 54290 }, { "epoch": 16.846106112317717, "grad_norm": 27.701452255249023, "learning_rate": 7.387172787246539e-07, "loss": 0.1935, "num_input_tokens_seen": 33076768, "step": 54295 }, { "epoch": 16.847657461991933, "grad_norm": 25.259883880615234, "learning_rate": 7.380092254050891e-07, "loss": 0.2101, "num_input_tokens_seen": 33079200, "step": 54300 }, { "epoch": 16.84920881166615, "grad_norm": 19.166332244873047, "learning_rate": 7.373014845399185e-07, "loss": 0.1416, "num_input_tokens_seen": 33082624, "step": 54305 }, { "epoch": 16.850760161340368, "grad_norm": 21.93181610107422, "learning_rate": 7.365940561810253e-07, "loss": 0.1927, "num_input_tokens_seen": 33086304, "step": 54310 }, { "epoch": 16.852311511014584, "grad_norm": 21.84471321105957, "learning_rate": 7.358869403802759e-07, "loss": 0.1818, "num_input_tokens_seen": 33090080, "step": 54315 }, { "epoch": 16.8538628606888, "grad_norm": 34.087501525878906, "learning_rate": 7.351801371895068e-07, "loss": 0.2153, "num_input_tokens_seen": 33092448, "step": 54320 }, { "epoch": 16.855414210363016, "grad_norm": 10.267565727233887, "learning_rate": 7.344736466605385e-07, "loss": 0.1739, "num_input_tokens_seen": 33095104, "step": 54325 }, { "epoch": 16.85696556003723, "grad_norm": 25.728967666625977, "learning_rate": 7.337674688451612e-07, "loss": 0.186, "num_input_tokens_seen": 33099104, "step": 54330 }, { "epoch": 16.858516909711447, "grad_norm": 9.80080795288086, "learning_rate": 7.330616037951499e-07, "loss": 0.115, "num_input_tokens_seen": 33102080, "step": 54335 }, { "epoch": 16.860068259385667, "grad_norm": 7.806148529052734, "learning_rate": 7.323560515622502e-07, "loss": 0.2173, "num_input_tokens_seen": 33104736, "step": 54340 }, { "epoch": 16.861619609059883, "grad_norm": 24.101768493652344, "learning_rate": 7.31650812198188e-07, "loss": 0.2343, "num_input_tokens_seen": 33107776, "step": 54345 }, { "epoch": 16.8631709587341, "grad_norm": 6.846682548522949, "learning_rate": 7.309458857546675e-07, "loss": 0.1504, "num_input_tokens_seen": 33110880, "step": 54350 }, { "epoch": 16.864722308408314, "grad_norm": 24.864635467529297, "learning_rate": 7.302412722833651e-07, "loss": 0.1706, "num_input_tokens_seen": 33113984, "step": 54355 }, { "epoch": 16.86627365808253, "grad_norm": 6.453105926513672, "learning_rate": 7.295369718359408e-07, "loss": 0.1227, "num_input_tokens_seen": 33117376, "step": 54360 }, { "epoch": 16.86782500775675, "grad_norm": 18.785173416137695, "learning_rate": 7.288329844640246e-07, "loss": 0.155, "num_input_tokens_seen": 33121568, "step": 54365 }, { "epoch": 16.869376357430966, "grad_norm": 15.175326347351074, "learning_rate": 7.281293102192299e-07, "loss": 0.2135, "num_input_tokens_seen": 33124224, "step": 54370 }, { "epoch": 16.87092770710518, "grad_norm": 6.26269006729126, "learning_rate": 7.274259491531427e-07, "loss": 0.142, "num_input_tokens_seen": 33127072, "step": 54375 }, { "epoch": 16.872479056779397, "grad_norm": 7.064802646636963, "learning_rate": 7.267229013173294e-07, "loss": 0.104, "num_input_tokens_seen": 33130304, "step": 54380 }, { "epoch": 16.874030406453613, "grad_norm": 12.519808769226074, "learning_rate": 7.260201667633287e-07, "loss": 0.2408, "num_input_tokens_seen": 33133760, "step": 54385 }, { "epoch": 16.875581756127833, "grad_norm": 14.346015930175781, "learning_rate": 7.253177455426629e-07, "loss": 0.2718, "num_input_tokens_seen": 33136416, "step": 54390 }, { "epoch": 16.87713310580205, "grad_norm": 19.55630874633789, "learning_rate": 7.246156377068253e-07, "loss": 0.2005, "num_input_tokens_seen": 33139456, "step": 54395 }, { "epoch": 16.878684455476265, "grad_norm": 5.082944869995117, "learning_rate": 7.239138433072889e-07, "loss": 0.1473, "num_input_tokens_seen": 33143104, "step": 54400 }, { "epoch": 16.88023580515048, "grad_norm": 11.877448081970215, "learning_rate": 7.232123623955062e-07, "loss": 0.1976, "num_input_tokens_seen": 33146272, "step": 54405 }, { "epoch": 16.881787154824696, "grad_norm": 9.459155082702637, "learning_rate": 7.225111950229002e-07, "loss": 0.1535, "num_input_tokens_seen": 33148832, "step": 54410 }, { "epoch": 16.883338504498916, "grad_norm": 18.550668716430664, "learning_rate": 7.218103412408783e-07, "loss": 0.2388, "num_input_tokens_seen": 33151392, "step": 54415 }, { "epoch": 16.88488985417313, "grad_norm": 20.883094787597656, "learning_rate": 7.211098011008183e-07, "loss": 0.2591, "num_input_tokens_seen": 33154176, "step": 54420 }, { "epoch": 16.886441203847347, "grad_norm": 4.862800598144531, "learning_rate": 7.204095746540807e-07, "loss": 0.1913, "num_input_tokens_seen": 33157472, "step": 54425 }, { "epoch": 16.887992553521563, "grad_norm": 21.565542221069336, "learning_rate": 7.197096619519983e-07, "loss": 0.1032, "num_input_tokens_seen": 33159872, "step": 54430 }, { "epoch": 16.88954390319578, "grad_norm": 14.032123565673828, "learning_rate": 7.190100630458846e-07, "loss": 0.1315, "num_input_tokens_seen": 33162208, "step": 54435 }, { "epoch": 16.891095252869995, "grad_norm": 9.915553092956543, "learning_rate": 7.183107779870268e-07, "loss": 0.1707, "num_input_tokens_seen": 33164832, "step": 54440 }, { "epoch": 16.892646602544215, "grad_norm": 10.54118537902832, "learning_rate": 7.17611806826693e-07, "loss": 0.1672, "num_input_tokens_seen": 33166976, "step": 54445 }, { "epoch": 16.89419795221843, "grad_norm": 4.246453762054443, "learning_rate": 7.169131496161235e-07, "loss": 0.1521, "num_input_tokens_seen": 33171456, "step": 54450 }, { "epoch": 16.895749301892646, "grad_norm": 12.601846694946289, "learning_rate": 7.16214806406541e-07, "loss": 0.1172, "num_input_tokens_seen": 33174784, "step": 54455 }, { "epoch": 16.897300651566862, "grad_norm": 10.929109573364258, "learning_rate": 7.155167772491394e-07, "loss": 0.1651, "num_input_tokens_seen": 33177120, "step": 54460 }, { "epoch": 16.898852001241078, "grad_norm": 14.554478645324707, "learning_rate": 7.14819062195094e-07, "loss": 0.1822, "num_input_tokens_seen": 33179456, "step": 54465 }, { "epoch": 16.900403350915298, "grad_norm": 14.149786949157715, "learning_rate": 7.141216612955565e-07, "loss": 0.1463, "num_input_tokens_seen": 33182176, "step": 54470 }, { "epoch": 16.901954700589513, "grad_norm": 20.70265769958496, "learning_rate": 7.134245746016527e-07, "loss": 0.2703, "num_input_tokens_seen": 33185120, "step": 54475 }, { "epoch": 16.90350605026373, "grad_norm": 16.78388786315918, "learning_rate": 7.12727802164489e-07, "loss": 0.2329, "num_input_tokens_seen": 33188704, "step": 54480 }, { "epoch": 16.905057399937945, "grad_norm": 16.864038467407227, "learning_rate": 7.120313440351456e-07, "loss": 0.1703, "num_input_tokens_seen": 33191680, "step": 54485 }, { "epoch": 16.90660874961216, "grad_norm": 11.399428367614746, "learning_rate": 7.113352002646828e-07, "loss": 0.1569, "num_input_tokens_seen": 33194240, "step": 54490 }, { "epoch": 16.90816009928638, "grad_norm": 1.2409908771514893, "learning_rate": 7.106393709041337e-07, "loss": 0.1534, "num_input_tokens_seen": 33196320, "step": 54495 }, { "epoch": 16.909711448960596, "grad_norm": 20.892662048339844, "learning_rate": 7.099438560045141e-07, "loss": 0.1913, "num_input_tokens_seen": 33198912, "step": 54500 }, { "epoch": 16.911262798634812, "grad_norm": 10.2742919921875, "learning_rate": 7.092486556168099e-07, "loss": 0.1471, "num_input_tokens_seen": 33201472, "step": 54505 }, { "epoch": 16.912814148309028, "grad_norm": 5.328560829162598, "learning_rate": 7.085537697919908e-07, "loss": 0.2052, "num_input_tokens_seen": 33203968, "step": 54510 }, { "epoch": 16.914365497983244, "grad_norm": 16.714113235473633, "learning_rate": 7.078591985809979e-07, "loss": 0.1272, "num_input_tokens_seen": 33206880, "step": 54515 }, { "epoch": 16.915916847657464, "grad_norm": 5.7163987159729, "learning_rate": 7.07164942034752e-07, "loss": 0.1347, "num_input_tokens_seen": 33210528, "step": 54520 }, { "epoch": 16.91746819733168, "grad_norm": 11.551216125488281, "learning_rate": 7.064710002041514e-07, "loss": 0.2169, "num_input_tokens_seen": 33212768, "step": 54525 }, { "epoch": 16.919019547005895, "grad_norm": 20.97125816345215, "learning_rate": 7.057773731400691e-07, "loss": 0.1617, "num_input_tokens_seen": 33215328, "step": 54530 }, { "epoch": 16.92057089668011, "grad_norm": 3.972663402557373, "learning_rate": 7.050840608933568e-07, "loss": 0.2148, "num_input_tokens_seen": 33217952, "step": 54535 }, { "epoch": 16.922122246354327, "grad_norm": 8.676715850830078, "learning_rate": 7.043910635148422e-07, "loss": 0.1031, "num_input_tokens_seen": 33220896, "step": 54540 }, { "epoch": 16.923673596028546, "grad_norm": 14.811476707458496, "learning_rate": 7.036983810553305e-07, "loss": 0.1518, "num_input_tokens_seen": 33224288, "step": 54545 }, { "epoch": 16.925224945702762, "grad_norm": 29.21444320678711, "learning_rate": 7.030060135656025e-07, "loss": 0.1477, "num_input_tokens_seen": 33226976, "step": 54550 }, { "epoch": 16.92677629537698, "grad_norm": 20.38582992553711, "learning_rate": 7.023139610964186e-07, "loss": 0.2312, "num_input_tokens_seen": 33229696, "step": 54555 }, { "epoch": 16.928327645051194, "grad_norm": 11.19494342803955, "learning_rate": 7.016222236985126e-07, "loss": 0.1532, "num_input_tokens_seen": 33234048, "step": 54560 }, { "epoch": 16.92987899472541, "grad_norm": 6.1155290603637695, "learning_rate": 7.009308014225991e-07, "loss": 0.1455, "num_input_tokens_seen": 33237600, "step": 54565 }, { "epoch": 16.93143034439963, "grad_norm": 7.855299949645996, "learning_rate": 7.002396943193657e-07, "loss": 0.1817, "num_input_tokens_seen": 33240832, "step": 54570 }, { "epoch": 16.932981694073845, "grad_norm": 18.62069320678711, "learning_rate": 6.995489024394803e-07, "loss": 0.2007, "num_input_tokens_seen": 33244960, "step": 54575 }, { "epoch": 16.93453304374806, "grad_norm": 20.743804931640625, "learning_rate": 6.98858425833584e-07, "loss": 0.1726, "num_input_tokens_seen": 33248288, "step": 54580 }, { "epoch": 16.936084393422277, "grad_norm": 6.037178993225098, "learning_rate": 6.981682645522985e-07, "loss": 0.1336, "num_input_tokens_seen": 33251328, "step": 54585 }, { "epoch": 16.937635743096493, "grad_norm": 15.770865440368652, "learning_rate": 6.974784186462202e-07, "loss": 0.0969, "num_input_tokens_seen": 33255520, "step": 54590 }, { "epoch": 16.93918709277071, "grad_norm": 14.750219345092773, "learning_rate": 6.967888881659251e-07, "loss": 0.2892, "num_input_tokens_seen": 33257696, "step": 54595 }, { "epoch": 16.94073844244493, "grad_norm": 7.057910919189453, "learning_rate": 6.960996731619607e-07, "loss": 0.1912, "num_input_tokens_seen": 33261120, "step": 54600 }, { "epoch": 16.942289792119144, "grad_norm": 16.274795532226562, "learning_rate": 6.954107736848558e-07, "loss": 0.2031, "num_input_tokens_seen": 33264704, "step": 54605 }, { "epoch": 16.94384114179336, "grad_norm": 16.842741012573242, "learning_rate": 6.947221897851164e-07, "loss": 0.247, "num_input_tokens_seen": 33267200, "step": 54610 }, { "epoch": 16.945392491467576, "grad_norm": 19.631607055664062, "learning_rate": 6.940339215132213e-07, "loss": 0.1099, "num_input_tokens_seen": 33271264, "step": 54615 }, { "epoch": 16.946943841141792, "grad_norm": 8.552692413330078, "learning_rate": 6.933459689196315e-07, "loss": 0.2303, "num_input_tokens_seen": 33273792, "step": 54620 }, { "epoch": 16.94849519081601, "grad_norm": 12.624003410339355, "learning_rate": 6.926583320547792e-07, "loss": 0.1665, "num_input_tokens_seen": 33276704, "step": 54625 }, { "epoch": 16.950046540490227, "grad_norm": 17.245485305786133, "learning_rate": 6.919710109690792e-07, "loss": 0.2376, "num_input_tokens_seen": 33279264, "step": 54630 }, { "epoch": 16.951597890164443, "grad_norm": 6.177955150604248, "learning_rate": 6.912840057129172e-07, "loss": 0.1172, "num_input_tokens_seen": 33282656, "step": 54635 }, { "epoch": 16.95314923983866, "grad_norm": 5.254965305328369, "learning_rate": 6.905973163366619e-07, "loss": 0.1225, "num_input_tokens_seen": 33285824, "step": 54640 }, { "epoch": 16.954700589512875, "grad_norm": 13.15177059173584, "learning_rate": 6.899109428906531e-07, "loss": 0.2942, "num_input_tokens_seen": 33290336, "step": 54645 }, { "epoch": 16.956251939187094, "grad_norm": 22.693552017211914, "learning_rate": 6.892248854252121e-07, "loss": 0.1782, "num_input_tokens_seen": 33293440, "step": 54650 }, { "epoch": 16.95780328886131, "grad_norm": 29.085264205932617, "learning_rate": 6.88539143990633e-07, "loss": 0.143, "num_input_tokens_seen": 33297312, "step": 54655 }, { "epoch": 16.959354638535526, "grad_norm": 44.9654655456543, "learning_rate": 6.878537186371914e-07, "loss": 0.2001, "num_input_tokens_seen": 33301152, "step": 54660 }, { "epoch": 16.960905988209742, "grad_norm": 10.560364723205566, "learning_rate": 6.871686094151348e-07, "loss": 0.1827, "num_input_tokens_seen": 33303328, "step": 54665 }, { "epoch": 16.962457337883958, "grad_norm": 2.8401923179626465, "learning_rate": 6.864838163746906e-07, "loss": 0.1021, "num_input_tokens_seen": 33306752, "step": 54670 }, { "epoch": 16.964008687558177, "grad_norm": 15.985645294189453, "learning_rate": 6.857993395660633e-07, "loss": 0.2622, "num_input_tokens_seen": 33311520, "step": 54675 }, { "epoch": 16.965560037232393, "grad_norm": 17.05475425720215, "learning_rate": 6.851151790394311e-07, "loss": 0.1404, "num_input_tokens_seen": 33315168, "step": 54680 }, { "epoch": 16.96711138690661, "grad_norm": 23.181320190429688, "learning_rate": 6.844313348449533e-07, "loss": 0.2888, "num_input_tokens_seen": 33317984, "step": 54685 }, { "epoch": 16.968662736580825, "grad_norm": 7.308253765106201, "learning_rate": 6.837478070327613e-07, "loss": 0.0997, "num_input_tokens_seen": 33320672, "step": 54690 }, { "epoch": 16.97021408625504, "grad_norm": 9.826055526733398, "learning_rate": 6.83064595652968e-07, "loss": 0.2364, "num_input_tokens_seen": 33324032, "step": 54695 }, { "epoch": 16.971765435929257, "grad_norm": 12.166563987731934, "learning_rate": 6.823817007556594e-07, "loss": 0.162, "num_input_tokens_seen": 33327200, "step": 54700 }, { "epoch": 16.973316785603476, "grad_norm": 11.0170316696167, "learning_rate": 6.816991223909014e-07, "loss": 0.1707, "num_input_tokens_seen": 33329920, "step": 54705 }, { "epoch": 16.974868135277692, "grad_norm": 5.08280611038208, "learning_rate": 6.81016860608733e-07, "loss": 0.1535, "num_input_tokens_seen": 33333408, "step": 54710 }, { "epoch": 16.976419484951908, "grad_norm": 12.619932174682617, "learning_rate": 6.803349154591743e-07, "loss": 0.1699, "num_input_tokens_seen": 33337152, "step": 54715 }, { "epoch": 16.977970834626124, "grad_norm": 67.3972396850586, "learning_rate": 6.796532869922173e-07, "loss": 0.1543, "num_input_tokens_seen": 33339872, "step": 54720 }, { "epoch": 16.97952218430034, "grad_norm": 3.2405502796173096, "learning_rate": 6.789719752578355e-07, "loss": 0.1551, "num_input_tokens_seen": 33343936, "step": 54725 }, { "epoch": 16.98107353397456, "grad_norm": 8.294309616088867, "learning_rate": 6.782909803059772e-07, "loss": 0.1602, "num_input_tokens_seen": 33346880, "step": 54730 }, { "epoch": 16.982624883648775, "grad_norm": 10.915047645568848, "learning_rate": 6.776103021865654e-07, "loss": 0.138, "num_input_tokens_seen": 33349728, "step": 54735 }, { "epoch": 16.98417623332299, "grad_norm": 42.48564147949219, "learning_rate": 6.76929940949505e-07, "loss": 0.2957, "num_input_tokens_seen": 33352640, "step": 54740 }, { "epoch": 16.985727582997207, "grad_norm": 10.762860298156738, "learning_rate": 6.762498966446712e-07, "loss": 0.2194, "num_input_tokens_seen": 33355520, "step": 54745 }, { "epoch": 16.987278932671423, "grad_norm": 9.975157737731934, "learning_rate": 6.755701693219219e-07, "loss": 0.1779, "num_input_tokens_seen": 33358528, "step": 54750 }, { "epoch": 16.988830282345642, "grad_norm": 15.16777229309082, "learning_rate": 6.748907590310871e-07, "loss": 0.179, "num_input_tokens_seen": 33362144, "step": 54755 }, { "epoch": 16.990381632019858, "grad_norm": 16.279935836791992, "learning_rate": 6.742116658219777e-07, "loss": 0.1682, "num_input_tokens_seen": 33364544, "step": 54760 }, { "epoch": 16.991932981694074, "grad_norm": 90.62940979003906, "learning_rate": 6.735328897443772e-07, "loss": 0.2815, "num_input_tokens_seen": 33367424, "step": 54765 }, { "epoch": 16.99348433136829, "grad_norm": 30.632156372070312, "learning_rate": 6.7285443084805e-07, "loss": 0.2382, "num_input_tokens_seen": 33369856, "step": 54770 }, { "epoch": 16.995035681042506, "grad_norm": 9.468358039855957, "learning_rate": 6.721762891827327e-07, "loss": 0.1609, "num_input_tokens_seen": 33373760, "step": 54775 }, { "epoch": 16.996587030716725, "grad_norm": 6.911561489105225, "learning_rate": 6.714984647981443e-07, "loss": 0.1426, "num_input_tokens_seen": 33377024, "step": 54780 }, { "epoch": 16.99813838039094, "grad_norm": 9.503079414367676, "learning_rate": 6.708209577439739e-07, "loss": 0.1365, "num_input_tokens_seen": 33379296, "step": 54785 }, { "epoch": 16.999689730065157, "grad_norm": 13.60358715057373, "learning_rate": 6.701437680698925e-07, "loss": 0.1068, "num_input_tokens_seen": 33382432, "step": 54790 }, { "epoch": 17.001241079739373, "grad_norm": 13.5029935836792, "learning_rate": 6.694668958255473e-07, "loss": 0.1778, "num_input_tokens_seen": 33385872, "step": 54795 }, { "epoch": 17.00279242941359, "grad_norm": 7.042308330535889, "learning_rate": 6.687903410605584e-07, "loss": 0.1436, "num_input_tokens_seen": 33388336, "step": 54800 }, { "epoch": 17.004343779087808, "grad_norm": 4.090226173400879, "learning_rate": 6.681141038245282e-07, "loss": 0.1373, "num_input_tokens_seen": 33390608, "step": 54805 }, { "epoch": 17.005895128762024, "grad_norm": 20.612926483154297, "learning_rate": 6.674381841670302e-07, "loss": 0.142, "num_input_tokens_seen": 33393264, "step": 54810 }, { "epoch": 17.00744647843624, "grad_norm": 20.476070404052734, "learning_rate": 6.667625821376195e-07, "loss": 0.1086, "num_input_tokens_seen": 33396752, "step": 54815 }, { "epoch": 17.008997828110456, "grad_norm": 6.064044952392578, "learning_rate": 6.660872977858235e-07, "loss": 0.1625, "num_input_tokens_seen": 33399184, "step": 54820 }, { "epoch": 17.01054917778467, "grad_norm": 39.92815017700195, "learning_rate": 6.65412331161151e-07, "loss": 0.155, "num_input_tokens_seen": 33402160, "step": 54825 }, { "epoch": 17.01210052745889, "grad_norm": 13.609739303588867, "learning_rate": 6.64737682313083e-07, "loss": 0.1597, "num_input_tokens_seen": 33404336, "step": 54830 }, { "epoch": 17.013651877133107, "grad_norm": 7.51938009262085, "learning_rate": 6.640633512910805e-07, "loss": 0.1476, "num_input_tokens_seen": 33407664, "step": 54835 }, { "epoch": 17.015203226807323, "grad_norm": 7.749235153198242, "learning_rate": 6.633893381445788e-07, "loss": 0.1179, "num_input_tokens_seen": 33409936, "step": 54840 }, { "epoch": 17.01675457648154, "grad_norm": 17.594621658325195, "learning_rate": 6.627156429229919e-07, "loss": 0.1333, "num_input_tokens_seen": 33413200, "step": 54845 }, { "epoch": 17.018305926155755, "grad_norm": 32.974388122558594, "learning_rate": 6.620422656757109e-07, "loss": 0.2189, "num_input_tokens_seen": 33415728, "step": 54850 }, { "epoch": 17.01985727582997, "grad_norm": 18.133872985839844, "learning_rate": 6.613692064520993e-07, "loss": 0.1963, "num_input_tokens_seen": 33418352, "step": 54855 }, { "epoch": 17.02140862550419, "grad_norm": 17.633745193481445, "learning_rate": 6.60696465301503e-07, "loss": 0.1901, "num_input_tokens_seen": 33421200, "step": 54860 }, { "epoch": 17.022959975178406, "grad_norm": 13.436540603637695, "learning_rate": 6.600240422732401e-07, "loss": 0.2418, "num_input_tokens_seen": 33424336, "step": 54865 }, { "epoch": 17.02451132485262, "grad_norm": 31.651390075683594, "learning_rate": 6.59351937416609e-07, "loss": 0.2717, "num_input_tokens_seen": 33426416, "step": 54870 }, { "epoch": 17.026062674526838, "grad_norm": 11.795173645019531, "learning_rate": 6.586801507808804e-07, "loss": 0.1453, "num_input_tokens_seen": 33429168, "step": 54875 }, { "epoch": 17.027614024201053, "grad_norm": 21.641279220581055, "learning_rate": 6.580086824153071e-07, "loss": 0.2087, "num_input_tokens_seen": 33432048, "step": 54880 }, { "epoch": 17.029165373875273, "grad_norm": 10.713119506835938, "learning_rate": 6.573375323691128e-07, "loss": 0.1908, "num_input_tokens_seen": 33434672, "step": 54885 }, { "epoch": 17.03071672354949, "grad_norm": 12.762710571289062, "learning_rate": 6.566667006915034e-07, "loss": 0.1689, "num_input_tokens_seen": 33437264, "step": 54890 }, { "epoch": 17.032268073223705, "grad_norm": 2.845114231109619, "learning_rate": 6.559961874316568e-07, "loss": 0.1737, "num_input_tokens_seen": 33439760, "step": 54895 }, { "epoch": 17.03381942289792, "grad_norm": 5.787626266479492, "learning_rate": 6.553259926387312e-07, "loss": 0.1281, "num_input_tokens_seen": 33442032, "step": 54900 }, { "epoch": 17.035370772572136, "grad_norm": 21.24957275390625, "learning_rate": 6.546561163618581e-07, "loss": 0.2134, "num_input_tokens_seen": 33444464, "step": 54905 }, { "epoch": 17.036922122246356, "grad_norm": 13.402544975280762, "learning_rate": 6.539865586501481e-07, "loss": 0.1659, "num_input_tokens_seen": 33447568, "step": 54910 }, { "epoch": 17.03847347192057, "grad_norm": 10.739789009094238, "learning_rate": 6.533173195526888e-07, "loss": 0.1615, "num_input_tokens_seen": 33450992, "step": 54915 }, { "epoch": 17.040024821594788, "grad_norm": 9.741133689880371, "learning_rate": 6.526483991185411e-07, "loss": 0.1397, "num_input_tokens_seen": 33452944, "step": 54920 }, { "epoch": 17.041576171269003, "grad_norm": 12.623457908630371, "learning_rate": 6.519797973967478e-07, "loss": 0.1014, "num_input_tokens_seen": 33456592, "step": 54925 }, { "epoch": 17.04312752094322, "grad_norm": 9.681035041809082, "learning_rate": 6.513115144363224e-07, "loss": 0.1547, "num_input_tokens_seen": 33458992, "step": 54930 }, { "epoch": 17.04467887061744, "grad_norm": 32.005950927734375, "learning_rate": 6.506435502862602e-07, "loss": 0.2117, "num_input_tokens_seen": 33462480, "step": 54935 }, { "epoch": 17.046230220291655, "grad_norm": 9.041827201843262, "learning_rate": 6.499759049955284e-07, "loss": 0.1728, "num_input_tokens_seen": 33464944, "step": 54940 }, { "epoch": 17.04778156996587, "grad_norm": 8.035686492919922, "learning_rate": 6.49308578613076e-07, "loss": 0.1433, "num_input_tokens_seen": 33467792, "step": 54945 }, { "epoch": 17.049332919640086, "grad_norm": 10.436553955078125, "learning_rate": 6.486415711878236e-07, "loss": 0.165, "num_input_tokens_seen": 33469936, "step": 54950 }, { "epoch": 17.050884269314302, "grad_norm": 18.535076141357422, "learning_rate": 6.479748827686732e-07, "loss": 0.1866, "num_input_tokens_seen": 33472400, "step": 54955 }, { "epoch": 17.052435618988522, "grad_norm": 11.969820022583008, "learning_rate": 6.473085134044981e-07, "loss": 0.1758, "num_input_tokens_seen": 33475792, "step": 54960 }, { "epoch": 17.053986968662738, "grad_norm": 10.321818351745605, "learning_rate": 6.466424631441531e-07, "loss": 0.1316, "num_input_tokens_seen": 33478576, "step": 54965 }, { "epoch": 17.055538318336954, "grad_norm": 12.386362075805664, "learning_rate": 6.459767320364673e-07, "loss": 0.1451, "num_input_tokens_seen": 33481296, "step": 54970 }, { "epoch": 17.05708966801117, "grad_norm": 8.12673568725586, "learning_rate": 6.453113201302458e-07, "loss": 0.0646, "num_input_tokens_seen": 33484464, "step": 54975 }, { "epoch": 17.058641017685385, "grad_norm": 12.105936050415039, "learning_rate": 6.446462274742731e-07, "loss": 0.181, "num_input_tokens_seen": 33486800, "step": 54980 }, { "epoch": 17.0601923673596, "grad_norm": 7.954194068908691, "learning_rate": 6.439814541173057e-07, "loss": 0.1478, "num_input_tokens_seen": 33489328, "step": 54985 }, { "epoch": 17.06174371703382, "grad_norm": 13.217495918273926, "learning_rate": 6.433170001080818e-07, "loss": 0.1736, "num_input_tokens_seen": 33492336, "step": 54990 }, { "epoch": 17.063295066708037, "grad_norm": 18.44855308532715, "learning_rate": 6.426528654953112e-07, "loss": 0.1125, "num_input_tokens_seen": 33495120, "step": 54995 }, { "epoch": 17.064846416382252, "grad_norm": 5.9760236740112305, "learning_rate": 6.419890503276854e-07, "loss": 0.1942, "num_input_tokens_seen": 33498320, "step": 55000 }, { "epoch": 17.06639776605647, "grad_norm": 9.39997386932373, "learning_rate": 6.413255546538683e-07, "loss": 0.1677, "num_input_tokens_seen": 33501328, "step": 55005 }, { "epoch": 17.067949115730684, "grad_norm": 14.167855262756348, "learning_rate": 6.40662378522503e-07, "loss": 0.1453, "num_input_tokens_seen": 33504944, "step": 55010 }, { "epoch": 17.069500465404904, "grad_norm": 9.45926284790039, "learning_rate": 6.39999521982207e-07, "loss": 0.1632, "num_input_tokens_seen": 33508240, "step": 55015 }, { "epoch": 17.07105181507912, "grad_norm": 10.489165306091309, "learning_rate": 6.393369850815767e-07, "loss": 0.1112, "num_input_tokens_seen": 33511760, "step": 55020 }, { "epoch": 17.072603164753335, "grad_norm": 9.503874778747559, "learning_rate": 6.386747678691829e-07, "loss": 0.1763, "num_input_tokens_seen": 33514704, "step": 55025 }, { "epoch": 17.07415451442755, "grad_norm": 8.022862434387207, "learning_rate": 6.380128703935739e-07, "loss": 0.1165, "num_input_tokens_seen": 33517488, "step": 55030 }, { "epoch": 17.075705864101767, "grad_norm": 16.97579002380371, "learning_rate": 6.373512927032766e-07, "loss": 0.1339, "num_input_tokens_seen": 33519664, "step": 55035 }, { "epoch": 17.077257213775987, "grad_norm": 12.391475677490234, "learning_rate": 6.366900348467897e-07, "loss": 0.1475, "num_input_tokens_seen": 33523408, "step": 55040 }, { "epoch": 17.078808563450202, "grad_norm": 16.12511444091797, "learning_rate": 6.360290968725936e-07, "loss": 0.1385, "num_input_tokens_seen": 33526224, "step": 55045 }, { "epoch": 17.08035991312442, "grad_norm": 14.731135368347168, "learning_rate": 6.353684788291408e-07, "loss": 0.125, "num_input_tokens_seen": 33529296, "step": 55050 }, { "epoch": 17.081911262798634, "grad_norm": 9.295530319213867, "learning_rate": 6.347081807648648e-07, "loss": 0.1149, "num_input_tokens_seen": 33531920, "step": 55055 }, { "epoch": 17.08346261247285, "grad_norm": 7.376482963562012, "learning_rate": 6.340482027281708e-07, "loss": 0.1665, "num_input_tokens_seen": 33536816, "step": 55060 }, { "epoch": 17.08501396214707, "grad_norm": 22.902780532836914, "learning_rate": 6.333885447674448e-07, "loss": 0.1349, "num_input_tokens_seen": 33541488, "step": 55065 }, { "epoch": 17.086565311821285, "grad_norm": 16.914596557617188, "learning_rate": 6.327292069310465e-07, "loss": 0.2041, "num_input_tokens_seen": 33544368, "step": 55070 }, { "epoch": 17.0881166614955, "grad_norm": 4.127713680267334, "learning_rate": 6.320701892673142e-07, "loss": 0.1615, "num_input_tokens_seen": 33546928, "step": 55075 }, { "epoch": 17.089668011169717, "grad_norm": 9.930213928222656, "learning_rate": 6.314114918245601e-07, "loss": 0.1009, "num_input_tokens_seen": 33551728, "step": 55080 }, { "epoch": 17.091219360843933, "grad_norm": 31.808835983276367, "learning_rate": 6.307531146510754e-07, "loss": 0.1729, "num_input_tokens_seen": 33554960, "step": 55085 }, { "epoch": 17.092770710518153, "grad_norm": 10.654351234436035, "learning_rate": 6.300950577951281e-07, "loss": 0.2017, "num_input_tokens_seen": 33558384, "step": 55090 }, { "epoch": 17.09432206019237, "grad_norm": 18.144363403320312, "learning_rate": 6.294373213049593e-07, "loss": 0.2432, "num_input_tokens_seen": 33561488, "step": 55095 }, { "epoch": 17.095873409866584, "grad_norm": 7.073260307312012, "learning_rate": 6.287799052287913e-07, "loss": 0.1959, "num_input_tokens_seen": 33563920, "step": 55100 }, { "epoch": 17.0974247595408, "grad_norm": 14.501107215881348, "learning_rate": 6.281228096148178e-07, "loss": 0.2115, "num_input_tokens_seen": 33567024, "step": 55105 }, { "epoch": 17.098976109215016, "grad_norm": 4.965603828430176, "learning_rate": 6.274660345112149e-07, "loss": 0.0793, "num_input_tokens_seen": 33570000, "step": 55110 }, { "epoch": 17.100527458889232, "grad_norm": 12.099825859069824, "learning_rate": 6.26809579966129e-07, "loss": 0.1787, "num_input_tokens_seen": 33573296, "step": 55115 }, { "epoch": 17.10207880856345, "grad_norm": 21.841875076293945, "learning_rate": 6.261534460276881e-07, "loss": 0.2024, "num_input_tokens_seen": 33576144, "step": 55120 }, { "epoch": 17.103630158237667, "grad_norm": 7.70176887512207, "learning_rate": 6.254976327439921e-07, "loss": 0.1405, "num_input_tokens_seen": 33580560, "step": 55125 }, { "epoch": 17.105181507911883, "grad_norm": 4.743083953857422, "learning_rate": 6.248421401631233e-07, "loss": 0.1096, "num_input_tokens_seen": 33582736, "step": 55130 }, { "epoch": 17.1067328575861, "grad_norm": 11.801583290100098, "learning_rate": 6.241869683331337e-07, "loss": 0.1525, "num_input_tokens_seen": 33585584, "step": 55135 }, { "epoch": 17.108284207260315, "grad_norm": 6.290579795837402, "learning_rate": 6.235321173020581e-07, "loss": 0.1431, "num_input_tokens_seen": 33587728, "step": 55140 }, { "epoch": 17.109835556934534, "grad_norm": 2.991724729537964, "learning_rate": 6.228775871179021e-07, "loss": 0.1147, "num_input_tokens_seen": 33590032, "step": 55145 }, { "epoch": 17.11138690660875, "grad_norm": 6.632840633392334, "learning_rate": 6.222233778286518e-07, "loss": 0.2117, "num_input_tokens_seen": 33593264, "step": 55150 }, { "epoch": 17.112938256282966, "grad_norm": 14.1129732131958, "learning_rate": 6.215694894822699e-07, "loss": 0.0872, "num_input_tokens_seen": 33597072, "step": 55155 }, { "epoch": 17.114489605957182, "grad_norm": 7.073339939117432, "learning_rate": 6.209159221266919e-07, "loss": 0.062, "num_input_tokens_seen": 33600144, "step": 55160 }, { "epoch": 17.116040955631398, "grad_norm": 4.170267105102539, "learning_rate": 6.202626758098324e-07, "loss": 0.095, "num_input_tokens_seen": 33602992, "step": 55165 }, { "epoch": 17.117592305305617, "grad_norm": 19.50310707092285, "learning_rate": 6.19609750579584e-07, "loss": 0.1542, "num_input_tokens_seen": 33606480, "step": 55170 }, { "epoch": 17.119143654979833, "grad_norm": 31.41714859008789, "learning_rate": 6.189571464838112e-07, "loss": 0.1979, "num_input_tokens_seen": 33609680, "step": 55175 }, { "epoch": 17.12069500465405, "grad_norm": 12.527815818786621, "learning_rate": 6.183048635703592e-07, "loss": 0.1097, "num_input_tokens_seen": 33612304, "step": 55180 }, { "epoch": 17.122246354328265, "grad_norm": 21.782207489013672, "learning_rate": 6.176529018870487e-07, "loss": 0.1898, "num_input_tokens_seen": 33614832, "step": 55185 }, { "epoch": 17.12379770400248, "grad_norm": 8.55066204071045, "learning_rate": 6.17001261481674e-07, "loss": 0.1043, "num_input_tokens_seen": 33617520, "step": 55190 }, { "epoch": 17.1253490536767, "grad_norm": 31.778606414794922, "learning_rate": 6.163499424020103e-07, "loss": 0.1981, "num_input_tokens_seen": 33620848, "step": 55195 }, { "epoch": 17.126900403350916, "grad_norm": 15.801312446594238, "learning_rate": 6.15698944695805e-07, "loss": 0.2363, "num_input_tokens_seen": 33623536, "step": 55200 }, { "epoch": 17.128451753025132, "grad_norm": 7.8293890953063965, "learning_rate": 6.150482684107861e-07, "loss": 0.1298, "num_input_tokens_seen": 33626544, "step": 55205 }, { "epoch": 17.130003102699348, "grad_norm": 26.723581314086914, "learning_rate": 6.143979135946537e-07, "loss": 0.18, "num_input_tokens_seen": 33629584, "step": 55210 }, { "epoch": 17.131554452373564, "grad_norm": 4.9956955909729, "learning_rate": 6.137478802950886e-07, "loss": 0.2434, "num_input_tokens_seen": 33632016, "step": 55215 }, { "epoch": 17.133105802047783, "grad_norm": 15.144429206848145, "learning_rate": 6.130981685597436e-07, "loss": 0.1872, "num_input_tokens_seen": 33635920, "step": 55220 }, { "epoch": 17.134657151722, "grad_norm": 17.53485107421875, "learning_rate": 6.124487784362526e-07, "loss": 0.1546, "num_input_tokens_seen": 33639696, "step": 55225 }, { "epoch": 17.136208501396215, "grad_norm": 4.125802993774414, "learning_rate": 6.117997099722217e-07, "loss": 0.1252, "num_input_tokens_seen": 33642640, "step": 55230 }, { "epoch": 17.13775985107043, "grad_norm": 11.172329902648926, "learning_rate": 6.111509632152362e-07, "loss": 0.1683, "num_input_tokens_seen": 33645808, "step": 55235 }, { "epoch": 17.139311200744647, "grad_norm": 4.451254844665527, "learning_rate": 6.105025382128577e-07, "loss": 0.1052, "num_input_tokens_seen": 33648592, "step": 55240 }, { "epoch": 17.140862550418863, "grad_norm": 8.866399765014648, "learning_rate": 6.098544350126212e-07, "loss": 0.1345, "num_input_tokens_seen": 33651440, "step": 55245 }, { "epoch": 17.142413900093082, "grad_norm": 8.055886268615723, "learning_rate": 6.092066536620433e-07, "loss": 0.1367, "num_input_tokens_seen": 33654160, "step": 55250 }, { "epoch": 17.143965249767298, "grad_norm": 7.394303798675537, "learning_rate": 6.085591942086111e-07, "loss": 0.1255, "num_input_tokens_seen": 33656976, "step": 55255 }, { "epoch": 17.145516599441514, "grad_norm": 19.088937759399414, "learning_rate": 6.079120566997937e-07, "loss": 0.1887, "num_input_tokens_seen": 33661136, "step": 55260 }, { "epoch": 17.14706794911573, "grad_norm": 17.716609954833984, "learning_rate": 6.072652411830315e-07, "loss": 0.1785, "num_input_tokens_seen": 33663920, "step": 55265 }, { "epoch": 17.148619298789946, "grad_norm": 15.3451509475708, "learning_rate": 6.066187477057456e-07, "loss": 0.1422, "num_input_tokens_seen": 33666768, "step": 55270 }, { "epoch": 17.150170648464165, "grad_norm": 35.266319274902344, "learning_rate": 6.059725763153301e-07, "loss": 0.1194, "num_input_tokens_seen": 33669552, "step": 55275 }, { "epoch": 17.15172199813838, "grad_norm": 9.668907165527344, "learning_rate": 6.053267270591589e-07, "loss": 0.0976, "num_input_tokens_seen": 33672624, "step": 55280 }, { "epoch": 17.153273347812597, "grad_norm": 25.217411041259766, "learning_rate": 6.046811999845786e-07, "loss": 0.2904, "num_input_tokens_seen": 33676496, "step": 55285 }, { "epoch": 17.154824697486813, "grad_norm": 28.141592025756836, "learning_rate": 6.040359951389157e-07, "loss": 0.2183, "num_input_tokens_seen": 33679440, "step": 55290 }, { "epoch": 17.15637604716103, "grad_norm": 21.404897689819336, "learning_rate": 6.033911125694691e-07, "loss": 0.1233, "num_input_tokens_seen": 33683024, "step": 55295 }, { "epoch": 17.157927396835248, "grad_norm": 26.898130416870117, "learning_rate": 6.027465523235171e-07, "loss": 0.2418, "num_input_tokens_seen": 33685936, "step": 55300 }, { "epoch": 17.159478746509464, "grad_norm": 19.280101776123047, "learning_rate": 6.021023144483156e-07, "loss": 0.1918, "num_input_tokens_seen": 33688528, "step": 55305 }, { "epoch": 17.16103009618368, "grad_norm": 23.37433624267578, "learning_rate": 6.014583989910927e-07, "loss": 0.2225, "num_input_tokens_seen": 33692912, "step": 55310 }, { "epoch": 17.162581445857896, "grad_norm": 4.314333915710449, "learning_rate": 6.008148059990559e-07, "loss": 0.1192, "num_input_tokens_seen": 33695344, "step": 55315 }, { "epoch": 17.16413279553211, "grad_norm": 9.211883544921875, "learning_rate": 6.001715355193876e-07, "loss": 0.1901, "num_input_tokens_seen": 33697776, "step": 55320 }, { "epoch": 17.16568414520633, "grad_norm": 27.994647979736328, "learning_rate": 5.995285875992485e-07, "loss": 0.1634, "num_input_tokens_seen": 33700496, "step": 55325 }, { "epoch": 17.167235494880547, "grad_norm": 20.630556106567383, "learning_rate": 5.988859622857718e-07, "loss": 0.1281, "num_input_tokens_seen": 33703056, "step": 55330 }, { "epoch": 17.168786844554763, "grad_norm": 12.503730773925781, "learning_rate": 5.982436596260721e-07, "loss": 0.1456, "num_input_tokens_seen": 33705520, "step": 55335 }, { "epoch": 17.17033819422898, "grad_norm": 14.571191787719727, "learning_rate": 5.976016796672363e-07, "loss": 0.1492, "num_input_tokens_seen": 33708784, "step": 55340 }, { "epoch": 17.171889543903195, "grad_norm": 9.9715576171875, "learning_rate": 5.969600224563304e-07, "loss": 0.146, "num_input_tokens_seen": 33711504, "step": 55345 }, { "epoch": 17.173440893577414, "grad_norm": 16.36704444885254, "learning_rate": 5.963186880403931e-07, "loss": 0.1568, "num_input_tokens_seen": 33714224, "step": 55350 }, { "epoch": 17.17499224325163, "grad_norm": 24.601627349853516, "learning_rate": 5.95677676466444e-07, "loss": 0.1225, "num_input_tokens_seen": 33717968, "step": 55355 }, { "epoch": 17.176543592925846, "grad_norm": 23.179080963134766, "learning_rate": 5.950369877814771e-07, "loss": 0.141, "num_input_tokens_seen": 33720432, "step": 55360 }, { "epoch": 17.17809494260006, "grad_norm": 20.464847564697266, "learning_rate": 5.943966220324604e-07, "loss": 0.1946, "num_input_tokens_seen": 33722448, "step": 55365 }, { "epoch": 17.179646292274278, "grad_norm": 9.003707885742188, "learning_rate": 5.937565792663425e-07, "loss": 0.1402, "num_input_tokens_seen": 33725424, "step": 55370 }, { "epoch": 17.181197641948494, "grad_norm": 13.510730743408203, "learning_rate": 5.931168595300435e-07, "loss": 0.2542, "num_input_tokens_seen": 33728688, "step": 55375 }, { "epoch": 17.182748991622713, "grad_norm": 11.33646011352539, "learning_rate": 5.92477462870466e-07, "loss": 0.1408, "num_input_tokens_seen": 33731888, "step": 55380 }, { "epoch": 17.18430034129693, "grad_norm": 14.023069381713867, "learning_rate": 5.918383893344815e-07, "loss": 0.1516, "num_input_tokens_seen": 33735376, "step": 55385 }, { "epoch": 17.185851690971145, "grad_norm": 23.255475997924805, "learning_rate": 5.91199638968945e-07, "loss": 0.1123, "num_input_tokens_seen": 33739536, "step": 55390 }, { "epoch": 17.18740304064536, "grad_norm": 12.464590072631836, "learning_rate": 5.905612118206822e-07, "loss": 0.1083, "num_input_tokens_seen": 33742608, "step": 55395 }, { "epoch": 17.188954390319577, "grad_norm": 31.024370193481445, "learning_rate": 5.899231079364986e-07, "loss": 0.1751, "num_input_tokens_seen": 33745360, "step": 55400 }, { "epoch": 17.190505739993796, "grad_norm": 13.206257820129395, "learning_rate": 5.892853273631733e-07, "loss": 0.1438, "num_input_tokens_seen": 33748336, "step": 55405 }, { "epoch": 17.192057089668012, "grad_norm": 8.964436531066895, "learning_rate": 5.886478701474658e-07, "loss": 0.1588, "num_input_tokens_seen": 33752592, "step": 55410 }, { "epoch": 17.193608439342228, "grad_norm": 8.761587142944336, "learning_rate": 5.88010736336106e-07, "loss": 0.1146, "num_input_tokens_seen": 33755792, "step": 55415 }, { "epoch": 17.195159789016444, "grad_norm": 7.352877616882324, "learning_rate": 5.873739259758049e-07, "loss": 0.084, "num_input_tokens_seen": 33759600, "step": 55420 }, { "epoch": 17.19671113869066, "grad_norm": 13.012247085571289, "learning_rate": 5.867374391132497e-07, "loss": 0.1374, "num_input_tokens_seen": 33762768, "step": 55425 }, { "epoch": 17.19826248836488, "grad_norm": 40.617218017578125, "learning_rate": 5.861012757951001e-07, "loss": 0.1093, "num_input_tokens_seen": 33765680, "step": 55430 }, { "epoch": 17.199813838039095, "grad_norm": 16.00798797607422, "learning_rate": 5.854654360679962e-07, "loss": 0.1412, "num_input_tokens_seen": 33768464, "step": 55435 }, { "epoch": 17.20136518771331, "grad_norm": 8.686214447021484, "learning_rate": 5.848299199785512e-07, "loss": 0.0965, "num_input_tokens_seen": 33771344, "step": 55440 }, { "epoch": 17.202916537387527, "grad_norm": 6.712869644165039, "learning_rate": 5.841947275733567e-07, "loss": 0.181, "num_input_tokens_seen": 33774352, "step": 55445 }, { "epoch": 17.204467887061742, "grad_norm": 8.832058906555176, "learning_rate": 5.835598588989793e-07, "loss": 0.1464, "num_input_tokens_seen": 33776944, "step": 55450 }, { "epoch": 17.206019236735962, "grad_norm": 10.08288288116455, "learning_rate": 5.829253140019636e-07, "loss": 0.1331, "num_input_tokens_seen": 33780752, "step": 55455 }, { "epoch": 17.207570586410178, "grad_norm": 8.527203559875488, "learning_rate": 5.822910929288272e-07, "loss": 0.0994, "num_input_tokens_seen": 33784336, "step": 55460 }, { "epoch": 17.209121936084394, "grad_norm": 9.636481285095215, "learning_rate": 5.816571957260681e-07, "loss": 0.1521, "num_input_tokens_seen": 33786384, "step": 55465 }, { "epoch": 17.21067328575861, "grad_norm": 8.093779563903809, "learning_rate": 5.810236224401566e-07, "loss": 0.0956, "num_input_tokens_seen": 33789424, "step": 55470 }, { "epoch": 17.212224635432825, "grad_norm": 14.531661033630371, "learning_rate": 5.803903731175426e-07, "loss": 0.0731, "num_input_tokens_seen": 33792368, "step": 55475 }, { "epoch": 17.213775985107045, "grad_norm": 12.747076988220215, "learning_rate": 5.797574478046502e-07, "loss": 0.2346, "num_input_tokens_seen": 33794864, "step": 55480 }, { "epoch": 17.21532733478126, "grad_norm": 7.998279094696045, "learning_rate": 5.791248465478805e-07, "loss": 0.2528, "num_input_tokens_seen": 33798000, "step": 55485 }, { "epoch": 17.216878684455477, "grad_norm": 4.555981159210205, "learning_rate": 5.784925693936111e-07, "loss": 0.131, "num_input_tokens_seen": 33800528, "step": 55490 }, { "epoch": 17.218430034129693, "grad_norm": 8.148415565490723, "learning_rate": 5.778606163881934e-07, "loss": 0.1344, "num_input_tokens_seen": 33802864, "step": 55495 }, { "epoch": 17.21998138380391, "grad_norm": 10.220385551452637, "learning_rate": 5.772289875779602e-07, "loss": 0.2585, "num_input_tokens_seen": 33805360, "step": 55500 }, { "epoch": 17.221532733478124, "grad_norm": 6.604091167449951, "learning_rate": 5.76597683009214e-07, "loss": 0.0903, "num_input_tokens_seen": 33808496, "step": 55505 }, { "epoch": 17.223084083152344, "grad_norm": 26.981876373291016, "learning_rate": 5.759667027282401e-07, "loss": 0.2393, "num_input_tokens_seen": 33811280, "step": 55510 }, { "epoch": 17.22463543282656, "grad_norm": 7.4933180809021, "learning_rate": 5.753360467812941e-07, "loss": 0.1281, "num_input_tokens_seen": 33814800, "step": 55515 }, { "epoch": 17.226186782500776, "grad_norm": 21.744468688964844, "learning_rate": 5.747057152146129e-07, "loss": 0.1515, "num_input_tokens_seen": 33817488, "step": 55520 }, { "epoch": 17.22773813217499, "grad_norm": 14.067315101623535, "learning_rate": 5.740757080744048e-07, "loss": 0.1936, "num_input_tokens_seen": 33820784, "step": 55525 }, { "epoch": 17.229289481849207, "grad_norm": 38.43425750732422, "learning_rate": 5.734460254068591e-07, "loss": 0.1287, "num_input_tokens_seen": 33824752, "step": 55530 }, { "epoch": 17.230840831523427, "grad_norm": 8.099596977233887, "learning_rate": 5.72816667258137e-07, "loss": 0.1669, "num_input_tokens_seen": 33827952, "step": 55535 }, { "epoch": 17.232392181197643, "grad_norm": 2.265180826187134, "learning_rate": 5.721876336743793e-07, "loss": 0.1696, "num_input_tokens_seen": 33830512, "step": 55540 }, { "epoch": 17.23394353087186, "grad_norm": 22.67376136779785, "learning_rate": 5.715589247017017e-07, "loss": 0.2249, "num_input_tokens_seen": 33833232, "step": 55545 }, { "epoch": 17.235494880546074, "grad_norm": 45.31955337524414, "learning_rate": 5.709305403861948e-07, "loss": 0.1594, "num_input_tokens_seen": 33835504, "step": 55550 }, { "epoch": 17.23704623022029, "grad_norm": 19.888809204101562, "learning_rate": 5.703024807739277e-07, "loss": 0.1506, "num_input_tokens_seen": 33839696, "step": 55555 }, { "epoch": 17.23859757989451, "grad_norm": 4.098745822906494, "learning_rate": 5.696747459109436e-07, "loss": 0.0883, "num_input_tokens_seen": 33842672, "step": 55560 }, { "epoch": 17.240148929568726, "grad_norm": 9.579668998718262, "learning_rate": 5.690473358432647e-07, "loss": 0.1291, "num_input_tokens_seen": 33845104, "step": 55565 }, { "epoch": 17.24170027924294, "grad_norm": 14.682978630065918, "learning_rate": 5.684202506168856e-07, "loss": 0.1591, "num_input_tokens_seen": 33847920, "step": 55570 }, { "epoch": 17.243251628917157, "grad_norm": 8.341487884521484, "learning_rate": 5.677934902777804e-07, "loss": 0.1914, "num_input_tokens_seen": 33850352, "step": 55575 }, { "epoch": 17.244802978591373, "grad_norm": 16.457252502441406, "learning_rate": 5.671670548718971e-07, "loss": 0.1578, "num_input_tokens_seen": 33852880, "step": 55580 }, { "epoch": 17.246354328265593, "grad_norm": 0.9914004802703857, "learning_rate": 5.66540944445162e-07, "loss": 0.1425, "num_input_tokens_seen": 33856176, "step": 55585 }, { "epoch": 17.24790567793981, "grad_norm": 8.742581367492676, "learning_rate": 5.659151590434742e-07, "loss": 0.1195, "num_input_tokens_seen": 33859632, "step": 55590 }, { "epoch": 17.249457027614024, "grad_norm": 23.164234161376953, "learning_rate": 5.652896987127132e-07, "loss": 0.1128, "num_input_tokens_seen": 33862576, "step": 55595 }, { "epoch": 17.25100837728824, "grad_norm": 9.153133392333984, "learning_rate": 5.64664563498733e-07, "loss": 0.1205, "num_input_tokens_seen": 33865456, "step": 55600 }, { "epoch": 17.252559726962456, "grad_norm": 29.663877487182617, "learning_rate": 5.640397534473613e-07, "loss": 0.1428, "num_input_tokens_seen": 33868464, "step": 55605 }, { "epoch": 17.254111076636676, "grad_norm": 32.933135986328125, "learning_rate": 5.63415268604407e-07, "loss": 0.3165, "num_input_tokens_seen": 33871216, "step": 55610 }, { "epoch": 17.25566242631089, "grad_norm": 19.678495407104492, "learning_rate": 5.62791109015649e-07, "loss": 0.181, "num_input_tokens_seen": 33874064, "step": 55615 }, { "epoch": 17.257213775985107, "grad_norm": 10.011101722717285, "learning_rate": 5.62167274726848e-07, "loss": 0.1233, "num_input_tokens_seen": 33876048, "step": 55620 }, { "epoch": 17.258765125659323, "grad_norm": 17.240890502929688, "learning_rate": 5.615437657837369e-07, "loss": 0.2077, "num_input_tokens_seen": 33879408, "step": 55625 }, { "epoch": 17.26031647533354, "grad_norm": 24.66307830810547, "learning_rate": 5.609205822320274e-07, "loss": 0.1673, "num_input_tokens_seen": 33882192, "step": 55630 }, { "epoch": 17.261867825007755, "grad_norm": 25.399112701416016, "learning_rate": 5.602977241174051e-07, "loss": 0.1653, "num_input_tokens_seen": 33884752, "step": 55635 }, { "epoch": 17.263419174681975, "grad_norm": 9.125555038452148, "learning_rate": 5.596751914855348e-07, "loss": 0.1707, "num_input_tokens_seen": 33887856, "step": 55640 }, { "epoch": 17.26497052435619, "grad_norm": 10.238929748535156, "learning_rate": 5.59052984382053e-07, "loss": 0.1309, "num_input_tokens_seen": 33891728, "step": 55645 }, { "epoch": 17.266521874030406, "grad_norm": 17.065654754638672, "learning_rate": 5.584311028525774e-07, "loss": 0.2458, "num_input_tokens_seen": 33896432, "step": 55650 }, { "epoch": 17.268073223704622, "grad_norm": 9.435322761535645, "learning_rate": 5.578095469426969e-07, "loss": 0.1705, "num_input_tokens_seen": 33898736, "step": 55655 }, { "epoch": 17.269624573378838, "grad_norm": 17.848602294921875, "learning_rate": 5.571883166979797e-07, "loss": 0.1641, "num_input_tokens_seen": 33901296, "step": 55660 }, { "epoch": 17.271175923053057, "grad_norm": 20.06504249572754, "learning_rate": 5.565674121639713e-07, "loss": 0.2227, "num_input_tokens_seen": 33903728, "step": 55665 }, { "epoch": 17.272727272727273, "grad_norm": 12.193310737609863, "learning_rate": 5.559468333861884e-07, "loss": 0.1788, "num_input_tokens_seen": 33907408, "step": 55670 }, { "epoch": 17.27427862240149, "grad_norm": 10.802773475646973, "learning_rate": 5.553265804101288e-07, "loss": 0.0848, "num_input_tokens_seen": 33910160, "step": 55675 }, { "epoch": 17.275829972075705, "grad_norm": 12.702367782592773, "learning_rate": 5.547066532812629e-07, "loss": 0.1438, "num_input_tokens_seen": 33913072, "step": 55680 }, { "epoch": 17.27738132174992, "grad_norm": 42.89191436767578, "learning_rate": 5.540870520450403e-07, "loss": 0.2533, "num_input_tokens_seen": 33915984, "step": 55685 }, { "epoch": 17.27893267142414, "grad_norm": 23.027183532714844, "learning_rate": 5.534677767468832e-07, "loss": 0.1163, "num_input_tokens_seen": 33918864, "step": 55690 }, { "epoch": 17.280484021098356, "grad_norm": 36.628753662109375, "learning_rate": 5.528488274321941e-07, "loss": 0.109, "num_input_tokens_seen": 33921616, "step": 55695 }, { "epoch": 17.282035370772572, "grad_norm": 24.52200698852539, "learning_rate": 5.522302041463473e-07, "loss": 0.2613, "num_input_tokens_seen": 33924240, "step": 55700 }, { "epoch": 17.283586720446788, "grad_norm": 16.05150604248047, "learning_rate": 5.516119069346964e-07, "loss": 0.2057, "num_input_tokens_seen": 33927216, "step": 55705 }, { "epoch": 17.285138070121004, "grad_norm": 39.30282974243164, "learning_rate": 5.509939358425692e-07, "loss": 0.2304, "num_input_tokens_seen": 33929680, "step": 55710 }, { "epoch": 17.286689419795223, "grad_norm": 28.455127716064453, "learning_rate": 5.503762909152705e-07, "loss": 0.2241, "num_input_tokens_seen": 33933008, "step": 55715 }, { "epoch": 17.28824076946944, "grad_norm": 6.798691272735596, "learning_rate": 5.497589721980817e-07, "loss": 0.1833, "num_input_tokens_seen": 33935696, "step": 55720 }, { "epoch": 17.289792119143655, "grad_norm": 6.76125431060791, "learning_rate": 5.49141979736258e-07, "loss": 0.1202, "num_input_tokens_seen": 33938928, "step": 55725 }, { "epoch": 17.29134346881787, "grad_norm": 7.232562065124512, "learning_rate": 5.485253135750346e-07, "loss": 0.1519, "num_input_tokens_seen": 33941584, "step": 55730 }, { "epoch": 17.292894818492087, "grad_norm": 12.127813339233398, "learning_rate": 5.479089737596177e-07, "loss": 0.1648, "num_input_tokens_seen": 33944720, "step": 55735 }, { "epoch": 17.294446168166306, "grad_norm": 16.571239471435547, "learning_rate": 5.472929603351939e-07, "loss": 0.1876, "num_input_tokens_seen": 33948240, "step": 55740 }, { "epoch": 17.295997517840522, "grad_norm": 16.231657028198242, "learning_rate": 5.466772733469239e-07, "loss": 0.1701, "num_input_tokens_seen": 33951536, "step": 55745 }, { "epoch": 17.297548867514738, "grad_norm": 17.849241256713867, "learning_rate": 5.460619128399464e-07, "loss": 0.1709, "num_input_tokens_seen": 33953776, "step": 55750 }, { "epoch": 17.299100217188954, "grad_norm": 12.848162651062012, "learning_rate": 5.45446878859372e-07, "loss": 0.1846, "num_input_tokens_seen": 33956464, "step": 55755 }, { "epoch": 17.30065156686317, "grad_norm": 7.899366855621338, "learning_rate": 5.448321714502919e-07, "loss": 0.181, "num_input_tokens_seen": 33959888, "step": 55760 }, { "epoch": 17.302202916537386, "grad_norm": 14.312220573425293, "learning_rate": 5.442177906577701e-07, "loss": 0.1512, "num_input_tokens_seen": 33962576, "step": 55765 }, { "epoch": 17.303754266211605, "grad_norm": 18.660776138305664, "learning_rate": 5.436037365268493e-07, "loss": 0.1476, "num_input_tokens_seen": 33965168, "step": 55770 }, { "epoch": 17.30530561588582, "grad_norm": 12.131587028503418, "learning_rate": 5.429900091025453e-07, "loss": 0.3088, "num_input_tokens_seen": 33967824, "step": 55775 }, { "epoch": 17.306856965560037, "grad_norm": 7.43838357925415, "learning_rate": 5.423766084298532e-07, "loss": 0.1429, "num_input_tokens_seen": 33970128, "step": 55780 }, { "epoch": 17.308408315234253, "grad_norm": 17.92671775817871, "learning_rate": 5.417635345537414e-07, "loss": 0.1353, "num_input_tokens_seen": 33973776, "step": 55785 }, { "epoch": 17.30995966490847, "grad_norm": 7.029942035675049, "learning_rate": 5.411507875191569e-07, "loss": 0.0789, "num_input_tokens_seen": 33976816, "step": 55790 }, { "epoch": 17.31151101458269, "grad_norm": 30.393274307250977, "learning_rate": 5.405383673710191e-07, "loss": 0.1512, "num_input_tokens_seen": 33980144, "step": 55795 }, { "epoch": 17.313062364256904, "grad_norm": 30.778446197509766, "learning_rate": 5.399262741542266e-07, "loss": 0.1279, "num_input_tokens_seen": 33982800, "step": 55800 }, { "epoch": 17.31461371393112, "grad_norm": 30.37190818786621, "learning_rate": 5.393145079136552e-07, "loss": 0.1364, "num_input_tokens_seen": 33985872, "step": 55805 }, { "epoch": 17.316165063605336, "grad_norm": 22.275415420532227, "learning_rate": 5.387030686941514e-07, "loss": 0.1437, "num_input_tokens_seen": 33990224, "step": 55810 }, { "epoch": 17.317716413279552, "grad_norm": 26.017375946044922, "learning_rate": 5.380919565405429e-07, "loss": 0.1069, "num_input_tokens_seen": 33992944, "step": 55815 }, { "epoch": 17.31926776295377, "grad_norm": 21.007061004638672, "learning_rate": 5.374811714976302e-07, "loss": 0.1743, "num_input_tokens_seen": 33995472, "step": 55820 }, { "epoch": 17.320819112627987, "grad_norm": 29.426301956176758, "learning_rate": 5.368707136101931e-07, "loss": 0.1865, "num_input_tokens_seen": 33998096, "step": 55825 }, { "epoch": 17.322370462302203, "grad_norm": 20.725814819335938, "learning_rate": 5.362605829229828e-07, "loss": 0.1395, "num_input_tokens_seen": 34000976, "step": 55830 }, { "epoch": 17.32392181197642, "grad_norm": 19.07278060913086, "learning_rate": 5.356507794807314e-07, "loss": 0.1253, "num_input_tokens_seen": 34003728, "step": 55835 }, { "epoch": 17.325473161650635, "grad_norm": 11.811506271362305, "learning_rate": 5.350413033281426e-07, "loss": 0.0894, "num_input_tokens_seen": 34006288, "step": 55840 }, { "epoch": 17.327024511324854, "grad_norm": 7.120311737060547, "learning_rate": 5.344321545099002e-07, "loss": 0.0923, "num_input_tokens_seen": 34009040, "step": 55845 }, { "epoch": 17.32857586099907, "grad_norm": 6.576910495758057, "learning_rate": 5.338233330706599e-07, "loss": 0.1288, "num_input_tokens_seen": 34012016, "step": 55850 }, { "epoch": 17.330127210673286, "grad_norm": 10.341978073120117, "learning_rate": 5.332148390550585e-07, "loss": 0.1167, "num_input_tokens_seen": 34016048, "step": 55855 }, { "epoch": 17.331678560347502, "grad_norm": 19.539674758911133, "learning_rate": 5.326066725077023e-07, "loss": 0.1604, "num_input_tokens_seen": 34019120, "step": 55860 }, { "epoch": 17.333229910021718, "grad_norm": 14.61286449432373, "learning_rate": 5.31998833473179e-07, "loss": 0.1446, "num_input_tokens_seen": 34021648, "step": 55865 }, { "epoch": 17.334781259695937, "grad_norm": 48.29151916503906, "learning_rate": 5.313913219960515e-07, "loss": 0.3281, "num_input_tokens_seen": 34024528, "step": 55870 }, { "epoch": 17.336332609370153, "grad_norm": 26.581342697143555, "learning_rate": 5.307841381208551e-07, "loss": 0.1381, "num_input_tokens_seen": 34027056, "step": 55875 }, { "epoch": 17.33788395904437, "grad_norm": 18.861268997192383, "learning_rate": 5.301772818921058e-07, "loss": 0.0767, "num_input_tokens_seen": 34030672, "step": 55880 }, { "epoch": 17.339435308718585, "grad_norm": 13.688194274902344, "learning_rate": 5.295707533542915e-07, "loss": 0.1504, "num_input_tokens_seen": 34033744, "step": 55885 }, { "epoch": 17.3409866583928, "grad_norm": 7.108911514282227, "learning_rate": 5.2896455255188e-07, "loss": 0.0493, "num_input_tokens_seen": 34037328, "step": 55890 }, { "epoch": 17.342538008067017, "grad_norm": 9.709129333496094, "learning_rate": 5.283586795293105e-07, "loss": 0.2479, "num_input_tokens_seen": 34040144, "step": 55895 }, { "epoch": 17.344089357741236, "grad_norm": 20.113460540771484, "learning_rate": 5.277531343310033e-07, "loss": 0.1933, "num_input_tokens_seen": 34042864, "step": 55900 }, { "epoch": 17.345640707415452, "grad_norm": 12.342440605163574, "learning_rate": 5.271479170013494e-07, "loss": 0.1834, "num_input_tokens_seen": 34045136, "step": 55905 }, { "epoch": 17.347192057089668, "grad_norm": 7.670170307159424, "learning_rate": 5.265430275847206e-07, "loss": 0.1561, "num_input_tokens_seen": 34047664, "step": 55910 }, { "epoch": 17.348743406763884, "grad_norm": 6.501486778259277, "learning_rate": 5.259384661254602e-07, "loss": 0.0929, "num_input_tokens_seen": 34050160, "step": 55915 }, { "epoch": 17.3502947564381, "grad_norm": 14.815003395080566, "learning_rate": 5.253342326678918e-07, "loss": 0.134, "num_input_tokens_seen": 34052816, "step": 55920 }, { "epoch": 17.35184610611232, "grad_norm": 18.803049087524414, "learning_rate": 5.247303272563125e-07, "loss": 0.1734, "num_input_tokens_seen": 34057136, "step": 55925 }, { "epoch": 17.353397455786535, "grad_norm": 38.110374450683594, "learning_rate": 5.241267499349945e-07, "loss": 0.1296, "num_input_tokens_seen": 34060400, "step": 55930 }, { "epoch": 17.35494880546075, "grad_norm": 6.986344814300537, "learning_rate": 5.235235007481892e-07, "loss": 0.1058, "num_input_tokens_seen": 34062864, "step": 55935 }, { "epoch": 17.356500155134967, "grad_norm": 18.849634170532227, "learning_rate": 5.229205797401193e-07, "loss": 0.1562, "num_input_tokens_seen": 34064976, "step": 55940 }, { "epoch": 17.358051504809183, "grad_norm": 4.459113597869873, "learning_rate": 5.223179869549888e-07, "loss": 0.1887, "num_input_tokens_seen": 34067952, "step": 55945 }, { "epoch": 17.359602854483402, "grad_norm": 7.118607521057129, "learning_rate": 5.217157224369728e-07, "loss": 0.1334, "num_input_tokens_seen": 34071248, "step": 55950 }, { "epoch": 17.361154204157618, "grad_norm": 11.699061393737793, "learning_rate": 5.211137862302257e-07, "loss": 0.2136, "num_input_tokens_seen": 34074832, "step": 55955 }, { "epoch": 17.362705553831834, "grad_norm": 12.412652969360352, "learning_rate": 5.205121783788752e-07, "loss": 0.0935, "num_input_tokens_seen": 34078320, "step": 55960 }, { "epoch": 17.36425690350605, "grad_norm": 16.75954246520996, "learning_rate": 5.199108989270279e-07, "loss": 0.1495, "num_input_tokens_seen": 34081168, "step": 55965 }, { "epoch": 17.365808253180266, "grad_norm": 8.66010570526123, "learning_rate": 5.19309947918763e-07, "loss": 0.1915, "num_input_tokens_seen": 34084336, "step": 55970 }, { "epoch": 17.367359602854485, "grad_norm": 15.773009300231934, "learning_rate": 5.187093253981395e-07, "loss": 0.2419, "num_input_tokens_seen": 34087216, "step": 55975 }, { "epoch": 17.3689109525287, "grad_norm": 16.93256378173828, "learning_rate": 5.181090314091874e-07, "loss": 0.142, "num_input_tokens_seen": 34089968, "step": 55980 }, { "epoch": 17.370462302202917, "grad_norm": 7.283370018005371, "learning_rate": 5.175090659959176e-07, "loss": 0.1553, "num_input_tokens_seen": 34093232, "step": 55985 }, { "epoch": 17.372013651877133, "grad_norm": 20.42075538635254, "learning_rate": 5.169094292023141e-07, "loss": 0.1804, "num_input_tokens_seen": 34095280, "step": 55990 }, { "epoch": 17.37356500155135, "grad_norm": 13.934206008911133, "learning_rate": 5.163101210723365e-07, "loss": 0.1263, "num_input_tokens_seen": 34099056, "step": 55995 }, { "epoch": 17.375116351225568, "grad_norm": 4.6500563621521, "learning_rate": 5.157111416499227e-07, "loss": 0.0873, "num_input_tokens_seen": 34102704, "step": 56000 }, { "epoch": 17.376667700899784, "grad_norm": 6.2331695556640625, "learning_rate": 5.151124909789835e-07, "loss": 0.0965, "num_input_tokens_seen": 34105712, "step": 56005 }, { "epoch": 17.378219050574, "grad_norm": 6.4180474281311035, "learning_rate": 5.145141691034084e-07, "loss": 0.1692, "num_input_tokens_seen": 34109648, "step": 56010 }, { "epoch": 17.379770400248216, "grad_norm": 25.090456008911133, "learning_rate": 5.139161760670597e-07, "loss": 0.231, "num_input_tokens_seen": 34112880, "step": 56015 }, { "epoch": 17.38132174992243, "grad_norm": 9.163402557373047, "learning_rate": 5.133185119137796e-07, "loss": 0.1078, "num_input_tokens_seen": 34117424, "step": 56020 }, { "epoch": 17.382873099596647, "grad_norm": 22.84565544128418, "learning_rate": 5.127211766873819e-07, "loss": 0.1304, "num_input_tokens_seen": 34120432, "step": 56025 }, { "epoch": 17.384424449270867, "grad_norm": 27.198644638061523, "learning_rate": 5.121241704316604e-07, "loss": 0.2007, "num_input_tokens_seen": 34124112, "step": 56030 }, { "epoch": 17.385975798945083, "grad_norm": 54.19589614868164, "learning_rate": 5.115274931903807e-07, "loss": 0.2495, "num_input_tokens_seen": 34127536, "step": 56035 }, { "epoch": 17.3875271486193, "grad_norm": 22.971078872680664, "learning_rate": 5.109311450072868e-07, "loss": 0.1102, "num_input_tokens_seen": 34130704, "step": 56040 }, { "epoch": 17.389078498293514, "grad_norm": 26.981855392456055, "learning_rate": 5.103351259260997e-07, "loss": 0.2594, "num_input_tokens_seen": 34133168, "step": 56045 }, { "epoch": 17.39062984796773, "grad_norm": 18.457782745361328, "learning_rate": 5.097394359905128e-07, "loss": 0.1712, "num_input_tokens_seen": 34137328, "step": 56050 }, { "epoch": 17.39218119764195, "grad_norm": 37.51485824584961, "learning_rate": 5.091440752441984e-07, "loss": 0.192, "num_input_tokens_seen": 34140592, "step": 56055 }, { "epoch": 17.393732547316166, "grad_norm": 8.213968276977539, "learning_rate": 5.085490437308021e-07, "loss": 0.1056, "num_input_tokens_seen": 34143440, "step": 56060 }, { "epoch": 17.39528389699038, "grad_norm": 10.279356002807617, "learning_rate": 5.079543414939487e-07, "loss": 0.1637, "num_input_tokens_seen": 34145904, "step": 56065 }, { "epoch": 17.396835246664597, "grad_norm": 18.435482025146484, "learning_rate": 5.073599685772346e-07, "loss": 0.1258, "num_input_tokens_seen": 34149712, "step": 56070 }, { "epoch": 17.398386596338813, "grad_norm": 31.359569549560547, "learning_rate": 5.067659250242368e-07, "loss": 0.2704, "num_input_tokens_seen": 34152240, "step": 56075 }, { "epoch": 17.399937946013033, "grad_norm": 25.748897552490234, "learning_rate": 5.061722108785033e-07, "loss": 0.1974, "num_input_tokens_seen": 34155984, "step": 56080 }, { "epoch": 17.40148929568725, "grad_norm": 23.89807891845703, "learning_rate": 5.055788261835631e-07, "loss": 0.1619, "num_input_tokens_seen": 34158576, "step": 56085 }, { "epoch": 17.403040645361465, "grad_norm": 17.02804946899414, "learning_rate": 5.049857709829159e-07, "loss": 0.138, "num_input_tokens_seen": 34161840, "step": 56090 }, { "epoch": 17.40459199503568, "grad_norm": 19.504152297973633, "learning_rate": 5.043930453200413e-07, "loss": 0.1321, "num_input_tokens_seen": 34164144, "step": 56095 }, { "epoch": 17.406143344709896, "grad_norm": 12.839460372924805, "learning_rate": 5.038006492383913e-07, "loss": 0.184, "num_input_tokens_seen": 34166896, "step": 56100 }, { "epoch": 17.407694694384116, "grad_norm": 19.771472930908203, "learning_rate": 5.03208582781397e-07, "loss": 0.3044, "num_input_tokens_seen": 34169968, "step": 56105 }, { "epoch": 17.40924604405833, "grad_norm": 44.3944206237793, "learning_rate": 5.02616845992464e-07, "loss": 0.2576, "num_input_tokens_seen": 34173552, "step": 56110 }, { "epoch": 17.410797393732548, "grad_norm": 13.627335548400879, "learning_rate": 5.020254389149726e-07, "loss": 0.1809, "num_input_tokens_seen": 34176688, "step": 56115 }, { "epoch": 17.412348743406763, "grad_norm": 18.26422691345215, "learning_rate": 5.014343615922818e-07, "loss": 0.1961, "num_input_tokens_seen": 34180016, "step": 56120 }, { "epoch": 17.41390009308098, "grad_norm": 9.922134399414062, "learning_rate": 5.008436140677219e-07, "loss": 0.0803, "num_input_tokens_seen": 34183152, "step": 56125 }, { "epoch": 17.4154514427552, "grad_norm": 13.47953987121582, "learning_rate": 5.002531963846041e-07, "loss": 0.1199, "num_input_tokens_seen": 34186416, "step": 56130 }, { "epoch": 17.417002792429415, "grad_norm": 27.225648880004883, "learning_rate": 4.996631085862108e-07, "loss": 0.1393, "num_input_tokens_seen": 34189680, "step": 56135 }, { "epoch": 17.41855414210363, "grad_norm": 29.132230758666992, "learning_rate": 4.990733507158052e-07, "loss": 0.274, "num_input_tokens_seen": 34192176, "step": 56140 }, { "epoch": 17.420105491777846, "grad_norm": 16.424654006958008, "learning_rate": 4.984839228166205e-07, "loss": 0.268, "num_input_tokens_seen": 34195088, "step": 56145 }, { "epoch": 17.421656841452062, "grad_norm": 10.4478120803833, "learning_rate": 4.97894824931871e-07, "loss": 0.1601, "num_input_tokens_seen": 34197872, "step": 56150 }, { "epoch": 17.423208191126278, "grad_norm": 1.1319007873535156, "learning_rate": 4.973060571047433e-07, "loss": 0.1661, "num_input_tokens_seen": 34201200, "step": 56155 }, { "epoch": 17.424759540800498, "grad_norm": 6.111582279205322, "learning_rate": 4.967176193784013e-07, "loss": 0.0772, "num_input_tokens_seen": 34204304, "step": 56160 }, { "epoch": 17.426310890474713, "grad_norm": 22.958782196044922, "learning_rate": 4.961295117959852e-07, "loss": 0.1634, "num_input_tokens_seen": 34206544, "step": 56165 }, { "epoch": 17.42786224014893, "grad_norm": 8.6909818649292, "learning_rate": 4.955417344006091e-07, "loss": 0.0893, "num_input_tokens_seen": 34210096, "step": 56170 }, { "epoch": 17.429413589823145, "grad_norm": 23.460952758789062, "learning_rate": 4.949542872353658e-07, "loss": 0.2208, "num_input_tokens_seen": 34213040, "step": 56175 }, { "epoch": 17.43096493949736, "grad_norm": 9.812808990478516, "learning_rate": 4.943671703433195e-07, "loss": 0.0627, "num_input_tokens_seen": 34215952, "step": 56180 }, { "epoch": 17.43251628917158, "grad_norm": 10.458890914916992, "learning_rate": 4.937803837675153e-07, "loss": 0.1657, "num_input_tokens_seen": 34218960, "step": 56185 }, { "epoch": 17.434067638845796, "grad_norm": 6.024328708648682, "learning_rate": 4.9319392755097e-07, "loss": 0.1151, "num_input_tokens_seen": 34222352, "step": 56190 }, { "epoch": 17.435618988520012, "grad_norm": 17.8143310546875, "learning_rate": 4.926078017366793e-07, "loss": 0.1917, "num_input_tokens_seen": 34225360, "step": 56195 }, { "epoch": 17.43717033819423, "grad_norm": 3.8244998455047607, "learning_rate": 4.920220063676106e-07, "loss": 0.094, "num_input_tokens_seen": 34228304, "step": 56200 }, { "epoch": 17.438721687868444, "grad_norm": 7.909757614135742, "learning_rate": 4.914365414867128e-07, "loss": 0.1507, "num_input_tokens_seen": 34231120, "step": 56205 }, { "epoch": 17.440273037542664, "grad_norm": 29.5856990814209, "learning_rate": 4.908514071369052e-07, "loss": 0.1834, "num_input_tokens_seen": 34233904, "step": 56210 }, { "epoch": 17.44182438721688, "grad_norm": 4.259586811065674, "learning_rate": 4.902666033610864e-07, "loss": 0.1162, "num_input_tokens_seen": 34237456, "step": 56215 }, { "epoch": 17.443375736891095, "grad_norm": 3.2178237438201904, "learning_rate": 4.896821302021276e-07, "loss": 0.2167, "num_input_tokens_seen": 34240624, "step": 56220 }, { "epoch": 17.44492708656531, "grad_norm": 8.871238708496094, "learning_rate": 4.890979877028795e-07, "loss": 0.1657, "num_input_tokens_seen": 34243280, "step": 56225 }, { "epoch": 17.446478436239527, "grad_norm": 12.201215744018555, "learning_rate": 4.885141759061663e-07, "loss": 0.2358, "num_input_tokens_seen": 34246672, "step": 56230 }, { "epoch": 17.448029785913747, "grad_norm": 9.196966171264648, "learning_rate": 4.879306948547874e-07, "loss": 0.1289, "num_input_tokens_seen": 34249680, "step": 56235 }, { "epoch": 17.449581135587962, "grad_norm": 8.622425079345703, "learning_rate": 4.873475445915199e-07, "loss": 0.1844, "num_input_tokens_seen": 34252464, "step": 56240 }, { "epoch": 17.45113248526218, "grad_norm": 19.60797691345215, "learning_rate": 4.867647251591146e-07, "loss": 0.2291, "num_input_tokens_seen": 34255696, "step": 56245 }, { "epoch": 17.452683834936394, "grad_norm": 31.765605926513672, "learning_rate": 4.861822366003011e-07, "loss": 0.1622, "num_input_tokens_seen": 34259600, "step": 56250 }, { "epoch": 17.45423518461061, "grad_norm": 4.634382724761963, "learning_rate": 4.856000789577797e-07, "loss": 0.1128, "num_input_tokens_seen": 34264368, "step": 56255 }, { "epoch": 17.45578653428483, "grad_norm": 8.310803413391113, "learning_rate": 4.850182522742319e-07, "loss": 0.0928, "num_input_tokens_seen": 34267472, "step": 56260 }, { "epoch": 17.457337883959045, "grad_norm": 33.18933868408203, "learning_rate": 4.844367565923109e-07, "loss": 0.1641, "num_input_tokens_seen": 34270800, "step": 56265 }, { "epoch": 17.45888923363326, "grad_norm": 15.53725814819336, "learning_rate": 4.838555919546484e-07, "loss": 0.1554, "num_input_tokens_seen": 34274000, "step": 56270 }, { "epoch": 17.460440583307477, "grad_norm": 14.924432754516602, "learning_rate": 4.8327475840385e-07, "loss": 0.1235, "num_input_tokens_seen": 34276816, "step": 56275 }, { "epoch": 17.461991932981693, "grad_norm": 10.861270904541016, "learning_rate": 4.826942559824982e-07, "loss": 0.1646, "num_input_tokens_seen": 34279504, "step": 56280 }, { "epoch": 17.46354328265591, "grad_norm": 36.16598892211914, "learning_rate": 4.821140847331495e-07, "loss": 0.1958, "num_input_tokens_seen": 34282160, "step": 56285 }, { "epoch": 17.46509463233013, "grad_norm": 14.597040176391602, "learning_rate": 4.815342446983379e-07, "loss": 0.1007, "num_input_tokens_seen": 34284976, "step": 56290 }, { "epoch": 17.466645982004344, "grad_norm": 17.69719123840332, "learning_rate": 4.809547359205741e-07, "loss": 0.1007, "num_input_tokens_seen": 34287792, "step": 56295 }, { "epoch": 17.46819733167856, "grad_norm": 45.01285171508789, "learning_rate": 4.803755584423409e-07, "loss": 0.2354, "num_input_tokens_seen": 34290832, "step": 56300 }, { "epoch": 17.469748681352776, "grad_norm": 20.397062301635742, "learning_rate": 4.797967123061003e-07, "loss": 0.1622, "num_input_tokens_seen": 34293680, "step": 56305 }, { "epoch": 17.471300031026992, "grad_norm": 19.030527114868164, "learning_rate": 4.79218197554287e-07, "loss": 0.1375, "num_input_tokens_seen": 34296016, "step": 56310 }, { "epoch": 17.47285138070121, "grad_norm": 22.110780715942383, "learning_rate": 4.78640014229314e-07, "loss": 0.1555, "num_input_tokens_seen": 34298928, "step": 56315 }, { "epoch": 17.474402730375427, "grad_norm": 33.42340850830078, "learning_rate": 4.780621623735687e-07, "loss": 0.2212, "num_input_tokens_seen": 34301072, "step": 56320 }, { "epoch": 17.475954080049643, "grad_norm": 27.67230224609375, "learning_rate": 4.774846420294155e-07, "loss": 0.1955, "num_input_tokens_seen": 34303920, "step": 56325 }, { "epoch": 17.47750542972386, "grad_norm": 18.27765464782715, "learning_rate": 4.769074532391921e-07, "loss": 0.1466, "num_input_tokens_seen": 34306576, "step": 56330 }, { "epoch": 17.479056779398075, "grad_norm": 8.04917049407959, "learning_rate": 4.7633059604521416e-07, "loss": 0.1509, "num_input_tokens_seen": 34308976, "step": 56335 }, { "epoch": 17.480608129072294, "grad_norm": 13.407383918762207, "learning_rate": 4.757540704897712e-07, "loss": 0.1326, "num_input_tokens_seen": 34311920, "step": 56340 }, { "epoch": 17.48215947874651, "grad_norm": 35.18724822998047, "learning_rate": 4.75177876615131e-07, "loss": 0.235, "num_input_tokens_seen": 34315472, "step": 56345 }, { "epoch": 17.483710828420726, "grad_norm": 8.827468872070312, "learning_rate": 4.7460201446353325e-07, "loss": 0.3208, "num_input_tokens_seen": 34319728, "step": 56350 }, { "epoch": 17.485262178094942, "grad_norm": 3.4543092250823975, "learning_rate": 4.7402648407719744e-07, "loss": 0.1542, "num_input_tokens_seen": 34322160, "step": 56355 }, { "epoch": 17.486813527769158, "grad_norm": 21.29958724975586, "learning_rate": 4.734512854983153e-07, "loss": 0.207, "num_input_tokens_seen": 34325040, "step": 56360 }, { "epoch": 17.488364877443377, "grad_norm": 12.35628604888916, "learning_rate": 4.7287641876905654e-07, "loss": 0.1756, "num_input_tokens_seen": 34329648, "step": 56365 }, { "epoch": 17.489916227117593, "grad_norm": 23.811498641967773, "learning_rate": 4.7230188393156516e-07, "loss": 0.2544, "num_input_tokens_seen": 34333616, "step": 56370 }, { "epoch": 17.49146757679181, "grad_norm": 12.338488578796387, "learning_rate": 4.717276810279614e-07, "loss": 0.0942, "num_input_tokens_seen": 34337008, "step": 56375 }, { "epoch": 17.493018926466025, "grad_norm": 10.849371910095215, "learning_rate": 4.711538101003427e-07, "loss": 0.0877, "num_input_tokens_seen": 34340080, "step": 56380 }, { "epoch": 17.49457027614024, "grad_norm": 22.19483757019043, "learning_rate": 4.7058027119077755e-07, "loss": 0.1379, "num_input_tokens_seen": 34343280, "step": 56385 }, { "epoch": 17.49612162581446, "grad_norm": 2.1608340740203857, "learning_rate": 4.7000706434131627e-07, "loss": 0.0799, "num_input_tokens_seen": 34346960, "step": 56390 }, { "epoch": 17.497672975488676, "grad_norm": 19.502904891967773, "learning_rate": 4.694341895939797e-07, "loss": 0.2702, "num_input_tokens_seen": 34350032, "step": 56395 }, { "epoch": 17.499224325162892, "grad_norm": 18.86509132385254, "learning_rate": 4.688616469907675e-07, "loss": 0.0957, "num_input_tokens_seen": 34352720, "step": 56400 }, { "epoch": 17.500775674837108, "grad_norm": 7.8000993728637695, "learning_rate": 4.682894365736523e-07, "loss": 0.1618, "num_input_tokens_seen": 34357552, "step": 56405 }, { "epoch": 17.502327024511324, "grad_norm": 25.342655181884766, "learning_rate": 4.677175583845861e-07, "loss": 0.2191, "num_input_tokens_seen": 34360016, "step": 56410 }, { "epoch": 17.50387837418554, "grad_norm": 9.751956939697266, "learning_rate": 4.6714601246549197e-07, "loss": 0.1436, "num_input_tokens_seen": 34362288, "step": 56415 }, { "epoch": 17.50542972385976, "grad_norm": 15.811490058898926, "learning_rate": 4.6657479885827304e-07, "loss": 0.1153, "num_input_tokens_seen": 34365776, "step": 56420 }, { "epoch": 17.506981073533975, "grad_norm": 5.3131422996521, "learning_rate": 4.660039176048042e-07, "loss": 0.1599, "num_input_tokens_seen": 34368592, "step": 56425 }, { "epoch": 17.50853242320819, "grad_norm": 11.759368896484375, "learning_rate": 4.654333687469392e-07, "loss": 0.0806, "num_input_tokens_seen": 34371056, "step": 56430 }, { "epoch": 17.510083772882407, "grad_norm": 9.493461608886719, "learning_rate": 4.6486315232650626e-07, "loss": 0.1618, "num_input_tokens_seen": 34374864, "step": 56435 }, { "epoch": 17.511635122556623, "grad_norm": 9.786113739013672, "learning_rate": 4.6429326838530685e-07, "loss": 0.1937, "num_input_tokens_seen": 34378448, "step": 56440 }, { "epoch": 17.513186472230842, "grad_norm": 40.19910430908203, "learning_rate": 4.637237169651232e-07, "loss": 0.1049, "num_input_tokens_seen": 34382544, "step": 56445 }, { "epoch": 17.514737821905058, "grad_norm": 19.299331665039062, "learning_rate": 4.631544981077074e-07, "loss": 0.3705, "num_input_tokens_seen": 34385840, "step": 56450 }, { "epoch": 17.516289171579274, "grad_norm": 24.48691749572754, "learning_rate": 4.625856118547922e-07, "loss": 0.1624, "num_input_tokens_seen": 34389584, "step": 56455 }, { "epoch": 17.51784052125349, "grad_norm": 19.463594436645508, "learning_rate": 4.6201705824808205e-07, "loss": 0.2311, "num_input_tokens_seen": 34392464, "step": 56460 }, { "epoch": 17.519391870927706, "grad_norm": 13.16199779510498, "learning_rate": 4.614488373292597e-07, "loss": 0.2143, "num_input_tokens_seen": 34395216, "step": 56465 }, { "epoch": 17.520943220601925, "grad_norm": 16.9934024810791, "learning_rate": 4.608809491399818e-07, "loss": 0.162, "num_input_tokens_seen": 34398032, "step": 56470 }, { "epoch": 17.52249457027614, "grad_norm": 34.59870910644531, "learning_rate": 4.6031339372188176e-07, "loss": 0.2052, "num_input_tokens_seen": 34400528, "step": 56475 }, { "epoch": 17.524045919950357, "grad_norm": 6.651086807250977, "learning_rate": 4.597461711165674e-07, "loss": 0.1688, "num_input_tokens_seen": 34406544, "step": 56480 }, { "epoch": 17.525597269624573, "grad_norm": 12.240654945373535, "learning_rate": 4.591792813656243e-07, "loss": 0.1792, "num_input_tokens_seen": 34409424, "step": 56485 }, { "epoch": 17.52714861929879, "grad_norm": 29.407377243041992, "learning_rate": 4.5861272451060976e-07, "loss": 0.0973, "num_input_tokens_seen": 34412688, "step": 56490 }, { "epoch": 17.528699968973008, "grad_norm": 15.773229598999023, "learning_rate": 4.580465005930612e-07, "loss": 0.1471, "num_input_tokens_seen": 34415696, "step": 56495 }, { "epoch": 17.530251318647224, "grad_norm": 26.449975967407227, "learning_rate": 4.5748060965448984e-07, "loss": 0.1042, "num_input_tokens_seen": 34418928, "step": 56500 }, { "epoch": 17.53180266832144, "grad_norm": 7.299683570861816, "learning_rate": 4.569150517363802e-07, "loss": 0.0856, "num_input_tokens_seen": 34422736, "step": 56505 }, { "epoch": 17.533354017995656, "grad_norm": 7.671117305755615, "learning_rate": 4.563498268801958e-07, "loss": 0.1477, "num_input_tokens_seen": 34424912, "step": 56510 }, { "epoch": 17.53490536766987, "grad_norm": 20.010723114013672, "learning_rate": 4.5578493512737356e-07, "loss": 0.1598, "num_input_tokens_seen": 34428048, "step": 56515 }, { "epoch": 17.53645671734409, "grad_norm": 8.188984870910645, "learning_rate": 4.5522037651932803e-07, "loss": 0.152, "num_input_tokens_seen": 34430672, "step": 56520 }, { "epoch": 17.538008067018307, "grad_norm": 10.060876846313477, "learning_rate": 4.546561510974462e-07, "loss": 0.1629, "num_input_tokens_seen": 34434064, "step": 56525 }, { "epoch": 17.539559416692523, "grad_norm": 5.354006290435791, "learning_rate": 4.540922589030944e-07, "loss": 0.0709, "num_input_tokens_seen": 34437680, "step": 56530 }, { "epoch": 17.54111076636674, "grad_norm": 9.895539283752441, "learning_rate": 4.535286999776106e-07, "loss": 0.2078, "num_input_tokens_seen": 34440880, "step": 56535 }, { "epoch": 17.542662116040955, "grad_norm": 20.489770889282227, "learning_rate": 4.529654743623124e-07, "loss": 0.244, "num_input_tokens_seen": 34444080, "step": 56540 }, { "epoch": 17.54421346571517, "grad_norm": 19.156110763549805, "learning_rate": 4.5240258209848896e-07, "loss": 0.1688, "num_input_tokens_seen": 34446992, "step": 56545 }, { "epoch": 17.54576481538939, "grad_norm": 13.173300743103027, "learning_rate": 4.5184002322740784e-07, "loss": 0.0881, "num_input_tokens_seen": 34450000, "step": 56550 }, { "epoch": 17.547316165063606, "grad_norm": 7.585719585418701, "learning_rate": 4.5127779779031213e-07, "loss": 0.1595, "num_input_tokens_seen": 34452720, "step": 56555 }, { "epoch": 17.54886751473782, "grad_norm": 33.209693908691406, "learning_rate": 4.5071590582841773e-07, "loss": 0.1569, "num_input_tokens_seen": 34456464, "step": 56560 }, { "epoch": 17.550418864412038, "grad_norm": 17.109100341796875, "learning_rate": 4.501543473829201e-07, "loss": 0.1691, "num_input_tokens_seen": 34459472, "step": 56565 }, { "epoch": 17.551970214086253, "grad_norm": 8.622034072875977, "learning_rate": 4.495931224949862e-07, "loss": 0.1861, "num_input_tokens_seen": 34461648, "step": 56570 }, { "epoch": 17.553521563760473, "grad_norm": 12.168288230895996, "learning_rate": 4.4903223120576213e-07, "loss": 0.2149, "num_input_tokens_seen": 34463984, "step": 56575 }, { "epoch": 17.55507291343469, "grad_norm": 17.55065155029297, "learning_rate": 4.484716735563666e-07, "loss": 0.2735, "num_input_tokens_seen": 34466864, "step": 56580 }, { "epoch": 17.556624263108905, "grad_norm": 8.213205337524414, "learning_rate": 4.4791144958789556e-07, "loss": 0.1657, "num_input_tokens_seen": 34471280, "step": 56585 }, { "epoch": 17.55817561278312, "grad_norm": 11.6044921875, "learning_rate": 4.473515593414196e-07, "loss": 0.2238, "num_input_tokens_seen": 34474608, "step": 56590 }, { "epoch": 17.559726962457336, "grad_norm": 15.868155479431152, "learning_rate": 4.4679200285798694e-07, "loss": 0.1953, "num_input_tokens_seen": 34477424, "step": 56595 }, { "epoch": 17.561278312131556, "grad_norm": 11.767951965332031, "learning_rate": 4.4623278017861703e-07, "loss": 0.2447, "num_input_tokens_seen": 34480144, "step": 56600 }, { "epoch": 17.56282966180577, "grad_norm": 12.218220710754395, "learning_rate": 4.456738913443104e-07, "loss": 0.1602, "num_input_tokens_seen": 34482864, "step": 56605 }, { "epoch": 17.564381011479988, "grad_norm": 23.4257869720459, "learning_rate": 4.4511533639603753e-07, "loss": 0.0993, "num_input_tokens_seen": 34486096, "step": 56610 }, { "epoch": 17.565932361154204, "grad_norm": 11.237746238708496, "learning_rate": 4.4455711537474857e-07, "loss": 0.1328, "num_input_tokens_seen": 34489648, "step": 56615 }, { "epoch": 17.56748371082842, "grad_norm": 28.483434677124023, "learning_rate": 4.4399922832136844e-07, "loss": 0.2465, "num_input_tokens_seen": 34492720, "step": 56620 }, { "epoch": 17.56903506050264, "grad_norm": 10.100713729858398, "learning_rate": 4.434416752767956e-07, "loss": 0.1352, "num_input_tokens_seen": 34496336, "step": 56625 }, { "epoch": 17.570586410176855, "grad_norm": 4.780144214630127, "learning_rate": 4.4288445628190566e-07, "loss": 0.0711, "num_input_tokens_seen": 34501808, "step": 56630 }, { "epoch": 17.57213775985107, "grad_norm": 8.874780654907227, "learning_rate": 4.423275713775493e-07, "loss": 0.0993, "num_input_tokens_seen": 34504912, "step": 56635 }, { "epoch": 17.573689109525287, "grad_norm": 10.017447471618652, "learning_rate": 4.4177102060455337e-07, "loss": 0.1481, "num_input_tokens_seen": 34507856, "step": 56640 }, { "epoch": 17.575240459199502, "grad_norm": 6.836297988891602, "learning_rate": 4.412148040037184e-07, "loss": 0.1373, "num_input_tokens_seen": 34511856, "step": 56645 }, { "epoch": 17.576791808873722, "grad_norm": 24.471086502075195, "learning_rate": 4.40658921615823e-07, "loss": 0.1229, "num_input_tokens_seen": 34514576, "step": 56650 }, { "epoch": 17.578343158547938, "grad_norm": 13.296150207519531, "learning_rate": 4.4010337348161847e-07, "loss": 0.1375, "num_input_tokens_seen": 34516944, "step": 56655 }, { "epoch": 17.579894508222154, "grad_norm": 9.204270362854004, "learning_rate": 4.3954815964183496e-07, "loss": 0.2111, "num_input_tokens_seen": 34520336, "step": 56660 }, { "epoch": 17.58144585789637, "grad_norm": 3.4605960845947266, "learning_rate": 4.3899328013717437e-07, "loss": 0.1082, "num_input_tokens_seen": 34522960, "step": 56665 }, { "epoch": 17.582997207570585, "grad_norm": 12.712038040161133, "learning_rate": 4.3843873500831693e-07, "loss": 0.1489, "num_input_tokens_seen": 34525296, "step": 56670 }, { "epoch": 17.5845485572448, "grad_norm": 21.644149780273438, "learning_rate": 4.378845242959179e-07, "loss": 0.2239, "num_input_tokens_seen": 34528112, "step": 56675 }, { "epoch": 17.58609990691902, "grad_norm": 30.826778411865234, "learning_rate": 4.373306480406059e-07, "loss": 0.1404, "num_input_tokens_seen": 34530768, "step": 56680 }, { "epoch": 17.587651256593237, "grad_norm": 39.92688751220703, "learning_rate": 4.3677710628298784e-07, "loss": 0.1285, "num_input_tokens_seen": 34534288, "step": 56685 }, { "epoch": 17.589202606267452, "grad_norm": 26.974496841430664, "learning_rate": 4.3622389906364413e-07, "loss": 0.1555, "num_input_tokens_seen": 34536368, "step": 56690 }, { "epoch": 17.59075395594167, "grad_norm": 16.33828353881836, "learning_rate": 4.3567102642313287e-07, "loss": 0.2074, "num_input_tokens_seen": 34539088, "step": 56695 }, { "epoch": 17.592305305615884, "grad_norm": 13.862784385681152, "learning_rate": 4.351184884019838e-07, "loss": 0.1796, "num_input_tokens_seen": 34541680, "step": 56700 }, { "epoch": 17.593856655290104, "grad_norm": 56.50477981567383, "learning_rate": 4.3456628504070674e-07, "loss": 0.1531, "num_input_tokens_seen": 34544784, "step": 56705 }, { "epoch": 17.59540800496432, "grad_norm": 18.905813217163086, "learning_rate": 4.3401441637978325e-07, "loss": 0.2031, "num_input_tokens_seen": 34548176, "step": 56710 }, { "epoch": 17.596959354638535, "grad_norm": 25.77705955505371, "learning_rate": 4.334628824596732e-07, "loss": 0.2607, "num_input_tokens_seen": 34550704, "step": 56715 }, { "epoch": 17.59851070431275, "grad_norm": 12.477155685424805, "learning_rate": 4.329116833208086e-07, "loss": 0.1718, "num_input_tokens_seen": 34553392, "step": 56720 }, { "epoch": 17.600062053986967, "grad_norm": 24.439035415649414, "learning_rate": 4.3236081900360105e-07, "loss": 0.1093, "num_input_tokens_seen": 34556976, "step": 56725 }, { "epoch": 17.601613403661187, "grad_norm": 17.71774673461914, "learning_rate": 4.3181028954843386e-07, "loss": 0.2604, "num_input_tokens_seen": 34559344, "step": 56730 }, { "epoch": 17.603164753335403, "grad_norm": 14.903728485107422, "learning_rate": 4.3126009499566747e-07, "loss": 0.1707, "num_input_tokens_seen": 34562960, "step": 56735 }, { "epoch": 17.60471610300962, "grad_norm": 12.97741413116455, "learning_rate": 4.3071023538563904e-07, "loss": 0.1008, "num_input_tokens_seen": 34565136, "step": 56740 }, { "epoch": 17.606267452683834, "grad_norm": 34.03636932373047, "learning_rate": 4.3016071075865805e-07, "loss": 0.1541, "num_input_tokens_seen": 34567920, "step": 56745 }, { "epoch": 17.60781880235805, "grad_norm": 13.261603355407715, "learning_rate": 4.296115211550128e-07, "loss": 0.186, "num_input_tokens_seen": 34570928, "step": 56750 }, { "epoch": 17.60937015203227, "grad_norm": 7.451942443847656, "learning_rate": 4.2906266661496385e-07, "loss": 0.1887, "num_input_tokens_seen": 34573776, "step": 56755 }, { "epoch": 17.610921501706486, "grad_norm": 41.916831970214844, "learning_rate": 4.285141471787502e-07, "loss": 0.1236, "num_input_tokens_seen": 34577744, "step": 56760 }, { "epoch": 17.6124728513807, "grad_norm": 17.612377166748047, "learning_rate": 4.279659628865829e-07, "loss": 0.1171, "num_input_tokens_seen": 34580176, "step": 56765 }, { "epoch": 17.614024201054917, "grad_norm": 25.490638732910156, "learning_rate": 4.274181137786526e-07, "loss": 0.1816, "num_input_tokens_seen": 34583920, "step": 56770 }, { "epoch": 17.615575550729133, "grad_norm": 30.697124481201172, "learning_rate": 4.2687059989512114e-07, "loss": 0.1688, "num_input_tokens_seen": 34586512, "step": 56775 }, { "epoch": 17.617126900403353, "grad_norm": 6.449120044708252, "learning_rate": 4.263234212761297e-07, "loss": 0.1258, "num_input_tokens_seen": 34589104, "step": 56780 }, { "epoch": 17.61867825007757, "grad_norm": 18.590412139892578, "learning_rate": 4.2577657796179116e-07, "loss": 0.2312, "num_input_tokens_seen": 34592432, "step": 56785 }, { "epoch": 17.620229599751784, "grad_norm": 4.36862850189209, "learning_rate": 4.2523006999219575e-07, "loss": 0.1504, "num_input_tokens_seen": 34595600, "step": 56790 }, { "epoch": 17.621780949426, "grad_norm": 24.71119499206543, "learning_rate": 4.246838974074108e-07, "loss": 0.2211, "num_input_tokens_seen": 34598448, "step": 56795 }, { "epoch": 17.623332299100216, "grad_norm": 3.7962143421173096, "learning_rate": 4.241380602474754e-07, "loss": 0.1748, "num_input_tokens_seen": 34600560, "step": 56800 }, { "epoch": 17.624883648774436, "grad_norm": 13.236885070800781, "learning_rate": 4.23592558552407e-07, "loss": 0.1186, "num_input_tokens_seen": 34603280, "step": 56805 }, { "epoch": 17.62643499844865, "grad_norm": 88.71864318847656, "learning_rate": 4.230473923621964e-07, "loss": 0.2526, "num_input_tokens_seen": 34605712, "step": 56810 }, { "epoch": 17.627986348122867, "grad_norm": 1.755495548248291, "learning_rate": 4.225025617168116e-07, "loss": 0.1971, "num_input_tokens_seen": 34608528, "step": 56815 }, { "epoch": 17.629537697797083, "grad_norm": 15.083396911621094, "learning_rate": 4.2195806665619396e-07, "loss": 0.1606, "num_input_tokens_seen": 34611280, "step": 56820 }, { "epoch": 17.6310890474713, "grad_norm": 21.782840728759766, "learning_rate": 4.2141390722026276e-07, "loss": 0.128, "num_input_tokens_seen": 34613936, "step": 56825 }, { "epoch": 17.632640397145515, "grad_norm": 27.942007064819336, "learning_rate": 4.2087008344891043e-07, "loss": 0.1066, "num_input_tokens_seen": 34617136, "step": 56830 }, { "epoch": 17.634191746819734, "grad_norm": 4.878615379333496, "learning_rate": 4.203265953820063e-07, "loss": 0.1065, "num_input_tokens_seen": 34620944, "step": 56835 }, { "epoch": 17.63574309649395, "grad_norm": 38.1640739440918, "learning_rate": 4.197834430593939e-07, "loss": 0.188, "num_input_tokens_seen": 34623184, "step": 56840 }, { "epoch": 17.637294446168166, "grad_norm": 8.229436874389648, "learning_rate": 4.1924062652089316e-07, "loss": 0.2131, "num_input_tokens_seen": 34625360, "step": 56845 }, { "epoch": 17.638845795842382, "grad_norm": 4.505960941314697, "learning_rate": 4.186981458062983e-07, "loss": 0.1433, "num_input_tokens_seen": 34628464, "step": 56850 }, { "epoch": 17.640397145516598, "grad_norm": 14.049263000488281, "learning_rate": 4.1815600095538035e-07, "loss": 0.1799, "num_input_tokens_seen": 34632336, "step": 56855 }, { "epoch": 17.641948495190817, "grad_norm": 22.873050689697266, "learning_rate": 4.1761419200788525e-07, "loss": 0.1571, "num_input_tokens_seen": 34635504, "step": 56860 }, { "epoch": 17.643499844865033, "grad_norm": 6.459552764892578, "learning_rate": 4.1707271900353285e-07, "loss": 0.1602, "num_input_tokens_seen": 34638800, "step": 56865 }, { "epoch": 17.64505119453925, "grad_norm": 27.734630584716797, "learning_rate": 4.1653158198202036e-07, "loss": 0.2039, "num_input_tokens_seen": 34641200, "step": 56870 }, { "epoch": 17.646602544213465, "grad_norm": 12.981200218200684, "learning_rate": 4.159907809830188e-07, "loss": 0.1772, "num_input_tokens_seen": 34643920, "step": 56875 }, { "epoch": 17.64815389388768, "grad_norm": 12.591630935668945, "learning_rate": 4.15450316046177e-07, "loss": 0.1337, "num_input_tokens_seen": 34646896, "step": 56880 }, { "epoch": 17.6497052435619, "grad_norm": 21.23206329345703, "learning_rate": 4.1491018721111496e-07, "loss": 0.1314, "num_input_tokens_seen": 34650416, "step": 56885 }, { "epoch": 17.651256593236116, "grad_norm": 15.705211639404297, "learning_rate": 4.1437039451743265e-07, "loss": 0.1108, "num_input_tokens_seen": 34654064, "step": 56890 }, { "epoch": 17.652807942910332, "grad_norm": 70.54531860351562, "learning_rate": 4.138309380047006e-07, "loss": 0.3572, "num_input_tokens_seen": 34656432, "step": 56895 }, { "epoch": 17.654359292584548, "grad_norm": 17.974306106567383, "learning_rate": 4.1329181771247117e-07, "loss": 0.2364, "num_input_tokens_seen": 34659376, "step": 56900 }, { "epoch": 17.655910642258764, "grad_norm": 17.001150131225586, "learning_rate": 4.127530336802654e-07, "loss": 0.1336, "num_input_tokens_seen": 34662704, "step": 56905 }, { "epoch": 17.657461991932983, "grad_norm": 8.043366432189941, "learning_rate": 4.1221458594758404e-07, "loss": 0.1805, "num_input_tokens_seen": 34666416, "step": 56910 }, { "epoch": 17.6590133416072, "grad_norm": 21.3090763092041, "learning_rate": 4.1167647455390037e-07, "loss": 0.136, "num_input_tokens_seen": 34669264, "step": 56915 }, { "epoch": 17.660564691281415, "grad_norm": 10.461921691894531, "learning_rate": 4.111386995386657e-07, "loss": 0.1697, "num_input_tokens_seen": 34671920, "step": 56920 }, { "epoch": 17.66211604095563, "grad_norm": 26.02138900756836, "learning_rate": 4.10601260941304e-07, "loss": 0.1662, "num_input_tokens_seen": 34676208, "step": 56925 }, { "epoch": 17.663667390629847, "grad_norm": 23.56995391845703, "learning_rate": 4.100641588012172e-07, "loss": 0.115, "num_input_tokens_seen": 34678800, "step": 56930 }, { "epoch": 17.665218740304063, "grad_norm": 1.9880173206329346, "learning_rate": 4.095273931577792e-07, "loss": 0.1162, "num_input_tokens_seen": 34682256, "step": 56935 }, { "epoch": 17.666770089978282, "grad_norm": 17.621868133544922, "learning_rate": 4.089909640503431e-07, "loss": 0.1649, "num_input_tokens_seen": 34687248, "step": 56940 }, { "epoch": 17.668321439652498, "grad_norm": 16.209476470947266, "learning_rate": 4.084548715182357e-07, "loss": 0.1468, "num_input_tokens_seen": 34690512, "step": 56945 }, { "epoch": 17.669872789326714, "grad_norm": 13.611366271972656, "learning_rate": 4.0791911560075783e-07, "loss": 0.1182, "num_input_tokens_seen": 34693232, "step": 56950 }, { "epoch": 17.67142413900093, "grad_norm": 25.746034622192383, "learning_rate": 4.0738369633718746e-07, "loss": 0.1485, "num_input_tokens_seen": 34696496, "step": 56955 }, { "epoch": 17.672975488675146, "grad_norm": 15.471651077270508, "learning_rate": 4.0684861376677596e-07, "loss": 0.1853, "num_input_tokens_seen": 34698736, "step": 56960 }, { "epoch": 17.674526838349365, "grad_norm": 23.960290908813477, "learning_rate": 4.0631386792875314e-07, "loss": 0.1741, "num_input_tokens_seen": 34702160, "step": 56965 }, { "epoch": 17.67607818802358, "grad_norm": 26.929719924926758, "learning_rate": 4.0577945886232096e-07, "loss": 0.1006, "num_input_tokens_seen": 34705360, "step": 56970 }, { "epoch": 17.677629537697797, "grad_norm": 14.558968544006348, "learning_rate": 4.05245386606658e-07, "loss": 0.2207, "num_input_tokens_seen": 34709328, "step": 56975 }, { "epoch": 17.679180887372013, "grad_norm": 25.600540161132812, "learning_rate": 4.0471165120091805e-07, "loss": 0.1841, "num_input_tokens_seen": 34711696, "step": 56980 }, { "epoch": 17.68073223704623, "grad_norm": 13.409120559692383, "learning_rate": 4.0417825268423085e-07, "loss": 0.1286, "num_input_tokens_seen": 34716400, "step": 56985 }, { "epoch": 17.682283586720448, "grad_norm": 16.017723083496094, "learning_rate": 4.0364519109570013e-07, "loss": 0.1779, "num_input_tokens_seen": 34719728, "step": 56990 }, { "epoch": 17.683834936394664, "grad_norm": 12.88037395477295, "learning_rate": 4.0311246647440517e-07, "loss": 0.1062, "num_input_tokens_seen": 34723536, "step": 56995 }, { "epoch": 17.68538628606888, "grad_norm": 42.17904281616211, "learning_rate": 4.025800788594031e-07, "loss": 0.2343, "num_input_tokens_seen": 34726416, "step": 57000 }, { "epoch": 17.686937635743096, "grad_norm": 9.330808639526367, "learning_rate": 4.0204802828972156e-07, "loss": 0.1526, "num_input_tokens_seen": 34730672, "step": 57005 }, { "epoch": 17.68848898541731, "grad_norm": 12.212447166442871, "learning_rate": 4.0151631480436883e-07, "loss": 0.1554, "num_input_tokens_seen": 34733648, "step": 57010 }, { "epoch": 17.69004033509153, "grad_norm": 37.848243713378906, "learning_rate": 4.009849384423231e-07, "loss": 0.2387, "num_input_tokens_seen": 34736880, "step": 57015 }, { "epoch": 17.691591684765747, "grad_norm": 3.2486846446990967, "learning_rate": 4.004538992425427e-07, "loss": 0.1803, "num_input_tokens_seen": 34739376, "step": 57020 }, { "epoch": 17.693143034439963, "grad_norm": 12.115188598632812, "learning_rate": 3.9992319724395767e-07, "loss": 0.1331, "num_input_tokens_seen": 34742320, "step": 57025 }, { "epoch": 17.69469438411418, "grad_norm": 30.50433349609375, "learning_rate": 3.9939283248547623e-07, "loss": 0.1487, "num_input_tokens_seen": 34745008, "step": 57030 }, { "epoch": 17.696245733788395, "grad_norm": 13.553616523742676, "learning_rate": 3.9886280500597896e-07, "loss": 0.1223, "num_input_tokens_seen": 34751248, "step": 57035 }, { "epoch": 17.697797083462614, "grad_norm": 19.062013626098633, "learning_rate": 3.9833311484432367e-07, "loss": 0.1505, "num_input_tokens_seen": 34754704, "step": 57040 }, { "epoch": 17.69934843313683, "grad_norm": 17.802501678466797, "learning_rate": 3.978037620393427e-07, "loss": 0.172, "num_input_tokens_seen": 34757488, "step": 57045 }, { "epoch": 17.700899782811046, "grad_norm": 24.574556350708008, "learning_rate": 3.972747466298449e-07, "loss": 0.1575, "num_input_tokens_seen": 34759632, "step": 57050 }, { "epoch": 17.702451132485262, "grad_norm": 53.90842819213867, "learning_rate": 3.967460686546115e-07, "loss": 0.2505, "num_input_tokens_seen": 34762736, "step": 57055 }, { "epoch": 17.704002482159478, "grad_norm": 13.402093887329102, "learning_rate": 3.9621772815240214e-07, "loss": 0.2105, "num_input_tokens_seen": 34765680, "step": 57060 }, { "epoch": 17.705553831833697, "grad_norm": 29.2674617767334, "learning_rate": 3.9568972516195124e-07, "loss": 0.1613, "num_input_tokens_seen": 34768144, "step": 57065 }, { "epoch": 17.707105181507913, "grad_norm": 10.333110809326172, "learning_rate": 3.9516205972196573e-07, "loss": 0.1591, "num_input_tokens_seen": 34770768, "step": 57070 }, { "epoch": 17.70865653118213, "grad_norm": 4.198931694030762, "learning_rate": 3.9463473187113134e-07, "loss": 0.1454, "num_input_tokens_seen": 34773200, "step": 57075 }, { "epoch": 17.710207880856345, "grad_norm": 17.72171401977539, "learning_rate": 3.941077416481065e-07, "loss": 0.1531, "num_input_tokens_seen": 34777648, "step": 57080 }, { "epoch": 17.71175923053056, "grad_norm": 24.25237464904785, "learning_rate": 3.935810890915265e-07, "loss": 0.2546, "num_input_tokens_seen": 34780752, "step": 57085 }, { "epoch": 17.713310580204777, "grad_norm": 22.2567195892334, "learning_rate": 3.9305477423999985e-07, "loss": 0.2105, "num_input_tokens_seen": 34784656, "step": 57090 }, { "epoch": 17.714861929878996, "grad_norm": 20.86514663696289, "learning_rate": 3.9252879713211343e-07, "loss": 0.1973, "num_input_tokens_seen": 34787984, "step": 57095 }, { "epoch": 17.716413279553212, "grad_norm": 12.326534271240234, "learning_rate": 3.920031578064265e-07, "loss": 0.0652, "num_input_tokens_seen": 34790800, "step": 57100 }, { "epoch": 17.717964629227428, "grad_norm": 9.452362060546875, "learning_rate": 3.914778563014754e-07, "loss": 0.1457, "num_input_tokens_seen": 34793488, "step": 57105 }, { "epoch": 17.719515978901644, "grad_norm": 15.916180610656738, "learning_rate": 3.9095289265576984e-07, "loss": 0.0943, "num_input_tokens_seen": 34797296, "step": 57110 }, { "epoch": 17.72106732857586, "grad_norm": 10.550666809082031, "learning_rate": 3.9042826690779633e-07, "loss": 0.119, "num_input_tokens_seen": 34799344, "step": 57115 }, { "epoch": 17.72261867825008, "grad_norm": 9.937376022338867, "learning_rate": 3.8990397909601743e-07, "loss": 0.1133, "num_input_tokens_seen": 34802896, "step": 57120 }, { "epoch": 17.724170027924295, "grad_norm": 25.92085838317871, "learning_rate": 3.893800292588673e-07, "loss": 0.2125, "num_input_tokens_seen": 34805904, "step": 57125 }, { "epoch": 17.72572137759851, "grad_norm": 18.22899627685547, "learning_rate": 3.8885641743476034e-07, "loss": 0.1484, "num_input_tokens_seen": 34808432, "step": 57130 }, { "epoch": 17.727272727272727, "grad_norm": 8.554403305053711, "learning_rate": 3.8833314366208077e-07, "loss": 0.1004, "num_input_tokens_seen": 34813840, "step": 57135 }, { "epoch": 17.728824076946943, "grad_norm": 12.102032661437988, "learning_rate": 3.878102079791934e-07, "loss": 0.1467, "num_input_tokens_seen": 34816912, "step": 57140 }, { "epoch": 17.730375426621162, "grad_norm": 42.724449157714844, "learning_rate": 3.872876104244333e-07, "loss": 0.2331, "num_input_tokens_seen": 34819856, "step": 57145 }, { "epoch": 17.731926776295378, "grad_norm": 10.817952156066895, "learning_rate": 3.8676535103611466e-07, "loss": 0.1456, "num_input_tokens_seen": 34822256, "step": 57150 }, { "epoch": 17.733478125969594, "grad_norm": 4.168444633483887, "learning_rate": 3.862434298525236e-07, "loss": 0.1027, "num_input_tokens_seen": 34825040, "step": 57155 }, { "epoch": 17.73502947564381, "grad_norm": 12.239198684692383, "learning_rate": 3.857218469119256e-07, "loss": 0.1458, "num_input_tokens_seen": 34827984, "step": 57160 }, { "epoch": 17.736580825318025, "grad_norm": 9.084012985229492, "learning_rate": 3.8520060225255616e-07, "loss": 0.1075, "num_input_tokens_seen": 34831408, "step": 57165 }, { "epoch": 17.738132174992245, "grad_norm": 25.044851303100586, "learning_rate": 3.8467969591263145e-07, "loss": 0.18, "num_input_tokens_seen": 34834480, "step": 57170 }, { "epoch": 17.73968352466646, "grad_norm": 11.336255073547363, "learning_rate": 3.841591279303369e-07, "loss": 0.1837, "num_input_tokens_seen": 34836720, "step": 57175 }, { "epoch": 17.741234874340677, "grad_norm": 14.562357902526855, "learning_rate": 3.836388983438383e-07, "loss": 0.1982, "num_input_tokens_seen": 34838992, "step": 57180 }, { "epoch": 17.742786224014893, "grad_norm": 46.288551330566406, "learning_rate": 3.831190071912755e-07, "loss": 0.1902, "num_input_tokens_seen": 34841424, "step": 57185 }, { "epoch": 17.74433757368911, "grad_norm": 7.245555877685547, "learning_rate": 3.825994545107603e-07, "loss": 0.1038, "num_input_tokens_seen": 34843824, "step": 57190 }, { "epoch": 17.745888923363324, "grad_norm": 19.775733947753906, "learning_rate": 3.820802403403845e-07, "loss": 0.1043, "num_input_tokens_seen": 34847312, "step": 57195 }, { "epoch": 17.747440273037544, "grad_norm": 23.219940185546875, "learning_rate": 3.8156136471821036e-07, "loss": 0.1671, "num_input_tokens_seen": 34849776, "step": 57200 }, { "epoch": 17.74899162271176, "grad_norm": 35.541587829589844, "learning_rate": 3.8104282768227976e-07, "loss": 0.1247, "num_input_tokens_seen": 34852272, "step": 57205 }, { "epoch": 17.750542972385976, "grad_norm": 7.329939842224121, "learning_rate": 3.805246292706055e-07, "loss": 0.1909, "num_input_tokens_seen": 34854640, "step": 57210 }, { "epoch": 17.75209432206019, "grad_norm": 28.12418556213379, "learning_rate": 3.8000676952117954e-07, "loss": 0.1822, "num_input_tokens_seen": 34857392, "step": 57215 }, { "epoch": 17.753645671734407, "grad_norm": 12.504204750061035, "learning_rate": 3.794892484719653e-07, "loss": 0.1417, "num_input_tokens_seen": 34860112, "step": 57220 }, { "epoch": 17.755197021408627, "grad_norm": 23.323579788208008, "learning_rate": 3.7897206616090533e-07, "loss": 0.1783, "num_input_tokens_seen": 34863344, "step": 57225 }, { "epoch": 17.756748371082843, "grad_norm": 8.317821502685547, "learning_rate": 3.7845522262591303e-07, "loss": 0.1179, "num_input_tokens_seen": 34865936, "step": 57230 }, { "epoch": 17.75829972075706, "grad_norm": 19.263124465942383, "learning_rate": 3.77938717904881e-07, "loss": 0.0856, "num_input_tokens_seen": 34868720, "step": 57235 }, { "epoch": 17.759851070431274, "grad_norm": 16.36478614807129, "learning_rate": 3.7742255203567337e-07, "loss": 0.1127, "num_input_tokens_seen": 34871280, "step": 57240 }, { "epoch": 17.76140242010549, "grad_norm": 18.064363479614258, "learning_rate": 3.769067250561326e-07, "loss": 0.141, "num_input_tokens_seen": 34874992, "step": 57245 }, { "epoch": 17.76295376977971, "grad_norm": 9.936606407165527, "learning_rate": 3.763912370040751e-07, "loss": 0.1376, "num_input_tokens_seen": 34877488, "step": 57250 }, { "epoch": 17.764505119453926, "grad_norm": 30.972732543945312, "learning_rate": 3.7587608791729067e-07, "loss": 0.1449, "num_input_tokens_seen": 34881488, "step": 57255 }, { "epoch": 17.76605646912814, "grad_norm": 33.4295539855957, "learning_rate": 3.7536127783354746e-07, "loss": 0.1893, "num_input_tokens_seen": 34884720, "step": 57260 }, { "epoch": 17.767607818802357, "grad_norm": 9.413824081420898, "learning_rate": 3.7484680679058636e-07, "loss": 0.1186, "num_input_tokens_seen": 34887664, "step": 57265 }, { "epoch": 17.769159168476573, "grad_norm": 17.491458892822266, "learning_rate": 3.7433267482612447e-07, "loss": 0.1775, "num_input_tokens_seen": 34890672, "step": 57270 }, { "epoch": 17.770710518150793, "grad_norm": 35.84800720214844, "learning_rate": 3.7381888197785323e-07, "loss": 0.1692, "num_input_tokens_seen": 34893904, "step": 57275 }, { "epoch": 17.77226186782501, "grad_norm": 22.782058715820312, "learning_rate": 3.7330542828344087e-07, "loss": 0.109, "num_input_tokens_seen": 34896944, "step": 57280 }, { "epoch": 17.773813217499224, "grad_norm": 8.71550178527832, "learning_rate": 3.727923137805278e-07, "loss": 0.0742, "num_input_tokens_seen": 34900080, "step": 57285 }, { "epoch": 17.77536456717344, "grad_norm": 5.699965000152588, "learning_rate": 3.722795385067335e-07, "loss": 0.1448, "num_input_tokens_seen": 34903536, "step": 57290 }, { "epoch": 17.776915916847656, "grad_norm": 27.939136505126953, "learning_rate": 3.717671024996489e-07, "loss": 0.1606, "num_input_tokens_seen": 34906160, "step": 57295 }, { "epoch": 17.778467266521876, "grad_norm": 10.146563529968262, "learning_rate": 3.712550057968417e-07, "loss": 0.0905, "num_input_tokens_seen": 34909360, "step": 57300 }, { "epoch": 17.78001861619609, "grad_norm": 35.98589324951172, "learning_rate": 3.707432484358564e-07, "loss": 0.1892, "num_input_tokens_seen": 34913648, "step": 57305 }, { "epoch": 17.781569965870307, "grad_norm": 13.633458137512207, "learning_rate": 3.7023183045420794e-07, "loss": 0.1049, "num_input_tokens_seen": 34917488, "step": 57310 }, { "epoch": 17.783121315544523, "grad_norm": 16.440149307250977, "learning_rate": 3.697207518893925e-07, "loss": 0.1968, "num_input_tokens_seen": 34920784, "step": 57315 }, { "epoch": 17.78467266521874, "grad_norm": 38.200313568115234, "learning_rate": 3.692100127788756e-07, "loss": 0.2588, "num_input_tokens_seen": 34923376, "step": 57320 }, { "epoch": 17.78622401489296, "grad_norm": 17.878446578979492, "learning_rate": 3.686996131601028e-07, "loss": 0.1413, "num_input_tokens_seen": 34926672, "step": 57325 }, { "epoch": 17.787775364567175, "grad_norm": 15.959549903869629, "learning_rate": 3.6818955307048985e-07, "loss": 0.1874, "num_input_tokens_seen": 34929072, "step": 57330 }, { "epoch": 17.78932671424139, "grad_norm": 12.43004322052002, "learning_rate": 3.6767983254743236e-07, "loss": 0.191, "num_input_tokens_seen": 34932368, "step": 57335 }, { "epoch": 17.790878063915606, "grad_norm": 36.14088821411133, "learning_rate": 3.6717045162829703e-07, "loss": 0.2308, "num_input_tokens_seen": 34935216, "step": 57340 }, { "epoch": 17.792429413589822, "grad_norm": 20.183732986450195, "learning_rate": 3.6666141035042956e-07, "loss": 0.1609, "num_input_tokens_seen": 34939088, "step": 57345 }, { "epoch": 17.793980763264038, "grad_norm": 23.12883758544922, "learning_rate": 3.661527087511468e-07, "loss": 0.1653, "num_input_tokens_seen": 34941680, "step": 57350 }, { "epoch": 17.795532112938258, "grad_norm": 10.379937171936035, "learning_rate": 3.656443468677445e-07, "loss": 0.2104, "num_input_tokens_seen": 34943664, "step": 57355 }, { "epoch": 17.797083462612473, "grad_norm": 15.600332260131836, "learning_rate": 3.651363247374895e-07, "loss": 0.1316, "num_input_tokens_seen": 34947152, "step": 57360 }, { "epoch": 17.79863481228669, "grad_norm": 15.200695037841797, "learning_rate": 3.6462864239762697e-07, "loss": 0.2692, "num_input_tokens_seen": 34949744, "step": 57365 }, { "epoch": 17.800186161960905, "grad_norm": 19.18225860595703, "learning_rate": 3.641212998853766e-07, "loss": 0.0949, "num_input_tokens_seen": 34953168, "step": 57370 }, { "epoch": 17.80173751163512, "grad_norm": 11.081559181213379, "learning_rate": 3.636142972379314e-07, "loss": 0.1032, "num_input_tokens_seen": 34955856, "step": 57375 }, { "epoch": 17.80328886130934, "grad_norm": 4.132098197937012, "learning_rate": 3.631076344924617e-07, "loss": 0.1227, "num_input_tokens_seen": 34959600, "step": 57380 }, { "epoch": 17.804840210983556, "grad_norm": 40.099365234375, "learning_rate": 3.62601311686111e-07, "loss": 0.1481, "num_input_tokens_seen": 34963376, "step": 57385 }, { "epoch": 17.806391560657772, "grad_norm": 26.344970703125, "learning_rate": 3.6209532885599973e-07, "loss": 0.1356, "num_input_tokens_seen": 34966416, "step": 57390 }, { "epoch": 17.807942910331988, "grad_norm": 21.10992431640625, "learning_rate": 3.6158968603922093e-07, "loss": 0.2017, "num_input_tokens_seen": 34969520, "step": 57395 }, { "epoch": 17.809494260006204, "grad_norm": 8.935370445251465, "learning_rate": 3.610843832728461e-07, "loss": 0.108, "num_input_tokens_seen": 34972208, "step": 57400 }, { "epoch": 17.811045609680423, "grad_norm": 9.568388938903809, "learning_rate": 3.6057942059391836e-07, "loss": 0.196, "num_input_tokens_seen": 34974928, "step": 57405 }, { "epoch": 17.81259695935464, "grad_norm": 14.85582160949707, "learning_rate": 3.6007479803945867e-07, "loss": 0.1499, "num_input_tokens_seen": 34978320, "step": 57410 }, { "epoch": 17.814148309028855, "grad_norm": 28.552448272705078, "learning_rate": 3.5957051564646015e-07, "loss": 0.1887, "num_input_tokens_seen": 34980656, "step": 57415 }, { "epoch": 17.81569965870307, "grad_norm": 13.805197715759277, "learning_rate": 3.590665734518944e-07, "loss": 0.215, "num_input_tokens_seen": 34983312, "step": 57420 }, { "epoch": 17.817251008377287, "grad_norm": 12.411458015441895, "learning_rate": 3.585629714927058e-07, "loss": 0.1609, "num_input_tokens_seen": 34986896, "step": 57425 }, { "epoch": 17.818802358051506, "grad_norm": 12.358670234680176, "learning_rate": 3.580597098058136e-07, "loss": 0.1772, "num_input_tokens_seen": 34990640, "step": 57430 }, { "epoch": 17.820353707725722, "grad_norm": 9.142446517944336, "learning_rate": 3.5755678842811446e-07, "loss": 0.1751, "num_input_tokens_seen": 34993520, "step": 57435 }, { "epoch": 17.82190505739994, "grad_norm": 2.96977162361145, "learning_rate": 3.570542073964767e-07, "loss": 0.2269, "num_input_tokens_seen": 34996048, "step": 57440 }, { "epoch": 17.823456407074154, "grad_norm": 34.88006591796875, "learning_rate": 3.5655196674774685e-07, "loss": 0.2555, "num_input_tokens_seen": 34998544, "step": 57445 }, { "epoch": 17.82500775674837, "grad_norm": 27.27159309387207, "learning_rate": 3.560500665187439e-07, "loss": 0.3269, "num_input_tokens_seen": 35001904, "step": 57450 }, { "epoch": 17.826559106422586, "grad_norm": 17.832290649414062, "learning_rate": 3.555485067462644e-07, "loss": 0.1838, "num_input_tokens_seen": 35004688, "step": 57455 }, { "epoch": 17.828110456096805, "grad_norm": 13.945172309875488, "learning_rate": 3.550472874670774e-07, "loss": 0.261, "num_input_tokens_seen": 35006864, "step": 57460 }, { "epoch": 17.82966180577102, "grad_norm": 9.680551528930664, "learning_rate": 3.54546408717929e-07, "loss": 0.1923, "num_input_tokens_seen": 35009840, "step": 57465 }, { "epoch": 17.831213155445237, "grad_norm": 16.1650390625, "learning_rate": 3.540458705355376e-07, "loss": 0.191, "num_input_tokens_seen": 35012048, "step": 57470 }, { "epoch": 17.832764505119453, "grad_norm": 8.645601272583008, "learning_rate": 3.5354567295660223e-07, "loss": 0.1695, "num_input_tokens_seen": 35016240, "step": 57475 }, { "epoch": 17.83431585479367, "grad_norm": 43.7364616394043, "learning_rate": 3.530458160177902e-07, "loss": 0.229, "num_input_tokens_seen": 35019280, "step": 57480 }, { "epoch": 17.83586720446789, "grad_norm": 15.739485740661621, "learning_rate": 3.5254629975574883e-07, "loss": 0.1612, "num_input_tokens_seen": 35022896, "step": 57485 }, { "epoch": 17.837418554142104, "grad_norm": 7.646528720855713, "learning_rate": 3.520471242070972e-07, "loss": 0.1495, "num_input_tokens_seen": 35026224, "step": 57490 }, { "epoch": 17.83896990381632, "grad_norm": 32.883262634277344, "learning_rate": 3.515482894084321e-07, "loss": 0.2264, "num_input_tokens_seen": 35029296, "step": 57495 }, { "epoch": 17.840521253490536, "grad_norm": 26.181255340576172, "learning_rate": 3.510497953963221e-07, "loss": 0.1555, "num_input_tokens_seen": 35032208, "step": 57500 }, { "epoch": 17.842072603164752, "grad_norm": 14.267004013061523, "learning_rate": 3.5055164220731407e-07, "loss": 0.0876, "num_input_tokens_seen": 35035312, "step": 57505 }, { "epoch": 17.84362395283897, "grad_norm": 7.344637393951416, "learning_rate": 3.5005382987792933e-07, "loss": 0.1679, "num_input_tokens_seen": 35038768, "step": 57510 }, { "epoch": 17.845175302513187, "grad_norm": 26.834901809692383, "learning_rate": 3.495563584446615e-07, "loss": 0.1609, "num_input_tokens_seen": 35042256, "step": 57515 }, { "epoch": 17.846726652187403, "grad_norm": 5.399068832397461, "learning_rate": 3.4905922794398305e-07, "loss": 0.1462, "num_input_tokens_seen": 35046032, "step": 57520 }, { "epoch": 17.84827800186162, "grad_norm": 8.312942504882812, "learning_rate": 3.485624384123382e-07, "loss": 0.09, "num_input_tokens_seen": 35048432, "step": 57525 }, { "epoch": 17.849829351535835, "grad_norm": 49.444793701171875, "learning_rate": 3.480659898861477e-07, "loss": 0.2124, "num_input_tokens_seen": 35051088, "step": 57530 }, { "epoch": 17.851380701210054, "grad_norm": 8.49355411529541, "learning_rate": 3.47569882401807e-07, "loss": 0.1933, "num_input_tokens_seen": 35053360, "step": 57535 }, { "epoch": 17.85293205088427, "grad_norm": 41.46420669555664, "learning_rate": 3.4707411599568807e-07, "loss": 0.2239, "num_input_tokens_seen": 35055952, "step": 57540 }, { "epoch": 17.854483400558486, "grad_norm": 8.230188369750977, "learning_rate": 3.465786907041341e-07, "loss": 0.2296, "num_input_tokens_seen": 35058448, "step": 57545 }, { "epoch": 17.856034750232702, "grad_norm": 3.6788933277130127, "learning_rate": 3.4608360656346764e-07, "loss": 0.1142, "num_input_tokens_seen": 35060944, "step": 57550 }, { "epoch": 17.857586099906918, "grad_norm": 13.213239669799805, "learning_rate": 3.45588863609983e-07, "loss": 0.161, "num_input_tokens_seen": 35064272, "step": 57555 }, { "epoch": 17.859137449581137, "grad_norm": 9.273462295532227, "learning_rate": 3.450944618799512e-07, "loss": 0.1857, "num_input_tokens_seen": 35067440, "step": 57560 }, { "epoch": 17.860688799255353, "grad_norm": 9.160481452941895, "learning_rate": 3.446004014096177e-07, "loss": 0.0879, "num_input_tokens_seen": 35070928, "step": 57565 }, { "epoch": 17.86224014892957, "grad_norm": 35.731571197509766, "learning_rate": 3.4410668223520237e-07, "loss": 0.2091, "num_input_tokens_seen": 35073680, "step": 57570 }, { "epoch": 17.863791498603785, "grad_norm": 7.822336673736572, "learning_rate": 3.4361330439290244e-07, "loss": 0.1394, "num_input_tokens_seen": 35076784, "step": 57575 }, { "epoch": 17.865342848278, "grad_norm": 14.875774383544922, "learning_rate": 3.431202679188861e-07, "loss": 0.1079, "num_input_tokens_seen": 35079696, "step": 57580 }, { "epoch": 17.86689419795222, "grad_norm": 28.93674087524414, "learning_rate": 3.426275728493006e-07, "loss": 0.2731, "num_input_tokens_seen": 35082320, "step": 57585 }, { "epoch": 17.868445547626436, "grad_norm": 23.59620475769043, "learning_rate": 3.421352192202648e-07, "loss": 0.1788, "num_input_tokens_seen": 35085360, "step": 57590 }, { "epoch": 17.869996897300652, "grad_norm": 14.227628707885742, "learning_rate": 3.41643207067876e-07, "loss": 0.2317, "num_input_tokens_seen": 35088816, "step": 57595 }, { "epoch": 17.871548246974868, "grad_norm": 13.094264030456543, "learning_rate": 3.4115153642820196e-07, "loss": 0.1322, "num_input_tokens_seen": 35091216, "step": 57600 }, { "epoch": 17.873099596649084, "grad_norm": 37.028648376464844, "learning_rate": 3.406602073372906e-07, "loss": 0.192, "num_input_tokens_seen": 35097296, "step": 57605 }, { "epoch": 17.8746509463233, "grad_norm": 16.204259872436523, "learning_rate": 3.401692198311596e-07, "loss": 0.2794, "num_input_tokens_seen": 35102064, "step": 57610 }, { "epoch": 17.87620229599752, "grad_norm": 12.120784759521484, "learning_rate": 3.396785739458064e-07, "loss": 0.1658, "num_input_tokens_seen": 35104912, "step": 57615 }, { "epoch": 17.877753645671735, "grad_norm": 18.401473999023438, "learning_rate": 3.3918826971719945e-07, "loss": 0.1068, "num_input_tokens_seen": 35107664, "step": 57620 }, { "epoch": 17.87930499534595, "grad_norm": 23.744277954101562, "learning_rate": 3.3869830718128494e-07, "loss": 0.1946, "num_input_tokens_seen": 35110160, "step": 57625 }, { "epoch": 17.880856345020167, "grad_norm": 57.66792678833008, "learning_rate": 3.3820868637398305e-07, "loss": 0.2568, "num_input_tokens_seen": 35113808, "step": 57630 }, { "epoch": 17.882407694694383, "grad_norm": 11.13736343383789, "learning_rate": 3.3771940733118734e-07, "loss": 0.1368, "num_input_tokens_seen": 35116240, "step": 57635 }, { "epoch": 17.883959044368602, "grad_norm": 5.621951103210449, "learning_rate": 3.3723047008876966e-07, "loss": 0.0626, "num_input_tokens_seen": 35119536, "step": 57640 }, { "epoch": 17.885510394042818, "grad_norm": 11.022455215454102, "learning_rate": 3.3674187468257346e-07, "loss": 0.1252, "num_input_tokens_seen": 35122928, "step": 57645 }, { "epoch": 17.887061743717034, "grad_norm": 21.37006187438965, "learning_rate": 3.362536211484196e-07, "loss": 0.1893, "num_input_tokens_seen": 35125456, "step": 57650 }, { "epoch": 17.88861309339125, "grad_norm": 22.860877990722656, "learning_rate": 3.357657095221012e-07, "loss": 0.1379, "num_input_tokens_seen": 35127856, "step": 57655 }, { "epoch": 17.890164443065466, "grad_norm": 55.465126037597656, "learning_rate": 3.3527813983939064e-07, "loss": 0.1811, "num_input_tokens_seen": 35130640, "step": 57660 }, { "epoch": 17.891715792739685, "grad_norm": 20.50558853149414, "learning_rate": 3.3479091213602986e-07, "loss": 0.1576, "num_input_tokens_seen": 35133040, "step": 57665 }, { "epoch": 17.8932671424139, "grad_norm": 4.767606258392334, "learning_rate": 3.343040264477404e-07, "loss": 0.162, "num_input_tokens_seen": 35136080, "step": 57670 }, { "epoch": 17.894818492088117, "grad_norm": 25.564775466918945, "learning_rate": 3.3381748281021473e-07, "loss": 0.1911, "num_input_tokens_seen": 35138800, "step": 57675 }, { "epoch": 17.896369841762333, "grad_norm": 23.511594772338867, "learning_rate": 3.333312812591244e-07, "loss": 0.1329, "num_input_tokens_seen": 35142064, "step": 57680 }, { "epoch": 17.89792119143655, "grad_norm": 32.77305603027344, "learning_rate": 3.328454218301125e-07, "loss": 0.1304, "num_input_tokens_seen": 35144272, "step": 57685 }, { "epoch": 17.899472541110768, "grad_norm": 5.2335524559021, "learning_rate": 3.323599045587983e-07, "loss": 0.1092, "num_input_tokens_seen": 35147184, "step": 57690 }, { "epoch": 17.901023890784984, "grad_norm": 14.519418716430664, "learning_rate": 3.318747294807767e-07, "loss": 0.1938, "num_input_tokens_seen": 35149232, "step": 57695 }, { "epoch": 17.9025752404592, "grad_norm": 4.419269561767578, "learning_rate": 3.313898966316159e-07, "loss": 0.1397, "num_input_tokens_seen": 35151952, "step": 57700 }, { "epoch": 17.904126590133416, "grad_norm": 21.60875701904297, "learning_rate": 3.3090540604686083e-07, "loss": 0.1531, "num_input_tokens_seen": 35154448, "step": 57705 }, { "epoch": 17.90567793980763, "grad_norm": 50.12838363647461, "learning_rate": 3.3042125776202914e-07, "loss": 0.1785, "num_input_tokens_seen": 35157136, "step": 57710 }, { "epoch": 17.907229289481847, "grad_norm": 16.410255432128906, "learning_rate": 3.2993745181261585e-07, "loss": 0.2078, "num_input_tokens_seen": 35159568, "step": 57715 }, { "epoch": 17.908780639156067, "grad_norm": 11.738033294677734, "learning_rate": 3.294539882340886e-07, "loss": 0.1202, "num_input_tokens_seen": 35162096, "step": 57720 }, { "epoch": 17.910331988830283, "grad_norm": 9.526700973510742, "learning_rate": 3.289708670618924e-07, "loss": 0.112, "num_input_tokens_seen": 35164816, "step": 57725 }, { "epoch": 17.9118833385045, "grad_norm": 6.246157646179199, "learning_rate": 3.28488088331444e-07, "loss": 0.1477, "num_input_tokens_seen": 35168816, "step": 57730 }, { "epoch": 17.913434688178715, "grad_norm": 10.817432403564453, "learning_rate": 3.2800565207813883e-07, "loss": 0.1724, "num_input_tokens_seen": 35171888, "step": 57735 }, { "epoch": 17.91498603785293, "grad_norm": 11.392635345458984, "learning_rate": 3.2752355833734315e-07, "loss": 0.1968, "num_input_tokens_seen": 35174320, "step": 57740 }, { "epoch": 17.91653738752715, "grad_norm": 8.51973819732666, "learning_rate": 3.2704180714440095e-07, "loss": 0.1017, "num_input_tokens_seen": 35178416, "step": 57745 }, { "epoch": 17.918088737201366, "grad_norm": 8.565264701843262, "learning_rate": 3.2656039853463104e-07, "loss": 0.1017, "num_input_tokens_seen": 35181008, "step": 57750 }, { "epoch": 17.91964008687558, "grad_norm": 5.064403533935547, "learning_rate": 3.260793325433248e-07, "loss": 0.2008, "num_input_tokens_seen": 35183248, "step": 57755 }, { "epoch": 17.921191436549798, "grad_norm": 37.7206916809082, "learning_rate": 3.255986092057523e-07, "loss": 0.3103, "num_input_tokens_seen": 35187248, "step": 57760 }, { "epoch": 17.922742786224013, "grad_norm": 22.166597366333008, "learning_rate": 3.2511822855715357e-07, "loss": 0.1086, "num_input_tokens_seen": 35189872, "step": 57765 }, { "epoch": 17.924294135898233, "grad_norm": 32.192909240722656, "learning_rate": 3.2463819063274894e-07, "loss": 0.1371, "num_input_tokens_seen": 35192080, "step": 57770 }, { "epoch": 17.92584548557245, "grad_norm": 7.72288703918457, "learning_rate": 3.2415849546772795e-07, "loss": 0.124, "num_input_tokens_seen": 35196304, "step": 57775 }, { "epoch": 17.927396835246665, "grad_norm": 18.771385192871094, "learning_rate": 3.236791430972608e-07, "loss": 0.147, "num_input_tokens_seen": 35200976, "step": 57780 }, { "epoch": 17.92894818492088, "grad_norm": 10.590792655944824, "learning_rate": 3.232001335564877e-07, "loss": 0.1175, "num_input_tokens_seen": 35204784, "step": 57785 }, { "epoch": 17.930499534595096, "grad_norm": 16.288265228271484, "learning_rate": 3.227214668805273e-07, "loss": 0.1285, "num_input_tokens_seen": 35208208, "step": 57790 }, { "epoch": 17.932050884269316, "grad_norm": 14.224517822265625, "learning_rate": 3.2224314310446926e-07, "loss": 0.1665, "num_input_tokens_seen": 35210512, "step": 57795 }, { "epoch": 17.93360223394353, "grad_norm": 11.648685455322266, "learning_rate": 3.217651622633827e-07, "loss": 0.1451, "num_input_tokens_seen": 35214768, "step": 57800 }, { "epoch": 17.935153583617748, "grad_norm": 9.806020736694336, "learning_rate": 3.21287524392308e-07, "loss": 0.1117, "num_input_tokens_seen": 35217200, "step": 57805 }, { "epoch": 17.936704933291963, "grad_norm": 11.102093696594238, "learning_rate": 3.2081022952626163e-07, "loss": 0.2006, "num_input_tokens_seen": 35219408, "step": 57810 }, { "epoch": 17.93825628296618, "grad_norm": 18.560049057006836, "learning_rate": 3.2033327770023604e-07, "loss": 0.1558, "num_input_tokens_seen": 35222992, "step": 57815 }, { "epoch": 17.9398076326404, "grad_norm": 7.987030982971191, "learning_rate": 3.1985666894919665e-07, "loss": 0.1789, "num_input_tokens_seen": 35226992, "step": 57820 }, { "epoch": 17.941358982314615, "grad_norm": 11.436240196228027, "learning_rate": 3.193804033080844e-07, "loss": 0.1266, "num_input_tokens_seen": 35229712, "step": 57825 }, { "epoch": 17.94291033198883, "grad_norm": 13.733969688415527, "learning_rate": 3.1890448081181525e-07, "loss": 0.1446, "num_input_tokens_seen": 35233104, "step": 57830 }, { "epoch": 17.944461681663046, "grad_norm": 93.4449234008789, "learning_rate": 3.184289014952807e-07, "loss": 0.1898, "num_input_tokens_seen": 35236560, "step": 57835 }, { "epoch": 17.946013031337262, "grad_norm": 2.3514885902404785, "learning_rate": 3.179536653933452e-07, "loss": 0.1358, "num_input_tokens_seen": 35239888, "step": 57840 }, { "epoch": 17.94756438101148, "grad_norm": 13.70854663848877, "learning_rate": 3.174787725408501e-07, "loss": 0.1931, "num_input_tokens_seen": 35242736, "step": 57845 }, { "epoch": 17.949115730685698, "grad_norm": 18.268434524536133, "learning_rate": 3.1700422297261004e-07, "loss": 0.1079, "num_input_tokens_seen": 35246000, "step": 57850 }, { "epoch": 17.950667080359914, "grad_norm": 17.65376091003418, "learning_rate": 3.165300167234159e-07, "loss": 0.1405, "num_input_tokens_seen": 35248336, "step": 57855 }, { "epoch": 17.95221843003413, "grad_norm": 12.1726713180542, "learning_rate": 3.1605615382803157e-07, "loss": 0.195, "num_input_tokens_seen": 35250480, "step": 57860 }, { "epoch": 17.953769779708345, "grad_norm": 27.364013671875, "learning_rate": 3.1558263432119706e-07, "loss": 0.193, "num_input_tokens_seen": 35253200, "step": 57865 }, { "epoch": 17.95532112938256, "grad_norm": 5.57034158706665, "learning_rate": 3.1510945823762796e-07, "loss": 0.1325, "num_input_tokens_seen": 35256112, "step": 57870 }, { "epoch": 17.95687247905678, "grad_norm": 14.93406867980957, "learning_rate": 3.146366256120126e-07, "loss": 0.1994, "num_input_tokens_seen": 35259600, "step": 57875 }, { "epoch": 17.958423828730997, "grad_norm": 29.727680206298828, "learning_rate": 3.14164136479016e-07, "loss": 0.1736, "num_input_tokens_seen": 35263920, "step": 57880 }, { "epoch": 17.959975178405212, "grad_norm": 10.084163665771484, "learning_rate": 3.1369199087327664e-07, "loss": 0.133, "num_input_tokens_seen": 35266992, "step": 57885 }, { "epoch": 17.96152652807943, "grad_norm": 28.190256118774414, "learning_rate": 3.132201888294084e-07, "loss": 0.16, "num_input_tokens_seen": 35269712, "step": 57890 }, { "epoch": 17.963077877753644, "grad_norm": 6.178745746612549, "learning_rate": 3.1274873038199983e-07, "loss": 0.0952, "num_input_tokens_seen": 35274064, "step": 57895 }, { "epoch": 17.964629227427864, "grad_norm": 6.297580718994141, "learning_rate": 3.1227761556561543e-07, "loss": 0.1682, "num_input_tokens_seen": 35276464, "step": 57900 }, { "epoch": 17.96618057710208, "grad_norm": 5.1167073249816895, "learning_rate": 3.11806844414792e-07, "loss": 0.1906, "num_input_tokens_seen": 35279888, "step": 57905 }, { "epoch": 17.967731926776295, "grad_norm": 18.315446853637695, "learning_rate": 3.1133641696404425e-07, "loss": 0.2032, "num_input_tokens_seen": 35282352, "step": 57910 }, { "epoch": 17.96928327645051, "grad_norm": 11.056862831115723, "learning_rate": 3.108663332478584e-07, "loss": 0.1695, "num_input_tokens_seen": 35286032, "step": 57915 }, { "epoch": 17.970834626124727, "grad_norm": 21.371801376342773, "learning_rate": 3.103965933006986e-07, "loss": 0.1765, "num_input_tokens_seen": 35288656, "step": 57920 }, { "epoch": 17.972385975798947, "grad_norm": 14.908611297607422, "learning_rate": 3.099271971570006e-07, "loss": 0.1404, "num_input_tokens_seen": 35291120, "step": 57925 }, { "epoch": 17.973937325473162, "grad_norm": 8.783519744873047, "learning_rate": 3.0945814485117855e-07, "loss": 0.0806, "num_input_tokens_seen": 35294832, "step": 57930 }, { "epoch": 17.97548867514738, "grad_norm": 8.81584644317627, "learning_rate": 3.0898943641761936e-07, "loss": 0.1742, "num_input_tokens_seen": 35298704, "step": 57935 }, { "epoch": 17.977040024821594, "grad_norm": 2.5013411045074463, "learning_rate": 3.0852107189068334e-07, "loss": 0.0902, "num_input_tokens_seen": 35301104, "step": 57940 }, { "epoch": 17.97859137449581, "grad_norm": 44.3726806640625, "learning_rate": 3.080530513047092e-07, "loss": 0.1164, "num_input_tokens_seen": 35304080, "step": 57945 }, { "epoch": 17.98014272417003, "grad_norm": 5.992341995239258, "learning_rate": 3.075853746940061e-07, "loss": 0.2091, "num_input_tokens_seen": 35307216, "step": 57950 }, { "epoch": 17.981694073844245, "grad_norm": 5.08557653427124, "learning_rate": 3.0711804209286266e-07, "loss": 0.2563, "num_input_tokens_seen": 35311088, "step": 57955 }, { "epoch": 17.98324542351846, "grad_norm": 5.2330641746521, "learning_rate": 3.0665105353553824e-07, "loss": 0.1451, "num_input_tokens_seen": 35316432, "step": 57960 }, { "epoch": 17.984796773192677, "grad_norm": 56.1864128112793, "learning_rate": 3.0618440905626936e-07, "loss": 0.2422, "num_input_tokens_seen": 35318992, "step": 57965 }, { "epoch": 17.986348122866893, "grad_norm": 65.31607055664062, "learning_rate": 3.057181086892663e-07, "loss": 0.2846, "num_input_tokens_seen": 35322000, "step": 57970 }, { "epoch": 17.98789947254111, "grad_norm": 17.591135025024414, "learning_rate": 3.0525215246871466e-07, "loss": 0.1639, "num_input_tokens_seen": 35324432, "step": 57975 }, { "epoch": 17.98945082221533, "grad_norm": 5.246479034423828, "learning_rate": 3.047865404287742e-07, "loss": 0.1952, "num_input_tokens_seen": 35327344, "step": 57980 }, { "epoch": 17.991002171889544, "grad_norm": 9.587512016296387, "learning_rate": 3.0432127260357934e-07, "loss": 0.0997, "num_input_tokens_seen": 35330416, "step": 57985 }, { "epoch": 17.99255352156376, "grad_norm": 22.88665008544922, "learning_rate": 3.038563490272417e-07, "loss": 0.1955, "num_input_tokens_seen": 35333264, "step": 57990 }, { "epoch": 17.994104871237976, "grad_norm": 9.547006607055664, "learning_rate": 3.0339176973384285e-07, "loss": 0.1643, "num_input_tokens_seen": 35335792, "step": 57995 }, { "epoch": 17.995656220912192, "grad_norm": 31.660404205322266, "learning_rate": 3.0292753475744505e-07, "loss": 0.2029, "num_input_tokens_seen": 35338928, "step": 58000 }, { "epoch": 17.99720757058641, "grad_norm": 29.600000381469727, "learning_rate": 3.024636441320794e-07, "loss": 0.3168, "num_input_tokens_seen": 35341936, "step": 58005 }, { "epoch": 17.998758920260627, "grad_norm": 16.221723556518555, "learning_rate": 3.0200009789175646e-07, "loss": 0.1999, "num_input_tokens_seen": 35344304, "step": 58010 }, { "epoch": 18.0, "eval_loss": 0.38111791014671326, "eval_runtime": 34.3762, "eval_samples_per_second": 93.757, "eval_steps_per_second": 23.446, "num_input_tokens_seen": 35346160, "step": 58014 }, { "epoch": 18.000310269934843, "grad_norm": 13.696685791015625, "learning_rate": 3.015368960704584e-07, "loss": 0.2293, "num_input_tokens_seen": 35346608, "step": 58015 }, { "epoch": 18.00186161960906, "grad_norm": 7.7429351806640625, "learning_rate": 3.010740387021449e-07, "loss": 0.0788, "num_input_tokens_seen": 35350064, "step": 58020 }, { "epoch": 18.003412969283275, "grad_norm": 6.995372772216797, "learning_rate": 3.0061152582074704e-07, "loss": 0.1163, "num_input_tokens_seen": 35356464, "step": 58025 }, { "epoch": 18.004964318957494, "grad_norm": 9.49075984954834, "learning_rate": 3.0014935746017383e-07, "loss": 0.1638, "num_input_tokens_seen": 35359504, "step": 58030 }, { "epoch": 18.00651566863171, "grad_norm": 7.964413642883301, "learning_rate": 2.99687533654307e-07, "loss": 0.1531, "num_input_tokens_seen": 35363952, "step": 58035 }, { "epoch": 18.008067018305926, "grad_norm": 10.95512580871582, "learning_rate": 2.9922605443700457e-07, "loss": 0.2054, "num_input_tokens_seen": 35366224, "step": 58040 }, { "epoch": 18.009618367980142, "grad_norm": 29.132686614990234, "learning_rate": 2.987649198420972e-07, "loss": 0.1819, "num_input_tokens_seen": 35369616, "step": 58045 }, { "epoch": 18.011169717654358, "grad_norm": 14.052583694458008, "learning_rate": 2.9830412990339173e-07, "loss": 0.1047, "num_input_tokens_seen": 35372688, "step": 58050 }, { "epoch": 18.012721067328577, "grad_norm": 15.835728645324707, "learning_rate": 2.978436846546706e-07, "loss": 0.1621, "num_input_tokens_seen": 35375312, "step": 58055 }, { "epoch": 18.014272417002793, "grad_norm": 46.317867279052734, "learning_rate": 2.973835841296896e-07, "loss": 0.1444, "num_input_tokens_seen": 35378064, "step": 58060 }, { "epoch": 18.01582376667701, "grad_norm": 32.49501037597656, "learning_rate": 2.969238283621784e-07, "loss": 0.154, "num_input_tokens_seen": 35381264, "step": 58065 }, { "epoch": 18.017375116351225, "grad_norm": 25.52741241455078, "learning_rate": 2.964644173858433e-07, "loss": 0.1205, "num_input_tokens_seen": 35383888, "step": 58070 }, { "epoch": 18.01892646602544, "grad_norm": 7.287841320037842, "learning_rate": 2.960053512343658e-07, "loss": 0.1713, "num_input_tokens_seen": 35386768, "step": 58075 }, { "epoch": 18.02047781569966, "grad_norm": 37.41915512084961, "learning_rate": 2.9554662994139837e-07, "loss": 0.2023, "num_input_tokens_seen": 35391120, "step": 58080 }, { "epoch": 18.022029165373876, "grad_norm": 29.230573654174805, "learning_rate": 2.9508825354057303e-07, "loss": 0.2364, "num_input_tokens_seen": 35393680, "step": 58085 }, { "epoch": 18.023580515048092, "grad_norm": 9.069544792175293, "learning_rate": 2.946302220654923e-07, "loss": 0.1638, "num_input_tokens_seen": 35396880, "step": 58090 }, { "epoch": 18.025131864722308, "grad_norm": 21.148658752441406, "learning_rate": 2.941725355497371e-07, "loss": 0.1499, "num_input_tokens_seen": 35400080, "step": 58095 }, { "epoch": 18.026683214396524, "grad_norm": 19.125200271606445, "learning_rate": 2.937151940268601e-07, "loss": 0.1693, "num_input_tokens_seen": 35403152, "step": 58100 }, { "epoch": 18.028234564070743, "grad_norm": 13.781351089477539, "learning_rate": 2.9325819753039e-07, "loss": 0.1713, "num_input_tokens_seen": 35406032, "step": 58105 }, { "epoch": 18.02978591374496, "grad_norm": 9.758749008178711, "learning_rate": 2.9280154609382994e-07, "loss": 0.1072, "num_input_tokens_seen": 35408848, "step": 58110 }, { "epoch": 18.031337263419175, "grad_norm": 14.215448379516602, "learning_rate": 2.9234523975065874e-07, "loss": 0.1447, "num_input_tokens_seen": 35411344, "step": 58115 }, { "epoch": 18.03288861309339, "grad_norm": 7.447743892669678, "learning_rate": 2.91889278534328e-07, "loss": 0.1829, "num_input_tokens_seen": 35416592, "step": 58120 }, { "epoch": 18.034439962767607, "grad_norm": 9.07252311706543, "learning_rate": 2.91433662478266e-07, "loss": 0.1811, "num_input_tokens_seen": 35420144, "step": 58125 }, { "epoch": 18.035991312441823, "grad_norm": 19.451087951660156, "learning_rate": 2.9097839161587317e-07, "loss": 0.1298, "num_input_tokens_seen": 35423152, "step": 58130 }, { "epoch": 18.037542662116042, "grad_norm": 8.427504539489746, "learning_rate": 2.9052346598052783e-07, "loss": 0.0943, "num_input_tokens_seen": 35425968, "step": 58135 }, { "epoch": 18.039094011790258, "grad_norm": 7.773961067199707, "learning_rate": 2.900688856055817e-07, "loss": 0.1061, "num_input_tokens_seen": 35428912, "step": 58140 }, { "epoch": 18.040645361464474, "grad_norm": 10.10461711883545, "learning_rate": 2.8961465052435965e-07, "loss": 0.1449, "num_input_tokens_seen": 35432080, "step": 58145 }, { "epoch": 18.04219671113869, "grad_norm": 18.032785415649414, "learning_rate": 2.891607607701635e-07, "loss": 0.1504, "num_input_tokens_seen": 35435312, "step": 58150 }, { "epoch": 18.043748060812906, "grad_norm": 3.4137871265411377, "learning_rate": 2.8870721637626777e-07, "loss": 0.09, "num_input_tokens_seen": 35437808, "step": 58155 }, { "epoch": 18.045299410487125, "grad_norm": 6.917595386505127, "learning_rate": 2.882540173759235e-07, "loss": 0.1228, "num_input_tokens_seen": 35440880, "step": 58160 }, { "epoch": 18.04685076016134, "grad_norm": 26.100431442260742, "learning_rate": 2.8780116380235424e-07, "loss": 0.1806, "num_input_tokens_seen": 35443888, "step": 58165 }, { "epoch": 18.048402109835557, "grad_norm": 14.934503555297852, "learning_rate": 2.873486556887617e-07, "loss": 0.2042, "num_input_tokens_seen": 35446480, "step": 58170 }, { "epoch": 18.049953459509773, "grad_norm": 14.27291488647461, "learning_rate": 2.8689649306831823e-07, "loss": 0.1347, "num_input_tokens_seen": 35449936, "step": 58175 }, { "epoch": 18.05150480918399, "grad_norm": 13.24429702758789, "learning_rate": 2.8644467597417347e-07, "loss": 0.1603, "num_input_tokens_seen": 35452080, "step": 58180 }, { "epoch": 18.053056158858208, "grad_norm": 14.315691947937012, "learning_rate": 2.8599320443945034e-07, "loss": 0.2761, "num_input_tokens_seen": 35454768, "step": 58185 }, { "epoch": 18.054607508532424, "grad_norm": 15.348645210266113, "learning_rate": 2.855420784972479e-07, "loss": 0.0918, "num_input_tokens_seen": 35457264, "step": 58190 }, { "epoch": 18.05615885820664, "grad_norm": 32.78551483154297, "learning_rate": 2.8509129818063863e-07, "loss": 0.177, "num_input_tokens_seen": 35460976, "step": 58195 }, { "epoch": 18.057710207880856, "grad_norm": 16.78851318359375, "learning_rate": 2.8464086352266993e-07, "loss": 0.1753, "num_input_tokens_seen": 35463568, "step": 58200 }, { "epoch": 18.05926155755507, "grad_norm": 50.27164840698242, "learning_rate": 2.8419077455636433e-07, "loss": 0.2157, "num_input_tokens_seen": 35467568, "step": 58205 }, { "epoch": 18.06081290722929, "grad_norm": 13.032510757446289, "learning_rate": 2.837410313147182e-07, "loss": 0.1881, "num_input_tokens_seen": 35471600, "step": 58210 }, { "epoch": 18.062364256903507, "grad_norm": 4.951250076293945, "learning_rate": 2.83291633830704e-07, "loss": 0.1225, "num_input_tokens_seen": 35474128, "step": 58215 }, { "epoch": 18.063915606577723, "grad_norm": 9.34562873840332, "learning_rate": 2.8284258213726657e-07, "loss": 0.1252, "num_input_tokens_seen": 35478288, "step": 58220 }, { "epoch": 18.06546695625194, "grad_norm": 3.520627021789551, "learning_rate": 2.8239387626732784e-07, "loss": 0.192, "num_input_tokens_seen": 35480528, "step": 58225 }, { "epoch": 18.067018305926155, "grad_norm": 11.342947006225586, "learning_rate": 2.8194551625378266e-07, "loss": 0.1269, "num_input_tokens_seen": 35484464, "step": 58230 }, { "epoch": 18.068569655600374, "grad_norm": 9.6372709274292, "learning_rate": 2.8149750212950135e-07, "loss": 0.2021, "num_input_tokens_seen": 35487632, "step": 58235 }, { "epoch": 18.07012100527459, "grad_norm": 2.8324639797210693, "learning_rate": 2.810498339273282e-07, "loss": 0.0794, "num_input_tokens_seen": 35489904, "step": 58240 }, { "epoch": 18.071672354948806, "grad_norm": 7.421505928039551, "learning_rate": 2.8060251168008314e-07, "loss": 0.1128, "num_input_tokens_seen": 35492528, "step": 58245 }, { "epoch": 18.07322370462302, "grad_norm": 15.159996032714844, "learning_rate": 2.8015553542055984e-07, "loss": 0.1557, "num_input_tokens_seen": 35494768, "step": 58250 }, { "epoch": 18.074775054297238, "grad_norm": 5.645684242248535, "learning_rate": 2.797089051815266e-07, "loss": 0.1626, "num_input_tokens_seen": 35498672, "step": 58255 }, { "epoch": 18.076326403971454, "grad_norm": 10.909431457519531, "learning_rate": 2.792626209957283e-07, "loss": 0.1513, "num_input_tokens_seen": 35501680, "step": 58260 }, { "epoch": 18.077877753645673, "grad_norm": 37.483943939208984, "learning_rate": 2.7881668289588106e-07, "loss": 0.2442, "num_input_tokens_seen": 35504144, "step": 58265 }, { "epoch": 18.07942910331989, "grad_norm": 5.322713851928711, "learning_rate": 2.783710909146792e-07, "loss": 0.1429, "num_input_tokens_seen": 35507248, "step": 58270 }, { "epoch": 18.080980452994105, "grad_norm": 10.908114433288574, "learning_rate": 2.779258450847877e-07, "loss": 0.0966, "num_input_tokens_seen": 35509616, "step": 58275 }, { "epoch": 18.08253180266832, "grad_norm": 11.785149574279785, "learning_rate": 2.7748094543884995e-07, "loss": 0.1088, "num_input_tokens_seen": 35512208, "step": 58280 }, { "epoch": 18.084083152342536, "grad_norm": 12.087581634521484, "learning_rate": 2.770363920094815e-07, "loss": 0.1243, "num_input_tokens_seen": 35515728, "step": 58285 }, { "epoch": 18.085634502016756, "grad_norm": 27.400012969970703, "learning_rate": 2.7659218482927464e-07, "loss": 0.1209, "num_input_tokens_seen": 35518640, "step": 58290 }, { "epoch": 18.087185851690972, "grad_norm": 3.4245564937591553, "learning_rate": 2.761483239307933e-07, "loss": 0.0892, "num_input_tokens_seen": 35523504, "step": 58295 }, { "epoch": 18.088737201365188, "grad_norm": 11.24815559387207, "learning_rate": 2.757048093465792e-07, "loss": 0.1628, "num_input_tokens_seen": 35526192, "step": 58300 }, { "epoch": 18.090288551039404, "grad_norm": 4.773723602294922, "learning_rate": 2.7526164110914577e-07, "loss": 0.126, "num_input_tokens_seen": 35529104, "step": 58305 }, { "epoch": 18.09183990071362, "grad_norm": 18.711984634399414, "learning_rate": 2.748188192509843e-07, "loss": 0.1781, "num_input_tokens_seen": 35532400, "step": 58310 }, { "epoch": 18.09339125038784, "grad_norm": 10.050068855285645, "learning_rate": 2.743763438045566e-07, "loss": 0.1552, "num_input_tokens_seen": 35535056, "step": 58315 }, { "epoch": 18.094942600062055, "grad_norm": 6.250933647155762, "learning_rate": 2.739342148023033e-07, "loss": 0.1553, "num_input_tokens_seen": 35537424, "step": 58320 }, { "epoch": 18.09649394973627, "grad_norm": 29.740535736083984, "learning_rate": 2.7349243227663744e-07, "loss": 0.1525, "num_input_tokens_seen": 35540752, "step": 58325 }, { "epoch": 18.098045299410487, "grad_norm": 49.688655853271484, "learning_rate": 2.7305099625994593e-07, "loss": 0.2009, "num_input_tokens_seen": 35543696, "step": 58330 }, { "epoch": 18.099596649084702, "grad_norm": 25.306175231933594, "learning_rate": 2.726099067845928e-07, "loss": 0.1339, "num_input_tokens_seen": 35546480, "step": 58335 }, { "epoch": 18.101147998758922, "grad_norm": 14.996966361999512, "learning_rate": 2.721691638829133e-07, "loss": 0.1586, "num_input_tokens_seen": 35549104, "step": 58340 }, { "epoch": 18.102699348433138, "grad_norm": 3.766075611114502, "learning_rate": 2.7172876758722045e-07, "loss": 0.1743, "num_input_tokens_seen": 35552784, "step": 58345 }, { "epoch": 18.104250698107354, "grad_norm": 31.182802200317383, "learning_rate": 2.712887179297996e-07, "loss": 0.1704, "num_input_tokens_seen": 35555568, "step": 58350 }, { "epoch": 18.10580204778157, "grad_norm": 15.080538749694824, "learning_rate": 2.708490149429127e-07, "loss": 0.1218, "num_input_tokens_seen": 35558256, "step": 58355 }, { "epoch": 18.107353397455785, "grad_norm": 26.63644027709961, "learning_rate": 2.704096586587934e-07, "loss": 0.2137, "num_input_tokens_seen": 35562544, "step": 58360 }, { "epoch": 18.108904747130005, "grad_norm": 14.426830291748047, "learning_rate": 2.699706491096543e-07, "loss": 0.2082, "num_input_tokens_seen": 35565712, "step": 58365 }, { "epoch": 18.11045609680422, "grad_norm": 13.8352632522583, "learning_rate": 2.695319863276774e-07, "loss": 0.0848, "num_input_tokens_seen": 35568656, "step": 58370 }, { "epoch": 18.112007446478437, "grad_norm": 16.7192325592041, "learning_rate": 2.690936703450231e-07, "loss": 0.0899, "num_input_tokens_seen": 35571536, "step": 58375 }, { "epoch": 18.113558796152653, "grad_norm": 22.75383949279785, "learning_rate": 2.6865570119382564e-07, "loss": 0.113, "num_input_tokens_seen": 35575184, "step": 58380 }, { "epoch": 18.11511014582687, "grad_norm": 20.472551345825195, "learning_rate": 2.6821807890619223e-07, "loss": 0.2196, "num_input_tokens_seen": 35577872, "step": 58385 }, { "epoch": 18.116661495501084, "grad_norm": 24.33823013305664, "learning_rate": 2.677808035142071e-07, "loss": 0.2601, "num_input_tokens_seen": 35580368, "step": 58390 }, { "epoch": 18.118212845175304, "grad_norm": 12.020730018615723, "learning_rate": 2.6734387504992633e-07, "loss": 0.1056, "num_input_tokens_seen": 35582928, "step": 58395 }, { "epoch": 18.11976419484952, "grad_norm": 26.5606746673584, "learning_rate": 2.6690729354538315e-07, "loss": 0.1897, "num_input_tokens_seen": 35586608, "step": 58400 }, { "epoch": 18.121315544523735, "grad_norm": 7.66979455947876, "learning_rate": 2.664710590325825e-07, "loss": 0.1855, "num_input_tokens_seen": 35589040, "step": 58405 }, { "epoch": 18.12286689419795, "grad_norm": 12.639813423156738, "learning_rate": 2.6603517154350777e-07, "loss": 0.1725, "num_input_tokens_seen": 35592208, "step": 58410 }, { "epoch": 18.124418243872167, "grad_norm": 12.787498474121094, "learning_rate": 2.655996311101122e-07, "loss": 0.0816, "num_input_tokens_seen": 35595664, "step": 58415 }, { "epoch": 18.125969593546387, "grad_norm": 6.600351333618164, "learning_rate": 2.651644377643287e-07, "loss": 0.1723, "num_input_tokens_seen": 35599024, "step": 58420 }, { "epoch": 18.127520943220603, "grad_norm": 13.263644218444824, "learning_rate": 2.647295915380599e-07, "loss": 0.127, "num_input_tokens_seen": 35601808, "step": 58425 }, { "epoch": 18.12907229289482, "grad_norm": 5.382325172424316, "learning_rate": 2.642950924631865e-07, "loss": 0.139, "num_input_tokens_seen": 35606000, "step": 58430 }, { "epoch": 18.130623642569034, "grad_norm": 46.40374755859375, "learning_rate": 2.638609405715614e-07, "loss": 0.1542, "num_input_tokens_seen": 35608784, "step": 58435 }, { "epoch": 18.13217499224325, "grad_norm": 9.003377914428711, "learning_rate": 2.6342713589501356e-07, "loss": 0.0764, "num_input_tokens_seen": 35611184, "step": 58440 }, { "epoch": 18.13372634191747, "grad_norm": 26.0997257232666, "learning_rate": 2.6299367846534693e-07, "loss": 0.1487, "num_input_tokens_seen": 35613616, "step": 58445 }, { "epoch": 18.135277691591686, "grad_norm": 12.249058723449707, "learning_rate": 2.625605683143373e-07, "loss": 0.1964, "num_input_tokens_seen": 35616112, "step": 58450 }, { "epoch": 18.1368290412659, "grad_norm": 8.360177040100098, "learning_rate": 2.621278054737386e-07, "loss": 0.1312, "num_input_tokens_seen": 35618832, "step": 58455 }, { "epoch": 18.138380390940117, "grad_norm": 21.130510330200195, "learning_rate": 2.6169538997527556e-07, "loss": 0.1103, "num_input_tokens_seen": 35622096, "step": 58460 }, { "epoch": 18.139931740614333, "grad_norm": 5.065833568572998, "learning_rate": 2.612633218506516e-07, "loss": 0.1927, "num_input_tokens_seen": 35626992, "step": 58465 }, { "epoch": 18.141483090288553, "grad_norm": 21.930606842041016, "learning_rate": 2.6083160113153985e-07, "loss": 0.162, "num_input_tokens_seen": 35629648, "step": 58470 }, { "epoch": 18.14303443996277, "grad_norm": 4.974485397338867, "learning_rate": 2.6040022784959316e-07, "loss": 0.1824, "num_input_tokens_seen": 35632240, "step": 58475 }, { "epoch": 18.144585789636984, "grad_norm": 6.225344657897949, "learning_rate": 2.599692020364336e-07, "loss": 0.1412, "num_input_tokens_seen": 35635664, "step": 58480 }, { "epoch": 18.1461371393112, "grad_norm": 19.353302001953125, "learning_rate": 2.5953852372366307e-07, "loss": 0.173, "num_input_tokens_seen": 35638448, "step": 58485 }, { "epoch": 18.147688488985416, "grad_norm": 21.80459213256836, "learning_rate": 2.591081929428535e-07, "loss": 0.1804, "num_input_tokens_seen": 35641872, "step": 58490 }, { "epoch": 18.149239838659636, "grad_norm": 10.889976501464844, "learning_rate": 2.5867820972555413e-07, "loss": 0.1192, "num_input_tokens_seen": 35644432, "step": 58495 }, { "epoch": 18.15079118833385, "grad_norm": 28.352855682373047, "learning_rate": 2.582485741032881e-07, "loss": 0.179, "num_input_tokens_seen": 35647120, "step": 58500 }, { "epoch": 18.152342538008067, "grad_norm": 17.01265525817871, "learning_rate": 2.578192861075518e-07, "loss": 0.1689, "num_input_tokens_seen": 35649648, "step": 58505 }, { "epoch": 18.153893887682283, "grad_norm": 18.668447494506836, "learning_rate": 2.5739034576981794e-07, "loss": 0.1128, "num_input_tokens_seen": 35652752, "step": 58510 }, { "epoch": 18.1554452373565, "grad_norm": 16.95527458190918, "learning_rate": 2.569617531215324e-07, "loss": 0.2178, "num_input_tokens_seen": 35655824, "step": 58515 }, { "epoch": 18.156996587030715, "grad_norm": 12.036088943481445, "learning_rate": 2.565335081941167e-07, "loss": 0.1406, "num_input_tokens_seen": 35658704, "step": 58520 }, { "epoch": 18.158547936704935, "grad_norm": 6.181137561798096, "learning_rate": 2.561056110189653e-07, "loss": 0.141, "num_input_tokens_seen": 35662448, "step": 58525 }, { "epoch": 18.16009928637915, "grad_norm": 15.847577095031738, "learning_rate": 2.556780616274496e-07, "loss": 0.0891, "num_input_tokens_seen": 35665296, "step": 58530 }, { "epoch": 18.161650636053366, "grad_norm": 11.566193580627441, "learning_rate": 2.5525086005091235e-07, "loss": 0.1302, "num_input_tokens_seen": 35668240, "step": 58535 }, { "epoch": 18.163201985727582, "grad_norm": 10.114180564880371, "learning_rate": 2.5482400632067415e-07, "loss": 0.1375, "num_input_tokens_seen": 35671152, "step": 58540 }, { "epoch": 18.164753335401798, "grad_norm": 37.244346618652344, "learning_rate": 2.5439750046802656e-07, "loss": 0.1924, "num_input_tokens_seen": 35673744, "step": 58545 }, { "epoch": 18.166304685076017, "grad_norm": 9.676309585571289, "learning_rate": 2.5397134252423906e-07, "loss": 0.1079, "num_input_tokens_seen": 35677552, "step": 58550 }, { "epoch": 18.167856034750233, "grad_norm": 2.399851083755493, "learning_rate": 2.5354553252055324e-07, "loss": 0.1305, "num_input_tokens_seen": 35680336, "step": 58555 }, { "epoch": 18.16940738442445, "grad_norm": 7.908320903778076, "learning_rate": 2.5312007048818646e-07, "loss": 0.1449, "num_input_tokens_seen": 35682640, "step": 58560 }, { "epoch": 18.170958734098665, "grad_norm": 9.573356628417969, "learning_rate": 2.526949564583303e-07, "loss": 0.1455, "num_input_tokens_seen": 35685520, "step": 58565 }, { "epoch": 18.17251008377288, "grad_norm": 15.540846824645996, "learning_rate": 2.5227019046214993e-07, "loss": 0.1765, "num_input_tokens_seen": 35688560, "step": 58570 }, { "epoch": 18.1740614334471, "grad_norm": 11.918977737426758, "learning_rate": 2.5184577253078656e-07, "loss": 0.1195, "num_input_tokens_seen": 35691312, "step": 58575 }, { "epoch": 18.175612783121316, "grad_norm": 15.866782188415527, "learning_rate": 2.5142170269535417e-07, "loss": 0.2438, "num_input_tokens_seen": 35693840, "step": 58580 }, { "epoch": 18.177164132795532, "grad_norm": 15.428861618041992, "learning_rate": 2.509979809869428e-07, "loss": 0.1927, "num_input_tokens_seen": 35696656, "step": 58585 }, { "epoch": 18.178715482469748, "grad_norm": 9.584329605102539, "learning_rate": 2.5057460743661556e-07, "loss": 0.1725, "num_input_tokens_seen": 35699952, "step": 58590 }, { "epoch": 18.180266832143964, "grad_norm": 10.089816093444824, "learning_rate": 2.501515820754119e-07, "loss": 0.1701, "num_input_tokens_seen": 35702736, "step": 58595 }, { "epoch": 18.181818181818183, "grad_norm": 8.929512023925781, "learning_rate": 2.497289049343438e-07, "loss": 0.1843, "num_input_tokens_seen": 35706064, "step": 58600 }, { "epoch": 18.1833695314924, "grad_norm": 15.424642562866211, "learning_rate": 2.493065760443986e-07, "loss": 0.1159, "num_input_tokens_seen": 35709008, "step": 58605 }, { "epoch": 18.184920881166615, "grad_norm": 7.373140335083008, "learning_rate": 2.4888459543653763e-07, "loss": 0.0753, "num_input_tokens_seen": 35711920, "step": 58610 }, { "epoch": 18.18647223084083, "grad_norm": 38.0463981628418, "learning_rate": 2.484629631416968e-07, "loss": 0.1268, "num_input_tokens_seen": 35716272, "step": 58615 }, { "epoch": 18.188023580515047, "grad_norm": 10.798398971557617, "learning_rate": 2.480416791907886e-07, "loss": 0.1526, "num_input_tokens_seen": 35718800, "step": 58620 }, { "epoch": 18.189574930189266, "grad_norm": 27.179746627807617, "learning_rate": 2.4762074361469656e-07, "loss": 0.1338, "num_input_tokens_seen": 35721872, "step": 58625 }, { "epoch": 18.191126279863482, "grad_norm": 14.332606315612793, "learning_rate": 2.4720015644428e-07, "loss": 0.1288, "num_input_tokens_seen": 35724752, "step": 58630 }, { "epoch": 18.192677629537698, "grad_norm": 17.511537551879883, "learning_rate": 2.4677991771037467e-07, "loss": 0.1629, "num_input_tokens_seen": 35727120, "step": 58635 }, { "epoch": 18.194228979211914, "grad_norm": 18.998090744018555, "learning_rate": 2.4636002744378664e-07, "loss": 0.1651, "num_input_tokens_seen": 35730032, "step": 58640 }, { "epoch": 18.19578032888613, "grad_norm": 15.696934700012207, "learning_rate": 2.459404856753006e-07, "loss": 0.165, "num_input_tokens_seen": 35733936, "step": 58645 }, { "epoch": 18.197331678560346, "grad_norm": 27.331388473510742, "learning_rate": 2.455212924356742e-07, "loss": 0.2145, "num_input_tokens_seen": 35736624, "step": 58650 }, { "epoch": 18.198883028234565, "grad_norm": 10.329652786254883, "learning_rate": 2.4510244775563743e-07, "loss": 0.0991, "num_input_tokens_seen": 35739536, "step": 58655 }, { "epoch": 18.20043437790878, "grad_norm": 14.8436279296875, "learning_rate": 2.4468395166589885e-07, "loss": 0.1223, "num_input_tokens_seen": 35742736, "step": 58660 }, { "epoch": 18.201985727582997, "grad_norm": 7.990160942077637, "learning_rate": 2.4426580419713684e-07, "loss": 0.1303, "num_input_tokens_seen": 35748048, "step": 58665 }, { "epoch": 18.203537077257213, "grad_norm": 15.809405326843262, "learning_rate": 2.4384800538000854e-07, "loss": 0.2679, "num_input_tokens_seen": 35750960, "step": 58670 }, { "epoch": 18.20508842693143, "grad_norm": 14.037792205810547, "learning_rate": 2.4343055524514214e-07, "loss": 0.2138, "num_input_tokens_seen": 35753872, "step": 58675 }, { "epoch": 18.20663977660565, "grad_norm": 12.110166549682617, "learning_rate": 2.430134538231427e-07, "loss": 0.1583, "num_input_tokens_seen": 35756944, "step": 58680 }, { "epoch": 18.208191126279864, "grad_norm": 65.22653198242188, "learning_rate": 2.425967011445879e-07, "loss": 0.2204, "num_input_tokens_seen": 35759184, "step": 58685 }, { "epoch": 18.20974247595408, "grad_norm": 37.90193557739258, "learning_rate": 2.4218029724003165e-07, "loss": 0.1027, "num_input_tokens_seen": 35761936, "step": 58690 }, { "epoch": 18.211293825628296, "grad_norm": 6.354044437408447, "learning_rate": 2.417642421399996e-07, "loss": 0.1108, "num_input_tokens_seen": 35765072, "step": 58695 }, { "epoch": 18.212845175302512, "grad_norm": 16.583051681518555, "learning_rate": 2.413485358749945e-07, "loss": 0.1289, "num_input_tokens_seen": 35767664, "step": 58700 }, { "epoch": 18.21439652497673, "grad_norm": 17.2268123626709, "learning_rate": 2.409331784754937e-07, "loss": 0.1901, "num_input_tokens_seen": 35771440, "step": 58705 }, { "epoch": 18.215947874650947, "grad_norm": 22.896869659423828, "learning_rate": 2.4051816997194555e-07, "loss": 0.1745, "num_input_tokens_seen": 35773808, "step": 58710 }, { "epoch": 18.217499224325163, "grad_norm": 49.97030258178711, "learning_rate": 2.401035103947774e-07, "loss": 0.1278, "num_input_tokens_seen": 35778064, "step": 58715 }, { "epoch": 18.21905057399938, "grad_norm": 11.590950012207031, "learning_rate": 2.3968919977438664e-07, "loss": 0.1823, "num_input_tokens_seen": 35780336, "step": 58720 }, { "epoch": 18.220601923673595, "grad_norm": 33.241512298583984, "learning_rate": 2.392752381411484e-07, "loss": 0.2182, "num_input_tokens_seen": 35782928, "step": 58725 }, { "epoch": 18.222153273347814, "grad_norm": 28.101612091064453, "learning_rate": 2.388616255254106e-07, "loss": 0.1958, "num_input_tokens_seen": 35786384, "step": 58730 }, { "epoch": 18.22370462302203, "grad_norm": 16.492706298828125, "learning_rate": 2.384483619574962e-07, "loss": 0.1342, "num_input_tokens_seen": 35789328, "step": 58735 }, { "epoch": 18.225255972696246, "grad_norm": 19.20598030090332, "learning_rate": 2.3803544746770158e-07, "loss": 0.18, "num_input_tokens_seen": 35791888, "step": 58740 }, { "epoch": 18.226807322370462, "grad_norm": 23.031871795654297, "learning_rate": 2.3762288208629914e-07, "loss": 0.1926, "num_input_tokens_seen": 35794960, "step": 58745 }, { "epoch": 18.228358672044678, "grad_norm": 16.937965393066406, "learning_rate": 2.3721066584353414e-07, "loss": 0.0734, "num_input_tokens_seen": 35798160, "step": 58750 }, { "epoch": 18.229910021718897, "grad_norm": 52.92826461791992, "learning_rate": 2.367987987696274e-07, "loss": 0.31, "num_input_tokens_seen": 35800560, "step": 58755 }, { "epoch": 18.231461371393113, "grad_norm": 5.51541805267334, "learning_rate": 2.3638728089477315e-07, "loss": 0.1618, "num_input_tokens_seen": 35803504, "step": 58760 }, { "epoch": 18.23301272106733, "grad_norm": 11.296818733215332, "learning_rate": 2.359761122491411e-07, "loss": 0.1291, "num_input_tokens_seen": 35808560, "step": 58765 }, { "epoch": 18.234564070741545, "grad_norm": 10.476773262023926, "learning_rate": 2.3556529286287488e-07, "loss": 0.0863, "num_input_tokens_seen": 35811088, "step": 58770 }, { "epoch": 18.23611542041576, "grad_norm": 15.931147575378418, "learning_rate": 2.3515482276609104e-07, "loss": 0.2353, "num_input_tokens_seen": 35815120, "step": 58775 }, { "epoch": 18.237666770089977, "grad_norm": 52.68217086791992, "learning_rate": 2.3474470198888378e-07, "loss": 0.1729, "num_input_tokens_seen": 35817936, "step": 58780 }, { "epoch": 18.239218119764196, "grad_norm": 12.691405296325684, "learning_rate": 2.3433493056131851e-07, "loss": 0.2007, "num_input_tokens_seen": 35820240, "step": 58785 }, { "epoch": 18.240769469438412, "grad_norm": 13.889822959899902, "learning_rate": 2.3392550851343732e-07, "loss": 0.1045, "num_input_tokens_seen": 35823280, "step": 58790 }, { "epoch": 18.242320819112628, "grad_norm": 31.030656814575195, "learning_rate": 2.3351643587525397e-07, "loss": 0.1465, "num_input_tokens_seen": 35825776, "step": 58795 }, { "epoch": 18.243872168786844, "grad_norm": 54.69828414916992, "learning_rate": 2.3310771267676057e-07, "loss": 0.1789, "num_input_tokens_seen": 35828784, "step": 58800 }, { "epoch": 18.24542351846106, "grad_norm": 3.915635585784912, "learning_rate": 2.3269933894791986e-07, "loss": 0.1424, "num_input_tokens_seen": 35831184, "step": 58805 }, { "epoch": 18.24697486813528, "grad_norm": 12.077703475952148, "learning_rate": 2.3229131471867117e-07, "loss": 0.1276, "num_input_tokens_seen": 35835280, "step": 58810 }, { "epoch": 18.248526217809495, "grad_norm": 11.499444961547852, "learning_rate": 2.3188364001892672e-07, "loss": 0.1755, "num_input_tokens_seen": 35837520, "step": 58815 }, { "epoch": 18.25007756748371, "grad_norm": 22.977323532104492, "learning_rate": 2.3147631487857426e-07, "loss": 0.2715, "num_input_tokens_seen": 35840528, "step": 58820 }, { "epoch": 18.251628917157927, "grad_norm": 63.27473068237305, "learning_rate": 2.3106933932747654e-07, "loss": 0.1508, "num_input_tokens_seen": 35842896, "step": 58825 }, { "epoch": 18.253180266832143, "grad_norm": 3.458049774169922, "learning_rate": 2.3066271339546809e-07, "loss": 0.1071, "num_input_tokens_seen": 35845904, "step": 58830 }, { "epoch": 18.254731616506362, "grad_norm": 10.394940376281738, "learning_rate": 2.3025643711236055e-07, "loss": 0.1006, "num_input_tokens_seen": 35848912, "step": 58835 }, { "epoch": 18.256282966180578, "grad_norm": 9.161194801330566, "learning_rate": 2.2985051050793795e-07, "loss": 0.1435, "num_input_tokens_seen": 35851632, "step": 58840 }, { "epoch": 18.257834315854794, "grad_norm": 13.252434730529785, "learning_rate": 2.2944493361196084e-07, "loss": 0.1928, "num_input_tokens_seen": 35856080, "step": 58845 }, { "epoch": 18.25938566552901, "grad_norm": 7.676934242248535, "learning_rate": 2.2903970645416108e-07, "loss": 0.128, "num_input_tokens_seen": 35860464, "step": 58850 }, { "epoch": 18.260937015203226, "grad_norm": 7.728517532348633, "learning_rate": 2.2863482906424816e-07, "loss": 0.0789, "num_input_tokens_seen": 35863344, "step": 58855 }, { "epoch": 18.262488364877445, "grad_norm": 17.766014099121094, "learning_rate": 2.2823030147190284e-07, "loss": 0.1647, "num_input_tokens_seen": 35866416, "step": 58860 }, { "epoch": 18.26403971455166, "grad_norm": 7.471001625061035, "learning_rate": 2.2782612370678358e-07, "loss": 0.1751, "num_input_tokens_seen": 35869904, "step": 58865 }, { "epoch": 18.265591064225877, "grad_norm": 18.25959587097168, "learning_rate": 2.274222957985195e-07, "loss": 0.2174, "num_input_tokens_seen": 35872272, "step": 58870 }, { "epoch": 18.267142413900093, "grad_norm": 7.9031548500061035, "learning_rate": 2.270188177767174e-07, "loss": 0.2069, "num_input_tokens_seen": 35875248, "step": 58875 }, { "epoch": 18.26869376357431, "grad_norm": 17.4116268157959, "learning_rate": 2.2661568967095648e-07, "loss": 0.1213, "num_input_tokens_seen": 35878224, "step": 58880 }, { "epoch": 18.270245113248528, "grad_norm": 3.797008991241455, "learning_rate": 2.2621291151079029e-07, "loss": 0.1326, "num_input_tokens_seen": 35881008, "step": 58885 }, { "epoch": 18.271796462922744, "grad_norm": 14.507168769836426, "learning_rate": 2.258104833257485e-07, "loss": 0.1451, "num_input_tokens_seen": 35883984, "step": 58890 }, { "epoch": 18.27334781259696, "grad_norm": 33.8006706237793, "learning_rate": 2.2540840514533258e-07, "loss": 0.149, "num_input_tokens_seen": 35886704, "step": 58895 }, { "epoch": 18.274899162271176, "grad_norm": 22.074172973632812, "learning_rate": 2.250066769990211e-07, "loss": 0.1259, "num_input_tokens_seen": 35889552, "step": 58900 }, { "epoch": 18.27645051194539, "grad_norm": 22.84682273864746, "learning_rate": 2.2460529891626393e-07, "loss": 0.1628, "num_input_tokens_seen": 35894288, "step": 58905 }, { "epoch": 18.278001861619607, "grad_norm": 21.71010398864746, "learning_rate": 2.2420427092648743e-07, "loss": 0.137, "num_input_tokens_seen": 35896336, "step": 58910 }, { "epoch": 18.279553211293827, "grad_norm": 37.30110168457031, "learning_rate": 2.2380359305909205e-07, "loss": 0.2066, "num_input_tokens_seen": 35899248, "step": 58915 }, { "epoch": 18.281104560968043, "grad_norm": 34.4993896484375, "learning_rate": 2.2340326534345202e-07, "loss": 0.2089, "num_input_tokens_seen": 35901808, "step": 58920 }, { "epoch": 18.28265591064226, "grad_norm": 14.074066162109375, "learning_rate": 2.2300328780891555e-07, "loss": 0.1939, "num_input_tokens_seen": 35905264, "step": 58925 }, { "epoch": 18.284207260316474, "grad_norm": 32.68290328979492, "learning_rate": 2.226036604848064e-07, "loss": 0.2087, "num_input_tokens_seen": 35908880, "step": 58930 }, { "epoch": 18.28575860999069, "grad_norm": 4.867822170257568, "learning_rate": 2.2220438340042173e-07, "loss": 0.1383, "num_input_tokens_seen": 35912592, "step": 58935 }, { "epoch": 18.28730995966491, "grad_norm": 28.894502639770508, "learning_rate": 2.2180545658503306e-07, "loss": 0.1901, "num_input_tokens_seen": 35914960, "step": 58940 }, { "epoch": 18.288861309339126, "grad_norm": 31.943218231201172, "learning_rate": 2.2140688006788702e-07, "loss": 0.2826, "num_input_tokens_seen": 35918192, "step": 58945 }, { "epoch": 18.29041265901334, "grad_norm": 26.264028549194336, "learning_rate": 2.2100865387820358e-07, "loss": 0.1484, "num_input_tokens_seen": 35920592, "step": 58950 }, { "epoch": 18.291964008687557, "grad_norm": 15.57768726348877, "learning_rate": 2.2061077804517772e-07, "loss": 0.1221, "num_input_tokens_seen": 35923376, "step": 58955 }, { "epoch": 18.293515358361773, "grad_norm": 24.126291275024414, "learning_rate": 2.2021325259797776e-07, "loss": 0.1675, "num_input_tokens_seen": 35925936, "step": 58960 }, { "epoch": 18.295066708035993, "grad_norm": 2.0945262908935547, "learning_rate": 2.1981607756574874e-07, "loss": 0.255, "num_input_tokens_seen": 35929712, "step": 58965 }, { "epoch": 18.29661805771021, "grad_norm": 7.9402923583984375, "learning_rate": 2.194192529776057e-07, "loss": 0.1339, "num_input_tokens_seen": 35932208, "step": 58970 }, { "epoch": 18.298169407384425, "grad_norm": 23.830036163330078, "learning_rate": 2.190227788626431e-07, "loss": 0.1331, "num_input_tokens_seen": 35934832, "step": 58975 }, { "epoch": 18.29972075705864, "grad_norm": 32.07362747192383, "learning_rate": 2.18626655249925e-07, "loss": 0.2266, "num_input_tokens_seen": 35937808, "step": 58980 }, { "epoch": 18.301272106732856, "grad_norm": 17.152170181274414, "learning_rate": 2.1823088216849363e-07, "loss": 0.2512, "num_input_tokens_seen": 35940656, "step": 58985 }, { "epoch": 18.302823456407076, "grad_norm": 10.81831169128418, "learning_rate": 2.1783545964736308e-07, "loss": 0.1856, "num_input_tokens_seen": 35945040, "step": 58990 }, { "epoch": 18.30437480608129, "grad_norm": 17.375511169433594, "learning_rate": 2.174403877155229e-07, "loss": 0.166, "num_input_tokens_seen": 35947312, "step": 58995 }, { "epoch": 18.305926155755508, "grad_norm": 10.856179237365723, "learning_rate": 2.1704566640193548e-07, "loss": 0.1766, "num_input_tokens_seen": 35950672, "step": 59000 }, { "epoch": 18.307477505429723, "grad_norm": 9.41356372833252, "learning_rate": 2.166512957355399e-07, "loss": 0.0995, "num_input_tokens_seen": 35954160, "step": 59005 }, { "epoch": 18.30902885510394, "grad_norm": 5.469070911407471, "learning_rate": 2.16257275745248e-07, "loss": 0.13, "num_input_tokens_seen": 35957296, "step": 59010 }, { "epoch": 18.31058020477816, "grad_norm": 23.47401237487793, "learning_rate": 2.158636064599451e-07, "loss": 0.1335, "num_input_tokens_seen": 35960624, "step": 59015 }, { "epoch": 18.312131554452375, "grad_norm": 25.55999183654785, "learning_rate": 2.1547028790849301e-07, "loss": 0.1824, "num_input_tokens_seen": 35964336, "step": 59020 }, { "epoch": 18.31368290412659, "grad_norm": 10.462577819824219, "learning_rate": 2.1507732011972592e-07, "loss": 0.172, "num_input_tokens_seen": 35967600, "step": 59025 }, { "epoch": 18.315234253800806, "grad_norm": 30.014015197753906, "learning_rate": 2.146847031224536e-07, "loss": 0.1572, "num_input_tokens_seen": 35970768, "step": 59030 }, { "epoch": 18.316785603475022, "grad_norm": 11.147017478942871, "learning_rate": 2.1429243694545854e-07, "loss": 0.1067, "num_input_tokens_seen": 35972944, "step": 59035 }, { "epoch": 18.318336953149238, "grad_norm": 6.9890265464782715, "learning_rate": 2.1390052161749942e-07, "loss": 0.2268, "num_input_tokens_seen": 35975696, "step": 59040 }, { "epoch": 18.319888302823458, "grad_norm": 20.089420318603516, "learning_rate": 2.1350895716730768e-07, "loss": 0.167, "num_input_tokens_seen": 35979856, "step": 59045 }, { "epoch": 18.321439652497673, "grad_norm": 12.948580741882324, "learning_rate": 2.1311774362359038e-07, "loss": 0.1164, "num_input_tokens_seen": 35983504, "step": 59050 }, { "epoch": 18.32299100217189, "grad_norm": 9.848217964172363, "learning_rate": 2.1272688101502736e-07, "loss": 0.1812, "num_input_tokens_seen": 35985904, "step": 59055 }, { "epoch": 18.324542351846105, "grad_norm": 57.346832275390625, "learning_rate": 2.1233636937027346e-07, "loss": 0.1643, "num_input_tokens_seen": 35988528, "step": 59060 }, { "epoch": 18.32609370152032, "grad_norm": 6.251698970794678, "learning_rate": 2.1194620871795857e-07, "loss": 0.1549, "num_input_tokens_seen": 35990960, "step": 59065 }, { "epoch": 18.32764505119454, "grad_norm": 35.50924301147461, "learning_rate": 2.1155639908668536e-07, "loss": 0.1847, "num_input_tokens_seen": 35993392, "step": 59070 }, { "epoch": 18.329196400868756, "grad_norm": 40.76069641113281, "learning_rate": 2.1116694050503206e-07, "loss": 0.2174, "num_input_tokens_seen": 35996016, "step": 59075 }, { "epoch": 18.330747750542972, "grad_norm": 12.250717163085938, "learning_rate": 2.1077783300154974e-07, "loss": 0.1496, "num_input_tokens_seen": 35998768, "step": 59080 }, { "epoch": 18.33229910021719, "grad_norm": 31.863615036010742, "learning_rate": 2.1038907660476615e-07, "loss": 0.2041, "num_input_tokens_seen": 36000816, "step": 59085 }, { "epoch": 18.333850449891404, "grad_norm": 17.61249542236328, "learning_rate": 2.1000067134317958e-07, "loss": 0.1377, "num_input_tokens_seen": 36003280, "step": 59090 }, { "epoch": 18.335401799565624, "grad_norm": 41.92609405517578, "learning_rate": 2.0961261724526673e-07, "loss": 0.1519, "num_input_tokens_seen": 36007376, "step": 59095 }, { "epoch": 18.33695314923984, "grad_norm": 16.31584930419922, "learning_rate": 2.0922491433947535e-07, "loss": 0.1803, "num_input_tokens_seen": 36011728, "step": 59100 }, { "epoch": 18.338504498914055, "grad_norm": 3.1934170722961426, "learning_rate": 2.0883756265422938e-07, "loss": 0.0961, "num_input_tokens_seen": 36014640, "step": 59105 }, { "epoch": 18.34005584858827, "grad_norm": 3.7287790775299072, "learning_rate": 2.0845056221792502e-07, "loss": 0.0893, "num_input_tokens_seen": 36017392, "step": 59110 }, { "epoch": 18.341607198262487, "grad_norm": 7.7202582359313965, "learning_rate": 2.0806391305893568e-07, "loss": 0.1208, "num_input_tokens_seen": 36022384, "step": 59115 }, { "epoch": 18.343158547936707, "grad_norm": 15.591760635375977, "learning_rate": 2.0767761520560591e-07, "loss": 0.1674, "num_input_tokens_seen": 36026064, "step": 59120 }, { "epoch": 18.344709897610922, "grad_norm": 6.365888595581055, "learning_rate": 2.0729166868625695e-07, "loss": 0.2053, "num_input_tokens_seen": 36028944, "step": 59125 }, { "epoch": 18.34626124728514, "grad_norm": 6.651760101318359, "learning_rate": 2.069060735291828e-07, "loss": 0.081, "num_input_tokens_seen": 36032272, "step": 59130 }, { "epoch": 18.347812596959354, "grad_norm": 21.545236587524414, "learning_rate": 2.0652082976265196e-07, "loss": 0.1492, "num_input_tokens_seen": 36035632, "step": 59135 }, { "epoch": 18.34936394663357, "grad_norm": 9.961524963378906, "learning_rate": 2.06135937414908e-07, "loss": 0.1214, "num_input_tokens_seen": 36038320, "step": 59140 }, { "epoch": 18.35091529630779, "grad_norm": 26.630352020263672, "learning_rate": 2.057513965141672e-07, "loss": 0.2607, "num_input_tokens_seen": 36042160, "step": 59145 }, { "epoch": 18.352466645982005, "grad_norm": 33.31993103027344, "learning_rate": 2.05367207088622e-07, "loss": 0.1748, "num_input_tokens_seen": 36045872, "step": 59150 }, { "epoch": 18.35401799565622, "grad_norm": 11.417014122009277, "learning_rate": 2.0498336916643712e-07, "loss": 0.1482, "num_input_tokens_seen": 36049648, "step": 59155 }, { "epoch": 18.355569345330437, "grad_norm": 49.6906852722168, "learning_rate": 2.0459988277575337e-07, "loss": 0.1588, "num_input_tokens_seen": 36052720, "step": 59160 }, { "epoch": 18.357120695004653, "grad_norm": 20.51811408996582, "learning_rate": 2.0421674794468326e-07, "loss": 0.0846, "num_input_tokens_seen": 36055984, "step": 59165 }, { "epoch": 18.35867204467887, "grad_norm": 6.667943954467773, "learning_rate": 2.0383396470131654e-07, "loss": 0.0826, "num_input_tokens_seen": 36058800, "step": 59170 }, { "epoch": 18.36022339435309, "grad_norm": 3.9670021533966064, "learning_rate": 2.0345153307371523e-07, "loss": 0.1192, "num_input_tokens_seen": 36061872, "step": 59175 }, { "epoch": 18.361774744027304, "grad_norm": 38.5084114074707, "learning_rate": 2.0306945308991578e-07, "loss": 0.1509, "num_input_tokens_seen": 36065872, "step": 59180 }, { "epoch": 18.36332609370152, "grad_norm": 1.7889597415924072, "learning_rate": 2.0268772477793075e-07, "loss": 0.1555, "num_input_tokens_seen": 36070096, "step": 59185 }, { "epoch": 18.364877443375736, "grad_norm": 7.333957195281982, "learning_rate": 2.023063481657428e-07, "loss": 0.1425, "num_input_tokens_seen": 36072592, "step": 59190 }, { "epoch": 18.366428793049952, "grad_norm": 19.05514144897461, "learning_rate": 2.0192532328131397e-07, "loss": 0.0936, "num_input_tokens_seen": 36076048, "step": 59195 }, { "epoch": 18.36798014272417, "grad_norm": 20.221792221069336, "learning_rate": 2.0154465015257586e-07, "loss": 0.1497, "num_input_tokens_seen": 36078736, "step": 59200 }, { "epoch": 18.369531492398387, "grad_norm": 8.831972122192383, "learning_rate": 2.0116432880743663e-07, "loss": 0.1302, "num_input_tokens_seen": 36081808, "step": 59205 }, { "epoch": 18.371082842072603, "grad_norm": 25.713865280151367, "learning_rate": 2.007843592737796e-07, "loss": 0.2373, "num_input_tokens_seen": 36084816, "step": 59210 }, { "epoch": 18.37263419174682, "grad_norm": 11.977892875671387, "learning_rate": 2.004047415794602e-07, "loss": 0.176, "num_input_tokens_seen": 36087568, "step": 59215 }, { "epoch": 18.374185541421035, "grad_norm": 5.5950822830200195, "learning_rate": 2.0002547575230845e-07, "loss": 0.0967, "num_input_tokens_seen": 36090416, "step": 59220 }, { "epoch": 18.375736891095254, "grad_norm": 13.616537094116211, "learning_rate": 1.9964656182013042e-07, "loss": 0.0808, "num_input_tokens_seen": 36093136, "step": 59225 }, { "epoch": 18.37728824076947, "grad_norm": 16.243955612182617, "learning_rate": 1.9926799981070334e-07, "loss": 0.0827, "num_input_tokens_seen": 36095632, "step": 59230 }, { "epoch": 18.378839590443686, "grad_norm": 65.44770050048828, "learning_rate": 1.9888978975178164e-07, "loss": 0.2373, "num_input_tokens_seen": 36098672, "step": 59235 }, { "epoch": 18.380390940117902, "grad_norm": 5.639974594116211, "learning_rate": 1.985119316710915e-07, "loss": 0.2279, "num_input_tokens_seen": 36101456, "step": 59240 }, { "epoch": 18.381942289792118, "grad_norm": 6.335256099700928, "learning_rate": 1.9813442559633523e-07, "loss": 0.1252, "num_input_tokens_seen": 36104496, "step": 59245 }, { "epoch": 18.383493639466337, "grad_norm": 10.893733978271484, "learning_rate": 1.9775727155518787e-07, "loss": 0.0806, "num_input_tokens_seen": 36109328, "step": 59250 }, { "epoch": 18.385044989140553, "grad_norm": 1.697224736213684, "learning_rate": 1.973804695753001e-07, "loss": 0.1013, "num_input_tokens_seen": 36112272, "step": 59255 }, { "epoch": 18.38659633881477, "grad_norm": 23.685251235961914, "learning_rate": 1.9700401968429483e-07, "loss": 0.1973, "num_input_tokens_seen": 36115216, "step": 59260 }, { "epoch": 18.388147688488985, "grad_norm": 8.305006980895996, "learning_rate": 1.9662792190977166e-07, "loss": 0.1692, "num_input_tokens_seen": 36118032, "step": 59265 }, { "epoch": 18.3896990381632, "grad_norm": 15.423949241638184, "learning_rate": 1.9625217627930126e-07, "loss": 0.2166, "num_input_tokens_seen": 36120368, "step": 59270 }, { "epoch": 18.39125038783742, "grad_norm": 7.810300827026367, "learning_rate": 1.9587678282043164e-07, "loss": 0.1585, "num_input_tokens_seen": 36123504, "step": 59275 }, { "epoch": 18.392801737511636, "grad_norm": 34.87105941772461, "learning_rate": 1.9550174156068302e-07, "loss": 0.1347, "num_input_tokens_seen": 36126576, "step": 59280 }, { "epoch": 18.394353087185852, "grad_norm": 21.59481430053711, "learning_rate": 1.951270525275506e-07, "loss": 0.2491, "num_input_tokens_seen": 36128720, "step": 59285 }, { "epoch": 18.395904436860068, "grad_norm": 19.54717445373535, "learning_rate": 1.9475271574850409e-07, "loss": 0.1851, "num_input_tokens_seen": 36132912, "step": 59290 }, { "epoch": 18.397455786534284, "grad_norm": 9.717788696289062, "learning_rate": 1.943787312509854e-07, "loss": 0.1177, "num_input_tokens_seen": 36136016, "step": 59295 }, { "epoch": 18.3990071362085, "grad_norm": 8.020023345947266, "learning_rate": 1.9400509906241316e-07, "loss": 0.122, "num_input_tokens_seen": 36139120, "step": 59300 }, { "epoch": 18.40055848588272, "grad_norm": 40.439048767089844, "learning_rate": 1.9363181921017826e-07, "loss": 0.1437, "num_input_tokens_seen": 36142800, "step": 59305 }, { "epoch": 18.402109835556935, "grad_norm": 5.603060245513916, "learning_rate": 1.9325889172164714e-07, "loss": 0.189, "num_input_tokens_seen": 36145392, "step": 59310 }, { "epoch": 18.40366118523115, "grad_norm": 3.002889394760132, "learning_rate": 1.9288631662415958e-07, "loss": 0.1215, "num_input_tokens_seen": 36148208, "step": 59315 }, { "epoch": 18.405212534905367, "grad_norm": 21.29116439819336, "learning_rate": 1.9251409394502983e-07, "loss": 0.1574, "num_input_tokens_seen": 36150704, "step": 59320 }, { "epoch": 18.406763884579583, "grad_norm": 27.53178596496582, "learning_rate": 1.9214222371154613e-07, "loss": 0.3086, "num_input_tokens_seen": 36153776, "step": 59325 }, { "epoch": 18.408315234253802, "grad_norm": 14.698678970336914, "learning_rate": 1.9177070595097047e-07, "loss": 0.1191, "num_input_tokens_seen": 36156528, "step": 59330 }, { "epoch": 18.409866583928018, "grad_norm": 15.079916954040527, "learning_rate": 1.9139954069054113e-07, "loss": 0.0965, "num_input_tokens_seen": 36159280, "step": 59335 }, { "epoch": 18.411417933602234, "grad_norm": 4.987203121185303, "learning_rate": 1.9102872795746685e-07, "loss": 0.0891, "num_input_tokens_seen": 36162928, "step": 59340 }, { "epoch": 18.41296928327645, "grad_norm": 22.79470443725586, "learning_rate": 1.9065826777893425e-07, "loss": 0.1257, "num_input_tokens_seen": 36165360, "step": 59345 }, { "epoch": 18.414520632950666, "grad_norm": 25.910350799560547, "learning_rate": 1.9028816018210106e-07, "loss": 0.2173, "num_input_tokens_seen": 36168112, "step": 59350 }, { "epoch": 18.416071982624885, "grad_norm": 32.97419357299805, "learning_rate": 1.8991840519410166e-07, "loss": 0.1214, "num_input_tokens_seen": 36171408, "step": 59355 }, { "epoch": 18.4176233322991, "grad_norm": 24.64226722717285, "learning_rate": 1.8954900284204269e-07, "loss": 0.124, "num_input_tokens_seen": 36174608, "step": 59360 }, { "epoch": 18.419174681973317, "grad_norm": 27.0919246673584, "learning_rate": 1.891799531530064e-07, "loss": 0.1874, "num_input_tokens_seen": 36177776, "step": 59365 }, { "epoch": 18.420726031647533, "grad_norm": 36.67798614501953, "learning_rate": 1.8881125615404783e-07, "loss": 0.1883, "num_input_tokens_seen": 36180592, "step": 59370 }, { "epoch": 18.42227738132175, "grad_norm": 6.368759632110596, "learning_rate": 1.884429118721981e-07, "loss": 0.1578, "num_input_tokens_seen": 36184976, "step": 59375 }, { "epoch": 18.423828730995968, "grad_norm": 16.20309066772461, "learning_rate": 1.8807492033445895e-07, "loss": 0.1859, "num_input_tokens_seen": 36187504, "step": 59380 }, { "epoch": 18.425380080670184, "grad_norm": 10.829508781433105, "learning_rate": 1.8770728156781104e-07, "loss": 0.1187, "num_input_tokens_seen": 36190384, "step": 59385 }, { "epoch": 18.4269314303444, "grad_norm": 10.98802661895752, "learning_rate": 1.8733999559920446e-07, "loss": 0.1919, "num_input_tokens_seen": 36193104, "step": 59390 }, { "epoch": 18.428482780018616, "grad_norm": 34.69750213623047, "learning_rate": 1.8697306245556712e-07, "loss": 0.1217, "num_input_tokens_seen": 36195632, "step": 59395 }, { "epoch": 18.43003412969283, "grad_norm": 20.906553268432617, "learning_rate": 1.8660648216379918e-07, "loss": 0.1522, "num_input_tokens_seen": 36197936, "step": 59400 }, { "epoch": 18.43158547936705, "grad_norm": 12.695391654968262, "learning_rate": 1.8624025475077522e-07, "loss": 0.1918, "num_input_tokens_seen": 36200176, "step": 59405 }, { "epoch": 18.433136829041267, "grad_norm": 23.645910263061523, "learning_rate": 1.8587438024334382e-07, "loss": 0.1801, "num_input_tokens_seen": 36203344, "step": 59410 }, { "epoch": 18.434688178715483, "grad_norm": 12.579638481140137, "learning_rate": 1.855088586683279e-07, "loss": 0.1889, "num_input_tokens_seen": 36209104, "step": 59415 }, { "epoch": 18.4362395283897, "grad_norm": 13.80590534210205, "learning_rate": 1.8514369005252554e-07, "loss": 0.1474, "num_input_tokens_seen": 36213424, "step": 59420 }, { "epoch": 18.437790878063915, "grad_norm": 15.879817962646484, "learning_rate": 1.8477887442270638e-07, "loss": 0.1145, "num_input_tokens_seen": 36216048, "step": 59425 }, { "epoch": 18.43934222773813, "grad_norm": 12.772867202758789, "learning_rate": 1.844144118056168e-07, "loss": 0.1223, "num_input_tokens_seen": 36219344, "step": 59430 }, { "epoch": 18.44089357741235, "grad_norm": 15.000960350036621, "learning_rate": 1.8405030222797605e-07, "loss": 0.1944, "num_input_tokens_seen": 36221456, "step": 59435 }, { "epoch": 18.442444927086566, "grad_norm": 27.93984031677246, "learning_rate": 1.8368654571647715e-07, "loss": 0.1487, "num_input_tokens_seen": 36223952, "step": 59440 }, { "epoch": 18.44399627676078, "grad_norm": 8.748086929321289, "learning_rate": 1.8332314229778824e-07, "loss": 0.0943, "num_input_tokens_seen": 36227312, "step": 59445 }, { "epoch": 18.445547626434998, "grad_norm": 26.20298957824707, "learning_rate": 1.8296009199855081e-07, "loss": 0.0869, "num_input_tokens_seen": 36229872, "step": 59450 }, { "epoch": 18.447098976109213, "grad_norm": 5.723023414611816, "learning_rate": 1.8259739484538132e-07, "loss": 0.1233, "num_input_tokens_seen": 36232624, "step": 59455 }, { "epoch": 18.448650325783433, "grad_norm": 20.297380447387695, "learning_rate": 1.8223505086486904e-07, "loss": 0.0956, "num_input_tokens_seen": 36235152, "step": 59460 }, { "epoch": 18.45020167545765, "grad_norm": 26.117198944091797, "learning_rate": 1.8187306008357887e-07, "loss": 0.2271, "num_input_tokens_seen": 36238160, "step": 59465 }, { "epoch": 18.451753025131865, "grad_norm": 32.911720275878906, "learning_rate": 1.8151142252804787e-07, "loss": 0.1382, "num_input_tokens_seen": 36241232, "step": 59470 }, { "epoch": 18.45330437480608, "grad_norm": 8.787016868591309, "learning_rate": 1.811501382247899e-07, "loss": 0.1329, "num_input_tokens_seen": 36243728, "step": 59475 }, { "epoch": 18.454855724480296, "grad_norm": 18.084583282470703, "learning_rate": 1.807892072002898e-07, "loss": 0.1826, "num_input_tokens_seen": 36246832, "step": 59480 }, { "epoch": 18.456407074154516, "grad_norm": 10.679671287536621, "learning_rate": 1.8042862948100924e-07, "loss": 0.1778, "num_input_tokens_seen": 36250896, "step": 59485 }, { "epoch": 18.45795842382873, "grad_norm": 35.215579986572266, "learning_rate": 1.8006840509338208e-07, "loss": 0.2128, "num_input_tokens_seen": 36253424, "step": 59490 }, { "epoch": 18.459509773502948, "grad_norm": 30.357410430908203, "learning_rate": 1.7970853406381773e-07, "loss": 0.1118, "num_input_tokens_seen": 36255888, "step": 59495 }, { "epoch": 18.461061123177164, "grad_norm": 14.131919860839844, "learning_rate": 1.7934901641869784e-07, "loss": 0.0655, "num_input_tokens_seen": 36260528, "step": 59500 }, { "epoch": 18.46261247285138, "grad_norm": 10.657034873962402, "learning_rate": 1.7898985218438082e-07, "loss": 0.2752, "num_input_tokens_seen": 36263632, "step": 59505 }, { "epoch": 18.4641638225256, "grad_norm": 45.281288146972656, "learning_rate": 1.7863104138719668e-07, "loss": 0.2131, "num_input_tokens_seen": 36268336, "step": 59510 }, { "epoch": 18.465715172199815, "grad_norm": 9.539610862731934, "learning_rate": 1.782725840534505e-07, "loss": 0.1678, "num_input_tokens_seen": 36271312, "step": 59515 }, { "epoch": 18.46726652187403, "grad_norm": 14.2949800491333, "learning_rate": 1.7791448020942237e-07, "loss": 0.1798, "num_input_tokens_seen": 36273968, "step": 59520 }, { "epoch": 18.468817871548247, "grad_norm": 10.445182800292969, "learning_rate": 1.7755672988136407e-07, "loss": 0.1027, "num_input_tokens_seen": 36276976, "step": 59525 }, { "epoch": 18.470369221222462, "grad_norm": 28.696622848510742, "learning_rate": 1.7719933309550462e-07, "loss": 0.1569, "num_input_tokens_seen": 36279728, "step": 59530 }, { "epoch": 18.471920570896682, "grad_norm": 7.759103298187256, "learning_rate": 1.768422898780442e-07, "loss": 0.0718, "num_input_tokens_seen": 36283984, "step": 59535 }, { "epoch": 18.473471920570898, "grad_norm": 16.763145446777344, "learning_rate": 1.7648560025515847e-07, "loss": 0.1653, "num_input_tokens_seen": 36287184, "step": 59540 }, { "epoch": 18.475023270245114, "grad_norm": 19.591724395751953, "learning_rate": 1.7612926425299715e-07, "loss": 0.188, "num_input_tokens_seen": 36290576, "step": 59545 }, { "epoch": 18.47657461991933, "grad_norm": 5.837666034698486, "learning_rate": 1.7577328189768484e-07, "loss": 0.1208, "num_input_tokens_seen": 36294832, "step": 59550 }, { "epoch": 18.478125969593545, "grad_norm": 37.5677604675293, "learning_rate": 1.7541765321531734e-07, "loss": 0.1228, "num_input_tokens_seen": 36297680, "step": 59555 }, { "epoch": 18.47967731926776, "grad_norm": 20.76202964782715, "learning_rate": 1.750623782319677e-07, "loss": 0.183, "num_input_tokens_seen": 36300208, "step": 59560 }, { "epoch": 18.48122866894198, "grad_norm": 6.064446926116943, "learning_rate": 1.747074569736812e-07, "loss": 0.1787, "num_input_tokens_seen": 36302896, "step": 59565 }, { "epoch": 18.482780018616197, "grad_norm": 19.285764694213867, "learning_rate": 1.7435288946647867e-07, "loss": 0.1152, "num_input_tokens_seen": 36305744, "step": 59570 }, { "epoch": 18.484331368290412, "grad_norm": 22.629322052001953, "learning_rate": 1.7399867573635375e-07, "loss": 0.1013, "num_input_tokens_seen": 36307760, "step": 59575 }, { "epoch": 18.48588271796463, "grad_norm": 14.120819091796875, "learning_rate": 1.736448158092735e-07, "loss": 0.1745, "num_input_tokens_seen": 36310992, "step": 59580 }, { "epoch": 18.487434067638844, "grad_norm": 14.196952819824219, "learning_rate": 1.7329130971118156e-07, "loss": 0.1572, "num_input_tokens_seen": 36313296, "step": 59585 }, { "epoch": 18.488985417313064, "grad_norm": 5.064320087432861, "learning_rate": 1.729381574679928e-07, "loss": 0.1339, "num_input_tokens_seen": 36315920, "step": 59590 }, { "epoch": 18.49053676698728, "grad_norm": 12.664732933044434, "learning_rate": 1.725853591055987e-07, "loss": 0.1732, "num_input_tokens_seen": 36318736, "step": 59595 }, { "epoch": 18.492088116661495, "grad_norm": 37.505943298339844, "learning_rate": 1.7223291464986248e-07, "loss": 0.18, "num_input_tokens_seen": 36321360, "step": 59600 }, { "epoch": 18.49363946633571, "grad_norm": 4.499234676361084, "learning_rate": 1.7188082412662343e-07, "loss": 0.1038, "num_input_tokens_seen": 36324112, "step": 59605 }, { "epoch": 18.495190816009927, "grad_norm": 9.617837905883789, "learning_rate": 1.715290875616926e-07, "loss": 0.1342, "num_input_tokens_seen": 36326800, "step": 59610 }, { "epoch": 18.496742165684147, "grad_norm": 7.644891262054443, "learning_rate": 1.711777049808583e-07, "loss": 0.1574, "num_input_tokens_seen": 36330000, "step": 59615 }, { "epoch": 18.498293515358363, "grad_norm": 31.892358779907227, "learning_rate": 1.7082667640987926e-07, "loss": 0.1667, "num_input_tokens_seen": 36333168, "step": 59620 }, { "epoch": 18.49984486503258, "grad_norm": 9.555974960327148, "learning_rate": 1.7047600187449108e-07, "loss": 0.1679, "num_input_tokens_seen": 36336720, "step": 59625 }, { "epoch": 18.501396214706794, "grad_norm": 15.463289260864258, "learning_rate": 1.701256814004021e-07, "loss": 0.2193, "num_input_tokens_seen": 36339280, "step": 59630 }, { "epoch": 18.50294756438101, "grad_norm": 21.17239761352539, "learning_rate": 1.697757150132945e-07, "loss": 0.2256, "num_input_tokens_seen": 36342544, "step": 59635 }, { "epoch": 18.50449891405523, "grad_norm": 6.826837539672852, "learning_rate": 1.694261027388261e-07, "loss": 0.1618, "num_input_tokens_seen": 36345936, "step": 59640 }, { "epoch": 18.506050263729446, "grad_norm": 10.425597190856934, "learning_rate": 1.6907684460262642e-07, "loss": 0.1503, "num_input_tokens_seen": 36348144, "step": 59645 }, { "epoch": 18.50760161340366, "grad_norm": 11.55500602722168, "learning_rate": 1.6872794063030106e-07, "loss": 0.3058, "num_input_tokens_seen": 36351408, "step": 59650 }, { "epoch": 18.509152963077877, "grad_norm": 18.51527976989746, "learning_rate": 1.683793908474285e-07, "loss": 0.1393, "num_input_tokens_seen": 36355088, "step": 59655 }, { "epoch": 18.510704312752093, "grad_norm": 43.652530670166016, "learning_rate": 1.6803119527956158e-07, "loss": 0.1501, "num_input_tokens_seen": 36358128, "step": 59660 }, { "epoch": 18.512255662426313, "grad_norm": 26.73436737060547, "learning_rate": 1.6768335395222657e-07, "loss": 0.2615, "num_input_tokens_seen": 36360624, "step": 59665 }, { "epoch": 18.51380701210053, "grad_norm": 7.376336574554443, "learning_rate": 1.6733586689092585e-07, "loss": 0.1028, "num_input_tokens_seen": 36363184, "step": 59670 }, { "epoch": 18.515358361774744, "grad_norm": 21.024898529052734, "learning_rate": 1.6698873412113236e-07, "loss": 0.1187, "num_input_tokens_seen": 36365872, "step": 59675 }, { "epoch": 18.51690971144896, "grad_norm": 17.569320678710938, "learning_rate": 1.6664195566829689e-07, "loss": 0.1081, "num_input_tokens_seen": 36369168, "step": 59680 }, { "epoch": 18.518461061123176, "grad_norm": 3.1069958209991455, "learning_rate": 1.6629553155784072e-07, "loss": 0.1211, "num_input_tokens_seen": 36372272, "step": 59685 }, { "epoch": 18.520012410797392, "grad_norm": 8.228904724121094, "learning_rate": 1.659494618151619e-07, "loss": 0.1003, "num_input_tokens_seen": 36375344, "step": 59690 }, { "epoch": 18.52156376047161, "grad_norm": 5.769486904144287, "learning_rate": 1.656037464656318e-07, "loss": 0.116, "num_input_tokens_seen": 36378736, "step": 59695 }, { "epoch": 18.523115110145827, "grad_norm": 22.248485565185547, "learning_rate": 1.652583855345946e-07, "loss": 0.1195, "num_input_tokens_seen": 36381232, "step": 59700 }, { "epoch": 18.524666459820043, "grad_norm": 18.45879364013672, "learning_rate": 1.6491337904737004e-07, "loss": 0.1573, "num_input_tokens_seen": 36383536, "step": 59705 }, { "epoch": 18.52621780949426, "grad_norm": 15.984626770019531, "learning_rate": 1.645687270292501e-07, "loss": 0.1556, "num_input_tokens_seen": 36386032, "step": 59710 }, { "epoch": 18.527769159168475, "grad_norm": 9.60702133178711, "learning_rate": 1.6422442950550344e-07, "loss": 0.0701, "num_input_tokens_seen": 36388880, "step": 59715 }, { "epoch": 18.529320508842694, "grad_norm": 10.50247573852539, "learning_rate": 1.6388048650136933e-07, "loss": 0.1308, "num_input_tokens_seen": 36391856, "step": 59720 }, { "epoch": 18.53087185851691, "grad_norm": 2.7526514530181885, "learning_rate": 1.635368980420643e-07, "loss": 0.179, "num_input_tokens_seen": 36394928, "step": 59725 }, { "epoch": 18.532423208191126, "grad_norm": 3.1715261936187744, "learning_rate": 1.631936641527765e-07, "loss": 0.1821, "num_input_tokens_seen": 36397904, "step": 59730 }, { "epoch": 18.533974557865342, "grad_norm": 7.808292865753174, "learning_rate": 1.6285078485867022e-07, "loss": 0.1771, "num_input_tokens_seen": 36400784, "step": 59735 }, { "epoch": 18.535525907539558, "grad_norm": 17.23137855529785, "learning_rate": 1.6250826018488096e-07, "loss": 0.1084, "num_input_tokens_seen": 36403600, "step": 59740 }, { "epoch": 18.537077257213777, "grad_norm": 6.665896415710449, "learning_rate": 1.6216609015652195e-07, "loss": 0.0758, "num_input_tokens_seen": 36406288, "step": 59745 }, { "epoch": 18.538628606887993, "grad_norm": 7.852985382080078, "learning_rate": 1.618242747986759e-07, "loss": 0.1071, "num_input_tokens_seen": 36408912, "step": 59750 }, { "epoch": 18.54017995656221, "grad_norm": 23.441923141479492, "learning_rate": 1.6148281413640278e-07, "loss": 0.1825, "num_input_tokens_seen": 36411280, "step": 59755 }, { "epoch": 18.541731306236425, "grad_norm": 12.11843490600586, "learning_rate": 1.6114170819473695e-07, "loss": 0.0945, "num_input_tokens_seen": 36415152, "step": 59760 }, { "epoch": 18.54328265591064, "grad_norm": 120.36473083496094, "learning_rate": 1.6080095699868404e-07, "loss": 0.3391, "num_input_tokens_seen": 36417712, "step": 59765 }, { "epoch": 18.54483400558486, "grad_norm": 23.980392456054688, "learning_rate": 1.6046056057322623e-07, "loss": 0.2037, "num_input_tokens_seen": 36420688, "step": 59770 }, { "epoch": 18.546385355259076, "grad_norm": 13.432653427124023, "learning_rate": 1.601205189433175e-07, "loss": 0.1527, "num_input_tokens_seen": 36424336, "step": 59775 }, { "epoch": 18.547936704933292, "grad_norm": 12.049635887145996, "learning_rate": 1.5978083213388784e-07, "loss": 0.1607, "num_input_tokens_seen": 36426800, "step": 59780 }, { "epoch": 18.549488054607508, "grad_norm": 11.517887115478516, "learning_rate": 1.5944150016983907e-07, "loss": 0.2318, "num_input_tokens_seen": 36430448, "step": 59785 }, { "epoch": 18.551039404281724, "grad_norm": 35.86468505859375, "learning_rate": 1.5910252307605012e-07, "loss": 0.2383, "num_input_tokens_seen": 36433200, "step": 59790 }, { "epoch": 18.552590753955943, "grad_norm": 15.57723617553711, "learning_rate": 1.5876390087737058e-07, "loss": 0.1599, "num_input_tokens_seen": 36436048, "step": 59795 }, { "epoch": 18.55414210363016, "grad_norm": 59.35821533203125, "learning_rate": 1.5842563359862617e-07, "loss": 0.1529, "num_input_tokens_seen": 36438928, "step": 59800 }, { "epoch": 18.555693453304375, "grad_norm": 16.68646812438965, "learning_rate": 1.5808772126461537e-07, "loss": 0.1383, "num_input_tokens_seen": 36441520, "step": 59805 }, { "epoch": 18.55724480297859, "grad_norm": 27.57924461364746, "learning_rate": 1.5775016390011166e-07, "loss": 0.1869, "num_input_tokens_seen": 36443728, "step": 59810 }, { "epoch": 18.558796152652807, "grad_norm": 22.482423782348633, "learning_rate": 1.5741296152986196e-07, "loss": 0.2423, "num_input_tokens_seen": 36446032, "step": 59815 }, { "epoch": 18.560347502327023, "grad_norm": 9.14972972869873, "learning_rate": 1.5707611417858704e-07, "loss": 0.0895, "num_input_tokens_seen": 36449072, "step": 59820 }, { "epoch": 18.561898852001242, "grad_norm": 43.2728385925293, "learning_rate": 1.5673962187098102e-07, "loss": 0.1126, "num_input_tokens_seen": 36451952, "step": 59825 }, { "epoch": 18.563450201675458, "grad_norm": 14.663214683532715, "learning_rate": 1.5640348463171416e-07, "loss": 0.1152, "num_input_tokens_seen": 36454704, "step": 59830 }, { "epoch": 18.565001551349674, "grad_norm": 13.769981384277344, "learning_rate": 1.560677024854279e-07, "loss": 0.2047, "num_input_tokens_seen": 36457456, "step": 59835 }, { "epoch": 18.56655290102389, "grad_norm": 18.29257583618164, "learning_rate": 1.557322754567403e-07, "loss": 0.1251, "num_input_tokens_seen": 36460848, "step": 59840 }, { "epoch": 18.568104250698106, "grad_norm": 16.092632293701172, "learning_rate": 1.5539720357024168e-07, "loss": 0.2089, "num_input_tokens_seen": 36463088, "step": 59845 }, { "epoch": 18.569655600372325, "grad_norm": 15.978107452392578, "learning_rate": 1.5506248685049628e-07, "loss": 0.1341, "num_input_tokens_seen": 36466192, "step": 59850 }, { "epoch": 18.57120695004654, "grad_norm": 11.553655624389648, "learning_rate": 1.5472812532204395e-07, "loss": 0.129, "num_input_tokens_seen": 36468880, "step": 59855 }, { "epoch": 18.572758299720757, "grad_norm": 26.006145477294922, "learning_rate": 1.5439411900939617e-07, "loss": 0.2279, "num_input_tokens_seen": 36472368, "step": 59860 }, { "epoch": 18.574309649394973, "grad_norm": 46.42609786987305, "learning_rate": 1.5406046793704054e-07, "loss": 0.0772, "num_input_tokens_seen": 36474896, "step": 59865 }, { "epoch": 18.57586099906919, "grad_norm": 11.323487281799316, "learning_rate": 1.5372717212943645e-07, "loss": 0.132, "num_input_tokens_seen": 36477808, "step": 59870 }, { "epoch": 18.577412348743408, "grad_norm": 25.81727409362793, "learning_rate": 1.5339423161101986e-07, "loss": 0.096, "num_input_tokens_seen": 36481296, "step": 59875 }, { "epoch": 18.578963698417624, "grad_norm": 15.821341514587402, "learning_rate": 1.5306164640619736e-07, "loss": 0.1275, "num_input_tokens_seen": 36483600, "step": 59880 }, { "epoch": 18.58051504809184, "grad_norm": 31.351125717163086, "learning_rate": 1.5272941653935336e-07, "loss": 0.1474, "num_input_tokens_seen": 36487472, "step": 59885 }, { "epoch": 18.582066397766056, "grad_norm": 28.107025146484375, "learning_rate": 1.5239754203484335e-07, "loss": 0.1397, "num_input_tokens_seen": 36490064, "step": 59890 }, { "epoch": 18.58361774744027, "grad_norm": 7.267383098602295, "learning_rate": 1.5206602291699735e-07, "loss": 0.1225, "num_input_tokens_seen": 36493040, "step": 59895 }, { "epoch": 18.58516909711449, "grad_norm": 14.226799964904785, "learning_rate": 1.5173485921012033e-07, "loss": 0.131, "num_input_tokens_seen": 36497008, "step": 59900 }, { "epoch": 18.586720446788707, "grad_norm": 14.073065757751465, "learning_rate": 1.5140405093848952e-07, "loss": 0.1106, "num_input_tokens_seen": 36499760, "step": 59905 }, { "epoch": 18.588271796462923, "grad_norm": 24.763648986816406, "learning_rate": 1.510735981263589e-07, "loss": 0.1205, "num_input_tokens_seen": 36502832, "step": 59910 }, { "epoch": 18.58982314613714, "grad_norm": 27.36668586730957, "learning_rate": 1.5074350079795242e-07, "loss": 0.1276, "num_input_tokens_seen": 36505360, "step": 59915 }, { "epoch": 18.591374495811355, "grad_norm": 10.598271369934082, "learning_rate": 1.504137589774718e-07, "loss": 0.1381, "num_input_tokens_seen": 36508112, "step": 59920 }, { "epoch": 18.592925845485574, "grad_norm": 14.355326652526855, "learning_rate": 1.5008437268908992e-07, "loss": 0.1059, "num_input_tokens_seen": 36510768, "step": 59925 }, { "epoch": 18.59447719515979, "grad_norm": 11.559263229370117, "learning_rate": 1.497553419569553e-07, "loss": 0.1141, "num_input_tokens_seen": 36513360, "step": 59930 }, { "epoch": 18.596028544834006, "grad_norm": 18.972673416137695, "learning_rate": 1.4942666680518913e-07, "loss": 0.1198, "num_input_tokens_seen": 36515664, "step": 59935 }, { "epoch": 18.597579894508222, "grad_norm": 9.913509368896484, "learning_rate": 1.490983472578883e-07, "loss": 0.099, "num_input_tokens_seen": 36518032, "step": 59940 }, { "epoch": 18.599131244182438, "grad_norm": 5.742598533630371, "learning_rate": 1.4877038333912186e-07, "loss": 0.1063, "num_input_tokens_seen": 36520976, "step": 59945 }, { "epoch": 18.600682593856654, "grad_norm": 37.23353958129883, "learning_rate": 1.4844277507293335e-07, "loss": 0.2101, "num_input_tokens_seen": 36523280, "step": 59950 }, { "epoch": 18.602233943530873, "grad_norm": 12.764375686645508, "learning_rate": 1.4811552248334028e-07, "loss": 0.092, "num_input_tokens_seen": 36526032, "step": 59955 }, { "epoch": 18.60378529320509, "grad_norm": 27.146791458129883, "learning_rate": 1.4778862559433395e-07, "loss": 0.2402, "num_input_tokens_seen": 36529040, "step": 59960 }, { "epoch": 18.605336642879305, "grad_norm": 20.960174560546875, "learning_rate": 1.4746208442988075e-07, "loss": 0.1303, "num_input_tokens_seen": 36531920, "step": 59965 }, { "epoch": 18.60688799255352, "grad_norm": 22.02143096923828, "learning_rate": 1.4713589901391932e-07, "loss": 0.2096, "num_input_tokens_seen": 36535728, "step": 59970 }, { "epoch": 18.608439342227737, "grad_norm": 18.51492691040039, "learning_rate": 1.468100693703628e-07, "loss": 0.2301, "num_input_tokens_seen": 36538000, "step": 59975 }, { "epoch": 18.609990691901956, "grad_norm": 16.128280639648438, "learning_rate": 1.464845955230987e-07, "loss": 0.2598, "num_input_tokens_seen": 36540688, "step": 59980 }, { "epoch": 18.611542041576172, "grad_norm": 23.892656326293945, "learning_rate": 1.4615947749598847e-07, "loss": 0.1722, "num_input_tokens_seen": 36544080, "step": 59985 }, { "epoch": 18.613093391250388, "grad_norm": 6.709380626678467, "learning_rate": 1.4583471531286587e-07, "loss": 0.0907, "num_input_tokens_seen": 36547152, "step": 59990 }, { "epoch": 18.614644740924604, "grad_norm": 10.223126411437988, "learning_rate": 1.455103089975407e-07, "loss": 0.1575, "num_input_tokens_seen": 36549680, "step": 59995 }, { "epoch": 18.61619609059882, "grad_norm": 5.769312858581543, "learning_rate": 1.4518625857379565e-07, "loss": 0.1138, "num_input_tokens_seen": 36551888, "step": 60000 }, { "epoch": 18.61774744027304, "grad_norm": 33.58192825317383, "learning_rate": 1.448625640653878e-07, "loss": 0.136, "num_input_tokens_seen": 36554704, "step": 60005 }, { "epoch": 18.619298789947255, "grad_norm": 9.088196754455566, "learning_rate": 1.4453922549604705e-07, "loss": 0.1339, "num_input_tokens_seen": 36557904, "step": 60010 }, { "epoch": 18.62085013962147, "grad_norm": 9.516327857971191, "learning_rate": 1.4421624288947777e-07, "loss": 0.2331, "num_input_tokens_seen": 36560336, "step": 60015 }, { "epoch": 18.622401489295687, "grad_norm": 2.8696563243865967, "learning_rate": 1.4389361626935993e-07, "loss": 0.12, "num_input_tokens_seen": 36563216, "step": 60020 }, { "epoch": 18.623952838969903, "grad_norm": 3.841172456741333, "learning_rate": 1.435713456593446e-07, "loss": 0.086, "num_input_tokens_seen": 36566064, "step": 60025 }, { "epoch": 18.625504188644122, "grad_norm": 6.2046942710876465, "learning_rate": 1.4324943108305845e-07, "loss": 0.144, "num_input_tokens_seen": 36569296, "step": 60030 }, { "epoch": 18.627055538318338, "grad_norm": 24.774660110473633, "learning_rate": 1.4292787256410145e-07, "loss": 0.0668, "num_input_tokens_seen": 36571600, "step": 60035 }, { "epoch": 18.628606887992554, "grad_norm": 33.3709716796875, "learning_rate": 1.426066701260481e-07, "loss": 0.1602, "num_input_tokens_seen": 36575024, "step": 60040 }, { "epoch": 18.63015823766677, "grad_norm": 10.673916816711426, "learning_rate": 1.4228582379244514e-07, "loss": 0.0831, "num_input_tokens_seen": 36577456, "step": 60045 }, { "epoch": 18.631709587340985, "grad_norm": 8.105241775512695, "learning_rate": 1.4196533358681596e-07, "loss": 0.1818, "num_input_tokens_seen": 36580656, "step": 60050 }, { "epoch": 18.633260937015205, "grad_norm": 12.69533634185791, "learning_rate": 1.416451995326551e-07, "loss": 0.1314, "num_input_tokens_seen": 36583504, "step": 60055 }, { "epoch": 18.63481228668942, "grad_norm": 27.385881423950195, "learning_rate": 1.413254216534332e-07, "loss": 0.1415, "num_input_tokens_seen": 36586288, "step": 60060 }, { "epoch": 18.636363636363637, "grad_norm": 13.809076309204102, "learning_rate": 1.4100599997259267e-07, "loss": 0.175, "num_input_tokens_seen": 36589008, "step": 60065 }, { "epoch": 18.637914986037853, "grad_norm": 21.085289001464844, "learning_rate": 1.4068693451355197e-07, "loss": 0.1707, "num_input_tokens_seen": 36591344, "step": 60070 }, { "epoch": 18.63946633571207, "grad_norm": 31.443187713623047, "learning_rate": 1.403682252997013e-07, "loss": 0.1923, "num_input_tokens_seen": 36594064, "step": 60075 }, { "epoch": 18.641017685386288, "grad_norm": 41.745765686035156, "learning_rate": 1.4004987235440637e-07, "loss": 0.1268, "num_input_tokens_seen": 36597360, "step": 60080 }, { "epoch": 18.642569035060504, "grad_norm": 5.895366191864014, "learning_rate": 1.3973187570100687e-07, "loss": 0.1713, "num_input_tokens_seen": 36600432, "step": 60085 }, { "epoch": 18.64412038473472, "grad_norm": 20.18265151977539, "learning_rate": 1.394142353628142e-07, "loss": 0.1452, "num_input_tokens_seen": 36603152, "step": 60090 }, { "epoch": 18.645671734408936, "grad_norm": 12.00400447845459, "learning_rate": 1.3909695136311686e-07, "loss": 0.1715, "num_input_tokens_seen": 36607024, "step": 60095 }, { "epoch": 18.64722308408315, "grad_norm": 33.629276275634766, "learning_rate": 1.3878002372517408e-07, "loss": 0.2069, "num_input_tokens_seen": 36609712, "step": 60100 }, { "epoch": 18.648774433757367, "grad_norm": 29.189308166503906, "learning_rate": 1.3846345247222115e-07, "loss": 0.2857, "num_input_tokens_seen": 36613552, "step": 60105 }, { "epoch": 18.650325783431587, "grad_norm": 16.798274993896484, "learning_rate": 1.381472376274662e-07, "loss": 0.1659, "num_input_tokens_seen": 36616656, "step": 60110 }, { "epoch": 18.651877133105803, "grad_norm": 57.03988265991211, "learning_rate": 1.378313792140923e-07, "loss": 0.3452, "num_input_tokens_seen": 36619280, "step": 60115 }, { "epoch": 18.65342848278002, "grad_norm": 30.047893524169922, "learning_rate": 1.3751587725525428e-07, "loss": 0.2006, "num_input_tokens_seen": 36621584, "step": 60120 }, { "epoch": 18.654979832454234, "grad_norm": 25.354639053344727, "learning_rate": 1.3720073177408365e-07, "loss": 0.1388, "num_input_tokens_seen": 36623888, "step": 60125 }, { "epoch": 18.65653118212845, "grad_norm": 16.95403289794922, "learning_rate": 1.368859427936825e-07, "loss": 0.1195, "num_input_tokens_seen": 36628208, "step": 60130 }, { "epoch": 18.65808253180267, "grad_norm": 17.342397689819336, "learning_rate": 1.365715103371301e-07, "loss": 0.0763, "num_input_tokens_seen": 36631632, "step": 60135 }, { "epoch": 18.659633881476886, "grad_norm": 38.29863357543945, "learning_rate": 1.3625743442747809e-07, "loss": 0.1487, "num_input_tokens_seen": 36633744, "step": 60140 }, { "epoch": 18.6611852311511, "grad_norm": 7.8396315574646, "learning_rate": 1.3594371508775072e-07, "loss": 0.174, "num_input_tokens_seen": 36636656, "step": 60145 }, { "epoch": 18.662736580825317, "grad_norm": 32.63570785522461, "learning_rate": 1.3563035234094856e-07, "loss": 0.2223, "num_input_tokens_seen": 36640016, "step": 60150 }, { "epoch": 18.664287930499533, "grad_norm": 16.914302825927734, "learning_rate": 1.3531734621004432e-07, "loss": 0.1152, "num_input_tokens_seen": 36642960, "step": 60155 }, { "epoch": 18.665839280173753, "grad_norm": 14.04721450805664, "learning_rate": 1.3500469671798522e-07, "loss": 0.1706, "num_input_tokens_seen": 36646480, "step": 60160 }, { "epoch": 18.66739062984797, "grad_norm": 35.700355529785156, "learning_rate": 1.3469240388769122e-07, "loss": 0.1927, "num_input_tokens_seen": 36649200, "step": 60165 }, { "epoch": 18.668941979522184, "grad_norm": 14.317817687988281, "learning_rate": 1.3438046774205903e-07, "loss": 0.1631, "num_input_tokens_seen": 36651568, "step": 60170 }, { "epoch": 18.6704933291964, "grad_norm": 9.272855758666992, "learning_rate": 1.340688883039548e-07, "loss": 0.1126, "num_input_tokens_seen": 36654064, "step": 60175 }, { "epoch": 18.672044678870616, "grad_norm": 7.923762321472168, "learning_rate": 1.337576655962236e-07, "loss": 0.1593, "num_input_tokens_seen": 36656464, "step": 60180 }, { "epoch": 18.673596028544836, "grad_norm": 15.348626136779785, "learning_rate": 1.334467996416794e-07, "loss": 0.173, "num_input_tokens_seen": 36659664, "step": 60185 }, { "epoch": 18.67514737821905, "grad_norm": 34.25638961791992, "learning_rate": 1.331362904631145e-07, "loss": 0.3027, "num_input_tokens_seen": 36661936, "step": 60190 }, { "epoch": 18.676698727893267, "grad_norm": 22.525423049926758, "learning_rate": 1.328261380832907e-07, "loss": 0.1319, "num_input_tokens_seen": 36664848, "step": 60195 }, { "epoch": 18.678250077567483, "grad_norm": 53.37530517578125, "learning_rate": 1.3251634252494704e-07, "loss": 0.1983, "num_input_tokens_seen": 36667760, "step": 60200 }, { "epoch": 18.6798014272417, "grad_norm": 10.265433311462402, "learning_rate": 1.3220690381079593e-07, "loss": 0.1565, "num_input_tokens_seen": 36670416, "step": 60205 }, { "epoch": 18.681352776915915, "grad_norm": 6.39400577545166, "learning_rate": 1.3189782196352086e-07, "loss": 0.1461, "num_input_tokens_seen": 36673424, "step": 60210 }, { "epoch": 18.682904126590135, "grad_norm": 6.322885990142822, "learning_rate": 1.315890970057837e-07, "loss": 0.1171, "num_input_tokens_seen": 36676304, "step": 60215 }, { "epoch": 18.68445547626435, "grad_norm": 20.256305694580078, "learning_rate": 1.3128072896021525e-07, "loss": 0.1183, "num_input_tokens_seen": 36678448, "step": 60220 }, { "epoch": 18.686006825938566, "grad_norm": 25.06610679626465, "learning_rate": 1.3097271784942467e-07, "loss": 0.1424, "num_input_tokens_seen": 36680784, "step": 60225 }, { "epoch": 18.687558175612782, "grad_norm": 59.29566955566406, "learning_rate": 1.3066506369599107e-07, "loss": 0.1359, "num_input_tokens_seen": 36686672, "step": 60230 }, { "epoch": 18.689109525286998, "grad_norm": 14.001605987548828, "learning_rate": 1.3035776652247034e-07, "loss": 0.137, "num_input_tokens_seen": 36689456, "step": 60235 }, { "epoch": 18.690660874961218, "grad_norm": 40.294315338134766, "learning_rate": 1.3005082635138999e-07, "loss": 0.2537, "num_input_tokens_seen": 36691696, "step": 60240 }, { "epoch": 18.692212224635433, "grad_norm": 12.260052680969238, "learning_rate": 1.2974424320525314e-07, "loss": 0.1297, "num_input_tokens_seen": 36694192, "step": 60245 }, { "epoch": 18.69376357430965, "grad_norm": 5.384273529052734, "learning_rate": 1.2943801710653515e-07, "loss": 0.1832, "num_input_tokens_seen": 36697264, "step": 60250 }, { "epoch": 18.695314923983865, "grad_norm": 35.27592468261719, "learning_rate": 1.2913214807768693e-07, "loss": 0.2774, "num_input_tokens_seen": 36699952, "step": 60255 }, { "epoch": 18.69686627365808, "grad_norm": 23.972375869750977, "learning_rate": 1.2882663614113278e-07, "loss": 0.1611, "num_input_tokens_seen": 36703280, "step": 60260 }, { "epoch": 18.6984176233323, "grad_norm": 13.793485641479492, "learning_rate": 1.2852148131926868e-07, "loss": 0.1622, "num_input_tokens_seen": 36706448, "step": 60265 }, { "epoch": 18.699968973006516, "grad_norm": 8.386748313903809, "learning_rate": 1.2821668363446725e-07, "loss": 0.1234, "num_input_tokens_seen": 36710096, "step": 60270 }, { "epoch": 18.701520322680732, "grad_norm": 10.424104690551758, "learning_rate": 1.279122431090729e-07, "loss": 0.0928, "num_input_tokens_seen": 36712528, "step": 60275 }, { "epoch": 18.703071672354948, "grad_norm": 17.400056838989258, "learning_rate": 1.2760815976540608e-07, "loss": 0.2128, "num_input_tokens_seen": 36715248, "step": 60280 }, { "epoch": 18.704623022029164, "grad_norm": 9.276450157165527, "learning_rate": 1.273044336257584e-07, "loss": 0.1435, "num_input_tokens_seen": 36717456, "step": 60285 }, { "epoch": 18.706174371703383, "grad_norm": 17.95660400390625, "learning_rate": 1.270010647123976e-07, "loss": 0.3027, "num_input_tokens_seen": 36719632, "step": 60290 }, { "epoch": 18.7077257213776, "grad_norm": 12.559737205505371, "learning_rate": 1.2669805304756312e-07, "loss": 0.1472, "num_input_tokens_seen": 36723632, "step": 60295 }, { "epoch": 18.709277071051815, "grad_norm": 4.329929828643799, "learning_rate": 1.2639539865346996e-07, "loss": 0.1211, "num_input_tokens_seen": 36728080, "step": 60300 }, { "epoch": 18.71082842072603, "grad_norm": 24.283926010131836, "learning_rate": 1.2609310155230647e-07, "loss": 0.1797, "num_input_tokens_seen": 36730576, "step": 60305 }, { "epoch": 18.712379770400247, "grad_norm": 1.279309630393982, "learning_rate": 1.2579116176623441e-07, "loss": 0.0966, "num_input_tokens_seen": 36733584, "step": 60310 }, { "epoch": 18.713931120074466, "grad_norm": 8.848593711853027, "learning_rate": 1.254895793173888e-07, "loss": 0.1624, "num_input_tokens_seen": 36737296, "step": 60315 }, { "epoch": 18.715482469748682, "grad_norm": 2.9425177574157715, "learning_rate": 1.2518835422788033e-07, "loss": 0.1199, "num_input_tokens_seen": 36739984, "step": 60320 }, { "epoch": 18.7170338194229, "grad_norm": 11.186511993408203, "learning_rate": 1.2488748651979187e-07, "loss": 0.2065, "num_input_tokens_seen": 36745136, "step": 60325 }, { "epoch": 18.718585169097114, "grad_norm": 57.044612884521484, "learning_rate": 1.2458697621518024e-07, "loss": 0.1707, "num_input_tokens_seen": 36748048, "step": 60330 }, { "epoch": 18.72013651877133, "grad_norm": 8.21105670928955, "learning_rate": 1.2428682333607777e-07, "loss": 0.1551, "num_input_tokens_seen": 36751472, "step": 60335 }, { "epoch": 18.72168786844555, "grad_norm": 19.63849639892578, "learning_rate": 1.2398702790448746e-07, "loss": 0.0941, "num_input_tokens_seen": 36754224, "step": 60340 }, { "epoch": 18.723239218119765, "grad_norm": 18.87235450744629, "learning_rate": 1.2368758994238893e-07, "loss": 0.1188, "num_input_tokens_seen": 36758096, "step": 60345 }, { "epoch": 18.72479056779398, "grad_norm": 23.45882797241211, "learning_rate": 1.2338850947173352e-07, "loss": 0.1915, "num_input_tokens_seen": 36761712, "step": 60350 }, { "epoch": 18.726341917468197, "grad_norm": 18.138694763183594, "learning_rate": 1.2308978651444926e-07, "loss": 0.2227, "num_input_tokens_seen": 36765232, "step": 60355 }, { "epoch": 18.727893267142413, "grad_norm": 9.775099754333496, "learning_rate": 1.2279142109243358e-07, "loss": 0.1145, "num_input_tokens_seen": 36768080, "step": 60360 }, { "epoch": 18.72944461681663, "grad_norm": 14.35134220123291, "learning_rate": 1.2249341322756236e-07, "loss": 0.1811, "num_input_tokens_seen": 36770896, "step": 60365 }, { "epoch": 18.73099596649085, "grad_norm": 22.382896423339844, "learning_rate": 1.22195762941682e-07, "loss": 0.0766, "num_input_tokens_seen": 36773616, "step": 60370 }, { "epoch": 18.732547316165064, "grad_norm": 8.129752159118652, "learning_rate": 1.218984702566145e-07, "loss": 0.1701, "num_input_tokens_seen": 36776528, "step": 60375 }, { "epoch": 18.73409866583928, "grad_norm": 24.50712013244629, "learning_rate": 1.2160153519415407e-07, "loss": 0.1402, "num_input_tokens_seen": 36779632, "step": 60380 }, { "epoch": 18.735650015513496, "grad_norm": 47.38348388671875, "learning_rate": 1.2130495777607e-07, "loss": 0.226, "num_input_tokens_seen": 36782352, "step": 60385 }, { "epoch": 18.737201365187712, "grad_norm": 23.54138946533203, "learning_rate": 1.2100873802410483e-07, "loss": 0.2566, "num_input_tokens_seen": 36786288, "step": 60390 }, { "epoch": 18.73875271486193, "grad_norm": 10.196619987487793, "learning_rate": 1.207128759599757e-07, "loss": 0.1507, "num_input_tokens_seen": 36789264, "step": 60395 }, { "epoch": 18.740304064536147, "grad_norm": 22.213274002075195, "learning_rate": 1.2041737160537136e-07, "loss": 0.1116, "num_input_tokens_seen": 36791408, "step": 60400 }, { "epoch": 18.741855414210363, "grad_norm": 42.39249038696289, "learning_rate": 1.2012222498195724e-07, "loss": 0.1535, "num_input_tokens_seen": 36795088, "step": 60405 }, { "epoch": 18.74340676388458, "grad_norm": 23.239093780517578, "learning_rate": 1.1982743611136993e-07, "loss": 0.2241, "num_input_tokens_seen": 36798224, "step": 60410 }, { "epoch": 18.744958113558795, "grad_norm": 9.577774047851562, "learning_rate": 1.1953300501522159e-07, "loss": 0.1363, "num_input_tokens_seen": 36801840, "step": 60415 }, { "epoch": 18.746509463233014, "grad_norm": 29.430469512939453, "learning_rate": 1.192389317150977e-07, "loss": 0.174, "num_input_tokens_seen": 36805840, "step": 60420 }, { "epoch": 18.74806081290723, "grad_norm": 13.489723205566406, "learning_rate": 1.1894521623255661e-07, "loss": 0.1163, "num_input_tokens_seen": 36808144, "step": 60425 }, { "epoch": 18.749612162581446, "grad_norm": 19.041349411010742, "learning_rate": 1.1865185858913163e-07, "loss": 0.1874, "num_input_tokens_seen": 36811504, "step": 60430 }, { "epoch": 18.751163512255662, "grad_norm": 13.385699272155762, "learning_rate": 1.1835885880632891e-07, "loss": 0.2466, "num_input_tokens_seen": 36814000, "step": 60435 }, { "epoch": 18.752714861929878, "grad_norm": 34.06647872924805, "learning_rate": 1.1806621690563014e-07, "loss": 0.1295, "num_input_tokens_seen": 36816400, "step": 60440 }, { "epoch": 18.754266211604097, "grad_norm": 12.35622787475586, "learning_rate": 1.1777393290848704e-07, "loss": 0.1426, "num_input_tokens_seen": 36820400, "step": 60445 }, { "epoch": 18.755817561278313, "grad_norm": 13.491350173950195, "learning_rate": 1.1748200683632971e-07, "loss": 0.243, "num_input_tokens_seen": 36823216, "step": 60450 }, { "epoch": 18.75736891095253, "grad_norm": 6.003174781799316, "learning_rate": 1.171904387105588e-07, "loss": 0.0912, "num_input_tokens_seen": 36825424, "step": 60455 }, { "epoch": 18.758920260626745, "grad_norm": 21.3531551361084, "learning_rate": 1.1689922855254998e-07, "loss": 0.2029, "num_input_tokens_seen": 36827792, "step": 60460 }, { "epoch": 18.76047161030096, "grad_norm": 14.437112808227539, "learning_rate": 1.1660837638365175e-07, "loss": 0.1359, "num_input_tokens_seen": 36830768, "step": 60465 }, { "epoch": 18.762022959975177, "grad_norm": 13.70374584197998, "learning_rate": 1.1631788222518758e-07, "loss": 0.1661, "num_input_tokens_seen": 36833008, "step": 60470 }, { "epoch": 18.763574309649396, "grad_norm": 14.599526405334473, "learning_rate": 1.160277460984549e-07, "loss": 0.1408, "num_input_tokens_seen": 36835984, "step": 60475 }, { "epoch": 18.765125659323612, "grad_norm": 6.465781211853027, "learning_rate": 1.1573796802472282e-07, "loss": 0.0848, "num_input_tokens_seen": 36838064, "step": 60480 }, { "epoch": 18.766677008997828, "grad_norm": 23.63055992126465, "learning_rate": 1.1544854802523598e-07, "loss": 0.156, "num_input_tokens_seen": 36840528, "step": 60485 }, { "epoch": 18.768228358672044, "grad_norm": 22.56550407409668, "learning_rate": 1.1515948612121187e-07, "loss": 0.1792, "num_input_tokens_seen": 36843248, "step": 60490 }, { "epoch": 18.76977970834626, "grad_norm": 3.5618157386779785, "learning_rate": 1.1487078233384352e-07, "loss": 0.1227, "num_input_tokens_seen": 36845936, "step": 60495 }, { "epoch": 18.77133105802048, "grad_norm": 13.103590965270996, "learning_rate": 1.1458243668429458e-07, "loss": 0.1824, "num_input_tokens_seen": 36848368, "step": 60500 }, { "epoch": 18.772882407694695, "grad_norm": 4.485764980316162, "learning_rate": 1.1429444919370592e-07, "loss": 0.1798, "num_input_tokens_seen": 36853136, "step": 60505 }, { "epoch": 18.77443375736891, "grad_norm": 22.008739471435547, "learning_rate": 1.140068198831884e-07, "loss": 0.1918, "num_input_tokens_seen": 36856432, "step": 60510 }, { "epoch": 18.775985107043127, "grad_norm": 37.40543746948242, "learning_rate": 1.1371954877383074e-07, "loss": 0.1241, "num_input_tokens_seen": 36860048, "step": 60515 }, { "epoch": 18.777536456717343, "grad_norm": 22.825143814086914, "learning_rate": 1.1343263588669162e-07, "loss": 0.1401, "num_input_tokens_seen": 36862512, "step": 60520 }, { "epoch": 18.779087806391562, "grad_norm": 42.628326416015625, "learning_rate": 1.1314608124280646e-07, "loss": 0.2314, "num_input_tokens_seen": 36865136, "step": 60525 }, { "epoch": 18.780639156065778, "grad_norm": 76.38199615478516, "learning_rate": 1.1285988486318234e-07, "loss": 0.1731, "num_input_tokens_seen": 36868720, "step": 60530 }, { "epoch": 18.782190505739994, "grad_norm": 17.322200775146484, "learning_rate": 1.1257404676880135e-07, "loss": 0.1485, "num_input_tokens_seen": 36873168, "step": 60535 }, { "epoch": 18.78374185541421, "grad_norm": 27.872690200805664, "learning_rate": 1.122885669806184e-07, "loss": 0.1413, "num_input_tokens_seen": 36875920, "step": 60540 }, { "epoch": 18.785293205088426, "grad_norm": 14.885655403137207, "learning_rate": 1.1200344551956232e-07, "loss": 0.1523, "num_input_tokens_seen": 36879312, "step": 60545 }, { "epoch": 18.786844554762645, "grad_norm": 11.394164085388184, "learning_rate": 1.1171868240653638e-07, "loss": 0.1259, "num_input_tokens_seen": 36882032, "step": 60550 }, { "epoch": 18.78839590443686, "grad_norm": 19.017122268676758, "learning_rate": 1.1143427766241665e-07, "loss": 0.1463, "num_input_tokens_seen": 36884976, "step": 60555 }, { "epoch": 18.789947254111077, "grad_norm": 9.046875953674316, "learning_rate": 1.1115023130805424e-07, "loss": 0.1434, "num_input_tokens_seen": 36888016, "step": 60560 }, { "epoch": 18.791498603785293, "grad_norm": 62.35529708862305, "learning_rate": 1.108665433642725e-07, "loss": 0.2402, "num_input_tokens_seen": 36892112, "step": 60565 }, { "epoch": 18.79304995345951, "grad_norm": 22.90172576904297, "learning_rate": 1.1058321385186921e-07, "loss": 0.1824, "num_input_tokens_seen": 36894736, "step": 60570 }, { "epoch": 18.794601303133728, "grad_norm": 42.95650863647461, "learning_rate": 1.10300242791615e-07, "loss": 0.1224, "num_input_tokens_seen": 36897424, "step": 60575 }, { "epoch": 18.796152652807944, "grad_norm": 3.6345808506011963, "learning_rate": 1.1001763020425605e-07, "loss": 0.0773, "num_input_tokens_seen": 36900336, "step": 60580 }, { "epoch": 18.79770400248216, "grad_norm": 64.38296508789062, "learning_rate": 1.0973537611051077e-07, "loss": 0.2187, "num_input_tokens_seen": 36903184, "step": 60585 }, { "epoch": 18.799255352156376, "grad_norm": 34.17167282104492, "learning_rate": 1.0945348053107207e-07, "loss": 0.2484, "num_input_tokens_seen": 36906576, "step": 60590 }, { "epoch": 18.80080670183059, "grad_norm": 8.66157054901123, "learning_rate": 1.0917194348660564e-07, "loss": 0.2479, "num_input_tokens_seen": 36909264, "step": 60595 }, { "epoch": 18.80235805150481, "grad_norm": 13.454816818237305, "learning_rate": 1.088907649977522e-07, "loss": 0.1913, "num_input_tokens_seen": 36912400, "step": 60600 }, { "epoch": 18.803909401179027, "grad_norm": 30.83446502685547, "learning_rate": 1.0860994508512524e-07, "loss": 0.1838, "num_input_tokens_seen": 36915952, "step": 60605 }, { "epoch": 18.805460750853243, "grad_norm": 4.704097270965576, "learning_rate": 1.0832948376931108e-07, "loss": 0.061, "num_input_tokens_seen": 36919248, "step": 60610 }, { "epoch": 18.80701210052746, "grad_norm": 12.272104263305664, "learning_rate": 1.0804938107087271e-07, "loss": 0.1351, "num_input_tokens_seen": 36922960, "step": 60615 }, { "epoch": 18.808563450201675, "grad_norm": 20.522857666015625, "learning_rate": 1.0776963701034371e-07, "loss": 0.1663, "num_input_tokens_seen": 36926032, "step": 60620 }, { "epoch": 18.81011479987589, "grad_norm": 19.600688934326172, "learning_rate": 1.0749025160823323e-07, "loss": 0.1245, "num_input_tokens_seen": 36928976, "step": 60625 }, { "epoch": 18.81166614955011, "grad_norm": 16.753450393676758, "learning_rate": 1.0721122488502322e-07, "loss": 0.2707, "num_input_tokens_seen": 36932144, "step": 60630 }, { "epoch": 18.813217499224326, "grad_norm": 12.342175483703613, "learning_rate": 1.0693255686116954e-07, "loss": 0.1852, "num_input_tokens_seen": 36935152, "step": 60635 }, { "epoch": 18.81476884889854, "grad_norm": 5.706519603729248, "learning_rate": 1.0665424755710197e-07, "loss": 0.1189, "num_input_tokens_seen": 36938288, "step": 60640 }, { "epoch": 18.816320198572758, "grad_norm": 11.654692649841309, "learning_rate": 1.0637629699322416e-07, "loss": 0.1539, "num_input_tokens_seen": 36940944, "step": 60645 }, { "epoch": 18.817871548246973, "grad_norm": 33.060848236083984, "learning_rate": 1.0609870518991317e-07, "loss": 0.1271, "num_input_tokens_seen": 36944720, "step": 60650 }, { "epoch": 18.819422897921193, "grad_norm": 59.3599967956543, "learning_rate": 1.0582147216751881e-07, "loss": 0.1534, "num_input_tokens_seen": 36947408, "step": 60655 }, { "epoch": 18.82097424759541, "grad_norm": 12.881376266479492, "learning_rate": 1.0554459794636707e-07, "loss": 0.1135, "num_input_tokens_seen": 36950320, "step": 60660 }, { "epoch": 18.822525597269625, "grad_norm": 17.230876922607422, "learning_rate": 1.0526808254675447e-07, "loss": 0.1635, "num_input_tokens_seen": 36952720, "step": 60665 }, { "epoch": 18.82407694694384, "grad_norm": 3.9115424156188965, "learning_rate": 1.049919259889548e-07, "loss": 0.1237, "num_input_tokens_seen": 36956240, "step": 60670 }, { "epoch": 18.825628296618056, "grad_norm": 11.18254280090332, "learning_rate": 1.0471612829321187e-07, "loss": 0.1182, "num_input_tokens_seen": 36958640, "step": 60675 }, { "epoch": 18.827179646292276, "grad_norm": 16.138076782226562, "learning_rate": 1.0444068947974562e-07, "loss": 0.1822, "num_input_tokens_seen": 36960656, "step": 60680 }, { "epoch": 18.82873099596649, "grad_norm": 7.515376091003418, "learning_rate": 1.0416560956874877e-07, "loss": 0.1961, "num_input_tokens_seen": 36963696, "step": 60685 }, { "epoch": 18.830282345640708, "grad_norm": 9.055154800415039, "learning_rate": 1.0389088858038854e-07, "loss": 0.1709, "num_input_tokens_seen": 36966160, "step": 60690 }, { "epoch": 18.831833695314923, "grad_norm": 9.121341705322266, "learning_rate": 1.0361652653480437e-07, "loss": 0.2167, "num_input_tokens_seen": 36968816, "step": 60695 }, { "epoch": 18.83338504498914, "grad_norm": 27.93711280822754, "learning_rate": 1.0334252345211126e-07, "loss": 0.0754, "num_input_tokens_seen": 36971888, "step": 60700 }, { "epoch": 18.83493639466336, "grad_norm": 42.69377517700195, "learning_rate": 1.030688793523954e-07, "loss": 0.2475, "num_input_tokens_seen": 36975120, "step": 60705 }, { "epoch": 18.836487744337575, "grad_norm": 8.818878173828125, "learning_rate": 1.0279559425571906e-07, "loss": 0.1141, "num_input_tokens_seen": 36978224, "step": 60710 }, { "epoch": 18.83803909401179, "grad_norm": 15.14949893951416, "learning_rate": 1.0252266818211786e-07, "loss": 0.126, "num_input_tokens_seen": 36981264, "step": 60715 }, { "epoch": 18.839590443686006, "grad_norm": 3.5803868770599365, "learning_rate": 1.0225010115159972e-07, "loss": 0.0743, "num_input_tokens_seen": 36984080, "step": 60720 }, { "epoch": 18.841141793360222, "grad_norm": 17.23064422607422, "learning_rate": 1.0197789318414753e-07, "loss": 0.0548, "num_input_tokens_seen": 36986768, "step": 60725 }, { "epoch": 18.842693143034438, "grad_norm": 20.501867294311523, "learning_rate": 1.0170604429971643e-07, "loss": 0.1571, "num_input_tokens_seen": 36989264, "step": 60730 }, { "epoch": 18.844244492708658, "grad_norm": 10.902369499206543, "learning_rate": 1.0143455451823714e-07, "loss": 0.1993, "num_input_tokens_seen": 36992560, "step": 60735 }, { "epoch": 18.845795842382874, "grad_norm": 34.81241226196289, "learning_rate": 1.0116342385961209e-07, "loss": 0.1877, "num_input_tokens_seen": 36995280, "step": 60740 }, { "epoch": 18.84734719205709, "grad_norm": 9.19135856628418, "learning_rate": 1.0089265234371981e-07, "loss": 0.1492, "num_input_tokens_seen": 36997968, "step": 60745 }, { "epoch": 18.848898541731305, "grad_norm": 16.941102981567383, "learning_rate": 1.0062223999040943e-07, "loss": 0.1386, "num_input_tokens_seen": 37000976, "step": 60750 }, { "epoch": 18.85044989140552, "grad_norm": 18.034032821655273, "learning_rate": 1.0035218681950676e-07, "loss": 0.2061, "num_input_tokens_seen": 37003088, "step": 60755 }, { "epoch": 18.85200124107974, "grad_norm": 16.45485496520996, "learning_rate": 1.0008249285080929e-07, "loss": 0.1093, "num_input_tokens_seen": 37005328, "step": 60760 }, { "epoch": 18.853552590753957, "grad_norm": 6.013659477233887, "learning_rate": 9.981315810408842e-08, "loss": 0.1721, "num_input_tokens_seen": 37008400, "step": 60765 }, { "epoch": 18.855103940428172, "grad_norm": 18.220077514648438, "learning_rate": 9.954418259909004e-08, "loss": 0.1522, "num_input_tokens_seen": 37011504, "step": 60770 }, { "epoch": 18.85665529010239, "grad_norm": 23.006254196166992, "learning_rate": 9.927556635553337e-08, "loss": 0.087, "num_input_tokens_seen": 37014672, "step": 60775 }, { "epoch": 18.858206639776604, "grad_norm": 21.216638565063477, "learning_rate": 9.900730939311099e-08, "loss": 0.0993, "num_input_tokens_seen": 37019312, "step": 60780 }, { "epoch": 18.859757989450824, "grad_norm": 30.141220092773438, "learning_rate": 9.873941173148882e-08, "loss": 0.1721, "num_input_tokens_seen": 37022544, "step": 60785 }, { "epoch": 18.86130933912504, "grad_norm": 31.20854949951172, "learning_rate": 9.847187339030783e-08, "loss": 0.1272, "num_input_tokens_seen": 37026544, "step": 60790 }, { "epoch": 18.862860688799255, "grad_norm": 12.43883228302002, "learning_rate": 9.820469438918124e-08, "loss": 0.0956, "num_input_tokens_seen": 37029136, "step": 60795 }, { "epoch": 18.86441203847347, "grad_norm": 20.56310272216797, "learning_rate": 9.793787474769667e-08, "loss": 0.1184, "num_input_tokens_seen": 37031888, "step": 60800 }, { "epoch": 18.865963388147687, "grad_norm": 7.186445713043213, "learning_rate": 9.767141448541462e-08, "loss": 0.0774, "num_input_tokens_seen": 37036208, "step": 60805 }, { "epoch": 18.867514737821907, "grad_norm": 4.579694747924805, "learning_rate": 9.740531362187056e-08, "loss": 0.187, "num_input_tokens_seen": 37039152, "step": 60810 }, { "epoch": 18.869066087496122, "grad_norm": 17.273244857788086, "learning_rate": 9.713957217657166e-08, "loss": 0.1065, "num_input_tokens_seen": 37041488, "step": 60815 }, { "epoch": 18.87061743717034, "grad_norm": 21.325023651123047, "learning_rate": 9.687419016900123e-08, "loss": 0.1781, "num_input_tokens_seen": 37043984, "step": 60820 }, { "epoch": 18.872168786844554, "grad_norm": 23.860328674316406, "learning_rate": 9.660916761861427e-08, "loss": 0.1235, "num_input_tokens_seen": 37046992, "step": 60825 }, { "epoch": 18.87372013651877, "grad_norm": 14.007147789001465, "learning_rate": 9.634450454483968e-08, "loss": 0.1509, "num_input_tokens_seen": 37049264, "step": 60830 }, { "epoch": 18.87527148619299, "grad_norm": 15.982467651367188, "learning_rate": 9.608020096708193e-08, "loss": 0.2113, "num_input_tokens_seen": 37052592, "step": 60835 }, { "epoch": 18.876822835867205, "grad_norm": 6.191410541534424, "learning_rate": 9.581625690471553e-08, "loss": 0.1051, "num_input_tokens_seen": 37055856, "step": 60840 }, { "epoch": 18.87837418554142, "grad_norm": 16.48252296447754, "learning_rate": 9.555267237709221e-08, "loss": 0.2524, "num_input_tokens_seen": 37059696, "step": 60845 }, { "epoch": 18.879925535215637, "grad_norm": 25.731233596801758, "learning_rate": 9.528944740353541e-08, "loss": 0.0992, "num_input_tokens_seen": 37062352, "step": 60850 }, { "epoch": 18.881476884889853, "grad_norm": 14.81946086883545, "learning_rate": 9.502658200334247e-08, "loss": 0.0792, "num_input_tokens_seen": 37066032, "step": 60855 }, { "epoch": 18.883028234564073, "grad_norm": 24.413301467895508, "learning_rate": 9.47640761957841e-08, "loss": 0.1771, "num_input_tokens_seen": 37069232, "step": 60860 }, { "epoch": 18.88457958423829, "grad_norm": 23.51736831665039, "learning_rate": 9.450193000010655e-08, "loss": 0.1461, "num_input_tokens_seen": 37073648, "step": 60865 }, { "epoch": 18.886130933912504, "grad_norm": 17.84644317626953, "learning_rate": 9.424014343552668e-08, "loss": 0.1624, "num_input_tokens_seen": 37077296, "step": 60870 }, { "epoch": 18.88768228358672, "grad_norm": 15.495542526245117, "learning_rate": 9.397871652123803e-08, "loss": 0.1699, "num_input_tokens_seen": 37079728, "step": 60875 }, { "epoch": 18.889233633260936, "grad_norm": 10.172834396362305, "learning_rate": 9.371764927640414e-08, "loss": 0.179, "num_input_tokens_seen": 37082128, "step": 60880 }, { "epoch": 18.890784982935152, "grad_norm": 3.974351644515991, "learning_rate": 9.345694172016639e-08, "loss": 0.0697, "num_input_tokens_seen": 37084432, "step": 60885 }, { "epoch": 18.89233633260937, "grad_norm": 46.97205352783203, "learning_rate": 9.319659387163726e-08, "loss": 0.1633, "num_input_tokens_seen": 37087632, "step": 60890 }, { "epoch": 18.893887682283587, "grad_norm": 13.852235794067383, "learning_rate": 9.29366057499026e-08, "loss": 0.1991, "num_input_tokens_seen": 37090800, "step": 60895 }, { "epoch": 18.895439031957803, "grad_norm": 14.13479232788086, "learning_rate": 9.267697737402325e-08, "loss": 0.1011, "num_input_tokens_seen": 37094032, "step": 60900 }, { "epoch": 18.89699038163202, "grad_norm": 12.68289852142334, "learning_rate": 9.241770876303236e-08, "loss": 0.212, "num_input_tokens_seen": 37096272, "step": 60905 }, { "epoch": 18.898541731306235, "grad_norm": 27.807106018066406, "learning_rate": 9.215879993593857e-08, "loss": 0.1874, "num_input_tokens_seen": 37099440, "step": 60910 }, { "epoch": 18.900093080980454, "grad_norm": 3.7798280715942383, "learning_rate": 9.190025091172172e-08, "loss": 0.1539, "num_input_tokens_seen": 37103376, "step": 60915 }, { "epoch": 18.90164443065467, "grad_norm": 4.000105381011963, "learning_rate": 9.164206170933775e-08, "loss": 0.1072, "num_input_tokens_seen": 37106672, "step": 60920 }, { "epoch": 18.903195780328886, "grad_norm": 6.316807270050049, "learning_rate": 9.138423234771376e-08, "loss": 0.1657, "num_input_tokens_seen": 37110224, "step": 60925 }, { "epoch": 18.904747130003102, "grad_norm": 3.113447666168213, "learning_rate": 9.11267628457524e-08, "loss": 0.1786, "num_input_tokens_seen": 37113072, "step": 60930 }, { "epoch": 18.906298479677318, "grad_norm": 14.22241497039795, "learning_rate": 9.08696532223291e-08, "loss": 0.191, "num_input_tokens_seen": 37116176, "step": 60935 }, { "epoch": 18.907849829351537, "grad_norm": 27.545137405395508, "learning_rate": 9.061290349629271e-08, "loss": 0.1317, "num_input_tokens_seen": 37118704, "step": 60940 }, { "epoch": 18.909401179025753, "grad_norm": 52.36204528808594, "learning_rate": 9.035651368646647e-08, "loss": 0.202, "num_input_tokens_seen": 37122192, "step": 60945 }, { "epoch": 18.91095252869997, "grad_norm": 31.457487106323242, "learning_rate": 9.010048381164705e-08, "loss": 0.2401, "num_input_tokens_seen": 37124656, "step": 60950 }, { "epoch": 18.912503878374185, "grad_norm": 6.237401008605957, "learning_rate": 8.98448138906033e-08, "loss": 0.1478, "num_input_tokens_seen": 37129456, "step": 60955 }, { "epoch": 18.9140552280484, "grad_norm": 14.985627174377441, "learning_rate": 8.958950394208077e-08, "loss": 0.142, "num_input_tokens_seen": 37132400, "step": 60960 }, { "epoch": 18.91560657772262, "grad_norm": 20.18297576904297, "learning_rate": 8.933455398479451e-08, "loss": 0.1572, "num_input_tokens_seen": 37135760, "step": 60965 }, { "epoch": 18.917157927396836, "grad_norm": 7.742227554321289, "learning_rate": 8.90799640374368e-08, "loss": 0.1479, "num_input_tokens_seen": 37139408, "step": 60970 }, { "epoch": 18.918709277071052, "grad_norm": 23.11253547668457, "learning_rate": 8.882573411867213e-08, "loss": 0.0931, "num_input_tokens_seen": 37142608, "step": 60975 }, { "epoch": 18.920260626745268, "grad_norm": 9.105108261108398, "learning_rate": 8.857186424713726e-08, "loss": 0.0901, "num_input_tokens_seen": 37146064, "step": 60980 }, { "epoch": 18.921811976419484, "grad_norm": 17.136520385742188, "learning_rate": 8.831835444144566e-08, "loss": 0.1078, "num_input_tokens_seen": 37148816, "step": 60985 }, { "epoch": 18.9233633260937, "grad_norm": 27.716672897338867, "learning_rate": 8.806520472018132e-08, "loss": 0.1506, "num_input_tokens_seen": 37152528, "step": 60990 }, { "epoch": 18.92491467576792, "grad_norm": 6.06736421585083, "learning_rate": 8.781241510190331e-08, "loss": 0.1322, "num_input_tokens_seen": 37154832, "step": 60995 }, { "epoch": 18.926466025442135, "grad_norm": 11.350676536560059, "learning_rate": 8.755998560514455e-08, "loss": 0.1615, "num_input_tokens_seen": 37157200, "step": 61000 }, { "epoch": 18.92801737511635, "grad_norm": 8.731818199157715, "learning_rate": 8.730791624841084e-08, "loss": 0.2146, "num_input_tokens_seen": 37160432, "step": 61005 }, { "epoch": 18.929568724790567, "grad_norm": 14.53795337677002, "learning_rate": 8.705620705018181e-08, "loss": 0.1715, "num_input_tokens_seen": 37162704, "step": 61010 }, { "epoch": 18.931120074464783, "grad_norm": 8.486037254333496, "learning_rate": 8.680485802891048e-08, "loss": 0.1742, "num_input_tokens_seen": 37166352, "step": 61015 }, { "epoch": 18.932671424139002, "grad_norm": 25.93951988220215, "learning_rate": 8.655386920302433e-08, "loss": 0.1645, "num_input_tokens_seen": 37169328, "step": 61020 }, { "epoch": 18.934222773813218, "grad_norm": 15.079801559448242, "learning_rate": 8.630324059092365e-08, "loss": 0.1389, "num_input_tokens_seen": 37172816, "step": 61025 }, { "epoch": 18.935774123487434, "grad_norm": 20.80545997619629, "learning_rate": 8.605297221098152e-08, "loss": 0.1526, "num_input_tokens_seen": 37175856, "step": 61030 }, { "epoch": 18.93732547316165, "grad_norm": 8.082941055297852, "learning_rate": 8.58030640815466e-08, "loss": 0.0808, "num_input_tokens_seen": 37178224, "step": 61035 }, { "epoch": 18.938876822835866, "grad_norm": 42.09851837158203, "learning_rate": 8.555351622094033e-08, "loss": 0.1999, "num_input_tokens_seen": 37181040, "step": 61040 }, { "epoch": 18.940428172510085, "grad_norm": 9.645989418029785, "learning_rate": 8.530432864745697e-08, "loss": 0.1227, "num_input_tokens_seen": 37183120, "step": 61045 }, { "epoch": 18.9419795221843, "grad_norm": 18.51184844970703, "learning_rate": 8.505550137936525e-08, "loss": 0.1287, "num_input_tokens_seen": 37186864, "step": 61050 }, { "epoch": 18.943530871858517, "grad_norm": 17.1446475982666, "learning_rate": 8.480703443490612e-08, "loss": 0.2109, "num_input_tokens_seen": 37190480, "step": 61055 }, { "epoch": 18.945082221532733, "grad_norm": 56.527008056640625, "learning_rate": 8.455892783229669e-08, "loss": 0.1199, "num_input_tokens_seen": 37193488, "step": 61060 }, { "epoch": 18.94663357120695, "grad_norm": 10.428285598754883, "learning_rate": 8.431118158972517e-08, "loss": 0.1973, "num_input_tokens_seen": 37196816, "step": 61065 }, { "epoch": 18.948184920881168, "grad_norm": 18.60211753845215, "learning_rate": 8.406379572535484e-08, "loss": 0.083, "num_input_tokens_seen": 37200528, "step": 61070 }, { "epoch": 18.949736270555384, "grad_norm": 14.718425750732422, "learning_rate": 8.381677025732116e-08, "loss": 0.173, "num_input_tokens_seen": 37203376, "step": 61075 }, { "epoch": 18.9512876202296, "grad_norm": 11.573196411132812, "learning_rate": 8.357010520373465e-08, "loss": 0.1984, "num_input_tokens_seen": 37206448, "step": 61080 }, { "epoch": 18.952838969903816, "grad_norm": 23.149127960205078, "learning_rate": 8.332380058267864e-08, "loss": 0.2674, "num_input_tokens_seen": 37209328, "step": 61085 }, { "epoch": 18.95439031957803, "grad_norm": 9.452993392944336, "learning_rate": 8.307785641220978e-08, "loss": 0.1512, "num_input_tokens_seen": 37213200, "step": 61090 }, { "epoch": 18.95594166925225, "grad_norm": 8.623558044433594, "learning_rate": 8.283227271035976e-08, "loss": 0.1101, "num_input_tokens_seen": 37217072, "step": 61095 }, { "epoch": 18.957493018926467, "grad_norm": 23.91109848022461, "learning_rate": 8.258704949513196e-08, "loss": 0.1359, "num_input_tokens_seen": 37220528, "step": 61100 }, { "epoch": 18.959044368600683, "grad_norm": 29.863067626953125, "learning_rate": 8.234218678450479e-08, "loss": 0.1964, "num_input_tokens_seen": 37224400, "step": 61105 }, { "epoch": 18.9605957182749, "grad_norm": 0.732715904712677, "learning_rate": 8.20976845964283e-08, "loss": 0.1403, "num_input_tokens_seen": 37227888, "step": 61110 }, { "epoch": 18.962147067949115, "grad_norm": 31.568933486938477, "learning_rate": 8.185354294882875e-08, "loss": 0.2007, "num_input_tokens_seen": 37231792, "step": 61115 }, { "epoch": 18.963698417623334, "grad_norm": 11.564252853393555, "learning_rate": 8.160976185960401e-08, "loss": 0.1895, "num_input_tokens_seen": 37234256, "step": 61120 }, { "epoch": 18.96524976729755, "grad_norm": 28.84364891052246, "learning_rate": 8.136634134662591e-08, "loss": 0.1504, "num_input_tokens_seen": 37237328, "step": 61125 }, { "epoch": 18.966801116971766, "grad_norm": 11.944677352905273, "learning_rate": 8.112328142774017e-08, "loss": 0.2731, "num_input_tokens_seen": 37239728, "step": 61130 }, { "epoch": 18.96835246664598, "grad_norm": 9.058732986450195, "learning_rate": 8.0880582120767e-08, "loss": 0.1568, "num_input_tokens_seen": 37243440, "step": 61135 }, { "epoch": 18.969903816320198, "grad_norm": 25.030662536621094, "learning_rate": 8.063824344349714e-08, "loss": 0.1478, "num_input_tokens_seen": 37246192, "step": 61140 }, { "epoch": 18.971455165994414, "grad_norm": 12.029155731201172, "learning_rate": 8.03962654136986e-08, "loss": 0.1594, "num_input_tokens_seen": 37248784, "step": 61145 }, { "epoch": 18.973006515668633, "grad_norm": 23.704015731811523, "learning_rate": 8.015464804911055e-08, "loss": 0.1243, "num_input_tokens_seen": 37252400, "step": 61150 }, { "epoch": 18.97455786534285, "grad_norm": 3.1692612171173096, "learning_rate": 7.991339136744658e-08, "loss": 0.1184, "num_input_tokens_seen": 37254704, "step": 61155 }, { "epoch": 18.976109215017065, "grad_norm": 9.05062198638916, "learning_rate": 7.967249538639365e-08, "loss": 0.2188, "num_input_tokens_seen": 37257552, "step": 61160 }, { "epoch": 18.97766056469128, "grad_norm": 11.831995964050293, "learning_rate": 7.943196012361153e-08, "loss": 0.07, "num_input_tokens_seen": 37260272, "step": 61165 }, { "epoch": 18.979211914365496, "grad_norm": 6.715829849243164, "learning_rate": 7.919178559673557e-08, "loss": 0.0988, "num_input_tokens_seen": 37263728, "step": 61170 }, { "epoch": 18.980763264039716, "grad_norm": 19.010704040527344, "learning_rate": 7.895197182337278e-08, "loss": 0.1459, "num_input_tokens_seen": 37266704, "step": 61175 }, { "epoch": 18.982314613713932, "grad_norm": 5.119402885437012, "learning_rate": 7.871251882110464e-08, "loss": 0.0919, "num_input_tokens_seen": 37269328, "step": 61180 }, { "epoch": 18.983865963388148, "grad_norm": 11.9425687789917, "learning_rate": 7.847342660748491e-08, "loss": 0.1562, "num_input_tokens_seen": 37272752, "step": 61185 }, { "epoch": 18.985417313062364, "grad_norm": 6.545210838317871, "learning_rate": 7.82346952000429e-08, "loss": 0.1647, "num_input_tokens_seen": 37277136, "step": 61190 }, { "epoch": 18.98696866273658, "grad_norm": 26.469064712524414, "learning_rate": 7.799632461628015e-08, "loss": 0.1376, "num_input_tokens_seen": 37280528, "step": 61195 }, { "epoch": 18.9885200124108, "grad_norm": 23.23377227783203, "learning_rate": 7.775831487367269e-08, "loss": 0.0979, "num_input_tokens_seen": 37283984, "step": 61200 }, { "epoch": 18.990071362085015, "grad_norm": 17.47810173034668, "learning_rate": 7.752066598966767e-08, "loss": 0.1686, "num_input_tokens_seen": 37286416, "step": 61205 }, { "epoch": 18.99162271175923, "grad_norm": 16.641319274902344, "learning_rate": 7.72833779816895e-08, "loss": 0.1612, "num_input_tokens_seen": 37289424, "step": 61210 }, { "epoch": 18.993174061433447, "grad_norm": 57.14063262939453, "learning_rate": 7.704645086713314e-08, "loss": 0.1091, "num_input_tokens_seen": 37294000, "step": 61215 }, { "epoch": 18.994725411107662, "grad_norm": 25.875076293945312, "learning_rate": 7.680988466336858e-08, "loss": 0.1508, "num_input_tokens_seen": 37297168, "step": 61220 }, { "epoch": 18.996276760781882, "grad_norm": 22.332794189453125, "learning_rate": 7.657367938773863e-08, "loss": 0.1329, "num_input_tokens_seen": 37299984, "step": 61225 }, { "epoch": 18.997828110456098, "grad_norm": 13.505841255187988, "learning_rate": 7.633783505755998e-08, "loss": 0.1615, "num_input_tokens_seen": 37303184, "step": 61230 }, { "epoch": 18.999379460130314, "grad_norm": 14.389381408691406, "learning_rate": 7.610235169012325e-08, "loss": 0.1478, "num_input_tokens_seen": 37306032, "step": 61235 }, { "epoch": 19.00093080980453, "grad_norm": 5.195194244384766, "learning_rate": 7.58672293026913e-08, "loss": 0.1122, "num_input_tokens_seen": 37308032, "step": 61240 }, { "epoch": 19.002482159478745, "grad_norm": 14.017075538635254, "learning_rate": 7.563246791250256e-08, "loss": 0.1014, "num_input_tokens_seen": 37311200, "step": 61245 }, { "epoch": 19.004033509152965, "grad_norm": 17.74408531188965, "learning_rate": 7.53980675367666e-08, "loss": 0.1323, "num_input_tokens_seen": 37313472, "step": 61250 }, { "epoch": 19.00558485882718, "grad_norm": 13.092767715454102, "learning_rate": 7.516402819266854e-08, "loss": 0.1059, "num_input_tokens_seen": 37316736, "step": 61255 }, { "epoch": 19.007136208501397, "grad_norm": 23.220947265625, "learning_rate": 7.493034989736581e-08, "loss": 0.1698, "num_input_tokens_seen": 37319616, "step": 61260 }, { "epoch": 19.008687558175613, "grad_norm": 14.079442977905273, "learning_rate": 7.469703266799078e-08, "loss": 0.1658, "num_input_tokens_seen": 37322784, "step": 61265 }, { "epoch": 19.01023890784983, "grad_norm": 16.66638946533203, "learning_rate": 7.446407652164644e-08, "loss": 0.1341, "num_input_tokens_seen": 37327200, "step": 61270 }, { "epoch": 19.011790257524044, "grad_norm": 13.82752513885498, "learning_rate": 7.423148147541248e-08, "loss": 0.1225, "num_input_tokens_seen": 37330304, "step": 61275 }, { "epoch": 19.013341607198264, "grad_norm": 8.141075134277344, "learning_rate": 7.399924754634136e-08, "loss": 0.1544, "num_input_tokens_seen": 37332992, "step": 61280 }, { "epoch": 19.01489295687248, "grad_norm": 17.056964874267578, "learning_rate": 7.376737475145778e-08, "loss": 0.1097, "num_input_tokens_seen": 37335648, "step": 61285 }, { "epoch": 19.016444306546695, "grad_norm": 20.819595336914062, "learning_rate": 7.353586310776096e-08, "loss": 0.1247, "num_input_tokens_seen": 37338176, "step": 61290 }, { "epoch": 19.01799565622091, "grad_norm": 17.785449981689453, "learning_rate": 7.330471263222339e-08, "loss": 0.0591, "num_input_tokens_seen": 37341152, "step": 61295 }, { "epoch": 19.019547005895127, "grad_norm": 9.71673583984375, "learning_rate": 7.307392334179097e-08, "loss": 0.1206, "num_input_tokens_seen": 37343488, "step": 61300 }, { "epoch": 19.021098355569347, "grad_norm": 24.689599990844727, "learning_rate": 7.284349525338352e-08, "loss": 0.189, "num_input_tokens_seen": 37345696, "step": 61305 }, { "epoch": 19.022649705243563, "grad_norm": 6.466856956481934, "learning_rate": 7.261342838389474e-08, "loss": 0.1628, "num_input_tokens_seen": 37347872, "step": 61310 }, { "epoch": 19.02420105491778, "grad_norm": 36.725799560546875, "learning_rate": 7.238372275019001e-08, "loss": 0.1941, "num_input_tokens_seen": 37350016, "step": 61315 }, { "epoch": 19.025752404591994, "grad_norm": 32.68013000488281, "learning_rate": 7.215437836911032e-08, "loss": 0.3715, "num_input_tokens_seen": 37352608, "step": 61320 }, { "epoch": 19.02730375426621, "grad_norm": 7.679653644561768, "learning_rate": 7.192539525746945e-08, "loss": 0.0663, "num_input_tokens_seen": 37355712, "step": 61325 }, { "epoch": 19.02885510394043, "grad_norm": 7.81386661529541, "learning_rate": 7.16967734320534e-08, "loss": 0.1531, "num_input_tokens_seen": 37358144, "step": 61330 }, { "epoch": 19.030406453614646, "grad_norm": 32.21409606933594, "learning_rate": 7.146851290962431e-08, "loss": 0.1338, "num_input_tokens_seen": 37362048, "step": 61335 }, { "epoch": 19.03195780328886, "grad_norm": 13.146383285522461, "learning_rate": 7.124061370691548e-08, "loss": 0.1728, "num_input_tokens_seen": 37365312, "step": 61340 }, { "epoch": 19.033509152963077, "grad_norm": 52.10145950317383, "learning_rate": 7.101307584063521e-08, "loss": 0.229, "num_input_tokens_seen": 37369152, "step": 61345 }, { "epoch": 19.035060502637293, "grad_norm": 16.06688117980957, "learning_rate": 7.07858993274646e-08, "loss": 0.1311, "num_input_tokens_seen": 37372096, "step": 61350 }, { "epoch": 19.036611852311513, "grad_norm": 13.110123634338379, "learning_rate": 7.05590841840581e-08, "loss": 0.1746, "num_input_tokens_seen": 37374400, "step": 61355 }, { "epoch": 19.03816320198573, "grad_norm": 34.12055969238281, "learning_rate": 7.033263042704353e-08, "loss": 0.2248, "num_input_tokens_seen": 37377280, "step": 61360 }, { "epoch": 19.039714551659944, "grad_norm": 22.728710174560547, "learning_rate": 7.01065380730237e-08, "loss": 0.144, "num_input_tokens_seen": 37380192, "step": 61365 }, { "epoch": 19.04126590133416, "grad_norm": 14.971983909606934, "learning_rate": 6.98808071385726e-08, "loss": 0.107, "num_input_tokens_seen": 37382848, "step": 61370 }, { "epoch": 19.042817251008376, "grad_norm": 11.289201736450195, "learning_rate": 6.96554376402403e-08, "loss": 0.1121, "num_input_tokens_seen": 37385760, "step": 61375 }, { "epoch": 19.044368600682596, "grad_norm": 12.67260456085205, "learning_rate": 6.943042959454804e-08, "loss": 0.1427, "num_input_tokens_seen": 37388448, "step": 61380 }, { "epoch": 19.04591995035681, "grad_norm": 8.098382949829102, "learning_rate": 6.920578301799263e-08, "loss": 0.1053, "num_input_tokens_seen": 37391904, "step": 61385 }, { "epoch": 19.047471300031027, "grad_norm": 13.595748901367188, "learning_rate": 6.8981497927042e-08, "loss": 0.1204, "num_input_tokens_seen": 37394688, "step": 61390 }, { "epoch": 19.049022649705243, "grad_norm": 60.872257232666016, "learning_rate": 6.875757433813967e-08, "loss": 0.168, "num_input_tokens_seen": 37397760, "step": 61395 }, { "epoch": 19.05057399937946, "grad_norm": 20.096471786499023, "learning_rate": 6.853401226770196e-08, "loss": 0.2482, "num_input_tokens_seen": 37400160, "step": 61400 }, { "epoch": 19.052125349053675, "grad_norm": 9.838716506958008, "learning_rate": 6.831081173211851e-08, "loss": 0.1597, "num_input_tokens_seen": 37402784, "step": 61405 }, { "epoch": 19.053676698727894, "grad_norm": 11.382720947265625, "learning_rate": 6.808797274775236e-08, "loss": 0.1487, "num_input_tokens_seen": 37406208, "step": 61410 }, { "epoch": 19.05522804840211, "grad_norm": 9.515995025634766, "learning_rate": 6.786549533094045e-08, "loss": 0.1741, "num_input_tokens_seen": 37409664, "step": 61415 }, { "epoch": 19.056779398076326, "grad_norm": 25.729982376098633, "learning_rate": 6.764337949799304e-08, "loss": 0.1061, "num_input_tokens_seen": 37412704, "step": 61420 }, { "epoch": 19.058330747750542, "grad_norm": 9.1709623336792, "learning_rate": 6.742162526519324e-08, "loss": 0.1592, "num_input_tokens_seen": 37415104, "step": 61425 }, { "epoch": 19.059882097424758, "grad_norm": 28.914508819580078, "learning_rate": 6.720023264879971e-08, "loss": 0.1609, "num_input_tokens_seen": 37419008, "step": 61430 }, { "epoch": 19.061433447098977, "grad_norm": 17.915332794189453, "learning_rate": 6.697920166504112e-08, "loss": 0.2639, "num_input_tokens_seen": 37421088, "step": 61435 }, { "epoch": 19.062984796773193, "grad_norm": 11.019241333007812, "learning_rate": 6.675853233012342e-08, "loss": 0.1651, "num_input_tokens_seen": 37424864, "step": 61440 }, { "epoch": 19.06453614644741, "grad_norm": 2.7049736976623535, "learning_rate": 6.653822466022364e-08, "loss": 0.1454, "num_input_tokens_seen": 37427232, "step": 61445 }, { "epoch": 19.066087496121625, "grad_norm": 11.040356636047363, "learning_rate": 6.631827867149277e-08, "loss": 0.1514, "num_input_tokens_seen": 37429536, "step": 61450 }, { "epoch": 19.06763884579584, "grad_norm": 5.038132667541504, "learning_rate": 6.609869438005567e-08, "loss": 0.0701, "num_input_tokens_seen": 37431744, "step": 61455 }, { "epoch": 19.06919019547006, "grad_norm": 21.409238815307617, "learning_rate": 6.587947180201004e-08, "loss": 0.1991, "num_input_tokens_seen": 37434656, "step": 61460 }, { "epoch": 19.070741545144276, "grad_norm": 10.601658821105957, "learning_rate": 6.566061095342857e-08, "loss": 0.1143, "num_input_tokens_seen": 37437664, "step": 61465 }, { "epoch": 19.072292894818492, "grad_norm": 15.691733360290527, "learning_rate": 6.544211185035454e-08, "loss": 0.1604, "num_input_tokens_seen": 37441600, "step": 61470 }, { "epoch": 19.073844244492708, "grad_norm": 17.341064453125, "learning_rate": 6.522397450880901e-08, "loss": 0.1131, "num_input_tokens_seen": 37443872, "step": 61475 }, { "epoch": 19.075395594166924, "grad_norm": 42.04990768432617, "learning_rate": 6.500619894478145e-08, "loss": 0.1381, "num_input_tokens_seen": 37446880, "step": 61480 }, { "epoch": 19.076946943841143, "grad_norm": 13.26602840423584, "learning_rate": 6.478878517423959e-08, "loss": 0.076, "num_input_tokens_seen": 37449216, "step": 61485 }, { "epoch": 19.07849829351536, "grad_norm": 24.60735321044922, "learning_rate": 6.457173321312072e-08, "loss": 0.1945, "num_input_tokens_seen": 37452096, "step": 61490 }, { "epoch": 19.080049643189575, "grad_norm": 15.716416358947754, "learning_rate": 6.435504307733819e-08, "loss": 0.1411, "num_input_tokens_seen": 37455616, "step": 61495 }, { "epoch": 19.08160099286379, "grad_norm": 52.190731048583984, "learning_rate": 6.413871478277766e-08, "loss": 0.2795, "num_input_tokens_seen": 37458624, "step": 61500 }, { "epoch": 19.083152342538007, "grad_norm": 19.097808837890625, "learning_rate": 6.392274834529865e-08, "loss": 0.1672, "num_input_tokens_seen": 37461568, "step": 61505 }, { "epoch": 19.084703692212226, "grad_norm": 22.399869918823242, "learning_rate": 6.370714378073406e-08, "loss": 0.0909, "num_input_tokens_seen": 37464832, "step": 61510 }, { "epoch": 19.086255041886442, "grad_norm": 18.294832229614258, "learning_rate": 6.349190110489068e-08, "loss": 0.1376, "num_input_tokens_seen": 37467680, "step": 61515 }, { "epoch": 19.087806391560658, "grad_norm": 7.130405426025391, "learning_rate": 6.327702033354754e-08, "loss": 0.1961, "num_input_tokens_seen": 37471744, "step": 61520 }, { "epoch": 19.089357741234874, "grad_norm": 4.1714396476745605, "learning_rate": 6.306250148245873e-08, "loss": 0.1559, "num_input_tokens_seen": 37474272, "step": 61525 }, { "epoch": 19.09090909090909, "grad_norm": 1.8419164419174194, "learning_rate": 6.284834456735001e-08, "loss": 0.2283, "num_input_tokens_seen": 37478240, "step": 61530 }, { "epoch": 19.092460440583306, "grad_norm": 28.727210998535156, "learning_rate": 6.263454960392324e-08, "loss": 0.1241, "num_input_tokens_seen": 37482528, "step": 61535 }, { "epoch": 19.094011790257525, "grad_norm": 10.20872688293457, "learning_rate": 6.242111660785034e-08, "loss": 0.1297, "num_input_tokens_seen": 37484864, "step": 61540 }, { "epoch": 19.09556313993174, "grad_norm": 7.131661415100098, "learning_rate": 6.220804559477934e-08, "loss": 0.1782, "num_input_tokens_seen": 37487648, "step": 61545 }, { "epoch": 19.097114489605957, "grad_norm": 26.67304039001465, "learning_rate": 6.199533658033108e-08, "loss": 0.1663, "num_input_tokens_seen": 37491168, "step": 61550 }, { "epoch": 19.098665839280173, "grad_norm": 14.487332344055176, "learning_rate": 6.17829895800992e-08, "loss": 0.1544, "num_input_tokens_seen": 37495072, "step": 61555 }, { "epoch": 19.10021718895439, "grad_norm": 24.39002227783203, "learning_rate": 6.157100460965182e-08, "loss": 0.1778, "num_input_tokens_seen": 37498496, "step": 61560 }, { "epoch": 19.10176853862861, "grad_norm": 10.593347549438477, "learning_rate": 6.13593816845287e-08, "loss": 0.1376, "num_input_tokens_seen": 37501792, "step": 61565 }, { "epoch": 19.103319888302824, "grad_norm": 27.13666534423828, "learning_rate": 6.114812082024635e-08, "loss": 0.1635, "num_input_tokens_seen": 37504800, "step": 61570 }, { "epoch": 19.10487123797704, "grad_norm": 11.193371772766113, "learning_rate": 6.093722203229069e-08, "loss": 0.1642, "num_input_tokens_seen": 37507360, "step": 61575 }, { "epoch": 19.106422587651256, "grad_norm": 14.078067779541016, "learning_rate": 6.07266853361238e-08, "loss": 0.1778, "num_input_tokens_seen": 37509920, "step": 61580 }, { "epoch": 19.107973937325472, "grad_norm": 16.935224533081055, "learning_rate": 6.051651074718112e-08, "loss": 0.1447, "num_input_tokens_seen": 37512608, "step": 61585 }, { "epoch": 19.10952528699969, "grad_norm": 14.246927261352539, "learning_rate": 6.030669828087033e-08, "loss": 0.1357, "num_input_tokens_seen": 37515552, "step": 61590 }, { "epoch": 19.111076636673907, "grad_norm": 8.095955848693848, "learning_rate": 6.0097247952573e-08, "loss": 0.1365, "num_input_tokens_seen": 37518112, "step": 61595 }, { "epoch": 19.112627986348123, "grad_norm": 18.89000129699707, "learning_rate": 5.988815977764461e-08, "loss": 0.1269, "num_input_tokens_seen": 37521440, "step": 61600 }, { "epoch": 19.11417933602234, "grad_norm": 13.341500282287598, "learning_rate": 5.967943377141405e-08, "loss": 0.1251, "num_input_tokens_seen": 37524224, "step": 61605 }, { "epoch": 19.115730685696555, "grad_norm": 31.42323112487793, "learning_rate": 5.9471069949182946e-08, "loss": 0.2131, "num_input_tokens_seen": 37526592, "step": 61610 }, { "epoch": 19.117282035370774, "grad_norm": 33.29951095581055, "learning_rate": 5.926306832622686e-08, "loss": 0.0945, "num_input_tokens_seen": 37529536, "step": 61615 }, { "epoch": 19.11883338504499, "grad_norm": 19.13981819152832, "learning_rate": 5.905542891779526e-08, "loss": 0.1344, "num_input_tokens_seen": 37534272, "step": 61620 }, { "epoch": 19.120384734719206, "grad_norm": 12.875038146972656, "learning_rate": 5.8848151739110405e-08, "loss": 0.1451, "num_input_tokens_seen": 37536512, "step": 61625 }, { "epoch": 19.121936084393422, "grad_norm": 9.894169807434082, "learning_rate": 5.8641236805367375e-08, "loss": 0.1229, "num_input_tokens_seen": 37539424, "step": 61630 }, { "epoch": 19.123487434067638, "grad_norm": 11.05121898651123, "learning_rate": 5.843468413173681e-08, "loss": 0.1481, "num_input_tokens_seen": 37543040, "step": 61635 }, { "epoch": 19.125038783741857, "grad_norm": 10.441184997558594, "learning_rate": 5.822849373336048e-08, "loss": 0.0936, "num_input_tokens_seen": 37545952, "step": 61640 }, { "epoch": 19.126590133416073, "grad_norm": 4.517758369445801, "learning_rate": 5.802266562535519e-08, "loss": 0.1079, "num_input_tokens_seen": 37548224, "step": 61645 }, { "epoch": 19.12814148309029, "grad_norm": 9.171235084533691, "learning_rate": 5.781719982280998e-08, "loss": 0.1762, "num_input_tokens_seen": 37552640, "step": 61650 }, { "epoch": 19.129692832764505, "grad_norm": 19.27006721496582, "learning_rate": 5.761209634078835e-08, "loss": 0.1571, "num_input_tokens_seen": 37555200, "step": 61655 }, { "epoch": 19.13124418243872, "grad_norm": 6.2773356437683105, "learning_rate": 5.7407355194326606e-08, "loss": 0.2221, "num_input_tokens_seen": 37557792, "step": 61660 }, { "epoch": 19.132795532112937, "grad_norm": 8.807770729064941, "learning_rate": 5.720297639843442e-08, "loss": 0.0638, "num_input_tokens_seen": 37560544, "step": 61665 }, { "epoch": 19.134346881787156, "grad_norm": 23.854646682739258, "learning_rate": 5.69989599680959e-08, "loss": 0.1394, "num_input_tokens_seen": 37563680, "step": 61670 }, { "epoch": 19.135898231461372, "grad_norm": 12.043588638305664, "learning_rate": 5.679530591826743e-08, "loss": 0.1873, "num_input_tokens_seen": 37566880, "step": 61675 }, { "epoch": 19.137449581135588, "grad_norm": 4.886744499206543, "learning_rate": 5.659201426387983e-08, "loss": 0.0911, "num_input_tokens_seen": 37569504, "step": 61680 }, { "epoch": 19.139000930809804, "grad_norm": 31.76253890991211, "learning_rate": 5.6389085019835646e-08, "loss": 0.1973, "num_input_tokens_seen": 37573376, "step": 61685 }, { "epoch": 19.14055228048402, "grad_norm": 48.98105239868164, "learning_rate": 5.618651820101295e-08, "loss": 0.2196, "num_input_tokens_seen": 37576224, "step": 61690 }, { "epoch": 19.14210363015824, "grad_norm": 20.397113800048828, "learning_rate": 5.598431382226155e-08, "loss": 0.1566, "num_input_tokens_seen": 37580960, "step": 61695 }, { "epoch": 19.143654979832455, "grad_norm": 9.377060890197754, "learning_rate": 5.578247189840624e-08, "loss": 0.1333, "num_input_tokens_seen": 37584768, "step": 61700 }, { "epoch": 19.14520632950667, "grad_norm": 21.67319107055664, "learning_rate": 5.558099244424409e-08, "loss": 0.1115, "num_input_tokens_seen": 37587360, "step": 61705 }, { "epoch": 19.146757679180887, "grad_norm": 9.221617698669434, "learning_rate": 5.537987547454604e-08, "loss": 0.1118, "num_input_tokens_seen": 37590368, "step": 61710 }, { "epoch": 19.148309028855103, "grad_norm": 19.528114318847656, "learning_rate": 5.517912100405587e-08, "loss": 0.1267, "num_input_tokens_seen": 37592608, "step": 61715 }, { "epoch": 19.149860378529322, "grad_norm": 11.179693222045898, "learning_rate": 5.497872904749124e-08, "loss": 0.1398, "num_input_tokens_seen": 37595840, "step": 61720 }, { "epoch": 19.151411728203538, "grad_norm": 17.955978393554688, "learning_rate": 5.477869961954429e-08, "loss": 0.1537, "num_input_tokens_seen": 37599200, "step": 61725 }, { "epoch": 19.152963077877754, "grad_norm": 7.794692039489746, "learning_rate": 5.45790327348783e-08, "loss": 0.1108, "num_input_tokens_seen": 37601696, "step": 61730 }, { "epoch": 19.15451442755197, "grad_norm": 20.64109230041504, "learning_rate": 5.4379728408132106e-08, "loss": 0.1392, "num_input_tokens_seen": 37604832, "step": 61735 }, { "epoch": 19.156065777226186, "grad_norm": 29.788599014282227, "learning_rate": 5.4180786653916237e-08, "loss": 0.2222, "num_input_tokens_seen": 37608128, "step": 61740 }, { "epoch": 19.157617126900405, "grad_norm": 14.3130521774292, "learning_rate": 5.398220748681682e-08, "loss": 0.1796, "num_input_tokens_seen": 37610656, "step": 61745 }, { "epoch": 19.15916847657462, "grad_norm": 15.150382041931152, "learning_rate": 5.3783990921390526e-08, "loss": 0.2033, "num_input_tokens_seen": 37613248, "step": 61750 }, { "epoch": 19.160719826248837, "grad_norm": 13.288056373596191, "learning_rate": 5.3586136972169636e-08, "loss": 0.245, "num_input_tokens_seen": 37615552, "step": 61755 }, { "epoch": 19.162271175923053, "grad_norm": 19.671546936035156, "learning_rate": 5.33886456536592e-08, "loss": 0.1538, "num_input_tokens_seen": 37618304, "step": 61760 }, { "epoch": 19.16382252559727, "grad_norm": 13.178750038146973, "learning_rate": 5.31915169803382e-08, "loss": 0.0934, "num_input_tokens_seen": 37622432, "step": 61765 }, { "epoch": 19.165373875271488, "grad_norm": 10.887839317321777, "learning_rate": 5.299475096665785e-08, "loss": 0.1995, "num_input_tokens_seen": 37624800, "step": 61770 }, { "epoch": 19.166925224945704, "grad_norm": 11.39003849029541, "learning_rate": 5.279834762704328e-08, "loss": 0.1605, "num_input_tokens_seen": 37627072, "step": 61775 }, { "epoch": 19.16847657461992, "grad_norm": 22.193986892700195, "learning_rate": 5.260230697589353e-08, "loss": 0.1363, "num_input_tokens_seen": 37630784, "step": 61780 }, { "epoch": 19.170027924294136, "grad_norm": 10.673443794250488, "learning_rate": 5.2406629027580423e-08, "loss": 0.1188, "num_input_tokens_seen": 37633728, "step": 61785 }, { "epoch": 19.17157927396835, "grad_norm": 2.6419687271118164, "learning_rate": 5.221131379645028e-08, "loss": 0.0775, "num_input_tokens_seen": 37637824, "step": 61790 }, { "epoch": 19.173130623642567, "grad_norm": 17.41936492919922, "learning_rate": 5.201636129682053e-08, "loss": 0.1195, "num_input_tokens_seen": 37640448, "step": 61795 }, { "epoch": 19.174681973316787, "grad_norm": 24.839391708374023, "learning_rate": 5.182177154298529e-08, "loss": 0.2551, "num_input_tokens_seen": 37644640, "step": 61800 }, { "epoch": 19.176233322991003, "grad_norm": 15.110451698303223, "learning_rate": 5.162754454920871e-08, "loss": 0.113, "num_input_tokens_seen": 37647168, "step": 61805 }, { "epoch": 19.17778467266522, "grad_norm": 12.570023536682129, "learning_rate": 5.143368032973106e-08, "loss": 0.1135, "num_input_tokens_seen": 37650432, "step": 61810 }, { "epoch": 19.179336022339434, "grad_norm": 9.600135803222656, "learning_rate": 5.124017889876431e-08, "loss": 0.1031, "num_input_tokens_seen": 37653888, "step": 61815 }, { "epoch": 19.18088737201365, "grad_norm": 35.07927322387695, "learning_rate": 5.104704027049434e-08, "loss": 0.1715, "num_input_tokens_seen": 37656352, "step": 61820 }, { "epoch": 19.18243872168787, "grad_norm": 6.397806167602539, "learning_rate": 5.0854264459080923e-08, "loss": 0.1492, "num_input_tokens_seen": 37659040, "step": 61825 }, { "epoch": 19.183990071362086, "grad_norm": 18.854503631591797, "learning_rate": 5.0661851478656654e-08, "loss": 0.1625, "num_input_tokens_seen": 37662336, "step": 61830 }, { "epoch": 19.1855414210363, "grad_norm": 4.520997524261475, "learning_rate": 5.046980134332746e-08, "loss": 0.1106, "num_input_tokens_seen": 37665344, "step": 61835 }, { "epoch": 19.187092770710517, "grad_norm": 12.504422187805176, "learning_rate": 5.027811406717264e-08, "loss": 0.0613, "num_input_tokens_seen": 37668800, "step": 61840 }, { "epoch": 19.188644120384733, "grad_norm": 16.469097137451172, "learning_rate": 5.008678966424652e-08, "loss": 0.1797, "num_input_tokens_seen": 37671840, "step": 61845 }, { "epoch": 19.190195470058953, "grad_norm": 20.397676467895508, "learning_rate": 4.9895828148573414e-08, "loss": 0.2057, "num_input_tokens_seen": 37674656, "step": 61850 }, { "epoch": 19.19174681973317, "grad_norm": 15.32571792602539, "learning_rate": 4.970522953415491e-08, "loss": 0.0753, "num_input_tokens_seen": 37677408, "step": 61855 }, { "epoch": 19.193298169407385, "grad_norm": 9.45359992980957, "learning_rate": 4.9514993834962613e-08, "loss": 0.0402, "num_input_tokens_seen": 37679680, "step": 61860 }, { "epoch": 19.1948495190816, "grad_norm": 9.558865547180176, "learning_rate": 4.932512106494425e-08, "loss": 0.1904, "num_input_tokens_seen": 37682144, "step": 61865 }, { "epoch": 19.196400868755816, "grad_norm": 12.281271934509277, "learning_rate": 4.9135611238019245e-08, "loss": 0.3076, "num_input_tokens_seen": 37684576, "step": 61870 }, { "epoch": 19.197952218430036, "grad_norm": 2.573232650756836, "learning_rate": 4.894646436808092e-08, "loss": 0.1572, "num_input_tokens_seen": 37687552, "step": 61875 }, { "epoch": 19.19950356810425, "grad_norm": 9.738214492797852, "learning_rate": 4.8757680468995985e-08, "loss": 0.1094, "num_input_tokens_seen": 37690464, "step": 61880 }, { "epoch": 19.201054917778468, "grad_norm": 42.52183532714844, "learning_rate": 4.8569259554604453e-08, "loss": 0.148, "num_input_tokens_seen": 37694176, "step": 61885 }, { "epoch": 19.202606267452683, "grad_norm": 35.6743278503418, "learning_rate": 4.8381201638719735e-08, "loss": 0.1613, "num_input_tokens_seen": 37697184, "step": 61890 }, { "epoch": 19.2041576171269, "grad_norm": 2.428983211517334, "learning_rate": 4.819350673512968e-08, "loss": 0.1508, "num_input_tokens_seen": 37700768, "step": 61895 }, { "epoch": 19.20570896680112, "grad_norm": 3.3154520988464355, "learning_rate": 4.800617485759274e-08, "loss": 0.2029, "num_input_tokens_seen": 37703520, "step": 61900 }, { "epoch": 19.207260316475335, "grad_norm": 19.622604370117188, "learning_rate": 4.781920601984402e-08, "loss": 0.1575, "num_input_tokens_seen": 37706496, "step": 61905 }, { "epoch": 19.20881166614955, "grad_norm": 14.328329086303711, "learning_rate": 4.7632600235590354e-08, "loss": 0.1719, "num_input_tokens_seen": 37709248, "step": 61910 }, { "epoch": 19.210363015823766, "grad_norm": 3.965212345123291, "learning_rate": 4.744635751851134e-08, "loss": 0.1065, "num_input_tokens_seen": 37712384, "step": 61915 }, { "epoch": 19.211914365497982, "grad_norm": 17.50191307067871, "learning_rate": 4.726047788226162e-08, "loss": 0.1241, "num_input_tokens_seen": 37716000, "step": 61920 }, { "epoch": 19.213465715172198, "grad_norm": 22.409542083740234, "learning_rate": 4.707496134046807e-08, "loss": 0.1846, "num_input_tokens_seen": 37719872, "step": 61925 }, { "epoch": 19.215017064846418, "grad_norm": 21.36586570739746, "learning_rate": 4.688980790673148e-08, "loss": 0.1732, "num_input_tokens_seen": 37722720, "step": 61930 }, { "epoch": 19.216568414520633, "grad_norm": 40.990966796875, "learning_rate": 4.670501759462542e-08, "loss": 0.2022, "num_input_tokens_seen": 37726240, "step": 61935 }, { "epoch": 19.21811976419485, "grad_norm": 5.889349460601807, "learning_rate": 4.652059041769741e-08, "loss": 0.1233, "num_input_tokens_seen": 37728960, "step": 61940 }, { "epoch": 19.219671113869065, "grad_norm": 9.211104393005371, "learning_rate": 4.633652638946773e-08, "loss": 0.1664, "num_input_tokens_seen": 37731552, "step": 61945 }, { "epoch": 19.22122246354328, "grad_norm": 9.38626480102539, "learning_rate": 4.615282552343115e-08, "loss": 0.156, "num_input_tokens_seen": 37733920, "step": 61950 }, { "epoch": 19.2227738132175, "grad_norm": 25.170103073120117, "learning_rate": 4.596948783305466e-08, "loss": 0.1831, "num_input_tokens_seen": 37737664, "step": 61955 }, { "epoch": 19.224325162891716, "grad_norm": 28.038330078125, "learning_rate": 4.578651333177919e-08, "loss": 0.1467, "num_input_tokens_seen": 37740832, "step": 61960 }, { "epoch": 19.225876512565932, "grad_norm": 9.503865242004395, "learning_rate": 4.560390203301901e-08, "loss": 0.1717, "num_input_tokens_seen": 37744864, "step": 61965 }, { "epoch": 19.227427862240148, "grad_norm": 41.975830078125, "learning_rate": 4.5421653950161736e-08, "loss": 0.1946, "num_input_tokens_seen": 37747552, "step": 61970 }, { "epoch": 19.228979211914364, "grad_norm": 7.760577201843262, "learning_rate": 4.52397690965678e-08, "loss": 0.1669, "num_input_tokens_seen": 37750656, "step": 61975 }, { "epoch": 19.230530561588584, "grad_norm": 15.369287490844727, "learning_rate": 4.505824748557208e-08, "loss": 0.1399, "num_input_tokens_seen": 37753472, "step": 61980 }, { "epoch": 19.2320819112628, "grad_norm": 30.405290603637695, "learning_rate": 4.487708913048283e-08, "loss": 0.1837, "num_input_tokens_seen": 37757216, "step": 61985 }, { "epoch": 19.233633260937015, "grad_norm": 41.01327896118164, "learning_rate": 4.469629404457942e-08, "loss": 0.1569, "num_input_tokens_seen": 37761920, "step": 61990 }, { "epoch": 19.23518461061123, "grad_norm": 26.688692092895508, "learning_rate": 4.451586224111792e-08, "loss": 0.2018, "num_input_tokens_seen": 37765536, "step": 61995 }, { "epoch": 19.236735960285447, "grad_norm": 10.430118560791016, "learning_rate": 4.433579373332497e-08, "loss": 0.1989, "num_input_tokens_seen": 37767968, "step": 62000 }, { "epoch": 19.238287309959667, "grad_norm": 28.666948318481445, "learning_rate": 4.4156088534402784e-08, "loss": 0.2251, "num_input_tokens_seen": 37771296, "step": 62005 }, { "epoch": 19.239838659633882, "grad_norm": 22.019643783569336, "learning_rate": 4.397674665752471e-08, "loss": 0.2497, "num_input_tokens_seen": 37774112, "step": 62010 }, { "epoch": 19.2413900093081, "grad_norm": 6.944113731384277, "learning_rate": 4.379776811583913e-08, "loss": 0.1479, "num_input_tokens_seen": 37777120, "step": 62015 }, { "epoch": 19.242941358982314, "grad_norm": 6.813181400299072, "learning_rate": 4.3619152922467766e-08, "loss": 0.0929, "num_input_tokens_seen": 37779744, "step": 62020 }, { "epoch": 19.24449270865653, "grad_norm": 5.076883792877197, "learning_rate": 4.344090109050403e-08, "loss": 0.0846, "num_input_tokens_seen": 37782272, "step": 62025 }, { "epoch": 19.24604405833075, "grad_norm": 19.43952178955078, "learning_rate": 4.3263012633017466e-08, "loss": 0.1171, "num_input_tokens_seen": 37785632, "step": 62030 }, { "epoch": 19.247595408004965, "grad_norm": 3.536426305770874, "learning_rate": 4.308548756304876e-08, "loss": 0.0943, "num_input_tokens_seen": 37788800, "step": 62035 }, { "epoch": 19.24914675767918, "grad_norm": 13.89714527130127, "learning_rate": 4.2908325893611935e-08, "loss": 0.1894, "num_input_tokens_seen": 37791552, "step": 62040 }, { "epoch": 19.250698107353397, "grad_norm": 12.054272651672363, "learning_rate": 4.273152763769606e-08, "loss": 0.089, "num_input_tokens_seen": 37794496, "step": 62045 }, { "epoch": 19.252249457027613, "grad_norm": 8.245746612548828, "learning_rate": 4.255509280826187e-08, "loss": 0.1837, "num_input_tokens_seen": 37798080, "step": 62050 }, { "epoch": 19.25380080670183, "grad_norm": 5.347755432128906, "learning_rate": 4.237902141824401e-08, "loss": 0.1083, "num_input_tokens_seen": 37800736, "step": 62055 }, { "epoch": 19.25535215637605, "grad_norm": 29.4079532623291, "learning_rate": 4.220331348055107e-08, "loss": 0.1667, "num_input_tokens_seen": 37803456, "step": 62060 }, { "epoch": 19.256903506050264, "grad_norm": 22.021230697631836, "learning_rate": 4.2027969008064387e-08, "loss": 0.1181, "num_input_tokens_seen": 37806560, "step": 62065 }, { "epoch": 19.25845485572448, "grad_norm": 22.540403366088867, "learning_rate": 4.185298801363869e-08, "loss": 0.1806, "num_input_tokens_seen": 37810112, "step": 62070 }, { "epoch": 19.260006205398696, "grad_norm": 11.630549430847168, "learning_rate": 4.167837051010204e-08, "loss": 0.1234, "num_input_tokens_seen": 37813248, "step": 62075 }, { "epoch": 19.261557555072912, "grad_norm": 69.22660064697266, "learning_rate": 4.150411651025643e-08, "loss": 0.1752, "num_input_tokens_seen": 37816864, "step": 62080 }, { "epoch": 19.26310890474713, "grad_norm": 33.004722595214844, "learning_rate": 4.133022602687664e-08, "loss": 0.2126, "num_input_tokens_seen": 37821600, "step": 62085 }, { "epoch": 19.264660254421347, "grad_norm": 17.820518493652344, "learning_rate": 4.115669907271025e-08, "loss": 0.1655, "num_input_tokens_seen": 37824224, "step": 62090 }, { "epoch": 19.266211604095563, "grad_norm": 19.370349884033203, "learning_rate": 4.09835356604793e-08, "loss": 0.1316, "num_input_tokens_seen": 37828448, "step": 62095 }, { "epoch": 19.26776295376978, "grad_norm": 13.337684631347656, "learning_rate": 4.0810735802879196e-08, "loss": 0.0771, "num_input_tokens_seen": 37831264, "step": 62100 }, { "epoch": 19.269314303443995, "grad_norm": 14.194713592529297, "learning_rate": 4.063829951257758e-08, "loss": 0.1648, "num_input_tokens_seen": 37833376, "step": 62105 }, { "epoch": 19.270865653118214, "grad_norm": 9.33037281036377, "learning_rate": 4.0466226802216016e-08, "loss": 0.0897, "num_input_tokens_seen": 37838784, "step": 62110 }, { "epoch": 19.27241700279243, "grad_norm": 56.19499206542969, "learning_rate": 4.029451768440995e-08, "loss": 0.1579, "num_input_tokens_seen": 37840928, "step": 62115 }, { "epoch": 19.273968352466646, "grad_norm": 21.50677490234375, "learning_rate": 4.0123172171747104e-08, "loss": 0.2596, "num_input_tokens_seen": 37844032, "step": 62120 }, { "epoch": 19.275519702140862, "grad_norm": 17.64939308166504, "learning_rate": 3.995219027678965e-08, "loss": 0.1228, "num_input_tokens_seen": 37847392, "step": 62125 }, { "epoch": 19.277071051815078, "grad_norm": 9.91758918762207, "learning_rate": 3.9781572012072556e-08, "loss": 0.1181, "num_input_tokens_seen": 37849952, "step": 62130 }, { "epoch": 19.278622401489297, "grad_norm": 3.707068920135498, "learning_rate": 3.9611317390104156e-08, "loss": 0.0697, "num_input_tokens_seen": 37853440, "step": 62135 }, { "epoch": 19.280173751163513, "grad_norm": 5.742338180541992, "learning_rate": 3.944142642336557e-08, "loss": 0.147, "num_input_tokens_seen": 37856512, "step": 62140 }, { "epoch": 19.28172510083773, "grad_norm": 29.480762481689453, "learning_rate": 3.9271899124312395e-08, "loss": 0.3209, "num_input_tokens_seen": 37860512, "step": 62145 }, { "epoch": 19.283276450511945, "grad_norm": 7.3562397956848145, "learning_rate": 3.910273550537247e-08, "loss": 0.2613, "num_input_tokens_seen": 37863552, "step": 62150 }, { "epoch": 19.28482780018616, "grad_norm": 15.861886978149414, "learning_rate": 3.8933935578948646e-08, "loss": 0.171, "num_input_tokens_seen": 37866144, "step": 62155 }, { "epoch": 19.28637914986038, "grad_norm": 4.467919826507568, "learning_rate": 3.876549935741436e-08, "loss": 0.0826, "num_input_tokens_seen": 37869280, "step": 62160 }, { "epoch": 19.287930499534596, "grad_norm": 10.41250991821289, "learning_rate": 3.859742685311918e-08, "loss": 0.1037, "num_input_tokens_seen": 37871968, "step": 62165 }, { "epoch": 19.289481849208812, "grad_norm": 9.663408279418945, "learning_rate": 3.842971807838436e-08, "loss": 0.1911, "num_input_tokens_seen": 37875200, "step": 62170 }, { "epoch": 19.291033198883028, "grad_norm": 18.259336471557617, "learning_rate": 3.826237304550451e-08, "loss": 0.1288, "num_input_tokens_seen": 37878592, "step": 62175 }, { "epoch": 19.292584548557244, "grad_norm": 82.04476928710938, "learning_rate": 3.809539176674926e-08, "loss": 0.1758, "num_input_tokens_seen": 37881088, "step": 62180 }, { "epoch": 19.29413589823146, "grad_norm": 6.491295337677002, "learning_rate": 3.792877425435937e-08, "loss": 0.1524, "num_input_tokens_seen": 37883616, "step": 62185 }, { "epoch": 19.29568724790568, "grad_norm": 10.025333404541016, "learning_rate": 3.776252052055007e-08, "loss": 0.0974, "num_input_tokens_seen": 37885856, "step": 62190 }, { "epoch": 19.297238597579895, "grad_norm": 8.30742073059082, "learning_rate": 3.7596630577509396e-08, "loss": 0.182, "num_input_tokens_seen": 37888256, "step": 62195 }, { "epoch": 19.29878994725411, "grad_norm": 15.681602478027344, "learning_rate": 3.743110443739928e-08, "loss": 0.1816, "num_input_tokens_seen": 37891072, "step": 62200 }, { "epoch": 19.300341296928327, "grad_norm": 16.42292022705078, "learning_rate": 3.7265942112355015e-08, "loss": 0.1055, "num_input_tokens_seen": 37894848, "step": 62205 }, { "epoch": 19.301892646602543, "grad_norm": 4.939263820648193, "learning_rate": 3.710114361448469e-08, "loss": 0.0451, "num_input_tokens_seen": 37899648, "step": 62210 }, { "epoch": 19.303443996276762, "grad_norm": 10.175755500793457, "learning_rate": 3.6936708955870316e-08, "loss": 0.175, "num_input_tokens_seen": 37902336, "step": 62215 }, { "epoch": 19.304995345950978, "grad_norm": 24.490367889404297, "learning_rate": 3.677263814856613e-08, "loss": 0.1749, "num_input_tokens_seen": 37905056, "step": 62220 }, { "epoch": 19.306546695625194, "grad_norm": 22.114295959472656, "learning_rate": 3.6608931204601406e-08, "loss": 0.189, "num_input_tokens_seen": 37907648, "step": 62225 }, { "epoch": 19.30809804529941, "grad_norm": 8.24698543548584, "learning_rate": 3.644558813597709e-08, "loss": 0.1092, "num_input_tokens_seen": 37910048, "step": 62230 }, { "epoch": 19.309649394973626, "grad_norm": 7.059864521026611, "learning_rate": 3.6282608954668616e-08, "loss": 0.0767, "num_input_tokens_seen": 37912896, "step": 62235 }, { "epoch": 19.311200744647845, "grad_norm": 6.416016578674316, "learning_rate": 3.61199936726242e-08, "loss": 0.1395, "num_input_tokens_seen": 37916928, "step": 62240 }, { "epoch": 19.31275209432206, "grad_norm": 39.95848083496094, "learning_rate": 3.595774230176541e-08, "loss": 0.1353, "num_input_tokens_seen": 37919840, "step": 62245 }, { "epoch": 19.314303443996277, "grad_norm": 17.034555435180664, "learning_rate": 3.579585485398662e-08, "loss": 0.1593, "num_input_tokens_seen": 37922592, "step": 62250 }, { "epoch": 19.315854793670493, "grad_norm": 23.927635192871094, "learning_rate": 3.563433134115723e-08, "loss": 0.1754, "num_input_tokens_seen": 37925056, "step": 62255 }, { "epoch": 19.31740614334471, "grad_norm": 5.537937641143799, "learning_rate": 3.547317177511777e-08, "loss": 0.1455, "num_input_tokens_seen": 37928544, "step": 62260 }, { "epoch": 19.318957493018928, "grad_norm": 22.506877899169922, "learning_rate": 3.5312376167684346e-08, "loss": 0.2873, "num_input_tokens_seen": 37931744, "step": 62265 }, { "epoch": 19.320508842693144, "grad_norm": 2.9391324520111084, "learning_rate": 3.5151944530643634e-08, "loss": 0.1283, "num_input_tokens_seen": 37934208, "step": 62270 }, { "epoch": 19.32206019236736, "grad_norm": 14.438403129577637, "learning_rate": 3.499187687575845e-08, "loss": 0.1793, "num_input_tokens_seen": 37936544, "step": 62275 }, { "epoch": 19.323611542041576, "grad_norm": 15.619783401489258, "learning_rate": 3.483217321476273e-08, "loss": 0.1315, "num_input_tokens_seen": 37938592, "step": 62280 }, { "epoch": 19.32516289171579, "grad_norm": 15.306361198425293, "learning_rate": 3.467283355936546e-08, "loss": 0.0848, "num_input_tokens_seen": 37941248, "step": 62285 }, { "epoch": 19.32671424139001, "grad_norm": 22.191173553466797, "learning_rate": 3.451385792124784e-08, "loss": 0.1304, "num_input_tokens_seen": 37945152, "step": 62290 }, { "epoch": 19.328265591064227, "grad_norm": 51.506492614746094, "learning_rate": 3.435524631206444e-08, "loss": 0.1747, "num_input_tokens_seen": 37948768, "step": 62295 }, { "epoch": 19.329816940738443, "grad_norm": 5.601879596710205, "learning_rate": 3.4196998743443734e-08, "loss": 0.2111, "num_input_tokens_seen": 37951904, "step": 62300 }, { "epoch": 19.33136829041266, "grad_norm": 8.7645263671875, "learning_rate": 3.403911522698644e-08, "loss": 0.1549, "num_input_tokens_seen": 37954080, "step": 62305 }, { "epoch": 19.332919640086875, "grad_norm": 11.975602149963379, "learning_rate": 3.38815957742683e-08, "loss": 0.1675, "num_input_tokens_seen": 37957312, "step": 62310 }, { "epoch": 19.33447098976109, "grad_norm": 21.511415481567383, "learning_rate": 3.3724440396836755e-08, "loss": 0.1501, "num_input_tokens_seen": 37960960, "step": 62315 }, { "epoch": 19.33602233943531, "grad_norm": 37.52518844604492, "learning_rate": 3.356764910621313e-08, "loss": 0.1442, "num_input_tokens_seen": 37963712, "step": 62320 }, { "epoch": 19.337573689109526, "grad_norm": 34.192237854003906, "learning_rate": 3.341122191389157e-08, "loss": 0.2911, "num_input_tokens_seen": 37966688, "step": 62325 }, { "epoch": 19.33912503878374, "grad_norm": 83.42493438720703, "learning_rate": 3.325515883134178e-08, "loss": 0.1655, "num_input_tokens_seen": 37969440, "step": 62330 }, { "epoch": 19.340676388457958, "grad_norm": 25.549388885498047, "learning_rate": 3.309945987000296e-08, "loss": 0.1155, "num_input_tokens_seen": 37972320, "step": 62335 }, { "epoch": 19.342227738132173, "grad_norm": 12.031611442565918, "learning_rate": 3.2944125041291517e-08, "loss": 0.1239, "num_input_tokens_seen": 37975200, "step": 62340 }, { "epoch": 19.343779087806393, "grad_norm": 6.186887264251709, "learning_rate": 3.278915435659335e-08, "loss": 0.1718, "num_input_tokens_seen": 37978336, "step": 62345 }, { "epoch": 19.34533043748061, "grad_norm": 7.360095500946045, "learning_rate": 3.26345478272716e-08, "loss": 0.2239, "num_input_tokens_seen": 37981440, "step": 62350 }, { "epoch": 19.346881787154825, "grad_norm": 11.862881660461426, "learning_rate": 3.248030546465941e-08, "loss": 0.1048, "num_input_tokens_seen": 37984288, "step": 62355 }, { "epoch": 19.34843313682904, "grad_norm": 22.788745880126953, "learning_rate": 3.232642728006552e-08, "loss": 0.2377, "num_input_tokens_seen": 37987840, "step": 62360 }, { "epoch": 19.349984486503256, "grad_norm": 1.0536776781082153, "learning_rate": 3.217291328477035e-08, "loss": 0.1544, "num_input_tokens_seen": 37990592, "step": 62365 }, { "epoch": 19.351535836177476, "grad_norm": 18.81829261779785, "learning_rate": 3.201976349002822e-08, "loss": 0.1582, "num_input_tokens_seen": 37994528, "step": 62370 }, { "epoch": 19.35308718585169, "grad_norm": 17.974241256713867, "learning_rate": 3.186697790706794e-08, "loss": 0.1762, "num_input_tokens_seen": 37996768, "step": 62375 }, { "epoch": 19.354638535525908, "grad_norm": 13.144364356994629, "learning_rate": 3.171455654708888e-08, "loss": 0.1277, "num_input_tokens_seen": 38000032, "step": 62380 }, { "epoch": 19.356189885200124, "grad_norm": 9.922980308532715, "learning_rate": 3.156249942126655e-08, "loss": 0.1441, "num_input_tokens_seen": 38003040, "step": 62385 }, { "epoch": 19.35774123487434, "grad_norm": 2.9388115406036377, "learning_rate": 3.1410806540747574e-08, "loss": 0.0883, "num_input_tokens_seen": 38005952, "step": 62390 }, { "epoch": 19.35929258454856, "grad_norm": 39.4295539855957, "learning_rate": 3.125947791665418e-08, "loss": 0.1139, "num_input_tokens_seen": 38008864, "step": 62395 }, { "epoch": 19.360843934222775, "grad_norm": 20.157432556152344, "learning_rate": 3.110851356007916e-08, "loss": 0.102, "num_input_tokens_seen": 38011328, "step": 62400 }, { "epoch": 19.36239528389699, "grad_norm": 8.462163925170898, "learning_rate": 3.0957913482090874e-08, "loss": 0.1175, "num_input_tokens_seen": 38014272, "step": 62405 }, { "epoch": 19.363946633571206, "grad_norm": 15.193703651428223, "learning_rate": 3.080767769372939e-08, "loss": 0.1373, "num_input_tokens_seen": 38018624, "step": 62410 }, { "epoch": 19.365497983245422, "grad_norm": 17.752220153808594, "learning_rate": 3.065780620600922e-08, "loss": 0.1352, "num_input_tokens_seen": 38021120, "step": 62415 }, { "epoch": 19.367049332919642, "grad_norm": 10.71054744720459, "learning_rate": 3.050829902991825e-08, "loss": 0.3008, "num_input_tokens_seen": 38023904, "step": 62420 }, { "epoch": 19.368600682593858, "grad_norm": 7.838391304016113, "learning_rate": 3.035915617641605e-08, "loss": 0.1684, "num_input_tokens_seen": 38026912, "step": 62425 }, { "epoch": 19.370152032268074, "grad_norm": 16.729345321655273, "learning_rate": 3.0210377656437195e-08, "loss": 0.1385, "num_input_tokens_seen": 38029920, "step": 62430 }, { "epoch": 19.37170338194229, "grad_norm": 11.043478012084961, "learning_rate": 3.006196348088852e-08, "loss": 0.2843, "num_input_tokens_seen": 38032416, "step": 62435 }, { "epoch": 19.373254731616505, "grad_norm": 14.568035125732422, "learning_rate": 2.991391366065133e-08, "loss": 0.2047, "num_input_tokens_seen": 38035296, "step": 62440 }, { "epoch": 19.37480608129072, "grad_norm": 12.290229797363281, "learning_rate": 2.9766228206578597e-08, "loss": 0.1075, "num_input_tokens_seen": 38039360, "step": 62445 }, { "epoch": 19.37635743096494, "grad_norm": 37.163326263427734, "learning_rate": 2.9618907129497777e-08, "loss": 0.2286, "num_input_tokens_seen": 38044288, "step": 62450 }, { "epoch": 19.377908780639157, "grad_norm": 10.14895248413086, "learning_rate": 2.9471950440208563e-08, "loss": 0.1222, "num_input_tokens_seen": 38047904, "step": 62455 }, { "epoch": 19.379460130313372, "grad_norm": 7.308892250061035, "learning_rate": 2.932535814948623e-08, "loss": 0.1331, "num_input_tokens_seen": 38050432, "step": 62460 }, { "epoch": 19.38101147998759, "grad_norm": 6.074790954589844, "learning_rate": 2.9179130268076062e-08, "loss": 0.0978, "num_input_tokens_seen": 38053216, "step": 62465 }, { "epoch": 19.382562829661804, "grad_norm": 36.7608528137207, "learning_rate": 2.9033266806698934e-08, "loss": 0.1624, "num_input_tokens_seen": 38055520, "step": 62470 }, { "epoch": 19.384114179336024, "grad_norm": 25.168039321899414, "learning_rate": 2.888776777604907e-08, "loss": 0.1581, "num_input_tokens_seen": 38058368, "step": 62475 }, { "epoch": 19.38566552901024, "grad_norm": 25.02409553527832, "learning_rate": 2.8742633186791825e-08, "loss": 0.1658, "num_input_tokens_seen": 38061312, "step": 62480 }, { "epoch": 19.387216878684455, "grad_norm": 5.5370707511901855, "learning_rate": 2.8597863049568687e-08, "loss": 0.1684, "num_input_tokens_seen": 38065184, "step": 62485 }, { "epoch": 19.38876822835867, "grad_norm": 12.528175354003906, "learning_rate": 2.8453457374992276e-08, "loss": 0.1256, "num_input_tokens_seen": 38068032, "step": 62490 }, { "epoch": 19.390319578032887, "grad_norm": 23.198213577270508, "learning_rate": 2.830941617364913e-08, "loss": 0.1805, "num_input_tokens_seen": 38071808, "step": 62495 }, { "epoch": 19.391870927707107, "grad_norm": 4.4802422523498535, "learning_rate": 2.8165739456099682e-08, "loss": 0.0947, "num_input_tokens_seen": 38074496, "step": 62500 }, { "epoch": 19.393422277381323, "grad_norm": 13.340499877929688, "learning_rate": 2.8022427232876626e-08, "loss": 0.1257, "num_input_tokens_seen": 38077088, "step": 62505 }, { "epoch": 19.39497362705554, "grad_norm": 18.550628662109375, "learning_rate": 2.7879479514486553e-08, "loss": 0.1677, "num_input_tokens_seen": 38080096, "step": 62510 }, { "epoch": 19.396524976729754, "grad_norm": 15.499680519104004, "learning_rate": 2.7736896311409412e-08, "loss": 0.1244, "num_input_tokens_seen": 38082816, "step": 62515 }, { "epoch": 19.39807632640397, "grad_norm": 11.906783103942871, "learning_rate": 2.759467763409851e-08, "loss": 0.0617, "num_input_tokens_seen": 38086304, "step": 62520 }, { "epoch": 19.39962767607819, "grad_norm": 22.944129943847656, "learning_rate": 2.7452823492979397e-08, "loss": 0.1459, "num_input_tokens_seen": 38089120, "step": 62525 }, { "epoch": 19.401179025752405, "grad_norm": 10.957345008850098, "learning_rate": 2.731133389845264e-08, "loss": 0.1, "num_input_tokens_seen": 38093056, "step": 62530 }, { "epoch": 19.40273037542662, "grad_norm": 15.209964752197266, "learning_rate": 2.7170208860889948e-08, "loss": 0.1736, "num_input_tokens_seen": 38095968, "step": 62535 }, { "epoch": 19.404281725100837, "grad_norm": 7.807741165161133, "learning_rate": 2.702944839063859e-08, "loss": 0.2257, "num_input_tokens_seen": 38098944, "step": 62540 }, { "epoch": 19.405833074775053, "grad_norm": 22.474428176879883, "learning_rate": 2.688905249801699e-08, "loss": 0.2152, "num_input_tokens_seen": 38102656, "step": 62545 }, { "epoch": 19.407384424449273, "grad_norm": 12.122953414916992, "learning_rate": 2.6749021193318568e-08, "loss": 0.1261, "num_input_tokens_seen": 38104896, "step": 62550 }, { "epoch": 19.40893577412349, "grad_norm": 14.18278980255127, "learning_rate": 2.660935448680846e-08, "loss": 0.1094, "num_input_tokens_seen": 38107296, "step": 62555 }, { "epoch": 19.410487123797704, "grad_norm": 24.23357391357422, "learning_rate": 2.6470052388727353e-08, "loss": 0.1262, "num_input_tokens_seen": 38112032, "step": 62560 }, { "epoch": 19.41203847347192, "grad_norm": 13.259190559387207, "learning_rate": 2.633111490928597e-08, "loss": 0.1818, "num_input_tokens_seen": 38116352, "step": 62565 }, { "epoch": 19.413589823146136, "grad_norm": 10.234314918518066, "learning_rate": 2.619254205867172e-08, "loss": 0.1494, "num_input_tokens_seen": 38119200, "step": 62570 }, { "epoch": 19.415141172820352, "grad_norm": 9.363926887512207, "learning_rate": 2.6054333847042036e-08, "loss": 0.1077, "num_input_tokens_seen": 38122528, "step": 62575 }, { "epoch": 19.41669252249457, "grad_norm": 10.532500267028809, "learning_rate": 2.591649028453047e-08, "loss": 0.1258, "num_input_tokens_seen": 38125280, "step": 62580 }, { "epoch": 19.418243872168787, "grad_norm": 19.481216430664062, "learning_rate": 2.5779011381241727e-08, "loss": 0.2485, "num_input_tokens_seen": 38128096, "step": 62585 }, { "epoch": 19.419795221843003, "grad_norm": 7.933269023895264, "learning_rate": 2.5641897147255512e-08, "loss": 0.1636, "num_input_tokens_seen": 38131424, "step": 62590 }, { "epoch": 19.42134657151722, "grad_norm": 54.447547912597656, "learning_rate": 2.5505147592623236e-08, "loss": 0.1947, "num_input_tokens_seen": 38133952, "step": 62595 }, { "epoch": 19.422897921191435, "grad_norm": 7.887055397033691, "learning_rate": 2.536876272737021e-08, "loss": 0.0841, "num_input_tokens_seen": 38137024, "step": 62600 }, { "epoch": 19.424449270865654, "grad_norm": 26.094863891601562, "learning_rate": 2.523274256149566e-08, "loss": 0.072, "num_input_tokens_seen": 38140352, "step": 62605 }, { "epoch": 19.42600062053987, "grad_norm": 22.01892852783203, "learning_rate": 2.509708710497105e-08, "loss": 0.1149, "num_input_tokens_seen": 38142592, "step": 62610 }, { "epoch": 19.427551970214086, "grad_norm": 22.656539916992188, "learning_rate": 2.4961796367741763e-08, "loss": 0.1063, "num_input_tokens_seen": 38145024, "step": 62615 }, { "epoch": 19.429103319888302, "grad_norm": 12.979320526123047, "learning_rate": 2.4826870359725975e-08, "loss": 0.1664, "num_input_tokens_seen": 38147392, "step": 62620 }, { "epoch": 19.430654669562518, "grad_norm": 5.028360366821289, "learning_rate": 2.469230909081577e-08, "loss": 0.0956, "num_input_tokens_seen": 38149600, "step": 62625 }, { "epoch": 19.432206019236737, "grad_norm": 15.587719917297363, "learning_rate": 2.4558112570874924e-08, "loss": 0.1224, "num_input_tokens_seen": 38151968, "step": 62630 }, { "epoch": 19.433757368910953, "grad_norm": 29.49652862548828, "learning_rate": 2.442428080974335e-08, "loss": 0.1642, "num_input_tokens_seen": 38155136, "step": 62635 }, { "epoch": 19.43530871858517, "grad_norm": 13.435587882995605, "learning_rate": 2.4290813817230975e-08, "loss": 0.1491, "num_input_tokens_seen": 38158592, "step": 62640 }, { "epoch": 19.436860068259385, "grad_norm": 35.620967864990234, "learning_rate": 2.4157711603123855e-08, "loss": 0.2379, "num_input_tokens_seen": 38160704, "step": 62645 }, { "epoch": 19.4384114179336, "grad_norm": 28.35677146911621, "learning_rate": 2.402497417717864e-08, "loss": 0.2128, "num_input_tokens_seen": 38164640, "step": 62650 }, { "epoch": 19.43996276760782, "grad_norm": 25.874284744262695, "learning_rate": 2.3892601549126982e-08, "loss": 0.0631, "num_input_tokens_seen": 38167648, "step": 62655 }, { "epoch": 19.441514117282036, "grad_norm": 10.172389030456543, "learning_rate": 2.3760593728674452e-08, "loss": 0.162, "num_input_tokens_seen": 38169568, "step": 62660 }, { "epoch": 19.443065466956252, "grad_norm": 9.843036651611328, "learning_rate": 2.3628950725497202e-08, "loss": 0.1868, "num_input_tokens_seen": 38173664, "step": 62665 }, { "epoch": 19.444616816630468, "grad_norm": 15.881622314453125, "learning_rate": 2.3497672549246952e-08, "loss": 0.1576, "num_input_tokens_seen": 38176192, "step": 62670 }, { "epoch": 19.446168166304684, "grad_norm": 24.390836715698242, "learning_rate": 2.3366759209548228e-08, "loss": 0.1486, "num_input_tokens_seen": 38181760, "step": 62675 }, { "epoch": 19.447719515978903, "grad_norm": 14.37235164642334, "learning_rate": 2.32362107159978e-08, "loss": 0.1462, "num_input_tokens_seen": 38184576, "step": 62680 }, { "epoch": 19.44927086565312, "grad_norm": 15.67043685913086, "learning_rate": 2.3106027078166894e-08, "loss": 0.1577, "num_input_tokens_seen": 38188128, "step": 62685 }, { "epoch": 19.450822215327335, "grad_norm": 37.301517486572266, "learning_rate": 2.29762083056001e-08, "loss": 0.3123, "num_input_tokens_seen": 38190752, "step": 62690 }, { "epoch": 19.45237356500155, "grad_norm": 41.169525146484375, "learning_rate": 2.284675440781314e-08, "loss": 0.1177, "num_input_tokens_seen": 38193568, "step": 62695 }, { "epoch": 19.453924914675767, "grad_norm": 20.834964752197266, "learning_rate": 2.271766539429787e-08, "loss": 0.1677, "num_input_tokens_seen": 38196832, "step": 62700 }, { "epoch": 19.455476264349983, "grad_norm": 39.291175842285156, "learning_rate": 2.2588941274517273e-08, "loss": 0.1197, "num_input_tokens_seen": 38200480, "step": 62705 }, { "epoch": 19.457027614024202, "grad_norm": 14.780811309814453, "learning_rate": 2.2460582057909354e-08, "loss": 0.2041, "num_input_tokens_seen": 38203776, "step": 62710 }, { "epoch": 19.458578963698418, "grad_norm": 14.22529125213623, "learning_rate": 2.2332587753882695e-08, "loss": 0.1964, "num_input_tokens_seen": 38207104, "step": 62715 }, { "epoch": 19.460130313372634, "grad_norm": 6.068915367126465, "learning_rate": 2.220495837182257e-08, "loss": 0.1047, "num_input_tokens_seen": 38209664, "step": 62720 }, { "epoch": 19.46168166304685, "grad_norm": 5.837528705596924, "learning_rate": 2.2077693921084276e-08, "loss": 0.189, "num_input_tokens_seen": 38212576, "step": 62725 }, { "epoch": 19.463233012721066, "grad_norm": 10.69123363494873, "learning_rate": 2.1950794410999232e-08, "loss": 0.1566, "num_input_tokens_seen": 38215648, "step": 62730 }, { "epoch": 19.464784362395285, "grad_norm": 16.36315155029297, "learning_rate": 2.182425985086889e-08, "loss": 0.1084, "num_input_tokens_seen": 38219456, "step": 62735 }, { "epoch": 19.4663357120695, "grad_norm": 18.885303497314453, "learning_rate": 2.1698090249971383e-08, "loss": 0.1602, "num_input_tokens_seen": 38222720, "step": 62740 }, { "epoch": 19.467887061743717, "grad_norm": 32.87982177734375, "learning_rate": 2.157228561755542e-08, "loss": 0.1344, "num_input_tokens_seen": 38225664, "step": 62745 }, { "epoch": 19.469438411417933, "grad_norm": 10.799825668334961, "learning_rate": 2.144684596284474e-08, "loss": 0.1451, "num_input_tokens_seen": 38228608, "step": 62750 }, { "epoch": 19.47098976109215, "grad_norm": 21.554487228393555, "learning_rate": 2.1321771295034764e-08, "loss": 0.1797, "num_input_tokens_seen": 38232576, "step": 62755 }, { "epoch": 19.472541110766368, "grad_norm": 8.170634269714355, "learning_rate": 2.119706162329538e-08, "loss": 0.1976, "num_input_tokens_seen": 38235872, "step": 62760 }, { "epoch": 19.474092460440584, "grad_norm": 42.889930725097656, "learning_rate": 2.1072716956769267e-08, "loss": 0.1776, "num_input_tokens_seen": 38239008, "step": 62765 }, { "epoch": 19.4756438101148, "grad_norm": 19.836610794067383, "learning_rate": 2.0948737304572474e-08, "loss": 0.1409, "num_input_tokens_seen": 38241984, "step": 62770 }, { "epoch": 19.477195159789016, "grad_norm": 7.5771355628967285, "learning_rate": 2.082512267579384e-08, "loss": 0.1229, "num_input_tokens_seen": 38244864, "step": 62775 }, { "epoch": 19.47874650946323, "grad_norm": 10.205904006958008, "learning_rate": 2.0701873079496115e-08, "loss": 0.1509, "num_input_tokens_seen": 38247136, "step": 62780 }, { "epoch": 19.48029785913745, "grad_norm": 22.646041870117188, "learning_rate": 2.057898852471485e-08, "loss": 0.243, "num_input_tokens_seen": 38253440, "step": 62785 }, { "epoch": 19.481849208811667, "grad_norm": 7.33267068862915, "learning_rate": 2.0456469020458392e-08, "loss": 0.1351, "num_input_tokens_seen": 38256032, "step": 62790 }, { "epoch": 19.483400558485883, "grad_norm": 36.55461120605469, "learning_rate": 2.0334314575710112e-08, "loss": 0.1632, "num_input_tokens_seen": 38260512, "step": 62795 }, { "epoch": 19.4849519081601, "grad_norm": 12.20197868347168, "learning_rate": 2.021252519942396e-08, "loss": 0.2695, "num_input_tokens_seen": 38263648, "step": 62800 }, { "epoch": 19.486503257834315, "grad_norm": 22.46698760986328, "learning_rate": 2.009110090052946e-08, "loss": 0.1182, "num_input_tokens_seen": 38266656, "step": 62805 }, { "epoch": 19.488054607508534, "grad_norm": 16.84530258178711, "learning_rate": 1.9970041687928375e-08, "loss": 0.1274, "num_input_tokens_seen": 38270816, "step": 62810 }, { "epoch": 19.48960595718275, "grad_norm": 29.295866012573242, "learning_rate": 1.9849347570495837e-08, "loss": 0.2113, "num_input_tokens_seen": 38273216, "step": 62815 }, { "epoch": 19.491157306856966, "grad_norm": 17.262182235717773, "learning_rate": 1.9729018557079206e-08, "loss": 0.2305, "num_input_tokens_seen": 38275552, "step": 62820 }, { "epoch": 19.492708656531182, "grad_norm": 9.843279838562012, "learning_rate": 1.9609054656501427e-08, "loss": 0.1291, "num_input_tokens_seen": 38278912, "step": 62825 }, { "epoch": 19.494260006205398, "grad_norm": 25.879070281982422, "learning_rate": 1.9489455877556017e-08, "loss": 0.1971, "num_input_tokens_seen": 38282912, "step": 62830 }, { "epoch": 19.495811355879614, "grad_norm": 23.75532341003418, "learning_rate": 1.937022222901208e-08, "loss": 0.1362, "num_input_tokens_seen": 38285856, "step": 62835 }, { "epoch": 19.497362705553833, "grad_norm": 27.068296432495117, "learning_rate": 1.92513537196104e-08, "loss": 0.1014, "num_input_tokens_seen": 38288864, "step": 62840 }, { "epoch": 19.49891405522805, "grad_norm": 20.663959503173828, "learning_rate": 1.913285035806456e-08, "loss": 0.1487, "num_input_tokens_seen": 38291872, "step": 62845 }, { "epoch": 19.500465404902265, "grad_norm": 4.445250988006592, "learning_rate": 1.901471215306372e-08, "loss": 0.1394, "num_input_tokens_seen": 38294240, "step": 62850 }, { "epoch": 19.50201675457648, "grad_norm": 14.732492446899414, "learning_rate": 1.8896939113268175e-08, "loss": 0.0846, "num_input_tokens_seen": 38297152, "step": 62855 }, { "epoch": 19.503568104250697, "grad_norm": 6.696051597595215, "learning_rate": 1.8779531247311577e-08, "loss": 0.1209, "num_input_tokens_seen": 38300160, "step": 62860 }, { "epoch": 19.505119453924916, "grad_norm": 31.285085678100586, "learning_rate": 1.8662488563802595e-08, "loss": 0.1908, "num_input_tokens_seen": 38302528, "step": 62865 }, { "epoch": 19.506670803599132, "grad_norm": 35.98265838623047, "learning_rate": 1.8545811071320474e-08, "loss": 0.1712, "num_input_tokens_seen": 38307328, "step": 62870 }, { "epoch": 19.508222153273348, "grad_norm": 15.871818542480469, "learning_rate": 1.842949877842004e-08, "loss": 0.246, "num_input_tokens_seen": 38310560, "step": 62875 }, { "epoch": 19.509773502947564, "grad_norm": 13.564443588256836, "learning_rate": 1.8313551693627806e-08, "loss": 0.1489, "num_input_tokens_seen": 38314144, "step": 62880 }, { "epoch": 19.51132485262178, "grad_norm": 8.145672798156738, "learning_rate": 1.8197969825444194e-08, "loss": 0.1572, "num_input_tokens_seen": 38318048, "step": 62885 }, { "epoch": 19.512876202296, "grad_norm": 8.437705993652344, "learning_rate": 1.8082753182342428e-08, "loss": 0.1424, "num_input_tokens_seen": 38320224, "step": 62890 }, { "epoch": 19.514427551970215, "grad_norm": 9.743196487426758, "learning_rate": 1.796790177277019e-08, "loss": 0.2435, "num_input_tokens_seen": 38322880, "step": 62895 }, { "epoch": 19.51597890164443, "grad_norm": 42.8026237487793, "learning_rate": 1.7853415605146305e-08, "loss": 0.1437, "num_input_tokens_seen": 38326976, "step": 62900 }, { "epoch": 19.517530251318647, "grad_norm": 15.184965133666992, "learning_rate": 1.773929468786517e-08, "loss": 0.1809, "num_input_tokens_seen": 38329632, "step": 62905 }, { "epoch": 19.519081600992862, "grad_norm": 9.00752067565918, "learning_rate": 1.762553902929176e-08, "loss": 0.0829, "num_input_tokens_seen": 38332320, "step": 62910 }, { "epoch": 19.520632950667082, "grad_norm": 9.058680534362793, "learning_rate": 1.7512148637766624e-08, "loss": 0.1717, "num_input_tokens_seen": 38335072, "step": 62915 }, { "epoch": 19.522184300341298, "grad_norm": 20.69024658203125, "learning_rate": 1.7399123521602557e-08, "loss": 0.1576, "num_input_tokens_seen": 38337792, "step": 62920 }, { "epoch": 19.523735650015514, "grad_norm": 5.836842060089111, "learning_rate": 1.728646368908571e-08, "loss": 0.1763, "num_input_tokens_seen": 38342048, "step": 62925 }, { "epoch": 19.52528699968973, "grad_norm": 27.685531616210938, "learning_rate": 1.7174169148475584e-08, "loss": 0.2474, "num_input_tokens_seen": 38346912, "step": 62930 }, { "epoch": 19.526838349363945, "grad_norm": 52.95367431640625, "learning_rate": 1.706223990800393e-08, "loss": 0.1848, "num_input_tokens_seen": 38350464, "step": 62935 }, { "epoch": 19.528389699038165, "grad_norm": 12.604837417602539, "learning_rate": 1.695067597587696e-08, "loss": 0.1066, "num_input_tokens_seen": 38353248, "step": 62940 }, { "epoch": 19.52994104871238, "grad_norm": 9.015121459960938, "learning_rate": 1.683947736027314e-08, "loss": 0.1725, "num_input_tokens_seen": 38355776, "step": 62945 }, { "epoch": 19.531492398386597, "grad_norm": 11.61491584777832, "learning_rate": 1.6728644069345935e-08, "loss": 0.1027, "num_input_tokens_seen": 38359232, "step": 62950 }, { "epoch": 19.533043748060813, "grad_norm": 15.420698165893555, "learning_rate": 1.6618176111218854e-08, "loss": 0.0943, "num_input_tokens_seen": 38362304, "step": 62955 }, { "epoch": 19.53459509773503, "grad_norm": 18.384071350097656, "learning_rate": 1.6508073493992637e-08, "loss": 0.1622, "num_input_tokens_seen": 38364800, "step": 62960 }, { "epoch": 19.536146447409244, "grad_norm": 23.785505294799805, "learning_rate": 1.6398336225736944e-08, "loss": 0.1162, "num_input_tokens_seen": 38371232, "step": 62965 }, { "epoch": 19.537697797083464, "grad_norm": 6.040240287780762, "learning_rate": 1.6288964314498668e-08, "loss": 0.1234, "num_input_tokens_seen": 38374208, "step": 62970 }, { "epoch": 19.53924914675768, "grad_norm": 5.208530902862549, "learning_rate": 1.617995776829473e-08, "loss": 0.1764, "num_input_tokens_seen": 38376800, "step": 62975 }, { "epoch": 19.540800496431896, "grad_norm": 6.013843536376953, "learning_rate": 1.6071316595117626e-08, "loss": 0.1168, "num_input_tokens_seen": 38379232, "step": 62980 }, { "epoch": 19.54235184610611, "grad_norm": 7.407049655914307, "learning_rate": 1.596304080293154e-08, "loss": 0.2174, "num_input_tokens_seen": 38381280, "step": 62985 }, { "epoch": 19.543903195780327, "grad_norm": 32.25069808959961, "learning_rate": 1.5855130399674012e-08, "loss": 0.1877, "num_input_tokens_seen": 38384096, "step": 62990 }, { "epoch": 19.545454545454547, "grad_norm": 14.181605339050293, "learning_rate": 1.5747585393256493e-08, "loss": 0.1235, "num_input_tokens_seen": 38387648, "step": 62995 }, { "epoch": 19.547005895128763, "grad_norm": 13.8047456741333, "learning_rate": 1.5640405791563784e-08, "loss": 0.1025, "num_input_tokens_seen": 38390272, "step": 63000 }, { "epoch": 19.54855724480298, "grad_norm": 20.523290634155273, "learning_rate": 1.5533591602452935e-08, "loss": 0.1459, "num_input_tokens_seen": 38393504, "step": 63005 }, { "epoch": 19.550108594477194, "grad_norm": 6.787449359893799, "learning_rate": 1.5427142833754348e-08, "loss": 0.0977, "num_input_tokens_seen": 38396480, "step": 63010 }, { "epoch": 19.55165994415141, "grad_norm": 12.941306114196777, "learning_rate": 1.5321059493272894e-08, "loss": 0.1238, "num_input_tokens_seen": 38399424, "step": 63015 }, { "epoch": 19.55321129382563, "grad_norm": 14.741945266723633, "learning_rate": 1.521534158878457e-08, "loss": 0.171, "num_input_tokens_seen": 38404480, "step": 63020 }, { "epoch": 19.554762643499846, "grad_norm": 22.50583267211914, "learning_rate": 1.510998912804096e-08, "loss": 0.1387, "num_input_tokens_seen": 38407424, "step": 63025 }, { "epoch": 19.55631399317406, "grad_norm": 16.852418899536133, "learning_rate": 1.5005002118764768e-08, "loss": 0.1523, "num_input_tokens_seen": 38409824, "step": 63030 }, { "epoch": 19.557865342848277, "grad_norm": 35.181243896484375, "learning_rate": 1.4900380568653172e-08, "loss": 0.121, "num_input_tokens_seen": 38412480, "step": 63035 }, { "epoch": 19.559416692522493, "grad_norm": 7.340695381164551, "learning_rate": 1.479612448537615e-08, "loss": 0.113, "num_input_tokens_seen": 38415680, "step": 63040 }, { "epoch": 19.560968042196713, "grad_norm": 12.019074440002441, "learning_rate": 1.4692233876576479e-08, "loss": 0.1065, "num_input_tokens_seen": 38418496, "step": 63045 }, { "epoch": 19.56251939187093, "grad_norm": 18.256423950195312, "learning_rate": 1.4588708749871395e-08, "loss": 0.1137, "num_input_tokens_seen": 38422720, "step": 63050 }, { "epoch": 19.564070741545144, "grad_norm": 7.197295188903809, "learning_rate": 1.4485549112849829e-08, "loss": 0.1456, "num_input_tokens_seen": 38425120, "step": 63055 }, { "epoch": 19.56562209121936, "grad_norm": 13.975810050964355, "learning_rate": 1.4382754973075175e-08, "loss": 0.1914, "num_input_tokens_seen": 38427904, "step": 63060 }, { "epoch": 19.567173440893576, "grad_norm": 15.53636646270752, "learning_rate": 1.4280326338082518e-08, "loss": 0.1121, "num_input_tokens_seen": 38430592, "step": 63065 }, { "epoch": 19.568724790567796, "grad_norm": 18.991104125976562, "learning_rate": 1.417826321538196e-08, "loss": 0.1756, "num_input_tokens_seen": 38433824, "step": 63070 }, { "epoch": 19.57027614024201, "grad_norm": 10.08946418762207, "learning_rate": 1.4076565612455851e-08, "loss": 0.1209, "num_input_tokens_seen": 38437248, "step": 63075 }, { "epoch": 19.571827489916227, "grad_norm": 29.742002487182617, "learning_rate": 1.3975233536759337e-08, "loss": 0.1775, "num_input_tokens_seen": 38439936, "step": 63080 }, { "epoch": 19.573378839590443, "grad_norm": 5.97721529006958, "learning_rate": 1.387426699572203e-08, "loss": 0.1834, "num_input_tokens_seen": 38442176, "step": 63085 }, { "epoch": 19.57493018926466, "grad_norm": 17.45610809326172, "learning_rate": 1.3773665996745234e-08, "loss": 0.2135, "num_input_tokens_seen": 38445440, "step": 63090 }, { "epoch": 19.576481538938875, "grad_norm": 9.930353164672852, "learning_rate": 1.3673430547204714e-08, "loss": 0.1937, "num_input_tokens_seen": 38448672, "step": 63095 }, { "epoch": 19.578032888613095, "grad_norm": 10.955795288085938, "learning_rate": 1.3573560654447927e-08, "loss": 0.139, "num_input_tokens_seen": 38450816, "step": 63100 }, { "epoch": 19.57958423828731, "grad_norm": 9.954306602478027, "learning_rate": 1.3474056325797902e-08, "loss": 0.1496, "num_input_tokens_seen": 38453536, "step": 63105 }, { "epoch": 19.581135587961526, "grad_norm": 2.571467399597168, "learning_rate": 1.3374917568548806e-08, "loss": 0.1362, "num_input_tokens_seen": 38455744, "step": 63110 }, { "epoch": 19.582686937635742, "grad_norm": 15.177206993103027, "learning_rate": 1.3276144389968715e-08, "loss": 0.1175, "num_input_tokens_seen": 38458496, "step": 63115 }, { "epoch": 19.584238287309958, "grad_norm": 35.176246643066406, "learning_rate": 1.3177736797299056e-08, "loss": 0.1441, "num_input_tokens_seen": 38461216, "step": 63120 }, { "epoch": 19.585789636984178, "grad_norm": 13.874871253967285, "learning_rate": 1.3079694797754061e-08, "loss": 0.138, "num_input_tokens_seen": 38465184, "step": 63125 }, { "epoch": 19.587340986658393, "grad_norm": 2.3651554584503174, "learning_rate": 1.2982018398520757e-08, "loss": 0.0843, "num_input_tokens_seen": 38467680, "step": 63130 }, { "epoch": 19.58889233633261, "grad_norm": 10.214201927185059, "learning_rate": 1.2884707606761193e-08, "loss": 0.1263, "num_input_tokens_seen": 38469952, "step": 63135 }, { "epoch": 19.590443686006825, "grad_norm": 7.325713157653809, "learning_rate": 1.2787762429608552e-08, "loss": 0.072, "num_input_tokens_seen": 38473184, "step": 63140 }, { "epoch": 19.59199503568104, "grad_norm": 20.115882873535156, "learning_rate": 1.2691182874171037e-08, "loss": 0.1475, "num_input_tokens_seen": 38475936, "step": 63145 }, { "epoch": 19.59354638535526, "grad_norm": 21.920019149780273, "learning_rate": 1.259496894752743e-08, "loss": 0.1605, "num_input_tokens_seen": 38478784, "step": 63150 }, { "epoch": 19.595097735029476, "grad_norm": 18.751190185546875, "learning_rate": 1.2499120656733198e-08, "loss": 0.091, "num_input_tokens_seen": 38481824, "step": 63155 }, { "epoch": 19.596649084703692, "grad_norm": 13.786813735961914, "learning_rate": 1.2403638008813835e-08, "loss": 0.0901, "num_input_tokens_seen": 38485440, "step": 63160 }, { "epoch": 19.598200434377908, "grad_norm": 40.60865783691406, "learning_rate": 1.2308521010769848e-08, "loss": 0.1232, "num_input_tokens_seen": 38488128, "step": 63165 }, { "epoch": 19.599751784052124, "grad_norm": 30.459611892700195, "learning_rate": 1.2213769669573993e-08, "loss": 0.1476, "num_input_tokens_seen": 38490528, "step": 63170 }, { "epoch": 19.601303133726343, "grad_norm": 14.227173805236816, "learning_rate": 1.2119383992173494e-08, "loss": 0.1398, "num_input_tokens_seen": 38494624, "step": 63175 }, { "epoch": 19.60285448340056, "grad_norm": 17.98543357849121, "learning_rate": 1.2025363985487259e-08, "loss": 0.1625, "num_input_tokens_seen": 38497728, "step": 63180 }, { "epoch": 19.604405833074775, "grad_norm": 5.511672496795654, "learning_rate": 1.1931709656408663e-08, "loss": 0.1034, "num_input_tokens_seen": 38501280, "step": 63185 }, { "epoch": 19.60595718274899, "grad_norm": 15.062966346740723, "learning_rate": 1.1838421011803325e-08, "loss": 0.2434, "num_input_tokens_seen": 38503616, "step": 63190 }, { "epoch": 19.607508532423207, "grad_norm": 24.41498565673828, "learning_rate": 1.1745498058509663e-08, "loss": 0.2459, "num_input_tokens_seen": 38506400, "step": 63195 }, { "epoch": 19.609059882097426, "grad_norm": 64.92376708984375, "learning_rate": 1.1652940803341672e-08, "loss": 0.2512, "num_input_tokens_seen": 38509184, "step": 63200 }, { "epoch": 19.610611231771642, "grad_norm": 22.8200626373291, "learning_rate": 1.1560749253083369e-08, "loss": 0.1615, "num_input_tokens_seen": 38513312, "step": 63205 }, { "epoch": 19.612162581445858, "grad_norm": 8.563783645629883, "learning_rate": 1.1468923414494349e-08, "loss": 0.1009, "num_input_tokens_seen": 38517184, "step": 63210 }, { "epoch": 19.613713931120074, "grad_norm": 15.046920776367188, "learning_rate": 1.1377463294305891e-08, "loss": 0.1187, "num_input_tokens_seen": 38523168, "step": 63215 }, { "epoch": 19.61526528079429, "grad_norm": 10.91447639465332, "learning_rate": 1.1286368899224298e-08, "loss": 0.1436, "num_input_tokens_seen": 38525984, "step": 63220 }, { "epoch": 19.616816630468506, "grad_norm": 11.461745262145996, "learning_rate": 1.1195640235926453e-08, "loss": 0.1193, "num_input_tokens_seen": 38528672, "step": 63225 }, { "epoch": 19.618367980142725, "grad_norm": 13.889083862304688, "learning_rate": 1.1105277311064811e-08, "loss": 0.1045, "num_input_tokens_seen": 38532000, "step": 63230 }, { "epoch": 19.61991932981694, "grad_norm": 26.99094581604004, "learning_rate": 1.1015280131263518e-08, "loss": 0.1711, "num_input_tokens_seen": 38535040, "step": 63235 }, { "epoch": 19.621470679491157, "grad_norm": 28.097185134887695, "learning_rate": 1.0925648703120628e-08, "loss": 0.1283, "num_input_tokens_seen": 38538336, "step": 63240 }, { "epoch": 19.623022029165373, "grad_norm": 22.02426528930664, "learning_rate": 1.0836383033206998e-08, "loss": 0.1468, "num_input_tokens_seen": 38540416, "step": 63245 }, { "epoch": 19.62457337883959, "grad_norm": 20.59306526184082, "learning_rate": 1.0747483128066837e-08, "loss": 0.0701, "num_input_tokens_seen": 38543552, "step": 63250 }, { "epoch": 19.62612472851381, "grad_norm": 9.517667770385742, "learning_rate": 1.0658948994218266e-08, "loss": 0.2638, "num_input_tokens_seen": 38545888, "step": 63255 }, { "epoch": 19.627676078188024, "grad_norm": 19.111190795898438, "learning_rate": 1.057078063815109e-08, "loss": 0.158, "num_input_tokens_seen": 38548768, "step": 63260 }, { "epoch": 19.62922742786224, "grad_norm": 8.292064666748047, "learning_rate": 1.0482978066329585e-08, "loss": 0.0921, "num_input_tokens_seen": 38552832, "step": 63265 }, { "epoch": 19.630778777536456, "grad_norm": 8.020459175109863, "learning_rate": 1.039554128519027e-08, "loss": 0.1782, "num_input_tokens_seen": 38555328, "step": 63270 }, { "epoch": 19.632330127210672, "grad_norm": 15.904206275939941, "learning_rate": 1.0308470301143569e-08, "loss": 0.1226, "num_input_tokens_seen": 38558880, "step": 63275 }, { "epoch": 19.63388147688489, "grad_norm": 21.100788116455078, "learning_rate": 1.0221765120573269e-08, "loss": 0.2945, "num_input_tokens_seen": 38561248, "step": 63280 }, { "epoch": 19.635432826559107, "grad_norm": 17.650087356567383, "learning_rate": 1.0135425749834837e-08, "loss": 0.2384, "num_input_tokens_seen": 38563872, "step": 63285 }, { "epoch": 19.636984176233323, "grad_norm": 20.256311416625977, "learning_rate": 1.004945219525877e-08, "loss": 0.2721, "num_input_tokens_seen": 38567040, "step": 63290 }, { "epoch": 19.63853552590754, "grad_norm": 18.140975952148438, "learning_rate": 9.963844463147798e-09, "loss": 0.1695, "num_input_tokens_seen": 38569536, "step": 63295 }, { "epoch": 19.640086875581755, "grad_norm": 10.81296157836914, "learning_rate": 9.878602559777462e-09, "loss": 0.1142, "num_input_tokens_seen": 38573760, "step": 63300 }, { "epoch": 19.641638225255974, "grad_norm": 25.59754180908203, "learning_rate": 9.793726491397759e-09, "loss": 0.1606, "num_input_tokens_seen": 38576672, "step": 63305 }, { "epoch": 19.64318957493019, "grad_norm": 16.527250289916992, "learning_rate": 9.709216264230936e-09, "loss": 0.1643, "num_input_tokens_seen": 38580000, "step": 63310 }, { "epoch": 19.644740924604406, "grad_norm": 10.133432388305664, "learning_rate": 9.625071884472036e-09, "loss": 0.1069, "num_input_tokens_seen": 38582816, "step": 63315 }, { "epoch": 19.646292274278622, "grad_norm": 8.343416213989258, "learning_rate": 9.54129335829057e-09, "loss": 0.0866, "num_input_tokens_seen": 38584992, "step": 63320 }, { "epoch": 19.647843623952838, "grad_norm": 15.36648178100586, "learning_rate": 9.457880691827736e-09, "loss": 0.2215, "num_input_tokens_seen": 38587488, "step": 63325 }, { "epoch": 19.649394973627057, "grad_norm": 9.605609893798828, "learning_rate": 9.374833891199752e-09, "loss": 0.2123, "num_input_tokens_seen": 38590752, "step": 63330 }, { "epoch": 19.650946323301273, "grad_norm": 15.51107406616211, "learning_rate": 9.292152962493972e-09, "loss": 0.0715, "num_input_tokens_seen": 38593952, "step": 63335 }, { "epoch": 19.65249767297549, "grad_norm": 9.830934524536133, "learning_rate": 9.209837911772212e-09, "loss": 0.2848, "num_input_tokens_seen": 38596992, "step": 63340 }, { "epoch": 19.654049022649705, "grad_norm": 9.022246360778809, "learning_rate": 9.127888745069091e-09, "loss": 0.1409, "num_input_tokens_seen": 38599456, "step": 63345 }, { "epoch": 19.65560037232392, "grad_norm": 23.579835891723633, "learning_rate": 9.04630546839258e-09, "loss": 0.2118, "num_input_tokens_seen": 38602528, "step": 63350 }, { "epoch": 19.65715172199814, "grad_norm": 49.25178146362305, "learning_rate": 8.96508808772345e-09, "loss": 0.1549, "num_input_tokens_seen": 38605280, "step": 63355 }, { "epoch": 19.658703071672356, "grad_norm": 21.84252166748047, "learning_rate": 8.884236609016384e-09, "loss": 0.1306, "num_input_tokens_seen": 38607872, "step": 63360 }, { "epoch": 19.660254421346572, "grad_norm": 9.562346458435059, "learning_rate": 8.803751038198305e-09, "loss": 0.0746, "num_input_tokens_seen": 38611840, "step": 63365 }, { "epoch": 19.661805771020788, "grad_norm": 25.779273986816406, "learning_rate": 8.723631381169496e-09, "loss": 0.1032, "num_input_tokens_seen": 38614080, "step": 63370 }, { "epoch": 19.663357120695004, "grad_norm": 46.33363342285156, "learning_rate": 8.6438776438047e-09, "loss": 0.1451, "num_input_tokens_seen": 38616224, "step": 63375 }, { "epoch": 19.66490847036922, "grad_norm": 10.46182632446289, "learning_rate": 8.564489831949241e-09, "loss": 0.1458, "num_input_tokens_seen": 38618816, "step": 63380 }, { "epoch": 19.66645982004344, "grad_norm": 11.857465744018555, "learning_rate": 8.48546795142513e-09, "loss": 0.1612, "num_input_tokens_seen": 38622240, "step": 63385 }, { "epoch": 19.668011169717655, "grad_norm": 19.000608444213867, "learning_rate": 8.406812008023846e-09, "loss": 0.1026, "num_input_tokens_seen": 38626912, "step": 63390 }, { "epoch": 19.66956251939187, "grad_norm": 13.046573638916016, "learning_rate": 8.328522007512996e-09, "loss": 0.1129, "num_input_tokens_seen": 38630272, "step": 63395 }, { "epoch": 19.671113869066087, "grad_norm": 9.535579681396484, "learning_rate": 8.250597955631878e-09, "loss": 0.1948, "num_input_tokens_seen": 38633408, "step": 63400 }, { "epoch": 19.672665218740303, "grad_norm": 14.994658470153809, "learning_rate": 8.173039858092591e-09, "loss": 0.1633, "num_input_tokens_seen": 38636640, "step": 63405 }, { "epoch": 19.674216568414522, "grad_norm": 6.389655113220215, "learning_rate": 8.095847720581696e-09, "loss": 0.1127, "num_input_tokens_seen": 38640096, "step": 63410 }, { "epoch": 19.675767918088738, "grad_norm": 60.75969696044922, "learning_rate": 8.019021548758554e-09, "loss": 0.1598, "num_input_tokens_seen": 38642720, "step": 63415 }, { "epoch": 19.677319267762954, "grad_norm": 33.0290641784668, "learning_rate": 7.942561348254219e-09, "loss": 0.1737, "num_input_tokens_seen": 38645280, "step": 63420 }, { "epoch": 19.67887061743717, "grad_norm": 34.43321228027344, "learning_rate": 7.866467124675315e-09, "loss": 0.1477, "num_input_tokens_seen": 38648064, "step": 63425 }, { "epoch": 19.680421967111386, "grad_norm": 14.44470500946045, "learning_rate": 7.790738883600712e-09, "loss": 0.1294, "num_input_tokens_seen": 38650528, "step": 63430 }, { "epoch": 19.681973316785605, "grad_norm": 12.944003105163574, "learning_rate": 7.715376630580972e-09, "loss": 0.2157, "num_input_tokens_seen": 38653728, "step": 63435 }, { "epoch": 19.68352466645982, "grad_norm": 6.973062038421631, "learning_rate": 7.640380371141675e-09, "loss": 0.131, "num_input_tokens_seen": 38657248, "step": 63440 }, { "epoch": 19.685076016134037, "grad_norm": 11.278273582458496, "learning_rate": 7.565750110780646e-09, "loss": 0.2278, "num_input_tokens_seen": 38660256, "step": 63445 }, { "epoch": 19.686627365808253, "grad_norm": 9.403609275817871, "learning_rate": 7.491485854969616e-09, "loss": 0.1699, "num_input_tokens_seen": 38662656, "step": 63450 }, { "epoch": 19.68817871548247, "grad_norm": 27.824344635009766, "learning_rate": 7.417587609152566e-09, "loss": 0.1152, "num_input_tokens_seen": 38664928, "step": 63455 }, { "epoch": 19.689730065156688, "grad_norm": 9.309927940368652, "learning_rate": 7.344055378747938e-09, "loss": 0.1101, "num_input_tokens_seen": 38668256, "step": 63460 }, { "epoch": 19.691281414830904, "grad_norm": 18.647960662841797, "learning_rate": 7.2708891691453125e-09, "loss": 0.1698, "num_input_tokens_seen": 38670784, "step": 63465 }, { "epoch": 19.69283276450512, "grad_norm": 11.099785804748535, "learning_rate": 7.198088985709839e-09, "loss": 0.0891, "num_input_tokens_seen": 38673728, "step": 63470 }, { "epoch": 19.694384114179336, "grad_norm": 14.536027908325195, "learning_rate": 7.1256548337778066e-09, "loss": 0.1528, "num_input_tokens_seen": 38677088, "step": 63475 }, { "epoch": 19.69593546385355, "grad_norm": 8.702223777770996, "learning_rate": 7.053586718659966e-09, "loss": 0.1977, "num_input_tokens_seen": 38679776, "step": 63480 }, { "epoch": 19.697486813527767, "grad_norm": 8.861294746398926, "learning_rate": 6.9818846456393145e-09, "loss": 0.1104, "num_input_tokens_seen": 38684064, "step": 63485 }, { "epoch": 19.699038163201987, "grad_norm": 3.2792205810546875, "learning_rate": 6.910548619972201e-09, "loss": 0.1785, "num_input_tokens_seen": 38687648, "step": 63490 }, { "epoch": 19.700589512876203, "grad_norm": 14.507240295410156, "learning_rate": 6.8395786468899995e-09, "loss": 0.1222, "num_input_tokens_seen": 38691072, "step": 63495 }, { "epoch": 19.70214086255042, "grad_norm": 6.48541259765625, "learning_rate": 6.7689747315935476e-09, "loss": 0.138, "num_input_tokens_seen": 38693792, "step": 63500 }, { "epoch": 19.703692212224635, "grad_norm": 25.773712158203125, "learning_rate": 6.69873687926037e-09, "loss": 0.1047, "num_input_tokens_seen": 38697536, "step": 63505 }, { "epoch": 19.70524356189885, "grad_norm": 14.530208587646484, "learning_rate": 6.628865095039683e-09, "loss": 0.1596, "num_input_tokens_seen": 38700224, "step": 63510 }, { "epoch": 19.70679491157307, "grad_norm": 140.6964111328125, "learning_rate": 6.5593593840529436e-09, "loss": 0.162, "num_input_tokens_seen": 38704288, "step": 63515 }, { "epoch": 19.708346261247286, "grad_norm": 7.704582691192627, "learning_rate": 6.490219751396631e-09, "loss": 0.2062, "num_input_tokens_seen": 38708736, "step": 63520 }, { "epoch": 19.7098976109215, "grad_norm": 23.967119216918945, "learning_rate": 6.4214462021389145e-09, "loss": 0.1409, "num_input_tokens_seen": 38713632, "step": 63525 }, { "epoch": 19.711448960595717, "grad_norm": 8.131725311279297, "learning_rate": 6.353038741322426e-09, "loss": 0.2419, "num_input_tokens_seen": 38716352, "step": 63530 }, { "epoch": 19.713000310269933, "grad_norm": 17.16257095336914, "learning_rate": 6.284997373961488e-09, "loss": 0.1496, "num_input_tokens_seen": 38719904, "step": 63535 }, { "epoch": 19.714551659944153, "grad_norm": 23.10154914855957, "learning_rate": 6.217322105044887e-09, "loss": 0.0787, "num_input_tokens_seen": 38722272, "step": 63540 }, { "epoch": 19.71610300961837, "grad_norm": 44.474910736083984, "learning_rate": 6.150012939533656e-09, "loss": 0.2217, "num_input_tokens_seen": 38725376, "step": 63545 }, { "epoch": 19.717654359292585, "grad_norm": 23.732513427734375, "learning_rate": 6.083069882362736e-09, "loss": 0.2058, "num_input_tokens_seen": 38729664, "step": 63550 }, { "epoch": 19.7192057089668, "grad_norm": 11.284896850585938, "learning_rate": 6.016492938439311e-09, "loss": 0.1639, "num_input_tokens_seen": 38732672, "step": 63555 }, { "epoch": 19.720757058641016, "grad_norm": 5.354005336761475, "learning_rate": 5.950282112645034e-09, "loss": 0.1558, "num_input_tokens_seen": 38735424, "step": 63560 }, { "epoch": 19.722308408315236, "grad_norm": 8.890296936035156, "learning_rate": 5.884437409833243e-09, "loss": 0.11, "num_input_tokens_seen": 38738304, "step": 63565 }, { "epoch": 19.72385975798945, "grad_norm": 6.538729190826416, "learning_rate": 5.81895883483119e-09, "loss": 0.1518, "num_input_tokens_seen": 38741600, "step": 63570 }, { "epoch": 19.725411107663668, "grad_norm": 18.316810607910156, "learning_rate": 5.753846392439477e-09, "loss": 0.1598, "num_input_tokens_seen": 38744416, "step": 63575 }, { "epoch": 19.726962457337883, "grad_norm": 13.232527732849121, "learning_rate": 5.689100087431509e-09, "loss": 0.1683, "num_input_tokens_seen": 38747808, "step": 63580 }, { "epoch": 19.7285138070121, "grad_norm": 6.135117053985596, "learning_rate": 5.624719924554045e-09, "loss": 0.0847, "num_input_tokens_seen": 38752704, "step": 63585 }, { "epoch": 19.73006515668632, "grad_norm": 4.357882022857666, "learning_rate": 5.560705908527197e-09, "loss": 0.2074, "num_input_tokens_seen": 38756448, "step": 63590 }, { "epoch": 19.731616506360535, "grad_norm": 8.067927360534668, "learning_rate": 5.497058044043324e-09, "loss": 0.1444, "num_input_tokens_seen": 38758912, "step": 63595 }, { "epoch": 19.73316785603475, "grad_norm": 8.65921401977539, "learning_rate": 5.4337763357686925e-09, "loss": 0.0815, "num_input_tokens_seen": 38762336, "step": 63600 }, { "epoch": 19.734719205708966, "grad_norm": 12.47061824798584, "learning_rate": 5.370860788342924e-09, "loss": 0.1754, "num_input_tokens_seen": 38764960, "step": 63605 }, { "epoch": 19.736270555383182, "grad_norm": 29.098783493041992, "learning_rate": 5.308311406378441e-09, "loss": 0.1095, "num_input_tokens_seen": 38768480, "step": 63610 }, { "epoch": 19.7378219050574, "grad_norm": 15.24634075164795, "learning_rate": 5.246128194460465e-09, "loss": 0.1385, "num_input_tokens_seen": 38771456, "step": 63615 }, { "epoch": 19.739373254731618, "grad_norm": 24.823467254638672, "learning_rate": 5.184311157148125e-09, "loss": 0.2026, "num_input_tokens_seen": 38774560, "step": 63620 }, { "epoch": 19.740924604405834, "grad_norm": 33.20120620727539, "learning_rate": 5.122860298973353e-09, "loss": 0.1506, "num_input_tokens_seen": 38778400, "step": 63625 }, { "epoch": 19.74247595408005, "grad_norm": 51.7458610534668, "learning_rate": 5.061775624440879e-09, "loss": 0.2147, "num_input_tokens_seen": 38781248, "step": 63630 }, { "epoch": 19.744027303754265, "grad_norm": 13.222552299499512, "learning_rate": 5.001057138029341e-09, "loss": 0.0987, "num_input_tokens_seen": 38783936, "step": 63635 }, { "epoch": 19.74557865342848, "grad_norm": 9.476093292236328, "learning_rate": 4.940704844190181e-09, "loss": 0.0992, "num_input_tokens_seen": 38787264, "step": 63640 }, { "epoch": 19.7471300031027, "grad_norm": 7.234942436218262, "learning_rate": 4.880718747347635e-09, "loss": 0.1808, "num_input_tokens_seen": 38790240, "step": 63645 }, { "epoch": 19.748681352776916, "grad_norm": 8.183713912963867, "learning_rate": 4.8210988518992975e-09, "loss": 0.151, "num_input_tokens_seen": 38794080, "step": 63650 }, { "epoch": 19.750232702451132, "grad_norm": 8.749189376831055, "learning_rate": 4.761845162216117e-09, "loss": 0.0752, "num_input_tokens_seen": 38797056, "step": 63655 }, { "epoch": 19.75178405212535, "grad_norm": 18.04534339904785, "learning_rate": 4.7029576826423954e-09, "loss": 0.0967, "num_input_tokens_seen": 38800672, "step": 63660 }, { "epoch": 19.753335401799564, "grad_norm": 22.53333854675293, "learning_rate": 4.64443641749468e-09, "loss": 0.1559, "num_input_tokens_seen": 38803168, "step": 63665 }, { "epoch": 19.754886751473784, "grad_norm": 23.324522018432617, "learning_rate": 4.586281371063983e-09, "loss": 0.1438, "num_input_tokens_seen": 38807104, "step": 63670 }, { "epoch": 19.756438101148, "grad_norm": 14.610710144042969, "learning_rate": 4.528492547613006e-09, "loss": 0.1238, "num_input_tokens_seen": 38811520, "step": 63675 }, { "epoch": 19.757989450822215, "grad_norm": 20.774354934692383, "learning_rate": 4.471069951378915e-09, "loss": 0.1192, "num_input_tokens_seen": 38814048, "step": 63680 }, { "epoch": 19.75954080049643, "grad_norm": 12.028848648071289, "learning_rate": 4.414013586571675e-09, "loss": 0.0872, "num_input_tokens_seen": 38816992, "step": 63685 }, { "epoch": 19.761092150170647, "grad_norm": 6.126081943511963, "learning_rate": 4.3573234573734965e-09, "loss": 0.0772, "num_input_tokens_seen": 38821024, "step": 63690 }, { "epoch": 19.762643499844867, "grad_norm": 9.43464469909668, "learning_rate": 4.3009995679405e-09, "loss": 0.0951, "num_input_tokens_seen": 38823904, "step": 63695 }, { "epoch": 19.764194849519082, "grad_norm": 26.928503036499023, "learning_rate": 4.245041922402715e-09, "loss": 0.1224, "num_input_tokens_seen": 38827584, "step": 63700 }, { "epoch": 19.7657461991933, "grad_norm": 44.71131134033203, "learning_rate": 4.18945052486186e-09, "loss": 0.1296, "num_input_tokens_seen": 38830720, "step": 63705 }, { "epoch": 19.767297548867514, "grad_norm": 20.10582160949707, "learning_rate": 4.1342253793935635e-09, "loss": 0.164, "num_input_tokens_seen": 38833824, "step": 63710 }, { "epoch": 19.76884889854173, "grad_norm": 2.9292895793914795, "learning_rate": 4.0793664900457e-09, "loss": 0.0811, "num_input_tokens_seen": 38836800, "step": 63715 }, { "epoch": 19.77040024821595, "grad_norm": 4.86503267288208, "learning_rate": 4.024873860841716e-09, "loss": 0.1439, "num_input_tokens_seen": 38839104, "step": 63720 }, { "epoch": 19.771951597890165, "grad_norm": 19.699365615844727, "learning_rate": 3.970747495775085e-09, "loss": 0.1737, "num_input_tokens_seen": 38842112, "step": 63725 }, { "epoch": 19.77350294756438, "grad_norm": 21.04538345336914, "learning_rate": 3.916987398814853e-09, "loss": 0.107, "num_input_tokens_seen": 38844768, "step": 63730 }, { "epoch": 19.775054297238597, "grad_norm": 56.83053207397461, "learning_rate": 3.863593573901758e-09, "loss": 0.1959, "num_input_tokens_seen": 38847776, "step": 63735 }, { "epoch": 19.776605646912813, "grad_norm": 20.57059669494629, "learning_rate": 3.8105660249504465e-09, "loss": 0.0744, "num_input_tokens_seen": 38850656, "step": 63740 }, { "epoch": 19.77815699658703, "grad_norm": 22.067415237426758, "learning_rate": 3.757904755848363e-09, "loss": 0.1179, "num_input_tokens_seen": 38852992, "step": 63745 }, { "epoch": 19.77970834626125, "grad_norm": 28.351030349731445, "learning_rate": 3.705609770456309e-09, "loss": 0.102, "num_input_tokens_seen": 38857760, "step": 63750 }, { "epoch": 19.781259695935464, "grad_norm": 6.959039688110352, "learning_rate": 3.6536810726078843e-09, "loss": 0.1154, "num_input_tokens_seen": 38860416, "step": 63755 }, { "epoch": 19.78281104560968, "grad_norm": 11.991707801818848, "learning_rate": 3.602118666110599e-09, "loss": 0.1016, "num_input_tokens_seen": 38864416, "step": 63760 }, { "epoch": 19.784362395283896, "grad_norm": 7.906902313232422, "learning_rate": 3.550922554743652e-09, "loss": 0.0773, "num_input_tokens_seen": 38867488, "step": 63765 }, { "epoch": 19.785913744958112, "grad_norm": 33.51113510131836, "learning_rate": 3.500092742261263e-09, "loss": 0.1105, "num_input_tokens_seen": 38870048, "step": 63770 }, { "epoch": 19.78746509463233, "grad_norm": 20.20564842224121, "learning_rate": 3.44962923238934e-09, "loss": 0.1336, "num_input_tokens_seen": 38873504, "step": 63775 }, { "epoch": 19.789016444306547, "grad_norm": 22.724925994873047, "learning_rate": 3.3995320288277013e-09, "loss": 0.0758, "num_input_tokens_seen": 38875584, "step": 63780 }, { "epoch": 19.790567793980763, "grad_norm": 5.262551307678223, "learning_rate": 3.3498011352489643e-09, "loss": 0.1134, "num_input_tokens_seen": 38878592, "step": 63785 }, { "epoch": 19.79211914365498, "grad_norm": 7.215928554534912, "learning_rate": 3.3004365552991026e-09, "loss": 0.1005, "num_input_tokens_seen": 38881632, "step": 63790 }, { "epoch": 19.793670493329195, "grad_norm": 10.017873764038086, "learning_rate": 3.251438292596887e-09, "loss": 0.2271, "num_input_tokens_seen": 38884288, "step": 63795 }, { "epoch": 19.795221843003414, "grad_norm": 5.537810802459717, "learning_rate": 3.2028063507344443e-09, "loss": 0.1706, "num_input_tokens_seen": 38888192, "step": 63800 }, { "epoch": 19.79677319267763, "grad_norm": 6.079377174377441, "learning_rate": 3.154540733277256e-09, "loss": 0.1301, "num_input_tokens_seen": 38890752, "step": 63805 }, { "epoch": 19.798324542351846, "grad_norm": 10.893369674682617, "learning_rate": 3.106641443763603e-09, "loss": 0.1036, "num_input_tokens_seen": 38893376, "step": 63810 }, { "epoch": 19.799875892026062, "grad_norm": 5.817821502685547, "learning_rate": 3.059108485705675e-09, "loss": 0.1402, "num_input_tokens_seen": 38896000, "step": 63815 }, { "epoch": 19.801427241700278, "grad_norm": 13.722127914428711, "learning_rate": 3.011941862587353e-09, "loss": 0.135, "num_input_tokens_seen": 38899360, "step": 63820 }, { "epoch": 19.802978591374497, "grad_norm": 55.530601501464844, "learning_rate": 2.965141577866981e-09, "loss": 0.1932, "num_input_tokens_seen": 38902080, "step": 63825 }, { "epoch": 19.804529941048713, "grad_norm": 9.84390640258789, "learning_rate": 2.9187076349757037e-09, "loss": 0.1163, "num_input_tokens_seen": 38904992, "step": 63830 }, { "epoch": 19.80608129072293, "grad_norm": 17.78641700744629, "learning_rate": 2.8726400373169093e-09, "loss": 0.1456, "num_input_tokens_seen": 38907328, "step": 63835 }, { "epoch": 19.807632640397145, "grad_norm": 11.237420082092285, "learning_rate": 2.826938788269007e-09, "loss": 0.1641, "num_input_tokens_seen": 38910528, "step": 63840 }, { "epoch": 19.80918399007136, "grad_norm": 13.047126770019531, "learning_rate": 2.781603891181539e-09, "loss": 0.0999, "num_input_tokens_seen": 38913344, "step": 63845 }, { "epoch": 19.81073533974558, "grad_norm": 8.024365425109863, "learning_rate": 2.7366353493785135e-09, "loss": 0.1513, "num_input_tokens_seen": 38915872, "step": 63850 }, { "epoch": 19.812286689419796, "grad_norm": 21.69647789001465, "learning_rate": 2.692033166156738e-09, "loss": 0.0992, "num_input_tokens_seen": 38919136, "step": 63855 }, { "epoch": 19.813838039094012, "grad_norm": 23.322874069213867, "learning_rate": 2.6477973447858185e-09, "loss": 0.1687, "num_input_tokens_seen": 38921568, "step": 63860 }, { "epoch": 19.815389388768228, "grad_norm": 15.714282989501953, "learning_rate": 2.603927888508717e-09, "loss": 0.2274, "num_input_tokens_seen": 38924032, "step": 63865 }, { "epoch": 19.816940738442444, "grad_norm": 10.953691482543945, "learning_rate": 2.56042480054175e-09, "loss": 0.1099, "num_input_tokens_seen": 38927072, "step": 63870 }, { "epoch": 19.818492088116663, "grad_norm": 17.463489532470703, "learning_rate": 2.5172880840745873e-09, "loss": 0.2458, "num_input_tokens_seen": 38930752, "step": 63875 }, { "epoch": 19.82004343779088, "grad_norm": 3.608119487762451, "learning_rate": 2.4745177422685897e-09, "loss": 0.1575, "num_input_tokens_seen": 38933792, "step": 63880 }, { "epoch": 19.821594787465095, "grad_norm": 14.994690895080566, "learning_rate": 2.432113778260137e-09, "loss": 0.1264, "num_input_tokens_seen": 38937056, "step": 63885 }, { "epoch": 19.82314613713931, "grad_norm": 14.887258529663086, "learning_rate": 2.3900761951584086e-09, "loss": 0.193, "num_input_tokens_seen": 38939680, "step": 63890 }, { "epoch": 19.824697486813527, "grad_norm": 8.808479309082031, "learning_rate": 2.3484049960442732e-09, "loss": 0.2734, "num_input_tokens_seen": 38942272, "step": 63895 }, { "epoch": 19.826248836487743, "grad_norm": 28.96225929260254, "learning_rate": 2.3071001839730655e-09, "loss": 0.124, "num_input_tokens_seen": 38944800, "step": 63900 }, { "epoch": 19.827800186161962, "grad_norm": 10.989197731018066, "learning_rate": 2.2661617619729183e-09, "loss": 0.1242, "num_input_tokens_seen": 38948064, "step": 63905 }, { "epoch": 19.829351535836178, "grad_norm": 16.557859420776367, "learning_rate": 2.225589733044764e-09, "loss": 0.1071, "num_input_tokens_seen": 38951008, "step": 63910 }, { "epoch": 19.830902885510394, "grad_norm": 25.282333374023438, "learning_rate": 2.1853841001640008e-09, "loss": 0.1705, "num_input_tokens_seen": 38954176, "step": 63915 }, { "epoch": 19.83245423518461, "grad_norm": 8.525136947631836, "learning_rate": 2.1455448662771603e-09, "loss": 0.1447, "num_input_tokens_seen": 38956672, "step": 63920 }, { "epoch": 19.834005584858826, "grad_norm": 11.129646301269531, "learning_rate": 2.106072034305795e-09, "loss": 0.1379, "num_input_tokens_seen": 38960000, "step": 63925 }, { "epoch": 19.835556934533045, "grad_norm": 4.352672576904297, "learning_rate": 2.0669656071425902e-09, "loss": 0.0897, "num_input_tokens_seen": 38962656, "step": 63930 }, { "epoch": 19.83710828420726, "grad_norm": 14.66429328918457, "learning_rate": 2.0282255876558075e-09, "loss": 0.1116, "num_input_tokens_seen": 38965888, "step": 63935 }, { "epoch": 19.838659633881477, "grad_norm": 41.49201583862305, "learning_rate": 1.9898519786848426e-09, "loss": 0.1415, "num_input_tokens_seen": 38968896, "step": 63940 }, { "epoch": 19.840210983555693, "grad_norm": 27.88214111328125, "learning_rate": 1.9518447830429997e-09, "loss": 0.1008, "num_input_tokens_seen": 38971840, "step": 63945 }, { "epoch": 19.84176233322991, "grad_norm": 23.980182647705078, "learning_rate": 1.9142040035163844e-09, "loss": 0.1468, "num_input_tokens_seen": 38974400, "step": 63950 }, { "epoch": 19.843313682904128, "grad_norm": 11.051642417907715, "learning_rate": 1.876929642865011e-09, "loss": 0.1958, "num_input_tokens_seen": 38976896, "step": 63955 }, { "epoch": 19.844865032578344, "grad_norm": 10.336390495300293, "learning_rate": 1.8400217038211376e-09, "loss": 0.0684, "num_input_tokens_seen": 38979936, "step": 63960 }, { "epoch": 19.84641638225256, "grad_norm": 24.04496955871582, "learning_rate": 1.8034801890909338e-09, "loss": 0.1303, "num_input_tokens_seen": 38982272, "step": 63965 }, { "epoch": 19.847967731926776, "grad_norm": 52.211124420166016, "learning_rate": 1.767305101353367e-09, "loss": 0.1747, "num_input_tokens_seen": 38985824, "step": 63970 }, { "epoch": 19.84951908160099, "grad_norm": 20.142396926879883, "learning_rate": 1.7314964432596503e-09, "loss": 0.1624, "num_input_tokens_seen": 38988864, "step": 63975 }, { "epoch": 19.85107043127521, "grad_norm": 13.20773983001709, "learning_rate": 1.696054217436016e-09, "loss": 0.1964, "num_input_tokens_seen": 38991776, "step": 63980 }, { "epoch": 19.852621780949427, "grad_norm": 17.378644943237305, "learning_rate": 1.6609784264803862e-09, "loss": 0.1749, "num_input_tokens_seen": 38995104, "step": 63985 }, { "epoch": 19.854173130623643, "grad_norm": 45.3731689453125, "learning_rate": 1.6262690729640373e-09, "loss": 0.1881, "num_input_tokens_seen": 38997760, "step": 63990 }, { "epoch": 19.85572448029786, "grad_norm": 32.16219711303711, "learning_rate": 1.5919261594321556e-09, "loss": 0.116, "num_input_tokens_seen": 39001152, "step": 63995 }, { "epoch": 19.857275829972075, "grad_norm": 12.506841659545898, "learning_rate": 1.5579496884016166e-09, "loss": 0.1132, "num_input_tokens_seen": 39003872, "step": 64000 }, { "epoch": 19.85882717964629, "grad_norm": 16.61274528503418, "learning_rate": 1.524339662364316e-09, "loss": 0.1524, "num_input_tokens_seen": 39007392, "step": 64005 }, { "epoch": 19.86037852932051, "grad_norm": 19.75832176208496, "learning_rate": 1.4910960837832833e-09, "loss": 0.1797, "num_input_tokens_seen": 39011264, "step": 64010 }, { "epoch": 19.861929878994726, "grad_norm": 27.008174896240234, "learning_rate": 1.4582189550960135e-09, "loss": 0.1622, "num_input_tokens_seen": 39014432, "step": 64015 }, { "epoch": 19.86348122866894, "grad_norm": 12.618395805358887, "learning_rate": 1.4257082787133557e-09, "loss": 0.1761, "num_input_tokens_seen": 39016832, "step": 64020 }, { "epoch": 19.865032578343158, "grad_norm": 14.03407096862793, "learning_rate": 1.3935640570178489e-09, "loss": 0.1067, "num_input_tokens_seen": 39019680, "step": 64025 }, { "epoch": 19.866583928017373, "grad_norm": 37.258018493652344, "learning_rate": 1.361786292366496e-09, "loss": 0.1824, "num_input_tokens_seen": 39022304, "step": 64030 }, { "epoch": 19.868135277691593, "grad_norm": 8.61027717590332, "learning_rate": 1.3303749870891003e-09, "loss": 0.1369, "num_input_tokens_seen": 39025568, "step": 64035 }, { "epoch": 19.86968662736581, "grad_norm": 10.827016830444336, "learning_rate": 1.2993301434882644e-09, "loss": 0.1097, "num_input_tokens_seen": 39028512, "step": 64040 }, { "epoch": 19.871237977040025, "grad_norm": 23.04403305053711, "learning_rate": 1.2686517638399454e-09, "loss": 0.1078, "num_input_tokens_seen": 39031520, "step": 64045 }, { "epoch": 19.87278932671424, "grad_norm": 22.784212112426758, "learning_rate": 1.2383398503934551e-09, "loss": 0.2328, "num_input_tokens_seen": 39034592, "step": 64050 }, { "epoch": 19.874340676388456, "grad_norm": 36.422203063964844, "learning_rate": 1.2083944053709052e-09, "loss": 0.2218, "num_input_tokens_seen": 39037024, "step": 64055 }, { "epoch": 19.875892026062676, "grad_norm": 9.320409774780273, "learning_rate": 1.1788154309672061e-09, "loss": 0.1268, "num_input_tokens_seen": 39039616, "step": 64060 }, { "epoch": 19.877443375736892, "grad_norm": 21.237668991088867, "learning_rate": 1.149602929351179e-09, "loss": 0.2547, "num_input_tokens_seen": 39042752, "step": 64065 }, { "epoch": 19.878994725411108, "grad_norm": 9.805961608886719, "learning_rate": 1.120756902664999e-09, "loss": 0.0781, "num_input_tokens_seen": 39045536, "step": 64070 }, { "epoch": 19.880546075085324, "grad_norm": 26.035066604614258, "learning_rate": 1.0922773530225306e-09, "loss": 0.177, "num_input_tokens_seen": 39048640, "step": 64075 }, { "epoch": 19.88209742475954, "grad_norm": 6.859683513641357, "learning_rate": 1.0641642825126585e-09, "loss": 0.1425, "num_input_tokens_seen": 39051008, "step": 64080 }, { "epoch": 19.88364877443376, "grad_norm": 16.884281158447266, "learning_rate": 1.0364176931948466e-09, "loss": 0.171, "num_input_tokens_seen": 39053280, "step": 64085 }, { "epoch": 19.885200124107975, "grad_norm": 10.154942512512207, "learning_rate": 1.0090375871052439e-09, "loss": 0.1161, "num_input_tokens_seen": 39055840, "step": 64090 }, { "epoch": 19.88675147378219, "grad_norm": 44.422523498535156, "learning_rate": 9.82023966249468e-10, "loss": 0.1324, "num_input_tokens_seen": 39058560, "step": 64095 }, { "epoch": 19.888302823456407, "grad_norm": 4.153958797454834, "learning_rate": 9.55376832608712e-10, "loss": 0.1753, "num_input_tokens_seen": 39060736, "step": 64100 }, { "epoch": 19.889854173130622, "grad_norm": 19.069734573364258, "learning_rate": 9.290961881358584e-10, "loss": 0.1009, "num_input_tokens_seen": 39063744, "step": 64105 }, { "epoch": 19.891405522804842, "grad_norm": 25.572282791137695, "learning_rate": 9.031820347588094e-10, "loss": 0.1269, "num_input_tokens_seen": 39066656, "step": 64110 }, { "epoch": 19.892956872479058, "grad_norm": 1.8001834154129028, "learning_rate": 8.776343743766014e-10, "loss": 0.1091, "num_input_tokens_seen": 39069568, "step": 64115 }, { "epoch": 19.894508222153274, "grad_norm": 39.05839538574219, "learning_rate": 8.524532088621806e-10, "loss": 0.2651, "num_input_tokens_seen": 39072800, "step": 64120 }, { "epoch": 19.89605957182749, "grad_norm": 9.255887031555176, "learning_rate": 8.27638540061293e-10, "loss": 0.1437, "num_input_tokens_seen": 39074912, "step": 64125 }, { "epoch": 19.897610921501705, "grad_norm": 11.512782096862793, "learning_rate": 8.03190369794149e-10, "loss": 0.1186, "num_input_tokens_seen": 39078080, "step": 64130 }, { "epoch": 19.899162271175925, "grad_norm": 10.031991958618164, "learning_rate": 7.791086998520936e-10, "loss": 0.1061, "num_input_tokens_seen": 39081088, "step": 64135 }, { "epoch": 19.90071362085014, "grad_norm": 9.822898864746094, "learning_rate": 7.553935320009365e-10, "loss": 0.1399, "num_input_tokens_seen": 39085056, "step": 64140 }, { "epoch": 19.902264970524357, "grad_norm": 32.084136962890625, "learning_rate": 7.32044867979842e-10, "loss": 0.1802, "num_input_tokens_seen": 39087680, "step": 64145 }, { "epoch": 19.903816320198572, "grad_norm": 28.571964263916016, "learning_rate": 7.090627094996638e-10, "loss": 0.149, "num_input_tokens_seen": 39090080, "step": 64150 }, { "epoch": 19.90536766987279, "grad_norm": 49.353084564208984, "learning_rate": 6.864470582457206e-10, "loss": 0.1536, "num_input_tokens_seen": 39093856, "step": 64155 }, { "epoch": 19.906919019547004, "grad_norm": 9.126469612121582, "learning_rate": 6.641979158761302e-10, "loss": 0.1433, "num_input_tokens_seen": 39097440, "step": 64160 }, { "epoch": 19.908470369221224, "grad_norm": 11.766694068908691, "learning_rate": 6.423152840218105e-10, "loss": 0.1999, "num_input_tokens_seen": 39100544, "step": 64165 }, { "epoch": 19.91002171889544, "grad_norm": 46.746490478515625, "learning_rate": 6.207991642864785e-10, "loss": 0.1966, "num_input_tokens_seen": 39104608, "step": 64170 }, { "epoch": 19.911573068569655, "grad_norm": 34.1960334777832, "learning_rate": 5.996495582488715e-10, "loss": 0.3279, "num_input_tokens_seen": 39107584, "step": 64175 }, { "epoch": 19.91312441824387, "grad_norm": 111.43074798583984, "learning_rate": 5.788664674583056e-10, "loss": 0.1623, "num_input_tokens_seen": 39110784, "step": 64180 }, { "epoch": 19.914675767918087, "grad_norm": 14.361133575439453, "learning_rate": 5.584498934385618e-10, "loss": 0.1851, "num_input_tokens_seen": 39114336, "step": 64185 }, { "epoch": 19.916227117592307, "grad_norm": 30.628738403320312, "learning_rate": 5.383998376873312e-10, "loss": 0.1923, "num_input_tokens_seen": 39117216, "step": 64190 }, { "epoch": 19.917778467266523, "grad_norm": 10.523444175720215, "learning_rate": 5.187163016734387e-10, "loss": 0.2639, "num_input_tokens_seen": 39121056, "step": 64195 }, { "epoch": 19.91932981694074, "grad_norm": 5.050473213195801, "learning_rate": 4.993992868401743e-10, "loss": 0.0769, "num_input_tokens_seen": 39123456, "step": 64200 }, { "epoch": 19.920881166614954, "grad_norm": 19.94877052307129, "learning_rate": 4.804487946041824e-10, "loss": 0.2085, "num_input_tokens_seen": 39126560, "step": 64205 }, { "epoch": 19.92243251628917, "grad_norm": 13.065183639526367, "learning_rate": 4.6186482635435236e-10, "loss": 0.1569, "num_input_tokens_seen": 39128736, "step": 64210 }, { "epoch": 19.92398386596339, "grad_norm": 24.26850700378418, "learning_rate": 4.436473834534827e-10, "loss": 0.1312, "num_input_tokens_seen": 39131200, "step": 64215 }, { "epoch": 19.925535215637606, "grad_norm": 25.48826026916504, "learning_rate": 4.257964672366166e-10, "loss": 0.1379, "num_input_tokens_seen": 39133920, "step": 64220 }, { "epoch": 19.92708656531182, "grad_norm": 8.413203239440918, "learning_rate": 4.0831207901270707e-10, "loss": 0.1063, "num_input_tokens_seen": 39136640, "step": 64225 }, { "epoch": 19.928637914986037, "grad_norm": 33.205039978027344, "learning_rate": 3.911942200635066e-10, "loss": 0.1721, "num_input_tokens_seen": 39139424, "step": 64230 }, { "epoch": 19.930189264660253, "grad_norm": 16.959033966064453, "learning_rate": 3.744428916441223e-10, "loss": 0.1407, "num_input_tokens_seen": 39141984, "step": 64235 }, { "epoch": 19.931740614334473, "grad_norm": 2.7137980461120605, "learning_rate": 3.580580949824608e-10, "loss": 0.2043, "num_input_tokens_seen": 39145760, "step": 64240 }, { "epoch": 19.93329196400869, "grad_norm": 21.46315574645996, "learning_rate": 3.4203983127978345e-10, "loss": 0.1391, "num_input_tokens_seen": 39148096, "step": 64245 }, { "epoch": 19.934843313682904, "grad_norm": 13.762958526611328, "learning_rate": 3.2638810171070625e-10, "loss": 0.0983, "num_input_tokens_seen": 39150752, "step": 64250 }, { "epoch": 19.93639466335712, "grad_norm": 5.333983898162842, "learning_rate": 3.111029074215344e-10, "loss": 0.1324, "num_input_tokens_seen": 39153600, "step": 64255 }, { "epoch": 19.937946013031336, "grad_norm": 9.414813995361328, "learning_rate": 2.9618424953470337e-10, "loss": 0.1658, "num_input_tokens_seen": 39156960, "step": 64260 }, { "epoch": 19.939497362705552, "grad_norm": 28.869216918945312, "learning_rate": 2.8163212914211756e-10, "loss": 0.1496, "num_input_tokens_seen": 39159712, "step": 64265 }, { "epoch": 19.94104871237977, "grad_norm": 11.165894508361816, "learning_rate": 2.674465473118115e-10, "loss": 0.0969, "num_input_tokens_seen": 39163648, "step": 64270 }, { "epoch": 19.942600062053987, "grad_norm": 15.851625442504883, "learning_rate": 2.53627505082954e-10, "loss": 0.2048, "num_input_tokens_seen": 39166144, "step": 64275 }, { "epoch": 19.944151411728203, "grad_norm": 37.96058654785156, "learning_rate": 2.4017500346973367e-10, "loss": 0.1967, "num_input_tokens_seen": 39168736, "step": 64280 }, { "epoch": 19.94570276140242, "grad_norm": 15.216017723083496, "learning_rate": 2.270890434569184e-10, "loss": 0.1533, "num_input_tokens_seen": 39171232, "step": 64285 }, { "epoch": 19.947254111076635, "grad_norm": 1.7218194007873535, "learning_rate": 2.1436962600540623e-10, "loss": 0.0741, "num_input_tokens_seen": 39174688, "step": 64290 }, { "epoch": 19.948805460750854, "grad_norm": 16.514232635498047, "learning_rate": 2.0201675204667426e-10, "loss": 0.1304, "num_input_tokens_seen": 39177056, "step": 64295 }, { "epoch": 19.95035681042507, "grad_norm": 15.43829345703125, "learning_rate": 1.9003042248610937e-10, "loss": 0.0994, "num_input_tokens_seen": 39179712, "step": 64300 }, { "epoch": 19.951908160099286, "grad_norm": 7.118022441864014, "learning_rate": 1.784106382035633e-10, "loss": 0.1884, "num_input_tokens_seen": 39182560, "step": 64305 }, { "epoch": 19.953459509773502, "grad_norm": 36.85072708129883, "learning_rate": 1.67157400050022e-10, "loss": 0.2314, "num_input_tokens_seen": 39184736, "step": 64310 }, { "epoch": 19.955010859447718, "grad_norm": 27.390016555786133, "learning_rate": 1.562707088503812e-10, "loss": 0.2638, "num_input_tokens_seen": 39187168, "step": 64315 }, { "epoch": 19.956562209121937, "grad_norm": 22.891826629638672, "learning_rate": 1.4575056540344636e-10, "loss": 0.0827, "num_input_tokens_seen": 39190016, "step": 64320 }, { "epoch": 19.958113558796153, "grad_norm": 24.306705474853516, "learning_rate": 1.3559697048026732e-10, "loss": 0.1542, "num_input_tokens_seen": 39192608, "step": 64325 }, { "epoch": 19.95966490847037, "grad_norm": 50.429290771484375, "learning_rate": 1.2580992482469358e-10, "loss": 0.1941, "num_input_tokens_seen": 39196160, "step": 64330 }, { "epoch": 19.961216258144585, "grad_norm": 6.182278156280518, "learning_rate": 1.1638942915503937e-10, "loss": 0.0864, "num_input_tokens_seen": 39198336, "step": 64335 }, { "epoch": 19.9627676078188, "grad_norm": 13.48733901977539, "learning_rate": 1.0733548416130834e-10, "loss": 0.1453, "num_input_tokens_seen": 39202240, "step": 64340 }, { "epoch": 19.96431895749302, "grad_norm": 39.89687728881836, "learning_rate": 9.864809050741386e-11, "loss": 0.1896, "num_input_tokens_seen": 39204800, "step": 64345 }, { "epoch": 19.965870307167236, "grad_norm": 20.59058952331543, "learning_rate": 9.032724883006883e-11, "loss": 0.1536, "num_input_tokens_seen": 39207840, "step": 64350 }, { "epoch": 19.967421656841452, "grad_norm": 7.497242450714111, "learning_rate": 8.23729597398959e-11, "loss": 0.1677, "num_input_tokens_seen": 39210272, "step": 64355 }, { "epoch": 19.968973006515668, "grad_norm": 19.233572006225586, "learning_rate": 7.478522381976217e-11, "loss": 0.1139, "num_input_tokens_seen": 39212704, "step": 64360 }, { "epoch": 19.970524356189884, "grad_norm": 4.4312825202941895, "learning_rate": 6.756404162588936e-11, "loss": 0.1149, "num_input_tokens_seen": 39218272, "step": 64365 }, { "epoch": 19.972075705864103, "grad_norm": 14.327715873718262, "learning_rate": 6.070941368729877e-11, "loss": 0.1223, "num_input_tokens_seen": 39221088, "step": 64370 }, { "epoch": 19.97362705553832, "grad_norm": 28.833072662353516, "learning_rate": 5.4221340506921404e-11, "loss": 0.1978, "num_input_tokens_seen": 39224512, "step": 64375 }, { "epoch": 19.975178405212535, "grad_norm": 7.647266864776611, "learning_rate": 4.809982256048784e-11, "loss": 0.168, "num_input_tokens_seen": 39227808, "step": 64380 }, { "epoch": 19.97672975488675, "grad_norm": 12.937889099121094, "learning_rate": 4.234486029652818e-11, "loss": 0.1477, "num_input_tokens_seen": 39230592, "step": 64385 }, { "epoch": 19.978281104560967, "grad_norm": 38.53170394897461, "learning_rate": 3.6956454136927164e-11, "loss": 0.114, "num_input_tokens_seen": 39233920, "step": 64390 }, { "epoch": 19.979832454235186, "grad_norm": 42.958106994628906, "learning_rate": 3.1934604476924206e-11, "loss": 0.1451, "num_input_tokens_seen": 39236864, "step": 64395 }, { "epoch": 19.981383803909402, "grad_norm": 43.98862838745117, "learning_rate": 2.727931168455822e-11, "loss": 0.2533, "num_input_tokens_seen": 39239264, "step": 64400 }, { "epoch": 19.982935153583618, "grad_norm": 40.46613311767578, "learning_rate": 2.2990576101222795e-11, "loss": 0.1113, "num_input_tokens_seen": 39242848, "step": 64405 }, { "epoch": 19.984486503257834, "grad_norm": 4.320159435272217, "learning_rate": 1.9068398041111046e-11, "loss": 0.1975, "num_input_tokens_seen": 39245792, "step": 64410 }, { "epoch": 19.98603785293205, "grad_norm": 11.33991527557373, "learning_rate": 1.5512777792325852e-11, "loss": 0.1313, "num_input_tokens_seen": 39248672, "step": 64415 }, { "epoch": 19.987589202606266, "grad_norm": 18.92547607421875, "learning_rate": 1.2323715614659394e-11, "loss": 0.114, "num_input_tokens_seen": 39251104, "step": 64420 }, { "epoch": 19.989140552280485, "grad_norm": 23.563318252563477, "learning_rate": 9.501211742368732e-12, "loss": 0.1476, "num_input_tokens_seen": 39253696, "step": 64425 }, { "epoch": 19.9906919019547, "grad_norm": 38.640480041503906, "learning_rate": 7.0452663825104626e-12, "loss": 0.2119, "num_input_tokens_seen": 39256416, "step": 64430 }, { "epoch": 19.992243251628917, "grad_norm": 15.79842758178711, "learning_rate": 4.955879715495826e-12, "loss": 0.1702, "num_input_tokens_seen": 39259296, "step": 64435 }, { "epoch": 19.993794601303133, "grad_norm": 62.67918014526367, "learning_rate": 3.2330518934253764e-12, "loss": 0.1929, "num_input_tokens_seen": 39262560, "step": 64440 }, { "epoch": 19.99534595097735, "grad_norm": 20.3526611328125, "learning_rate": 1.87678304341965e-12, "loss": 0.1209, "num_input_tokens_seen": 39265536, "step": 64445 }, { "epoch": 19.99689730065157, "grad_norm": 9.48779582977295, "learning_rate": 8.870732648436076e-13, "loss": 0.0997, "num_input_tokens_seen": 39268608, "step": 64450 }, { "epoch": 19.998448650325784, "grad_norm": 21.22992706298828, "learning_rate": 2.6392262986174587e-13, "loss": 0.3129, "num_input_tokens_seen": 39271680, "step": 64455 }, { "epoch": 20.0, "grad_norm": 27.336463928222656, "learning_rate": 7.331183993208869e-15, "loss": 0.1512, "num_input_tokens_seen": 39274528, "step": 64460 }, { "epoch": 20.0, "eval_loss": 0.40142932534217834, "eval_runtime": 34.4005, "eval_samples_per_second": 93.69, "eval_steps_per_second": 23.43, "num_input_tokens_seen": 39274528, "step": 64460 }, { "epoch": 20.0, "num_input_tokens_seen": 39274528, "step": 64460, "total_flos": 1.768514245010129e+18, "train_loss": 0.23372886603732232, "train_runtime": 7192.438, "train_samples_per_second": 35.843, "train_steps_per_second": 8.962 } ], "logging_steps": 5, "max_steps": 64460, "num_input_tokens_seen": 39274528, "num_train_epochs": 20, "save_steps": 6446, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.768514245010129e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }