{ "best_global_step": 668, "best_metric": 0.3488299548625946, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_42_1760609151/checkpoint-668", "epoch": 30.0, "eval_steps": 167, "global_step": 3330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04504504504504504, "grad_norm": 198.7502899169922, "learning_rate": 6.006006006006006e-06, "loss": 10.1271, "num_input_tokens_seen": 1920, "step": 5 }, { "epoch": 0.09009009009009009, "grad_norm": 227.89190673828125, "learning_rate": 1.3513513513513515e-05, "loss": 5.278, "num_input_tokens_seen": 4448, "step": 10 }, { "epoch": 0.13513513513513514, "grad_norm": 42.42586135864258, "learning_rate": 2.102102102102102e-05, "loss": 1.8516, "num_input_tokens_seen": 6560, "step": 15 }, { "epoch": 0.18018018018018017, "grad_norm": 43.10729217529297, "learning_rate": 2.8528528528528528e-05, "loss": 0.5796, "num_input_tokens_seen": 8352, "step": 20 }, { "epoch": 0.22522522522522523, "grad_norm": 13.360367774963379, "learning_rate": 3.603603603603604e-05, "loss": 0.4379, "num_input_tokens_seen": 10432, "step": 25 }, { "epoch": 0.2702702702702703, "grad_norm": 5.690464973449707, "learning_rate": 4.354354354354354e-05, "loss": 0.4127, "num_input_tokens_seen": 12256, "step": 30 }, { "epoch": 0.3153153153153153, "grad_norm": 7.120150566101074, "learning_rate": 5.105105105105105e-05, "loss": 0.4075, "num_input_tokens_seen": 14336, "step": 35 }, { "epoch": 0.36036036036036034, "grad_norm": 4.940525054931641, "learning_rate": 5.855855855855856e-05, "loss": 0.4046, "num_input_tokens_seen": 16384, "step": 40 }, { "epoch": 0.40540540540540543, "grad_norm": 14.779731750488281, "learning_rate": 6.606606606606606e-05, "loss": 0.4308, "num_input_tokens_seen": 18368, "step": 45 }, { "epoch": 0.45045045045045046, "grad_norm": 9.910587310791016, "learning_rate": 7.357357357357357e-05, "loss": 0.548, "num_input_tokens_seen": 20128, "step": 50 }, { "epoch": 0.4954954954954955, "grad_norm": 3.5048186779022217, "learning_rate": 8.108108108108109e-05, "loss": 0.5391, "num_input_tokens_seen": 21984, "step": 55 }, { "epoch": 0.5405405405405406, "grad_norm": 3.228012800216675, "learning_rate": 8.85885885885886e-05, "loss": 0.337, "num_input_tokens_seen": 23840, "step": 60 }, { "epoch": 0.5855855855855856, "grad_norm": 2.3968305587768555, "learning_rate": 9.60960960960961e-05, "loss": 0.4438, "num_input_tokens_seen": 25984, "step": 65 }, { "epoch": 0.6306306306306306, "grad_norm": 2.806647300720215, "learning_rate": 0.0001036036036036036, "loss": 0.4852, "num_input_tokens_seen": 27872, "step": 70 }, { "epoch": 0.6756756756756757, "grad_norm": 0.7306264042854309, "learning_rate": 0.0001111111111111111, "loss": 0.4096, "num_input_tokens_seen": 29824, "step": 75 }, { "epoch": 0.7207207207207207, "grad_norm": 1.4658559560775757, "learning_rate": 0.00011861861861861862, "loss": 0.2499, "num_input_tokens_seen": 31680, "step": 80 }, { "epoch": 0.7657657657657657, "grad_norm": 1.1854609251022339, "learning_rate": 0.00012612612612612612, "loss": 0.5662, "num_input_tokens_seen": 33632, "step": 85 }, { "epoch": 0.8108108108108109, "grad_norm": 0.7023877501487732, "learning_rate": 0.00013363363363363365, "loss": 0.4575, "num_input_tokens_seen": 35520, "step": 90 }, { "epoch": 0.8558558558558559, "grad_norm": 6.529045581817627, "learning_rate": 0.00014114114114114116, "loss": 0.42, "num_input_tokens_seen": 37824, "step": 95 }, { "epoch": 0.9009009009009009, "grad_norm": 1.2374546527862549, "learning_rate": 0.00014864864864864866, "loss": 0.7992, "num_input_tokens_seen": 39616, "step": 100 }, { "epoch": 0.9459459459459459, "grad_norm": 4.099679946899414, "learning_rate": 0.00015615615615615616, "loss": 0.4349, "num_input_tokens_seen": 41440, "step": 105 }, { "epoch": 0.990990990990991, "grad_norm": 4.258035659790039, "learning_rate": 0.00016366366366366367, "loss": 0.4096, "num_input_tokens_seen": 43232, "step": 110 }, { "epoch": 1.0360360360360361, "grad_norm": 11.078781127929688, "learning_rate": 0.00017117117117117117, "loss": 1.7942, "num_input_tokens_seen": 45440, "step": 115 }, { "epoch": 1.0810810810810811, "grad_norm": 1.426566481590271, "learning_rate": 0.00017867867867867867, "loss": 0.3969, "num_input_tokens_seen": 47456, "step": 120 }, { "epoch": 1.1261261261261262, "grad_norm": 0.9096628427505493, "learning_rate": 0.00018618618618618617, "loss": 0.3697, "num_input_tokens_seen": 49440, "step": 125 }, { "epoch": 1.1711711711711712, "grad_norm": 0.8774626851081848, "learning_rate": 0.00019369369369369368, "loss": 0.3804, "num_input_tokens_seen": 51264, "step": 130 }, { "epoch": 1.2162162162162162, "grad_norm": 0.6812538504600525, "learning_rate": 0.0002012012012012012, "loss": 0.3599, "num_input_tokens_seen": 53408, "step": 135 }, { "epoch": 1.2612612612612613, "grad_norm": 1.0902281999588013, "learning_rate": 0.0002087087087087087, "loss": 0.3894, "num_input_tokens_seen": 55104, "step": 140 }, { "epoch": 1.3063063063063063, "grad_norm": 2.2054333686828613, "learning_rate": 0.00021621621621621624, "loss": 0.5272, "num_input_tokens_seen": 57280, "step": 145 }, { "epoch": 1.3513513513513513, "grad_norm": 3.4195408821105957, "learning_rate": 0.00022372372372372374, "loss": 0.5666, "num_input_tokens_seen": 59264, "step": 150 }, { "epoch": 1.3963963963963963, "grad_norm": 0.2056901454925537, "learning_rate": 0.00023123123123123125, "loss": 0.3785, "num_input_tokens_seen": 60992, "step": 155 }, { "epoch": 1.4414414414414414, "grad_norm": 1.0198805332183838, "learning_rate": 0.00023873873873873875, "loss": 0.36, "num_input_tokens_seen": 62944, "step": 160 }, { "epoch": 1.4864864864864864, "grad_norm": 1.283266544342041, "learning_rate": 0.00024624624624624625, "loss": 0.414, "num_input_tokens_seen": 65152, "step": 165 }, { "epoch": 1.5045045045045045, "eval_loss": 0.4660390019416809, "eval_runtime": 1.2841, "eval_samples_per_second": 86.442, "eval_steps_per_second": 21.805, "num_input_tokens_seen": 65984, "step": 167 }, { "epoch": 1.5315315315315314, "grad_norm": 0.7147128582000732, "learning_rate": 0.00025375375375375376, "loss": 0.4435, "num_input_tokens_seen": 67104, "step": 170 }, { "epoch": 1.5765765765765765, "grad_norm": 1.0104883909225464, "learning_rate": 0.00026126126126126126, "loss": 0.3524, "num_input_tokens_seen": 68800, "step": 175 }, { "epoch": 1.6216216216216215, "grad_norm": 0.11864089220762253, "learning_rate": 0.00026876876876876876, "loss": 0.3485, "num_input_tokens_seen": 70592, "step": 180 }, { "epoch": 1.6666666666666665, "grad_norm": 60.13191223144531, "learning_rate": 0.00027627627627627627, "loss": 1.411, "num_input_tokens_seen": 72736, "step": 185 }, { "epoch": 1.7117117117117115, "grad_norm": 30.945253372192383, "learning_rate": 0.00028378378378378377, "loss": 1.0786, "num_input_tokens_seen": 74400, "step": 190 }, { "epoch": 1.7567567567567568, "grad_norm": 2.6165988445281982, "learning_rate": 0.00029129129129129127, "loss": 0.5167, "num_input_tokens_seen": 76192, "step": 195 }, { "epoch": 1.8018018018018018, "grad_norm": 4.2211012840271, "learning_rate": 0.0002987987987987988, "loss": 0.9988, "num_input_tokens_seen": 78400, "step": 200 }, { "epoch": 1.8468468468468469, "grad_norm": 0.4140159785747528, "learning_rate": 0.0003063063063063063, "loss": 0.3591, "num_input_tokens_seen": 80096, "step": 205 }, { "epoch": 1.8918918918918919, "grad_norm": 0.4666629731655121, "learning_rate": 0.0003138138138138138, "loss": 0.381, "num_input_tokens_seen": 82336, "step": 210 }, { "epoch": 1.936936936936937, "grad_norm": 0.7295236587524414, "learning_rate": 0.0003213213213213213, "loss": 0.4551, "num_input_tokens_seen": 84352, "step": 215 }, { "epoch": 1.981981981981982, "grad_norm": 0.2503405809402466, "learning_rate": 0.00032882882882882884, "loss": 0.3549, "num_input_tokens_seen": 86496, "step": 220 }, { "epoch": 2.027027027027027, "grad_norm": 0.12554490566253662, "learning_rate": 0.00033633633633633635, "loss": 0.2953, "num_input_tokens_seen": 88088, "step": 225 }, { "epoch": 2.0720720720720722, "grad_norm": 1.331797480583191, "learning_rate": 0.00034384384384384385, "loss": 1.573, "num_input_tokens_seen": 90232, "step": 230 }, { "epoch": 2.1171171171171173, "grad_norm": 0.26872557401657104, "learning_rate": 0.00035135135135135135, "loss": 0.3753, "num_input_tokens_seen": 91864, "step": 235 }, { "epoch": 2.1621621621621623, "grad_norm": 0.0352901928126812, "learning_rate": 0.0003588588588588589, "loss": 0.3136, "num_input_tokens_seen": 94232, "step": 240 }, { "epoch": 2.2072072072072073, "grad_norm": 0.1265193670988083, "learning_rate": 0.0003663663663663664, "loss": 0.4186, "num_input_tokens_seen": 96536, "step": 245 }, { "epoch": 2.2522522522522523, "grad_norm": 0.2617993652820587, "learning_rate": 0.0003738738738738739, "loss": 0.3911, "num_input_tokens_seen": 98360, "step": 250 }, { "epoch": 2.2972972972972974, "grad_norm": 0.23085494339466095, "learning_rate": 0.0003813813813813814, "loss": 0.3668, "num_input_tokens_seen": 100152, "step": 255 }, { "epoch": 2.3423423423423424, "grad_norm": 0.13152875006198883, "learning_rate": 0.0003888888888888889, "loss": 0.3606, "num_input_tokens_seen": 102296, "step": 260 }, { "epoch": 2.3873873873873874, "grad_norm": 0.21601299941539764, "learning_rate": 0.0003963963963963964, "loss": 0.3488, "num_input_tokens_seen": 104024, "step": 265 }, { "epoch": 2.4324324324324325, "grad_norm": 0.1982952058315277, "learning_rate": 0.00040390390390390393, "loss": 0.3477, "num_input_tokens_seen": 105944, "step": 270 }, { "epoch": 2.4774774774774775, "grad_norm": 0.3404879868030548, "learning_rate": 0.00041141141141141143, "loss": 0.3467, "num_input_tokens_seen": 107928, "step": 275 }, { "epoch": 2.5225225225225225, "grad_norm": 0.19420060515403748, "learning_rate": 0.00041891891891891893, "loss": 0.3638, "num_input_tokens_seen": 109976, "step": 280 }, { "epoch": 2.5675675675675675, "grad_norm": 0.10250438004732132, "learning_rate": 0.00042642642642642644, "loss": 0.3496, "num_input_tokens_seen": 112024, "step": 285 }, { "epoch": 2.6126126126126126, "grad_norm": 2.241009473800659, "learning_rate": 0.00043393393393393394, "loss": 0.3593, "num_input_tokens_seen": 114584, "step": 290 }, { "epoch": 2.6576576576576576, "grad_norm": 0.19610685110092163, "learning_rate": 0.00044144144144144144, "loss": 0.3504, "num_input_tokens_seen": 116472, "step": 295 }, { "epoch": 2.7027027027027026, "grad_norm": 0.5257853269577026, "learning_rate": 0.00044894894894894895, "loss": 0.4647, "num_input_tokens_seen": 118136, "step": 300 }, { "epoch": 2.7477477477477477, "grad_norm": 3.2750346660614014, "learning_rate": 0.00045645645645645645, "loss": 3.7793, "num_input_tokens_seen": 120120, "step": 305 }, { "epoch": 2.7927927927927927, "grad_norm": 3.562680959701538, "learning_rate": 0.00046396396396396395, "loss": 6.114, "num_input_tokens_seen": 121880, "step": 310 }, { "epoch": 2.8378378378378377, "grad_norm": 7.478321552276611, "learning_rate": 0.00047147147147147146, "loss": 3.3028, "num_input_tokens_seen": 123800, "step": 315 }, { "epoch": 2.8828828828828827, "grad_norm": 0.7892034649848938, "learning_rate": 0.00047897897897897896, "loss": 0.8823, "num_input_tokens_seen": 125752, "step": 320 }, { "epoch": 2.9279279279279278, "grad_norm": 0.6121970415115356, "learning_rate": 0.0004864864864864865, "loss": 0.3914, "num_input_tokens_seen": 127928, "step": 325 }, { "epoch": 2.972972972972973, "grad_norm": 0.1384078562259674, "learning_rate": 0.0004939939939939941, "loss": 0.4193, "num_input_tokens_seen": 129816, "step": 330 }, { "epoch": 3.009009009009009, "eval_loss": 0.49270448088645935, "eval_runtime": 1.285, "eval_samples_per_second": 86.383, "eval_steps_per_second": 21.79, "num_input_tokens_seen": 131096, "step": 334 }, { "epoch": 3.018018018018018, "grad_norm": 0.16554592549800873, "learning_rate": 0.0004999998626476062, "loss": 0.328, "num_input_tokens_seen": 131416, "step": 335 }, { "epoch": 3.063063063063063, "grad_norm": 0.8997936248779297, "learning_rate": 0.0004999950553296708, "loss": 0.4295, "num_input_tokens_seen": 133336, "step": 340 }, { "epoch": 3.108108108108108, "grad_norm": 0.1003471091389656, "learning_rate": 0.0004999833805429708, "loss": 0.3431, "num_input_tokens_seen": 135288, "step": 345 }, { "epoch": 3.153153153153153, "grad_norm": 0.12911377847194672, "learning_rate": 0.0004999648386082173, "loss": 0.3772, "num_input_tokens_seen": 137272, "step": 350 }, { "epoch": 3.1981981981981984, "grad_norm": 0.07771427184343338, "learning_rate": 0.0004999394300347652, "loss": 0.3617, "num_input_tokens_seen": 139096, "step": 355 }, { "epoch": 3.2432432432432434, "grad_norm": 0.23330911993980408, "learning_rate": 0.0004999071555205985, "loss": 0.2842, "num_input_tokens_seen": 141304, "step": 360 }, { "epoch": 3.2882882882882885, "grad_norm": 0.11049104481935501, "learning_rate": 0.0004998680159523117, "loss": 0.4347, "num_input_tokens_seen": 143160, "step": 365 }, { "epoch": 3.3333333333333335, "grad_norm": 0.16447076201438904, "learning_rate": 0.000499822012405085, "loss": 0.3724, "num_input_tokens_seen": 145144, "step": 370 }, { "epoch": 3.3783783783783785, "grad_norm": 0.25307098031044006, "learning_rate": 0.0004997691461426552, "loss": 0.3634, "num_input_tokens_seen": 147320, "step": 375 }, { "epoch": 3.4234234234234235, "grad_norm": 0.2143518030643463, "learning_rate": 0.0004997094186172807, "loss": 0.3547, "num_input_tokens_seen": 149208, "step": 380 }, { "epoch": 3.4684684684684686, "grad_norm": 0.15004123747348785, "learning_rate": 0.0004996428314697015, "loss": 0.3272, "num_input_tokens_seen": 151256, "step": 385 }, { "epoch": 3.5135135135135136, "grad_norm": 0.1515875607728958, "learning_rate": 0.0004995693865290945, "loss": 0.3502, "num_input_tokens_seen": 152920, "step": 390 }, { "epoch": 3.5585585585585586, "grad_norm": 0.09838002175092697, "learning_rate": 0.0004994890858130227, "loss": 0.3687, "num_input_tokens_seen": 154968, "step": 395 }, { "epoch": 3.6036036036036037, "grad_norm": 0.13579407334327698, "learning_rate": 0.0004994019315273806, "loss": 0.3518, "num_input_tokens_seen": 156760, "step": 400 }, { "epoch": 3.6486486486486487, "grad_norm": 0.09335580468177795, "learning_rate": 0.0004993079260663329, "loss": 0.3441, "num_input_tokens_seen": 158712, "step": 405 }, { "epoch": 3.6936936936936937, "grad_norm": 0.06506289541721344, "learning_rate": 0.0004992070720122489, "loss": 0.3488, "num_input_tokens_seen": 160792, "step": 410 }, { "epoch": 3.7387387387387387, "grad_norm": 0.07059651613235474, "learning_rate": 0.0004990993721356316, "loss": 0.3604, "num_input_tokens_seen": 162968, "step": 415 }, { "epoch": 3.7837837837837838, "grad_norm": 0.11248281598091125, "learning_rate": 0.0004989848293950417, "loss": 0.3691, "num_input_tokens_seen": 164792, "step": 420 }, { "epoch": 3.828828828828829, "grad_norm": 0.08889925479888916, "learning_rate": 0.0004988634469370164, "loss": 0.3632, "num_input_tokens_seen": 167000, "step": 425 }, { "epoch": 3.873873873873874, "grad_norm": 0.235771045088768, "learning_rate": 0.0004987352280959822, "loss": 0.3381, "num_input_tokens_seen": 168824, "step": 430 }, { "epoch": 3.918918918918919, "grad_norm": 0.06070815026760101, "learning_rate": 0.0004986001763941647, "loss": 0.3518, "num_input_tokens_seen": 170776, "step": 435 }, { "epoch": 3.963963963963964, "grad_norm": 0.16858269274234772, "learning_rate": 0.0004984582955414904, "loss": 0.3556, "num_input_tokens_seen": 172536, "step": 440 }, { "epoch": 4.009009009009009, "grad_norm": 0.6117138266563416, "learning_rate": 0.0004983095894354857, "loss": 0.3534, "num_input_tokens_seen": 174288, "step": 445 }, { "epoch": 4.054054054054054, "grad_norm": 0.16566413640975952, "learning_rate": 0.0004981540621611698, "loss": 0.3765, "num_input_tokens_seen": 176368, "step": 450 }, { "epoch": 4.099099099099099, "grad_norm": 0.15239045023918152, "learning_rate": 0.0004979917179909417, "loss": 0.3463, "num_input_tokens_seen": 178064, "step": 455 }, { "epoch": 4.1441441441441444, "grad_norm": 0.49086788296699524, "learning_rate": 0.0004978225613844638, "loss": 0.3851, "num_input_tokens_seen": 179984, "step": 460 }, { "epoch": 4.1891891891891895, "grad_norm": 0.487692266702652, "learning_rate": 0.000497646596988539, "loss": 0.3643, "num_input_tokens_seen": 182352, "step": 465 }, { "epoch": 4.2342342342342345, "grad_norm": 0.08044200390577316, "learning_rate": 0.0004974638296369826, "loss": 0.3766, "num_input_tokens_seen": 184016, "step": 470 }, { "epoch": 4.2792792792792795, "grad_norm": 0.051064759492874146, "learning_rate": 0.0004972742643504904, "loss": 0.3598, "num_input_tokens_seen": 186096, "step": 475 }, { "epoch": 4.324324324324325, "grad_norm": 0.05002656951546669, "learning_rate": 0.0004970779063365, "loss": 0.368, "num_input_tokens_seen": 187920, "step": 480 }, { "epoch": 4.36936936936937, "grad_norm": 0.028187120333313942, "learning_rate": 0.0004968747609890484, "loss": 0.3598, "num_input_tokens_seen": 189744, "step": 485 }, { "epoch": 4.414414414414415, "grad_norm": 0.13390058279037476, "learning_rate": 0.000496664833888623, "loss": 0.3559, "num_input_tokens_seen": 191760, "step": 490 }, { "epoch": 4.45945945945946, "grad_norm": 0.055107761174440384, "learning_rate": 0.0004964481308020093, "loss": 0.3447, "num_input_tokens_seen": 193872, "step": 495 }, { "epoch": 4.504504504504505, "grad_norm": 0.17059732973575592, "learning_rate": 0.0004962246576821318, "loss": 0.3295, "num_input_tokens_seen": 196048, "step": 500 }, { "epoch": 4.513513513513513, "eval_loss": 0.3681543171405792, "eval_runtime": 1.1637, "eval_samples_per_second": 95.383, "eval_steps_per_second": 24.061, "num_input_tokens_seen": 196400, "step": 501 }, { "epoch": 4.54954954954955, "grad_norm": 0.058844875544309616, "learning_rate": 0.0004959944206678903, "loss": 0.3872, "num_input_tokens_seen": 198128, "step": 505 }, { "epoch": 4.594594594594595, "grad_norm": 0.03136448562145233, "learning_rate": 0.0004957574260839923, "loss": 0.335, "num_input_tokens_seen": 199984, "step": 510 }, { "epoch": 4.63963963963964, "grad_norm": 0.0501897931098938, "learning_rate": 0.0004955136804407779, "loss": 0.325, "num_input_tokens_seen": 201968, "step": 515 }, { "epoch": 4.684684684684685, "grad_norm": 0.2560962438583374, "learning_rate": 0.000495263190434042, "loss": 0.6477, "num_input_tokens_seen": 203984, "step": 520 }, { "epoch": 4.72972972972973, "grad_norm": 45.544105529785156, "learning_rate": 0.0004950059629448499, "loss": 1.822, "num_input_tokens_seen": 205808, "step": 525 }, { "epoch": 4.774774774774775, "grad_norm": 14.118766784667969, "learning_rate": 0.0004947420050393483, "loss": 2.169, "num_input_tokens_seen": 208016, "step": 530 }, { "epoch": 4.81981981981982, "grad_norm": 6.093238353729248, "learning_rate": 0.0004944713239685713, "loss": 2.7455, "num_input_tokens_seen": 210320, "step": 535 }, { "epoch": 4.864864864864865, "grad_norm": 4.631172180175781, "learning_rate": 0.0004941939271682411, "loss": 0.9305, "num_input_tokens_seen": 212048, "step": 540 }, { "epoch": 4.90990990990991, "grad_norm": 18.648813247680664, "learning_rate": 0.000493909822258564, "loss": 1.1274, "num_input_tokens_seen": 214032, "step": 545 }, { "epoch": 4.954954954954955, "grad_norm": 0.10367891937494278, "learning_rate": 0.0004936190170440208, "loss": 0.5231, "num_input_tokens_seen": 215856, "step": 550 }, { "epoch": 5.0, "grad_norm": 0.37693262100219727, "learning_rate": 0.0004933215195131521, "loss": 0.3789, "num_input_tokens_seen": 217408, "step": 555 }, { "epoch": 5.045045045045045, "grad_norm": 0.2514440417289734, "learning_rate": 0.00049301733783834, "loss": 0.5117, "num_input_tokens_seen": 219232, "step": 560 }, { "epoch": 5.09009009009009, "grad_norm": 1.351763367652893, "learning_rate": 0.000492706480375582, "loss": 0.4079, "num_input_tokens_seen": 221664, "step": 565 }, { "epoch": 5.135135135135135, "grad_norm": 0.1483132541179657, "learning_rate": 0.0004923889556642626, "loss": 0.3651, "num_input_tokens_seen": 223232, "step": 570 }, { "epoch": 5.18018018018018, "grad_norm": 0.18108053505420685, "learning_rate": 0.0004920647724269188, "loss": 0.535, "num_input_tokens_seen": 225024, "step": 575 }, { "epoch": 5.225225225225225, "grad_norm": 0.8862496614456177, "learning_rate": 0.0004917339395689996, "loss": 0.6379, "num_input_tokens_seen": 227040, "step": 580 }, { "epoch": 5.27027027027027, "grad_norm": 1.8228765726089478, "learning_rate": 0.000491396466178622, "loss": 0.649, "num_input_tokens_seen": 229184, "step": 585 }, { "epoch": 5.315315315315315, "grad_norm": 3.0169267654418945, "learning_rate": 0.0004910523615263213, "loss": 2.0758, "num_input_tokens_seen": 231104, "step": 590 }, { "epoch": 5.36036036036036, "grad_norm": 0.7156198024749756, "learning_rate": 0.0004907016350647961, "loss": 0.5733, "num_input_tokens_seen": 233088, "step": 595 }, { "epoch": 5.405405405405405, "grad_norm": 0.3205277621746063, "learning_rate": 0.0004903442964286492, "loss": 0.4, "num_input_tokens_seen": 235008, "step": 600 }, { "epoch": 5.45045045045045, "grad_norm": 1.1223965883255005, "learning_rate": 0.0004899803554341225, "loss": 0.395, "num_input_tokens_seen": 237120, "step": 605 }, { "epoch": 5.495495495495495, "grad_norm": 0.7036691308021545, "learning_rate": 0.0004896098220788272, "loss": 0.6124, "num_input_tokens_seen": 238752, "step": 610 }, { "epoch": 5.54054054054054, "grad_norm": 1.1847765445709229, "learning_rate": 0.0004892327065414697, "loss": 1.2727, "num_input_tokens_seen": 240864, "step": 615 }, { "epoch": 5.585585585585585, "grad_norm": 1.7145177125930786, "learning_rate": 0.0004888490191815716, "loss": 0.7417, "num_input_tokens_seen": 242720, "step": 620 }, { "epoch": 5.63063063063063, "grad_norm": 0.39997339248657227, "learning_rate": 0.0004884587705391851, "loss": 0.59, "num_input_tokens_seen": 245184, "step": 625 }, { "epoch": 5.675675675675675, "grad_norm": 0.3795667886734009, "learning_rate": 0.0004880619713346038, "loss": 0.8984, "num_input_tokens_seen": 247008, "step": 630 }, { "epoch": 5.7207207207207205, "grad_norm": 0.144282728433609, "learning_rate": 0.0004876586324680679, "loss": 0.6586, "num_input_tokens_seen": 248768, "step": 635 }, { "epoch": 5.7657657657657655, "grad_norm": 0.70822674036026, "learning_rate": 0.0004872487650194647, "loss": 0.4359, "num_input_tokens_seen": 250464, "step": 640 }, { "epoch": 5.8108108108108105, "grad_norm": 0.053767815232276917, "learning_rate": 0.00048683238024802456, "loss": 0.4257, "num_input_tokens_seen": 252672, "step": 645 }, { "epoch": 5.8558558558558556, "grad_norm": 0.7051423788070679, "learning_rate": 0.0004864094895920113, "loss": 0.3523, "num_input_tokens_seen": 254400, "step": 650 }, { "epoch": 5.900900900900901, "grad_norm": 0.4927254021167755, "learning_rate": 0.0004859801046684083, "loss": 0.3772, "num_input_tokens_seen": 256576, "step": 655 }, { "epoch": 5.945945945945946, "grad_norm": 0.1783227026462555, "learning_rate": 0.0004855442372725989, "loss": 0.3582, "num_input_tokens_seen": 258304, "step": 660 }, { "epoch": 5.990990990990991, "grad_norm": 0.4709087610244751, "learning_rate": 0.0004851018993780429, "loss": 0.4134, "num_input_tokens_seen": 260448, "step": 665 }, { "epoch": 6.018018018018018, "eval_loss": 0.3488299548625946, "eval_runtime": 1.316, "eval_samples_per_second": 84.35, "eval_steps_per_second": 21.277, "num_input_tokens_seen": 261392, "step": 668 }, { "epoch": 6.036036036036036, "grad_norm": 0.133230522274971, "learning_rate": 0.00048465310313594736, "loss": 0.4853, "num_input_tokens_seen": 262096, "step": 670 }, { "epoch": 6.081081081081081, "grad_norm": 0.24544201791286469, "learning_rate": 0.00048419786087493277, "loss": 0.4484, "num_input_tokens_seen": 264080, "step": 675 }, { "epoch": 6.126126126126126, "grad_norm": 0.03883830085396767, "learning_rate": 0.0004837361851006945, "loss": 0.3794, "num_input_tokens_seen": 265840, "step": 680 }, { "epoch": 6.171171171171171, "grad_norm": 0.0673433169722557, "learning_rate": 0.00048326808849565934, "loss": 0.3794, "num_input_tokens_seen": 267792, "step": 685 }, { "epoch": 6.216216216216216, "grad_norm": 0.556755542755127, "learning_rate": 0.0004827935839186368, "loss": 0.3546, "num_input_tokens_seen": 269776, "step": 690 }, { "epoch": 6.261261261261261, "grad_norm": 0.22620883584022522, "learning_rate": 0.0004823126844044661, "loss": 0.3455, "num_input_tokens_seen": 272176, "step": 695 }, { "epoch": 6.306306306306306, "grad_norm": 0.35793691873550415, "learning_rate": 0.0004818254031636581, "loss": 0.3537, "num_input_tokens_seen": 274480, "step": 700 }, { "epoch": 6.351351351351352, "grad_norm": 0.4595194458961487, "learning_rate": 0.00048133175358203243, "loss": 0.3623, "num_input_tokens_seen": 276176, "step": 705 }, { "epoch": 6.396396396396397, "grad_norm": 0.3390141725540161, "learning_rate": 0.0004808317492203496, "loss": 0.3725, "num_input_tokens_seen": 277968, "step": 710 }, { "epoch": 6.441441441441442, "grad_norm": 0.35919827222824097, "learning_rate": 0.00048032540381393854, "loss": 0.3177, "num_input_tokens_seen": 279824, "step": 715 }, { "epoch": 6.486486486486487, "grad_norm": 0.13053083419799805, "learning_rate": 0.0004798127312723195, "loss": 0.428, "num_input_tokens_seen": 281584, "step": 720 }, { "epoch": 6.531531531531532, "grad_norm": 0.2772209644317627, "learning_rate": 0.00047929374567882174, "loss": 0.3647, "num_input_tokens_seen": 283632, "step": 725 }, { "epoch": 6.576576576576577, "grad_norm": 0.07429492473602295, "learning_rate": 0.00047876846129019655, "loss": 0.482, "num_input_tokens_seen": 285808, "step": 730 }, { "epoch": 6.621621621621622, "grad_norm": 0.3712543249130249, "learning_rate": 0.00047823689253622586, "loss": 0.3713, "num_input_tokens_seen": 287792, "step": 735 }, { "epoch": 6.666666666666667, "grad_norm": 0.24175819754600525, "learning_rate": 0.00047769905401932585, "loss": 0.3927, "num_input_tokens_seen": 289680, "step": 740 }, { "epoch": 6.711711711711712, "grad_norm": 0.09707850962877274, "learning_rate": 0.0004771549605141455, "loss": 0.3208, "num_input_tokens_seen": 291792, "step": 745 }, { "epoch": 6.756756756756757, "grad_norm": 0.1386759877204895, "learning_rate": 0.00047660462696716107, "loss": 0.3977, "num_input_tokens_seen": 293584, "step": 750 }, { "epoch": 6.801801801801802, "grad_norm": 0.21642529964447021, "learning_rate": 0.0004760480684962654, "loss": 0.3499, "num_input_tokens_seen": 295568, "step": 755 }, { "epoch": 6.846846846846847, "grad_norm": 0.18956072628498077, "learning_rate": 0.00047548530039035245, "loss": 0.3588, "num_input_tokens_seen": 297424, "step": 760 }, { "epoch": 6.891891891891892, "grad_norm": 0.27483439445495605, "learning_rate": 0.0004749163381088976, "loss": 0.373, "num_input_tokens_seen": 299056, "step": 765 }, { "epoch": 6.936936936936937, "grad_norm": 0.4253614842891693, "learning_rate": 0.00047434119728153267, "loss": 0.3411, "num_input_tokens_seen": 301200, "step": 770 }, { "epoch": 6.981981981981982, "grad_norm": 0.04035143554210663, "learning_rate": 0.00047375989370761695, "loss": 0.3939, "num_input_tokens_seen": 303216, "step": 775 }, { "epoch": 7.027027027027027, "grad_norm": 0.45627114176750183, "learning_rate": 0.00047317244335580286, "loss": 0.3639, "num_input_tokens_seen": 305328, "step": 780 }, { "epoch": 7.072072072072072, "grad_norm": 0.10808518528938293, "learning_rate": 0.0004725788623635972, "loss": 0.3619, "num_input_tokens_seen": 307056, "step": 785 }, { "epoch": 7.117117117117117, "grad_norm": 0.1564093381166458, "learning_rate": 0.00047197916703691823, "loss": 0.3291, "num_input_tokens_seen": 309232, "step": 790 }, { "epoch": 7.162162162162162, "grad_norm": 0.40506285429000854, "learning_rate": 0.0004713733738496475, "loss": 0.3975, "num_input_tokens_seen": 311056, "step": 795 }, { "epoch": 7.207207207207207, "grad_norm": 0.2510952353477478, "learning_rate": 0.00047076149944317734, "loss": 0.3461, "num_input_tokens_seen": 312944, "step": 800 }, { "epoch": 7.252252252252252, "grad_norm": 0.39134106040000916, "learning_rate": 0.00047014356062595366, "loss": 0.3391, "num_input_tokens_seen": 314992, "step": 805 }, { "epoch": 7.297297297297297, "grad_norm": 0.14467374980449677, "learning_rate": 0.00046951957437301427, "loss": 0.466, "num_input_tokens_seen": 317200, "step": 810 }, { "epoch": 7.342342342342342, "grad_norm": 0.07726424932479858, "learning_rate": 0.0004688895578255227, "loss": 0.3583, "num_input_tokens_seen": 319152, "step": 815 }, { "epoch": 7.387387387387387, "grad_norm": 0.32542741298675537, "learning_rate": 0.00046825352829029705, "loss": 0.4826, "num_input_tokens_seen": 321136, "step": 820 }, { "epoch": 7.4324324324324325, "grad_norm": 0.2392854541540146, "learning_rate": 0.00046761150323933474, "loss": 0.3208, "num_input_tokens_seen": 322960, "step": 825 }, { "epoch": 7.4774774774774775, "grad_norm": 0.34363648295402527, "learning_rate": 0.0004669635003093325, "loss": 0.3802, "num_input_tokens_seen": 324880, "step": 830 }, { "epoch": 7.5225225225225225, "grad_norm": 0.22759604454040527, "learning_rate": 0.00046630953730120205, "loss": 0.3588, "num_input_tokens_seen": 326864, "step": 835 }, { "epoch": 7.5225225225225225, "eval_loss": 0.35348400473594666, "eval_runtime": 1.2113, "eval_samples_per_second": 91.635, "eval_steps_per_second": 23.115, "num_input_tokens_seen": 326864, "step": 835 }, { "epoch": 7.5675675675675675, "grad_norm": 0.10177426040172577, "learning_rate": 0.00046564963217958077, "loss": 0.3296, "num_input_tokens_seen": 328688, "step": 840 }, { "epoch": 7.612612612612613, "grad_norm": 0.1499328464269638, "learning_rate": 0.0004649838030723385, "loss": 0.4332, "num_input_tokens_seen": 330928, "step": 845 }, { "epoch": 7.657657657657658, "grad_norm": 0.4361546039581299, "learning_rate": 0.0004643120682700792, "loss": 0.3653, "num_input_tokens_seen": 332816, "step": 850 }, { "epoch": 7.702702702702703, "grad_norm": 0.2804911732673645, "learning_rate": 0.00046363444622563916, "loss": 0.3581, "num_input_tokens_seen": 334832, "step": 855 }, { "epoch": 7.747747747747748, "grad_norm": 0.3171520233154297, "learning_rate": 0.00046295095555357936, "loss": 0.3797, "num_input_tokens_seen": 336592, "step": 860 }, { "epoch": 7.792792792792793, "grad_norm": 0.2583809792995453, "learning_rate": 0.00046226161502967443, "loss": 0.3576, "num_input_tokens_seen": 338928, "step": 865 }, { "epoch": 7.837837837837838, "grad_norm": 0.11164341866970062, "learning_rate": 0.00046156644359039717, "loss": 0.3579, "num_input_tokens_seen": 340944, "step": 870 }, { "epoch": 7.882882882882883, "grad_norm": 0.3103424310684204, "learning_rate": 0.0004608654603323977, "loss": 0.4455, "num_input_tokens_seen": 342704, "step": 875 }, { "epoch": 7.927927927927928, "grad_norm": 0.020305242389440536, "learning_rate": 0.0004601586845119795, "loss": 0.3447, "num_input_tokens_seen": 344976, "step": 880 }, { "epoch": 7.972972972972973, "grad_norm": 0.40425506234169006, "learning_rate": 0.00045944613554457005, "loss": 0.3682, "num_input_tokens_seen": 346640, "step": 885 }, { "epoch": 8.018018018018019, "grad_norm": 0.1170736700296402, "learning_rate": 0.0004587278330041876, "loss": 0.3688, "num_input_tokens_seen": 348200, "step": 890 }, { "epoch": 8.063063063063064, "grad_norm": 0.2430335283279419, "learning_rate": 0.00045800379662290334, "loss": 0.3515, "num_input_tokens_seen": 350184, "step": 895 }, { "epoch": 8.108108108108109, "grad_norm": 0.03294355794787407, "learning_rate": 0.00045727404629029985, "loss": 0.3535, "num_input_tokens_seen": 351912, "step": 900 }, { "epoch": 8.153153153153154, "grad_norm": 0.0508589930832386, "learning_rate": 0.00045653860205292383, "loss": 0.3547, "num_input_tokens_seen": 353992, "step": 905 }, { "epoch": 8.198198198198199, "grad_norm": 0.16204330325126648, "learning_rate": 0.0004557974841137363, "loss": 0.344, "num_input_tokens_seen": 356136, "step": 910 }, { "epoch": 8.243243243243244, "grad_norm": 0.10633068531751633, "learning_rate": 0.0004550507128315572, "loss": 0.3385, "num_input_tokens_seen": 358056, "step": 915 }, { "epoch": 8.288288288288289, "grad_norm": 0.21723313629627228, "learning_rate": 0.00045429830872050587, "loss": 0.3668, "num_input_tokens_seen": 359848, "step": 920 }, { "epoch": 8.333333333333334, "grad_norm": 0.14599087834358215, "learning_rate": 0.0004535402924494382, "loss": 0.3702, "num_input_tokens_seen": 361992, "step": 925 }, { "epoch": 8.378378378378379, "grad_norm": 0.30659475922584534, "learning_rate": 0.00045277668484137827, "loss": 0.3888, "num_input_tokens_seen": 363624, "step": 930 }, { "epoch": 8.423423423423424, "grad_norm": 0.37063145637512207, "learning_rate": 0.00045200750687294654, "loss": 0.3698, "num_input_tokens_seen": 365352, "step": 935 }, { "epoch": 8.468468468468469, "grad_norm": 0.31823858618736267, "learning_rate": 0.00045123277967378374, "loss": 0.3522, "num_input_tokens_seen": 367400, "step": 940 }, { "epoch": 8.513513513513514, "grad_norm": 0.15336915850639343, "learning_rate": 0.00045045252452596997, "loss": 0.3336, "num_input_tokens_seen": 369384, "step": 945 }, { "epoch": 8.558558558558559, "grad_norm": 0.06012330204248428, "learning_rate": 0.00044966676286344064, "loss": 0.3534, "num_input_tokens_seen": 371336, "step": 950 }, { "epoch": 8.603603603603604, "grad_norm": 0.41091388463974, "learning_rate": 0.0004488755162713975, "loss": 0.3527, "num_input_tokens_seen": 373320, "step": 955 }, { "epoch": 8.64864864864865, "grad_norm": 0.2880156636238098, "learning_rate": 0.0004480788064857153, "loss": 0.3905, "num_input_tokens_seen": 375048, "step": 960 }, { "epoch": 8.693693693693694, "grad_norm": 0.18088005483150482, "learning_rate": 0.0004472766553923454, "loss": 0.3999, "num_input_tokens_seen": 377672, "step": 965 }, { "epoch": 8.73873873873874, "grad_norm": 0.1736196130514145, "learning_rate": 0.00044646908502671376, "loss": 0.3372, "num_input_tokens_seen": 379560, "step": 970 }, { "epoch": 8.783783783783784, "grad_norm": 0.022430969402194023, "learning_rate": 0.0004456561175731162, "loss": 0.4068, "num_input_tokens_seen": 381128, "step": 975 }, { "epoch": 8.82882882882883, "grad_norm": 0.18760623037815094, "learning_rate": 0.000444837775364109, "loss": 0.3719, "num_input_tokens_seen": 383400, "step": 980 }, { "epoch": 8.873873873873874, "grad_norm": 0.15508021414279938, "learning_rate": 0.00044401408087989475, "loss": 0.356, "num_input_tokens_seen": 385448, "step": 985 }, { "epoch": 8.91891891891892, "grad_norm": 0.10870053619146347, "learning_rate": 0.0004431850567477058, "loss": 0.4219, "num_input_tokens_seen": 388040, "step": 990 }, { "epoch": 8.963963963963964, "grad_norm": 0.2384214997291565, "learning_rate": 0.0004423507257411817, "loss": 0.453, "num_input_tokens_seen": 389768, "step": 995 }, { "epoch": 9.00900900900901, "grad_norm": 0.3409654200077057, "learning_rate": 0.0004415111107797445, "loss": 0.3683, "num_input_tokens_seen": 391224, "step": 1000 }, { "epoch": 9.027027027027026, "eval_loss": 0.35251328349113464, "eval_runtime": 1.2788, "eval_samples_per_second": 86.801, "eval_steps_per_second": 21.896, "num_input_tokens_seen": 391800, "step": 1002 }, { "epoch": 9.054054054054054, "grad_norm": 0.4011771082878113, "learning_rate": 0.0004406662349279683, "loss": 0.3787, "num_input_tokens_seen": 392952, "step": 1005 }, { "epoch": 9.0990990990991, "grad_norm": 0.13834860920906067, "learning_rate": 0.0004398161213949464, "loss": 0.3692, "num_input_tokens_seen": 394840, "step": 1010 }, { "epoch": 9.144144144144144, "grad_norm": 0.15099625289440155, "learning_rate": 0.000438960793533653, "loss": 0.39, "num_input_tokens_seen": 396856, "step": 1015 }, { "epoch": 9.18918918918919, "grad_norm": 0.0967622771859169, "learning_rate": 0.00043810027484030245, "loss": 0.3373, "num_input_tokens_seen": 398680, "step": 1020 }, { "epoch": 9.234234234234235, "grad_norm": 0.2498539686203003, "learning_rate": 0.0004372345889537034, "loss": 0.3693, "num_input_tokens_seen": 400696, "step": 1025 }, { "epoch": 9.27927927927928, "grad_norm": 0.09526379406452179, "learning_rate": 0.00043636375965460907, "loss": 0.3758, "num_input_tokens_seen": 402520, "step": 1030 }, { "epoch": 9.324324324324325, "grad_norm": 0.021048899739980698, "learning_rate": 0.00043548781086506484, "loss": 0.3887, "num_input_tokens_seen": 404728, "step": 1035 }, { "epoch": 9.36936936936937, "grad_norm": 0.12369292974472046, "learning_rate": 0.00043460676664775036, "loss": 0.3376, "num_input_tokens_seen": 406776, "step": 1040 }, { "epoch": 9.414414414414415, "grad_norm": 0.148820698261261, "learning_rate": 0.000433720651205319, "loss": 0.3739, "num_input_tokens_seen": 408568, "step": 1045 }, { "epoch": 9.45945945945946, "grad_norm": 0.41787710785865784, "learning_rate": 0.0004328294888797326, "loss": 0.3643, "num_input_tokens_seen": 410392, "step": 1050 }, { "epoch": 9.504504504504505, "grad_norm": 0.03184948116540909, "learning_rate": 0.00043193330415159314, "loss": 0.3606, "num_input_tokens_seen": 412344, "step": 1055 }, { "epoch": 9.54954954954955, "grad_norm": 0.34247833490371704, "learning_rate": 0.00043103212163947, "loss": 0.3674, "num_input_tokens_seen": 414584, "step": 1060 }, { "epoch": 9.594594594594595, "grad_norm": 0.29731881618499756, "learning_rate": 0.000430125966099224, "loss": 0.3834, "num_input_tokens_seen": 416536, "step": 1065 }, { "epoch": 9.63963963963964, "grad_norm": 0.20951907336711884, "learning_rate": 0.0004292148624233268, "loss": 0.35, "num_input_tokens_seen": 418904, "step": 1070 }, { "epoch": 9.684684684684685, "grad_norm": 0.12838751077651978, "learning_rate": 0.0004282988356401776, "loss": 0.3552, "num_input_tokens_seen": 420952, "step": 1075 }, { "epoch": 9.72972972972973, "grad_norm": 0.20652344822883606, "learning_rate": 0.00042737791091341533, "loss": 0.3631, "num_input_tokens_seen": 422616, "step": 1080 }, { "epoch": 9.774774774774775, "grad_norm": 0.12488292902708054, "learning_rate": 0.0004264521135412276, "loss": 0.3431, "num_input_tokens_seen": 424696, "step": 1085 }, { "epoch": 9.81981981981982, "grad_norm": 0.08600403368473053, "learning_rate": 0.0004255214689556557, "loss": 0.3644, "num_input_tokens_seen": 426680, "step": 1090 }, { "epoch": 9.864864864864865, "grad_norm": 0.08096712827682495, "learning_rate": 0.00042458600272189553, "loss": 0.3517, "num_input_tokens_seen": 428536, "step": 1095 }, { "epoch": 9.90990990990991, "grad_norm": 0.046571407467126846, "learning_rate": 0.000423645740537596, "loss": 0.3707, "num_input_tokens_seen": 431032, "step": 1100 }, { "epoch": 9.954954954954955, "grad_norm": 0.11912014335393906, "learning_rate": 0.0004227007082321528, "loss": 0.346, "num_input_tokens_seen": 433048, "step": 1105 }, { "epoch": 10.0, "grad_norm": 0.22320574522018433, "learning_rate": 0.00042175093176599854, "loss": 0.3361, "num_input_tokens_seen": 434728, "step": 1110 }, { "epoch": 10.045045045045045, "grad_norm": 0.25626805424690247, "learning_rate": 0.0004207964372298904, "loss": 0.3894, "num_input_tokens_seen": 436968, "step": 1115 }, { "epoch": 10.09009009009009, "grad_norm": 0.2743680775165558, "learning_rate": 0.0004198372508441924, "loss": 0.3593, "num_input_tokens_seen": 439016, "step": 1120 }, { "epoch": 10.135135135135135, "grad_norm": 0.06560913473367691, "learning_rate": 0.00041887339895815606, "loss": 0.3895, "num_input_tokens_seen": 441224, "step": 1125 }, { "epoch": 10.18018018018018, "grad_norm": 0.40651583671569824, "learning_rate": 0.00041790490804919576, "loss": 0.3566, "num_input_tokens_seen": 443208, "step": 1130 }, { "epoch": 10.225225225225225, "grad_norm": 0.22788061201572418, "learning_rate": 0.00041693180472216206, "loss": 0.3513, "num_input_tokens_seen": 445000, "step": 1135 }, { "epoch": 10.27027027027027, "grad_norm": 0.22868895530700684, "learning_rate": 0.0004159541157086106, "loss": 0.3488, "num_input_tokens_seen": 447240, "step": 1140 }, { "epoch": 10.315315315315315, "grad_norm": 0.12963739037513733, "learning_rate": 0.00041497186786606746, "loss": 0.3306, "num_input_tokens_seen": 449032, "step": 1145 }, { "epoch": 10.36036036036036, "grad_norm": 0.10702938586473465, "learning_rate": 0.00041398508817729187, "loss": 0.3395, "num_input_tokens_seen": 451176, "step": 1150 }, { "epoch": 10.405405405405405, "grad_norm": 0.20335602760314941, "learning_rate": 0.00041299380374953487, "loss": 0.4138, "num_input_tokens_seen": 453096, "step": 1155 }, { "epoch": 10.45045045045045, "grad_norm": 0.07722572982311249, "learning_rate": 0.00041199804181379424, "loss": 0.4009, "num_input_tokens_seen": 454856, "step": 1160 }, { "epoch": 10.495495495495495, "grad_norm": 0.23282402753829956, "learning_rate": 0.0004109978297240671, "loss": 0.3652, "num_input_tokens_seen": 457096, "step": 1165 }, { "epoch": 10.531531531531531, "eval_loss": 0.34910741448402405, "eval_runtime": 1.1956, "eval_samples_per_second": 92.837, "eval_steps_per_second": 23.418, "num_input_tokens_seen": 458568, "step": 1169 }, { "epoch": 10.54054054054054, "grad_norm": 0.22194916009902954, "learning_rate": 0.000409993194956598, "loss": 0.3655, "num_input_tokens_seen": 459080, "step": 1170 }, { "epoch": 10.585585585585585, "grad_norm": 0.030269604176282883, "learning_rate": 0.0004089841651091243, "loss": 0.3446, "num_input_tokens_seen": 461192, "step": 1175 }, { "epoch": 10.63063063063063, "grad_norm": 0.020014692097902298, "learning_rate": 0.000407970767900118, "loss": 0.3531, "num_input_tokens_seen": 463240, "step": 1180 }, { "epoch": 10.675675675675675, "grad_norm": 0.029829155653715134, "learning_rate": 0.0004069530311680247, "loss": 0.357, "num_input_tokens_seen": 465416, "step": 1185 }, { "epoch": 10.72072072072072, "grad_norm": 0.3611600399017334, "learning_rate": 0.0004059309828704979, "loss": 0.3487, "num_input_tokens_seen": 467304, "step": 1190 }, { "epoch": 10.765765765765765, "grad_norm": 0.02416239120066166, "learning_rate": 0.00040490465108363214, "loss": 0.3571, "num_input_tokens_seen": 469288, "step": 1195 }, { "epoch": 10.81081081081081, "grad_norm": 0.2901204228401184, "learning_rate": 0.0004038740640011911, "loss": 0.3383, "num_input_tokens_seen": 470920, "step": 1200 }, { "epoch": 10.855855855855856, "grad_norm": 0.058503858745098114, "learning_rate": 0.0004028392499338328, "loss": 0.3565, "num_input_tokens_seen": 473160, "step": 1205 }, { "epoch": 10.9009009009009, "grad_norm": 0.09709370136260986, "learning_rate": 0.00040180023730833293, "loss": 0.3829, "num_input_tokens_seen": 475048, "step": 1210 }, { "epoch": 10.945945945945946, "grad_norm": 0.12688879668712616, "learning_rate": 0.0004007570546668029, "loss": 0.3605, "num_input_tokens_seen": 476712, "step": 1215 }, { "epoch": 10.99099099099099, "grad_norm": 0.2851768434047699, "learning_rate": 0.00039970973066590635, "loss": 0.4403, "num_input_tokens_seen": 478696, "step": 1220 }, { "epoch": 11.036036036036036, "grad_norm": 0.02032957412302494, "learning_rate": 0.0003986582940760717, "loss": 0.3381, "num_input_tokens_seen": 480512, "step": 1225 }, { "epoch": 11.08108108108108, "grad_norm": 0.1939368098974228, "learning_rate": 0.00039760277378070205, "loss": 0.3521, "num_input_tokens_seen": 482368, "step": 1230 }, { "epoch": 11.126126126126126, "grad_norm": 0.2213449329137802, "learning_rate": 0.0003965431987753815, "loss": 0.372, "num_input_tokens_seen": 484320, "step": 1235 }, { "epoch": 11.17117117117117, "grad_norm": 0.034773580729961395, "learning_rate": 0.0003954795981670788, "loss": 0.3468, "num_input_tokens_seen": 486208, "step": 1240 }, { "epoch": 11.216216216216216, "grad_norm": 0.08730168640613556, "learning_rate": 0.0003944120011733476, "loss": 0.3548, "num_input_tokens_seen": 487872, "step": 1245 }, { "epoch": 11.26126126126126, "grad_norm": 0.10386672616004944, "learning_rate": 0.0003933404371215241, "loss": 0.3167, "num_input_tokens_seen": 489952, "step": 1250 }, { "epoch": 11.306306306306306, "grad_norm": 0.08380131423473358, "learning_rate": 0.00039226493544792097, "loss": 0.3697, "num_input_tokens_seen": 492032, "step": 1255 }, { "epoch": 11.35135135135135, "grad_norm": 0.018763568252325058, "learning_rate": 0.0003911855256970193, "loss": 0.355, "num_input_tokens_seen": 493888, "step": 1260 }, { "epoch": 11.396396396396396, "grad_norm": 0.19116578996181488, "learning_rate": 0.00039010223752065644, "loss": 0.3531, "num_input_tokens_seen": 495648, "step": 1265 }, { "epoch": 11.441441441441441, "grad_norm": 0.08821618556976318, "learning_rate": 0.0003890151006772119, "loss": 0.3595, "num_input_tokens_seen": 497600, "step": 1270 }, { "epoch": 11.486486486486486, "grad_norm": 0.2903653681278229, "learning_rate": 0.00038792414503078967, "loss": 0.3331, "num_input_tokens_seen": 499552, "step": 1275 }, { "epoch": 11.531531531531531, "grad_norm": 0.2660365700721741, "learning_rate": 0.00038682940055039764, "loss": 0.352, "num_input_tokens_seen": 501312, "step": 1280 }, { "epoch": 11.576576576576576, "grad_norm": 0.14043103158473969, "learning_rate": 0.0003857308973091249, "loss": 0.3477, "num_input_tokens_seen": 503232, "step": 1285 }, { "epoch": 11.621621621621621, "grad_norm": 0.18052133917808533, "learning_rate": 0.00038462866548331486, "loss": 0.356, "num_input_tokens_seen": 505024, "step": 1290 }, { "epoch": 11.666666666666666, "grad_norm": 0.26612210273742676, "learning_rate": 0.0003835227353517372, "loss": 0.3746, "num_input_tokens_seen": 507680, "step": 1295 }, { "epoch": 11.711711711711711, "grad_norm": 0.026459388434886932, "learning_rate": 0.0003824131372947551, "loss": 0.3634, "num_input_tokens_seen": 509632, "step": 1300 }, { "epoch": 11.756756756756756, "grad_norm": 0.02496427483856678, "learning_rate": 0.0003812999017934916, "loss": 0.3565, "num_input_tokens_seen": 511648, "step": 1305 }, { "epoch": 11.801801801801801, "grad_norm": 0.2184697389602661, "learning_rate": 0.00038018305942899165, "loss": 0.3554, "num_input_tokens_seen": 513728, "step": 1310 }, { "epoch": 11.846846846846846, "grad_norm": 0.2856709063053131, "learning_rate": 0.0003790626408813822, "loss": 0.3206, "num_input_tokens_seen": 515904, "step": 1315 }, { "epoch": 11.891891891891891, "grad_norm": 0.29436999559402466, "learning_rate": 0.0003779386769290296, "loss": 0.3995, "num_input_tokens_seen": 517600, "step": 1320 }, { "epoch": 11.936936936936936, "grad_norm": 0.16210943460464478, "learning_rate": 0.0003768111984476938, "loss": 0.3536, "num_input_tokens_seen": 519520, "step": 1325 }, { "epoch": 11.981981981981981, "grad_norm": 0.09292060136795044, "learning_rate": 0.00037568023640968044, "loss": 0.346, "num_input_tokens_seen": 521280, "step": 1330 }, { "epoch": 12.027027027027026, "grad_norm": 0.16323931515216827, "learning_rate": 0.0003745458218829899, "loss": 0.3456, "num_input_tokens_seen": 522960, "step": 1335 }, { "epoch": 12.036036036036036, "eval_loss": 0.3908711075782776, "eval_runtime": 1.2501, "eval_samples_per_second": 88.792, "eval_steps_per_second": 22.398, "num_input_tokens_seen": 523312, "step": 1336 }, { "epoch": 12.072072072072071, "grad_norm": 0.175073504447937, "learning_rate": 0.0003734079860304639, "loss": 0.363, "num_input_tokens_seen": 524912, "step": 1340 }, { "epoch": 12.117117117117116, "grad_norm": 0.30753299593925476, "learning_rate": 0.00037226676010892925, "loss": 0.3042, "num_input_tokens_seen": 526928, "step": 1345 }, { "epoch": 12.162162162162161, "grad_norm": 0.04058603569865227, "learning_rate": 0.00037112217546833955, "loss": 0.3613, "num_input_tokens_seen": 528944, "step": 1350 }, { "epoch": 12.207207207207206, "grad_norm": 0.23132722079753876, "learning_rate": 0.00036997426355091375, "loss": 0.3547, "num_input_tokens_seen": 531376, "step": 1355 }, { "epoch": 12.252252252252251, "grad_norm": 0.04770840331912041, "learning_rate": 0.0003688230558902725, "loss": 0.3653, "num_input_tokens_seen": 533264, "step": 1360 }, { "epoch": 12.297297297297296, "grad_norm": 0.2295585423707962, "learning_rate": 0.0003676685841105719, "loss": 0.3665, "num_input_tokens_seen": 535120, "step": 1365 }, { "epoch": 12.342342342342342, "grad_norm": 0.2151583582162857, "learning_rate": 0.00036651087992563476, "loss": 0.2919, "num_input_tokens_seen": 536912, "step": 1370 }, { "epoch": 12.387387387387387, "grad_norm": 0.31578120589256287, "learning_rate": 0.00036534997513807934, "loss": 0.4497, "num_input_tokens_seen": 538736, "step": 1375 }, { "epoch": 12.432432432432432, "grad_norm": 0.03904511407017708, "learning_rate": 0.00036418590163844587, "loss": 0.3984, "num_input_tokens_seen": 540720, "step": 1380 }, { "epoch": 12.477477477477478, "grad_norm": 0.04692419245839119, "learning_rate": 0.00036301869140432057, "loss": 0.3396, "num_input_tokens_seen": 542832, "step": 1385 }, { "epoch": 12.522522522522522, "grad_norm": 0.033866655081510544, "learning_rate": 0.00036184837649945673, "loss": 0.3497, "num_input_tokens_seen": 544592, "step": 1390 }, { "epoch": 12.567567567567568, "grad_norm": 0.2458135038614273, "learning_rate": 0.00036067498907289456, "loss": 0.4109, "num_input_tokens_seen": 546416, "step": 1395 }, { "epoch": 12.612612612612612, "grad_norm": 0.06222885102033615, "learning_rate": 0.0003594985613580775, "loss": 0.3466, "num_input_tokens_seen": 548336, "step": 1400 }, { "epoch": 12.657657657657658, "grad_norm": 0.3508203625679016, "learning_rate": 0.00035831912567196715, "loss": 0.4403, "num_input_tokens_seen": 550384, "step": 1405 }, { "epoch": 12.702702702702704, "grad_norm": 0.2739538848400116, "learning_rate": 0.0003571367144141552, "loss": 0.379, "num_input_tokens_seen": 552464, "step": 1410 }, { "epoch": 12.747747747747749, "grad_norm": 0.34273868799209595, "learning_rate": 0.00035595136006597375, "loss": 0.3625, "num_input_tokens_seen": 554768, "step": 1415 }, { "epoch": 12.792792792792794, "grad_norm": 0.08933409303426743, "learning_rate": 0.0003547630951896025, "loss": 0.3392, "num_input_tokens_seen": 556752, "step": 1420 }, { "epoch": 12.837837837837839, "grad_norm": 0.13190878927707672, "learning_rate": 0.0003535719524271749, "loss": 0.3869, "num_input_tokens_seen": 558832, "step": 1425 }, { "epoch": 12.882882882882884, "grad_norm": 0.22936788201332092, "learning_rate": 0.00035237796449988086, "loss": 0.3764, "num_input_tokens_seen": 560880, "step": 1430 }, { "epoch": 12.927927927927929, "grad_norm": 0.04973359405994415, "learning_rate": 0.0003511811642070684, "loss": 0.3478, "num_input_tokens_seen": 562416, "step": 1435 }, { "epoch": 12.972972972972974, "grad_norm": 0.08619926124811172, "learning_rate": 0.0003499815844253423, "loss": 0.3449, "num_input_tokens_seen": 564144, "step": 1440 }, { "epoch": 13.018018018018019, "grad_norm": 0.055636610835790634, "learning_rate": 0.00034877925810766086, "loss": 0.4018, "num_input_tokens_seen": 566336, "step": 1445 }, { "epoch": 13.063063063063064, "grad_norm": 0.22953113913536072, "learning_rate": 0.0003475742182824314, "loss": 0.3527, "num_input_tokens_seen": 568608, "step": 1450 }, { "epoch": 13.108108108108109, "grad_norm": 0.10283882915973663, "learning_rate": 0.0003463664980526018, "loss": 0.3576, "num_input_tokens_seen": 570944, "step": 1455 }, { "epoch": 13.153153153153154, "grad_norm": 0.24032360315322876, "learning_rate": 0.0003451561305947522, "loss": 0.3338, "num_input_tokens_seen": 573056, "step": 1460 }, { "epoch": 13.198198198198199, "grad_norm": 0.3658190667629242, "learning_rate": 0.000343943149158183, "loss": 0.3806, "num_input_tokens_seen": 575360, "step": 1465 }, { "epoch": 13.243243243243244, "grad_norm": 0.08185989409685135, "learning_rate": 0.00034272758706400193, "loss": 0.3276, "num_input_tokens_seen": 577504, "step": 1470 }, { "epoch": 13.288288288288289, "grad_norm": 0.13225214183330536, "learning_rate": 0.0003415094777042081, "loss": 0.3685, "num_input_tokens_seen": 579456, "step": 1475 }, { "epoch": 13.333333333333334, "grad_norm": 0.10822223871946335, "learning_rate": 0.0003402888545407753, "loss": 0.4039, "num_input_tokens_seen": 581184, "step": 1480 }, { "epoch": 13.378378378378379, "grad_norm": 0.15501396358013153, "learning_rate": 0.0003390657511047326, "loss": 0.3558, "num_input_tokens_seen": 583136, "step": 1485 }, { "epoch": 13.423423423423424, "grad_norm": 0.14301234483718872, "learning_rate": 0.00033784020099524297, "loss": 0.351, "num_input_tokens_seen": 585056, "step": 1490 }, { "epoch": 13.468468468468469, "grad_norm": 0.3943434953689575, "learning_rate": 0.00033661223787868097, "loss": 0.3586, "num_input_tokens_seen": 586784, "step": 1495 }, { "epoch": 13.513513513513514, "grad_norm": 0.20777928829193115, "learning_rate": 0.00033538189548770677, "loss": 0.3703, "num_input_tokens_seen": 588768, "step": 1500 }, { "epoch": 13.54054054054054, "eval_loss": 0.35342463850975037, "eval_runtime": 1.2665, "eval_samples_per_second": 87.645, "eval_steps_per_second": 22.109, "num_input_tokens_seen": 589824, "step": 1503 }, { "epoch": 13.558558558558559, "grad_norm": 0.19171638786792755, "learning_rate": 0.00033414920762034095, "loss": 0.3472, "num_input_tokens_seen": 590560, "step": 1505 }, { "epoch": 13.603603603603604, "grad_norm": 0.0835702195763588, "learning_rate": 0.0003329142081390348, "loss": 0.3228, "num_input_tokens_seen": 592416, "step": 1510 }, { "epoch": 13.64864864864865, "grad_norm": 0.09458846598863602, "learning_rate": 0.00033167693096974085, "loss": 0.3979, "num_input_tokens_seen": 594336, "step": 1515 }, { "epoch": 13.693693693693694, "grad_norm": 0.03758715093135834, "learning_rate": 0.00033043741010098046, "loss": 0.3445, "num_input_tokens_seen": 596320, "step": 1520 }, { "epoch": 13.73873873873874, "grad_norm": 0.18656352162361145, "learning_rate": 0.00032919567958291073, "loss": 0.3552, "num_input_tokens_seen": 598240, "step": 1525 }, { "epoch": 13.783783783783784, "grad_norm": 0.3501572608947754, "learning_rate": 0.0003279517735263883, "loss": 0.3583, "num_input_tokens_seen": 599872, "step": 1530 }, { "epoch": 13.82882882882883, "grad_norm": 0.1193755716085434, "learning_rate": 0.0003267057261020331, "loss": 0.3281, "num_input_tokens_seen": 601856, "step": 1535 }, { "epoch": 13.873873873873874, "grad_norm": 0.10996165126562119, "learning_rate": 0.00032545757153928923, "loss": 0.3492, "num_input_tokens_seen": 603648, "step": 1540 }, { "epoch": 13.91891891891892, "grad_norm": 0.2525525689125061, "learning_rate": 0.0003242073441254846, "loss": 0.389, "num_input_tokens_seen": 606144, "step": 1545 }, { "epoch": 13.963963963963964, "grad_norm": 0.13008233904838562, "learning_rate": 0.00032295507820488944, "loss": 0.3362, "num_input_tokens_seen": 607904, "step": 1550 }, { "epoch": 14.00900900900901, "grad_norm": 0.024191677570343018, "learning_rate": 0.0003217008081777726, "loss": 0.3516, "num_input_tokens_seen": 609688, "step": 1555 }, { "epoch": 14.054054054054054, "grad_norm": 0.08284207433462143, "learning_rate": 0.00032044456849945636, "loss": 0.3243, "num_input_tokens_seen": 611608, "step": 1560 }, { "epoch": 14.0990990990991, "grad_norm": 0.41647791862487793, "learning_rate": 0.00031918639367937025, "loss": 0.4164, "num_input_tokens_seen": 613848, "step": 1565 }, { "epoch": 14.144144144144144, "grad_norm": 0.016869306564331055, "learning_rate": 0.00031792631828010323, "loss": 0.3623, "num_input_tokens_seen": 615704, "step": 1570 }, { "epoch": 14.18918918918919, "grad_norm": 0.050070323050022125, "learning_rate": 0.0003166643769164533, "loss": 0.3426, "num_input_tokens_seen": 617784, "step": 1575 }, { "epoch": 14.234234234234235, "grad_norm": 0.09605066478252411, "learning_rate": 0.00031540060425447813, "loss": 0.3457, "num_input_tokens_seen": 619768, "step": 1580 }, { "epoch": 14.27927927927928, "grad_norm": 0.2416335940361023, "learning_rate": 0.0003141350350105413, "loss": 0.3775, "num_input_tokens_seen": 621816, "step": 1585 }, { "epoch": 14.324324324324325, "grad_norm": 0.1901450753211975, "learning_rate": 0.0003128677039503594, "loss": 0.3601, "num_input_tokens_seen": 623864, "step": 1590 }, { "epoch": 14.36936936936937, "grad_norm": 0.06296460330486298, "learning_rate": 0.00031159864588804694, "loss": 0.3431, "num_input_tokens_seen": 625752, "step": 1595 }, { "epoch": 14.414414414414415, "grad_norm": 0.029421856626868248, "learning_rate": 0.0003103278956851598, "loss": 0.303, "num_input_tokens_seen": 627736, "step": 1600 }, { "epoch": 14.45945945945946, "grad_norm": 0.17490741610527039, "learning_rate": 0.0003090554882497378, "loss": 0.3428, "num_input_tokens_seen": 629688, "step": 1605 }, { "epoch": 14.504504504504505, "grad_norm": 0.17685425281524658, "learning_rate": 0.00030778145853534557, "loss": 0.3394, "num_input_tokens_seen": 631576, "step": 1610 }, { "epoch": 14.54954954954955, "grad_norm": 0.3024262487888336, "learning_rate": 0.0003065058415401123, "loss": 0.3293, "num_input_tokens_seen": 633816, "step": 1615 }, { "epoch": 14.594594594594595, "grad_norm": 0.050083838403224945, "learning_rate": 0.00030522867230577057, "loss": 0.379, "num_input_tokens_seen": 635608, "step": 1620 }, { "epoch": 14.63963963963964, "grad_norm": 0.14594219624996185, "learning_rate": 0.00030394998591669364, "loss": 0.3417, "num_input_tokens_seen": 637400, "step": 1625 }, { "epoch": 14.684684684684685, "grad_norm": 0.10283079743385315, "learning_rate": 0.0003026698174989316, "loss": 0.3849, "num_input_tokens_seen": 639352, "step": 1630 }, { "epoch": 14.72972972972973, "grad_norm": 0.021166425198316574, "learning_rate": 0.0003013882022192465, "loss": 0.362, "num_input_tokens_seen": 641560, "step": 1635 }, { "epoch": 14.774774774774775, "grad_norm": 0.20087668299674988, "learning_rate": 0.0003001051752841462, "loss": 0.3574, "num_input_tokens_seen": 643480, "step": 1640 }, { "epoch": 14.81981981981982, "grad_norm": 0.044715140014886856, "learning_rate": 0.0002988207719389175, "loss": 0.3483, "num_input_tokens_seen": 645464, "step": 1645 }, { "epoch": 14.864864864864865, "grad_norm": 0.3323145806789398, "learning_rate": 0.0002975350274666577, "loss": 0.3562, "num_input_tokens_seen": 647416, "step": 1650 }, { "epoch": 14.90990990990991, "grad_norm": 0.2399042695760727, "learning_rate": 0.0002962479771873053, "loss": 0.3806, "num_input_tokens_seen": 649240, "step": 1655 }, { "epoch": 14.954954954954955, "grad_norm": 0.10982757061719894, "learning_rate": 0.00029495965645667, "loss": 0.3684, "num_input_tokens_seen": 651256, "step": 1660 }, { "epoch": 15.0, "grad_norm": 0.057245321571826935, "learning_rate": 0.0002936701006654613, "loss": 0.333, "num_input_tokens_seen": 653216, "step": 1665 }, { "epoch": 15.045045045045045, "grad_norm": 0.13068826496601105, "learning_rate": 0.0002923793452383163, "loss": 0.3643, "num_input_tokens_seen": 655200, "step": 1670 }, { "epoch": 15.045045045045045, "eval_loss": 0.3588047921657562, "eval_runtime": 1.2817, "eval_samples_per_second": 86.6, "eval_steps_per_second": 21.845, "num_input_tokens_seen": 655200, "step": 1670 }, { "epoch": 15.09009009009009, "grad_norm": 0.19212093949317932, "learning_rate": 0.00029108742563282655, "loss": 0.353, "num_input_tokens_seen": 657056, "step": 1675 }, { "epoch": 15.135135135135135, "grad_norm": 0.022731155157089233, "learning_rate": 0.00028979437733856427, "loss": 0.3474, "num_input_tokens_seen": 659232, "step": 1680 }, { "epoch": 15.18018018018018, "grad_norm": 0.14064331352710724, "learning_rate": 0.000288500235876107, "loss": 0.3456, "num_input_tokens_seen": 661184, "step": 1685 }, { "epoch": 15.225225225225225, "grad_norm": 0.12251386791467667, "learning_rate": 0.00028720503679606225, "loss": 0.3307, "num_input_tokens_seen": 663040, "step": 1690 }, { "epoch": 15.27027027027027, "grad_norm": 0.2509421110153198, "learning_rate": 0.0002859088156780906, "loss": 0.3643, "num_input_tokens_seen": 665088, "step": 1695 }, { "epoch": 15.315315315315315, "grad_norm": 0.3646177649497986, "learning_rate": 0.00028461160812992846, "loss": 0.3735, "num_input_tokens_seen": 666880, "step": 1700 }, { "epoch": 15.36036036036036, "grad_norm": 0.045939963310956955, "learning_rate": 0.00028331344978640993, "loss": 0.3643, "num_input_tokens_seen": 669152, "step": 1705 }, { "epoch": 15.405405405405405, "grad_norm": 0.09639077633619308, "learning_rate": 0.00028201437630848787, "loss": 0.359, "num_input_tokens_seen": 671168, "step": 1710 }, { "epoch": 15.45045045045045, "grad_norm": 0.11868836730718613, "learning_rate": 0.0002807144233822542, "loss": 0.3206, "num_input_tokens_seen": 673312, "step": 1715 }, { "epoch": 15.495495495495495, "grad_norm": 0.10870905965566635, "learning_rate": 0.0002794136267179596, "loss": 0.367, "num_input_tokens_seen": 675200, "step": 1720 }, { "epoch": 15.54054054054054, "grad_norm": 0.14366932213306427, "learning_rate": 0.00027811202204903297, "loss": 0.364, "num_input_tokens_seen": 677088, "step": 1725 }, { "epoch": 15.585585585585585, "grad_norm": 0.171949103474617, "learning_rate": 0.00027680964513109876, "loss": 0.3549, "num_input_tokens_seen": 679360, "step": 1730 }, { "epoch": 15.63063063063063, "grad_norm": 0.15831437706947327, "learning_rate": 0.00027550653174099603, "loss": 0.3467, "num_input_tokens_seen": 681120, "step": 1735 }, { "epoch": 15.675675675675675, "grad_norm": 0.1595962792634964, "learning_rate": 0.0002742027176757948, "loss": 0.3562, "num_input_tokens_seen": 683008, "step": 1740 }, { "epoch": 15.72072072072072, "grad_norm": 0.18163646757602692, "learning_rate": 0.0002728982387518129, "loss": 0.3613, "num_input_tokens_seen": 684704, "step": 1745 }, { "epoch": 15.765765765765765, "grad_norm": 0.14146992564201355, "learning_rate": 0.0002715931308036321, "loss": 0.3453, "num_input_tokens_seen": 686528, "step": 1750 }, { "epoch": 15.81081081081081, "grad_norm": 0.09521026164293289, "learning_rate": 0.0002702874296831139, "loss": 0.3251, "num_input_tokens_seen": 688256, "step": 1755 }, { "epoch": 15.855855855855856, "grad_norm": 0.08400721102952957, "learning_rate": 0.0002689811712584143, "loss": 0.4018, "num_input_tokens_seen": 690400, "step": 1760 }, { "epoch": 15.9009009009009, "grad_norm": 0.023672491312026978, "learning_rate": 0.00026767439141299866, "loss": 0.3742, "num_input_tokens_seen": 692256, "step": 1765 }, { "epoch": 15.945945945945946, "grad_norm": 0.029393251985311508, "learning_rate": 0.00026636712604465626, "loss": 0.3503, "num_input_tokens_seen": 694560, "step": 1770 }, { "epoch": 15.99099099099099, "grad_norm": 0.033729467540979385, "learning_rate": 0.0002650594110645136, "loss": 0.3626, "num_input_tokens_seen": 696608, "step": 1775 }, { "epoch": 16.036036036036037, "grad_norm": 0.26835304498672485, "learning_rate": 0.0002637512823960483, "loss": 0.3368, "num_input_tokens_seen": 698424, "step": 1780 }, { "epoch": 16.08108108108108, "grad_norm": 0.1397535502910614, "learning_rate": 0.00026244277597410213, "loss": 0.3531, "num_input_tokens_seen": 700568, "step": 1785 }, { "epoch": 16.126126126126128, "grad_norm": 0.05984152853488922, "learning_rate": 0.0002611339277438943, "loss": 0.37, "num_input_tokens_seen": 702712, "step": 1790 }, { "epoch": 16.17117117117117, "grad_norm": 0.11292751133441925, "learning_rate": 0.0002598247736600328, "loss": 0.3759, "num_input_tokens_seen": 704568, "step": 1795 }, { "epoch": 16.216216216216218, "grad_norm": 0.18683435022830963, "learning_rate": 0.00025851534968552843, "loss": 0.3573, "num_input_tokens_seen": 706488, "step": 1800 }, { "epoch": 16.26126126126126, "grad_norm": 0.19400519132614136, "learning_rate": 0.0002572056917908055, "loss": 0.3595, "num_input_tokens_seen": 708920, "step": 1805 }, { "epoch": 16.306306306306308, "grad_norm": 0.0322173647582531, "learning_rate": 0.00025589583595271424, "loss": 0.341, "num_input_tokens_seen": 711352, "step": 1810 }, { "epoch": 16.35135135135135, "grad_norm": 0.04361787065863609, "learning_rate": 0.0002545858181535426, "loss": 0.3457, "num_input_tokens_seen": 713304, "step": 1815 }, { "epoch": 16.396396396396398, "grad_norm": 0.02331223338842392, "learning_rate": 0.00025327567438002775, "loss": 0.3579, "num_input_tokens_seen": 715096, "step": 1820 }, { "epoch": 16.44144144144144, "grad_norm": 0.19047728180885315, "learning_rate": 0.0002519654406223671, "loss": 0.3405, "num_input_tokens_seen": 716760, "step": 1825 }, { "epoch": 16.486486486486488, "grad_norm": 0.24891367554664612, "learning_rate": 0.0002506551528732302, "loss": 0.3609, "num_input_tokens_seen": 718456, "step": 1830 }, { "epoch": 16.53153153153153, "grad_norm": 0.2271847128868103, "learning_rate": 0.0002493448471267698, "loss": 0.3647, "num_input_tokens_seen": 720280, "step": 1835 }, { "epoch": 16.54954954954955, "eval_loss": 0.35697561502456665, "eval_runtime": 1.2629, "eval_samples_per_second": 87.89, "eval_steps_per_second": 22.171, "num_input_tokens_seen": 721016, "step": 1837 }, { "epoch": 16.576576576576578, "grad_norm": 0.20656314492225647, "learning_rate": 0.0002480345593776329, "loss": 0.3843, "num_input_tokens_seen": 721976, "step": 1840 }, { "epoch": 16.62162162162162, "grad_norm": 0.050191402435302734, "learning_rate": 0.00024672432561997237, "loss": 0.367, "num_input_tokens_seen": 723608, "step": 1845 }, { "epoch": 16.666666666666668, "grad_norm": 0.0699080228805542, "learning_rate": 0.0002454141818464574, "loss": 0.3813, "num_input_tokens_seen": 725560, "step": 1850 }, { "epoch": 16.71171171171171, "grad_norm": 0.19403664767742157, "learning_rate": 0.0002441041640472858, "loss": 0.3493, "num_input_tokens_seen": 727320, "step": 1855 }, { "epoch": 16.756756756756758, "grad_norm": 0.016935104504227638, "learning_rate": 0.00024279430820919458, "loss": 0.3563, "num_input_tokens_seen": 729304, "step": 1860 }, { "epoch": 16.8018018018018, "grad_norm": 0.031578533351421356, "learning_rate": 0.00024148465031447155, "loss": 0.3437, "num_input_tokens_seen": 731544, "step": 1865 }, { "epoch": 16.846846846846848, "grad_norm": 0.11004813760519028, "learning_rate": 0.00024017522633996722, "loss": 0.3361, "num_input_tokens_seen": 733624, "step": 1870 }, { "epoch": 16.89189189189189, "grad_norm": 0.2583062946796417, "learning_rate": 0.00023886607225610582, "loss": 0.3625, "num_input_tokens_seen": 736024, "step": 1875 }, { "epoch": 16.936936936936938, "grad_norm": 0.4021206498146057, "learning_rate": 0.0002375572240258978, "loss": 0.3521, "num_input_tokens_seen": 738136, "step": 1880 }, { "epoch": 16.98198198198198, "grad_norm": 0.05159064382314682, "learning_rate": 0.00023624871760395176, "loss": 0.3756, "num_input_tokens_seen": 740024, "step": 1885 }, { "epoch": 17.027027027027028, "grad_norm": 0.14895497262477875, "learning_rate": 0.00023494058893548653, "loss": 0.3346, "num_input_tokens_seen": 741904, "step": 1890 }, { "epoch": 17.07207207207207, "grad_norm": 0.10104119032621384, "learning_rate": 0.00023363287395534375, "loss": 0.363, "num_input_tokens_seen": 743984, "step": 1895 }, { "epoch": 17.117117117117118, "grad_norm": 0.054903823882341385, "learning_rate": 0.00023232560858700135, "loss": 0.3736, "num_input_tokens_seen": 745648, "step": 1900 }, { "epoch": 17.16216216216216, "grad_norm": 0.0345749631524086, "learning_rate": 0.00023101882874158582, "loss": 0.3406, "num_input_tokens_seen": 747376, "step": 1905 }, { "epoch": 17.207207207207208, "grad_norm": 0.015553313307464123, "learning_rate": 0.00022971257031688614, "loss": 0.358, "num_input_tokens_seen": 749232, "step": 1910 }, { "epoch": 17.25225225225225, "grad_norm": 0.11265865713357925, "learning_rate": 0.0002284068691963679, "loss": 0.3406, "num_input_tokens_seen": 751184, "step": 1915 }, { "epoch": 17.2972972972973, "grad_norm": 0.08328580856323242, "learning_rate": 0.00022710176124818721, "loss": 0.3266, "num_input_tokens_seen": 753072, "step": 1920 }, { "epoch": 17.34234234234234, "grad_norm": 0.08238591253757477, "learning_rate": 0.00022579728232420524, "loss": 0.358, "num_input_tokens_seen": 755280, "step": 1925 }, { "epoch": 17.38738738738739, "grad_norm": 0.07033450156450272, "learning_rate": 0.00022449346825900398, "loss": 0.3415, "num_input_tokens_seen": 757584, "step": 1930 }, { "epoch": 17.43243243243243, "grad_norm": 0.2403155118227005, "learning_rate": 0.0002231903548689013, "loss": 0.3345, "num_input_tokens_seen": 760080, "step": 1935 }, { "epoch": 17.47747747747748, "grad_norm": 0.05678914114832878, "learning_rate": 0.0002218879779509671, "loss": 0.3513, "num_input_tokens_seen": 761904, "step": 1940 }, { "epoch": 17.52252252252252, "grad_norm": 0.1207728311419487, "learning_rate": 0.0002205863732820404, "loss": 0.3454, "num_input_tokens_seen": 763728, "step": 1945 }, { "epoch": 17.56756756756757, "grad_norm": 0.019787251949310303, "learning_rate": 0.0002192855766177459, "loss": 0.3534, "num_input_tokens_seen": 765552, "step": 1950 }, { "epoch": 17.61261261261261, "grad_norm": 0.1292268931865692, "learning_rate": 0.00021798562369151214, "loss": 0.3411, "num_input_tokens_seen": 767472, "step": 1955 }, { "epoch": 17.65765765765766, "grad_norm": 0.056675706058740616, "learning_rate": 0.00021668655021359008, "loss": 0.3653, "num_input_tokens_seen": 769328, "step": 1960 }, { "epoch": 17.7027027027027, "grad_norm": 0.20839175581932068, "learning_rate": 0.0002153883918700716, "loss": 0.3634, "num_input_tokens_seen": 771344, "step": 1965 }, { "epoch": 17.74774774774775, "grad_norm": 0.160031259059906, "learning_rate": 0.00021409118432190943, "loss": 0.3583, "num_input_tokens_seen": 773072, "step": 1970 }, { "epoch": 17.792792792792792, "grad_norm": 0.04150155559182167, "learning_rate": 0.0002127949632039378, "loss": 0.3642, "num_input_tokens_seen": 775024, "step": 1975 }, { "epoch": 17.83783783783784, "grad_norm": 0.22855611145496368, "learning_rate": 0.00021149976412389301, "loss": 0.3709, "num_input_tokens_seen": 777392, "step": 1980 }, { "epoch": 17.882882882882882, "grad_norm": 0.16699308156967163, "learning_rate": 0.00021020562266143571, "loss": 0.3605, "num_input_tokens_seen": 779632, "step": 1985 }, { "epoch": 17.92792792792793, "grad_norm": 0.029601508751511574, "learning_rate": 0.00020891257436717354, "loss": 0.3565, "num_input_tokens_seen": 781424, "step": 1990 }, { "epoch": 17.972972972972972, "grad_norm": 0.34150949120521545, "learning_rate": 0.00020762065476168375, "loss": 0.351, "num_input_tokens_seen": 783504, "step": 1995 }, { "epoch": 18.01801801801802, "grad_norm": 0.17843817174434662, "learning_rate": 0.00020632989933453877, "loss": 0.3484, "num_input_tokens_seen": 785256, "step": 2000 }, { "epoch": 18.054054054054053, "eval_loss": 0.35014501214027405, "eval_runtime": 1.318, "eval_samples_per_second": 84.221, "eval_steps_per_second": 21.245, "num_input_tokens_seen": 787016, "step": 2004 }, { "epoch": 18.063063063063062, "grad_norm": 0.3064093291759491, "learning_rate": 0.00020504034354333007, "loss": 0.3465, "num_input_tokens_seen": 787656, "step": 2005 }, { "epoch": 18.10810810810811, "grad_norm": 0.32400768995285034, "learning_rate": 0.00020375202281269474, "loss": 0.3516, "num_input_tokens_seen": 789352, "step": 2010 }, { "epoch": 18.153153153153152, "grad_norm": 0.023612281307578087, "learning_rate": 0.00020246497253334232, "loss": 0.3434, "num_input_tokens_seen": 791400, "step": 2015 }, { "epoch": 18.1981981981982, "grad_norm": 0.022443737834692, "learning_rate": 0.00020117922806108256, "loss": 0.3454, "num_input_tokens_seen": 793384, "step": 2020 }, { "epoch": 18.243243243243242, "grad_norm": 0.1469493955373764, "learning_rate": 0.0001998948247158538, "loss": 0.3578, "num_input_tokens_seen": 795208, "step": 2025 }, { "epoch": 18.28828828828829, "grad_norm": 0.13506555557250977, "learning_rate": 0.00019861179778075355, "loss": 0.3378, "num_input_tokens_seen": 796808, "step": 2030 }, { "epoch": 18.333333333333332, "grad_norm": 0.2293097972869873, "learning_rate": 0.0001973301825010685, "loss": 0.3566, "num_input_tokens_seen": 798984, "step": 2035 }, { "epoch": 18.37837837837838, "grad_norm": 0.04228850081562996, "learning_rate": 0.00019605001408330632, "loss": 0.3535, "num_input_tokens_seen": 800840, "step": 2040 }, { "epoch": 18.423423423423422, "grad_norm": 0.0424201637506485, "learning_rate": 0.00019477132769422947, "loss": 0.3403, "num_input_tokens_seen": 802664, "step": 2045 }, { "epoch": 18.46846846846847, "grad_norm": 0.03492378816008568, "learning_rate": 0.00019349415845988776, "loss": 0.3461, "num_input_tokens_seen": 804520, "step": 2050 }, { "epoch": 18.513513513513512, "grad_norm": 0.17761828005313873, "learning_rate": 0.00019221854146465444, "loss": 0.3545, "num_input_tokens_seen": 806632, "step": 2055 }, { "epoch": 18.55855855855856, "grad_norm": 0.1371828019618988, "learning_rate": 0.00019094451175026217, "loss": 0.3518, "num_input_tokens_seen": 808616, "step": 2060 }, { "epoch": 18.603603603603602, "grad_norm": 0.2381972372531891, "learning_rate": 0.0001896721043148402, "loss": 0.3558, "num_input_tokens_seen": 810536, "step": 2065 }, { "epoch": 18.64864864864865, "grad_norm": 0.09525643289089203, "learning_rate": 0.00018840135411195307, "loss": 0.3543, "num_input_tokens_seen": 812424, "step": 2070 }, { "epoch": 18.693693693693692, "grad_norm": 0.3644741475582123, "learning_rate": 0.00018713229604964065, "loss": 0.3484, "num_input_tokens_seen": 814632, "step": 2075 }, { "epoch": 18.73873873873874, "grad_norm": 0.15766045451164246, "learning_rate": 0.00018586496498945875, "loss": 0.3513, "num_input_tokens_seen": 816360, "step": 2080 }, { "epoch": 18.783783783783782, "grad_norm": 0.09054473042488098, "learning_rate": 0.00018459939574552186, "loss": 0.3141, "num_input_tokens_seen": 818632, "step": 2085 }, { "epoch": 18.82882882882883, "grad_norm": 0.04026452451944351, "learning_rate": 0.00018333562308354666, "loss": 0.4119, "num_input_tokens_seen": 820712, "step": 2090 }, { "epoch": 18.873873873873872, "grad_norm": 0.08696538954973221, "learning_rate": 0.0001820736817198969, "loss": 0.3738, "num_input_tokens_seen": 823112, "step": 2095 }, { "epoch": 18.91891891891892, "grad_norm": 0.20292513072490692, "learning_rate": 0.0001808136063206297, "loss": 0.3442, "num_input_tokens_seen": 824840, "step": 2100 }, { "epoch": 18.963963963963963, "grad_norm": 0.2855810225009918, "learning_rate": 0.0001795554315005437, "loss": 0.3343, "num_input_tokens_seen": 826792, "step": 2105 }, { "epoch": 19.00900900900901, "grad_norm": 0.1864543855190277, "learning_rate": 0.00017829919182222752, "loss": 0.3641, "num_input_tokens_seen": 828464, "step": 2110 }, { "epoch": 19.054054054054053, "grad_norm": 0.18023419380187988, "learning_rate": 0.0001770449217951105, "loss": 0.3437, "num_input_tokens_seen": 830768, "step": 2115 }, { "epoch": 19.0990990990991, "grad_norm": 0.0292358435690403, "learning_rate": 0.00017579265587451542, "loss": 0.3369, "num_input_tokens_seen": 832752, "step": 2120 }, { "epoch": 19.144144144144143, "grad_norm": 0.018932675942778587, "learning_rate": 0.00017454242846071084, "loss": 0.3546, "num_input_tokens_seen": 835184, "step": 2125 }, { "epoch": 19.18918918918919, "grad_norm": 0.028284583240747452, "learning_rate": 0.00017329427389796686, "loss": 0.3485, "num_input_tokens_seen": 837296, "step": 2130 }, { "epoch": 19.234234234234233, "grad_norm": 0.1788344383239746, "learning_rate": 0.00017204822647361173, "loss": 0.3563, "num_input_tokens_seen": 839312, "step": 2135 }, { "epoch": 19.27927927927928, "grad_norm": 0.16228212416172028, "learning_rate": 0.00017080432041708939, "loss": 0.3526, "num_input_tokens_seen": 841264, "step": 2140 }, { "epoch": 19.324324324324323, "grad_norm": 0.022678622975945473, "learning_rate": 0.00016956258989901955, "loss": 0.3502, "num_input_tokens_seen": 843216, "step": 2145 }, { "epoch": 19.36936936936937, "grad_norm": 0.04490073397755623, "learning_rate": 0.00016832306903025925, "loss": 0.3429, "num_input_tokens_seen": 845104, "step": 2150 }, { "epoch": 19.414414414414413, "grad_norm": 0.27067822217941284, "learning_rate": 0.0001670857918609653, "loss": 0.347, "num_input_tokens_seen": 846832, "step": 2155 }, { "epoch": 19.45945945945946, "grad_norm": 0.0553789883852005, "learning_rate": 0.00016585079237965906, "loss": 0.3285, "num_input_tokens_seen": 849072, "step": 2160 }, { "epoch": 19.504504504504503, "grad_norm": 0.10404996573925018, "learning_rate": 0.00016461810451229324, "loss": 0.3499, "num_input_tokens_seen": 851184, "step": 2165 }, { "epoch": 19.54954954954955, "grad_norm": 0.11453372985124588, "learning_rate": 0.00016338776212131918, "loss": 0.3594, "num_input_tokens_seen": 853328, "step": 2170 }, { "epoch": 19.55855855855856, "eval_loss": 0.3500675559043884, "eval_runtime": 1.22, "eval_samples_per_second": 90.981, "eval_steps_per_second": 22.95, "num_input_tokens_seen": 853744, "step": 2171 }, { "epoch": 19.594594594594593, "grad_norm": 0.11898820102214813, "learning_rate": 0.000162159799004757, "loss": 0.3259, "num_input_tokens_seen": 855120, "step": 2175 }, { "epoch": 19.63963963963964, "grad_norm": 0.03547373786568642, "learning_rate": 0.00016093424889526746, "loss": 0.3588, "num_input_tokens_seen": 856976, "step": 2180 }, { "epoch": 19.684684684684683, "grad_norm": 0.1620701253414154, "learning_rate": 0.00015971114545922476, "loss": 0.3517, "num_input_tokens_seen": 858928, "step": 2185 }, { "epoch": 19.72972972972973, "grad_norm": 0.024420609697699547, "learning_rate": 0.00015849052229579194, "loss": 0.3468, "num_input_tokens_seen": 860912, "step": 2190 }, { "epoch": 19.774774774774773, "grad_norm": 0.1992795765399933, "learning_rate": 0.0001572724129359981, "loss": 0.3379, "num_input_tokens_seen": 862544, "step": 2195 }, { "epoch": 19.81981981981982, "grad_norm": 0.03652095049619675, "learning_rate": 0.000156056850841817, "loss": 0.3682, "num_input_tokens_seen": 864688, "step": 2200 }, { "epoch": 19.864864864864863, "grad_norm": 0.13393381237983704, "learning_rate": 0.00015484386940524777, "loss": 0.3406, "num_input_tokens_seen": 866608, "step": 2205 }, { "epoch": 19.90990990990991, "grad_norm": 0.07378467917442322, "learning_rate": 0.00015363350194739822, "loss": 0.3479, "num_input_tokens_seen": 868656, "step": 2210 }, { "epoch": 19.954954954954957, "grad_norm": 0.2451988160610199, "learning_rate": 0.00015242578171756867, "loss": 0.3691, "num_input_tokens_seen": 870608, "step": 2215 }, { "epoch": 20.0, "grad_norm": 0.27152061462402344, "learning_rate": 0.0001512207418923391, "loss": 0.3807, "num_input_tokens_seen": 872376, "step": 2220 }, { "epoch": 20.045045045045047, "grad_norm": 0.1962389051914215, "learning_rate": 0.00015001841557465777, "loss": 0.3353, "num_input_tokens_seen": 874424, "step": 2225 }, { "epoch": 20.09009009009009, "grad_norm": 0.18191595375537872, "learning_rate": 0.00014881883579293171, "loss": 0.3518, "num_input_tokens_seen": 876568, "step": 2230 }, { "epoch": 20.135135135135137, "grad_norm": 0.011241542175412178, "learning_rate": 0.00014762203550011918, "loss": 0.3388, "num_input_tokens_seen": 878360, "step": 2235 }, { "epoch": 20.18018018018018, "grad_norm": 0.17374536395072937, "learning_rate": 0.0001464280475728252, "loss": 0.3563, "num_input_tokens_seen": 880216, "step": 2240 }, { "epoch": 20.225225225225227, "grad_norm": 0.062224432826042175, "learning_rate": 0.00014523690481039762, "loss": 0.3295, "num_input_tokens_seen": 882392, "step": 2245 }, { "epoch": 20.27027027027027, "grad_norm": 0.10997764021158218, "learning_rate": 0.00014404863993402634, "loss": 0.3598, "num_input_tokens_seen": 884312, "step": 2250 }, { "epoch": 20.315315315315317, "grad_norm": 0.033173635601997375, "learning_rate": 0.00014286328558584476, "loss": 0.3408, "num_input_tokens_seen": 886552, "step": 2255 }, { "epoch": 20.36036036036036, "grad_norm": 0.03685940057039261, "learning_rate": 0.00014168087432803292, "loss": 0.339, "num_input_tokens_seen": 888408, "step": 2260 }, { "epoch": 20.405405405405407, "grad_norm": 0.05566856637597084, "learning_rate": 0.00014050143864192252, "loss": 0.3481, "num_input_tokens_seen": 890424, "step": 2265 }, { "epoch": 20.45045045045045, "grad_norm": 0.0560782290995121, "learning_rate": 0.00013932501092710553, "loss": 0.3534, "num_input_tokens_seen": 892312, "step": 2270 }, { "epoch": 20.495495495495497, "grad_norm": 0.35051149129867554, "learning_rate": 0.0001381516235005433, "loss": 0.3622, "num_input_tokens_seen": 894456, "step": 2275 }, { "epoch": 20.54054054054054, "grad_norm": 0.28898414969444275, "learning_rate": 0.00013698130859567944, "loss": 0.3318, "num_input_tokens_seen": 896504, "step": 2280 }, { "epoch": 20.585585585585587, "grad_norm": 0.08215141296386719, "learning_rate": 0.00013581409836155414, "loss": 0.3132, "num_input_tokens_seen": 898808, "step": 2285 }, { "epoch": 20.63063063063063, "grad_norm": 0.04971693828701973, "learning_rate": 0.00013465002486192078, "loss": 0.3523, "num_input_tokens_seen": 900824, "step": 2290 }, { "epoch": 20.675675675675677, "grad_norm": 0.049674246460199356, "learning_rate": 0.00013348912007436536, "loss": 0.3698, "num_input_tokens_seen": 902744, "step": 2295 }, { "epoch": 20.72072072072072, "grad_norm": 0.21757592260837555, "learning_rate": 0.00013233141588942817, "loss": 0.3084, "num_input_tokens_seen": 904376, "step": 2300 }, { "epoch": 20.765765765765767, "grad_norm": 0.24013109505176544, "learning_rate": 0.00013117694410972748, "loss": 0.4093, "num_input_tokens_seen": 906264, "step": 2305 }, { "epoch": 20.81081081081081, "grad_norm": 0.2128821760416031, "learning_rate": 0.0001300257364490863, "loss": 0.3543, "num_input_tokens_seen": 908280, "step": 2310 }, { "epoch": 20.855855855855857, "grad_norm": 0.1378423571586609, "learning_rate": 0.00012887782453166057, "loss": 0.3565, "num_input_tokens_seen": 910008, "step": 2315 }, { "epoch": 20.9009009009009, "grad_norm": 0.3053135573863983, "learning_rate": 0.00012773323989107073, "loss": 0.3421, "num_input_tokens_seen": 911896, "step": 2320 }, { "epoch": 20.945945945945947, "grad_norm": 0.040070418268442154, "learning_rate": 0.00012659201396953614, "loss": 0.3413, "num_input_tokens_seen": 914072, "step": 2325 }, { "epoch": 20.99099099099099, "grad_norm": 0.16136138141155243, "learning_rate": 0.00012545417811701015, "loss": 0.3586, "num_input_tokens_seen": 915928, "step": 2330 }, { "epoch": 21.036036036036037, "grad_norm": 0.02132781594991684, "learning_rate": 0.00012431976359031957, "loss": 0.3425, "num_input_tokens_seen": 917696, "step": 2335 }, { "epoch": 21.063063063063062, "eval_loss": 0.3508879244327545, "eval_runtime": 1.3636, "eval_samples_per_second": 81.404, "eval_steps_per_second": 20.534, "num_input_tokens_seen": 918752, "step": 2338 }, { "epoch": 21.08108108108108, "grad_norm": 0.3399727940559387, "learning_rate": 0.00012318880155230618, "loss": 0.3551, "num_input_tokens_seen": 919392, "step": 2340 }, { "epoch": 21.126126126126128, "grad_norm": 0.122002974152565, "learning_rate": 0.00012206132307097046, "loss": 0.3355, "num_input_tokens_seen": 921184, "step": 2345 }, { "epoch": 21.17117117117117, "grad_norm": 0.30550843477249146, "learning_rate": 0.00012093735911861778, "loss": 0.3491, "num_input_tokens_seen": 923232, "step": 2350 }, { "epoch": 21.216216216216218, "grad_norm": 0.1953243911266327, "learning_rate": 0.00011981694057100839, "loss": 0.3439, "num_input_tokens_seen": 925184, "step": 2355 }, { "epoch": 21.26126126126126, "grad_norm": 0.1134195476770401, "learning_rate": 0.00011870009820650837, "loss": 0.3361, "num_input_tokens_seen": 927040, "step": 2360 }, { "epoch": 21.306306306306308, "grad_norm": 0.11591604351997375, "learning_rate": 0.00011758686270524483, "loss": 0.3225, "num_input_tokens_seen": 929120, "step": 2365 }, { "epoch": 21.35135135135135, "grad_norm": 0.07578551769256592, "learning_rate": 0.00011647726464826283, "loss": 0.3348, "num_input_tokens_seen": 930912, "step": 2370 }, { "epoch": 21.396396396396398, "grad_norm": 0.06989861279726028, "learning_rate": 0.00011537133451668519, "loss": 0.3551, "num_input_tokens_seen": 933152, "step": 2375 }, { "epoch": 21.44144144144144, "grad_norm": 0.0739617720246315, "learning_rate": 0.00011426910269087517, "loss": 0.3522, "num_input_tokens_seen": 935264, "step": 2380 }, { "epoch": 21.486486486486488, "grad_norm": 0.06335064023733139, "learning_rate": 0.00011317059944960234, "loss": 0.3481, "num_input_tokens_seen": 937376, "step": 2385 }, { "epoch": 21.53153153153153, "grad_norm": 0.35028526186943054, "learning_rate": 0.0001120758549692104, "loss": 0.3708, "num_input_tokens_seen": 939648, "step": 2390 }, { "epoch": 21.576576576576578, "grad_norm": 0.16894294321537018, "learning_rate": 0.00011098489932278811, "loss": 0.3384, "num_input_tokens_seen": 941376, "step": 2395 }, { "epoch": 21.62162162162162, "grad_norm": 0.035739749670028687, "learning_rate": 0.00010989776247934363, "loss": 0.3501, "num_input_tokens_seen": 943456, "step": 2400 }, { "epoch": 21.666666666666668, "grad_norm": 0.03201930969953537, "learning_rate": 0.00010881447430298075, "loss": 0.3407, "num_input_tokens_seen": 945376, "step": 2405 }, { "epoch": 21.71171171171171, "grad_norm": 0.1527249962091446, "learning_rate": 0.00010773506455207901, "loss": 0.3434, "num_input_tokens_seen": 947168, "step": 2410 }, { "epoch": 21.756756756756758, "grad_norm": 0.029136650264263153, "learning_rate": 0.00010665956287847598, "loss": 0.3436, "num_input_tokens_seen": 949056, "step": 2415 }, { "epoch": 21.8018018018018, "grad_norm": 0.02643633261322975, "learning_rate": 0.00010558799882665245, "loss": 0.348, "num_input_tokens_seen": 951040, "step": 2420 }, { "epoch": 21.846846846846848, "grad_norm": 0.03000778518617153, "learning_rate": 0.00010452040183292125, "loss": 0.3484, "num_input_tokens_seen": 952864, "step": 2425 }, { "epoch": 21.89189189189189, "grad_norm": 0.13491778075695038, "learning_rate": 0.0001034568012246185, "loss": 0.3411, "num_input_tokens_seen": 954944, "step": 2430 }, { "epoch": 21.936936936936938, "grad_norm": 0.14409510791301727, "learning_rate": 0.00010239722621929803, "loss": 0.3372, "num_input_tokens_seen": 956864, "step": 2435 }, { "epoch": 21.98198198198198, "grad_norm": 0.16194793581962585, "learning_rate": 0.00010134170592392835, "loss": 0.3583, "num_input_tokens_seen": 959072, "step": 2440 }, { "epoch": 22.027027027027028, "grad_norm": 0.04894425719976425, "learning_rate": 0.00010029026933409377, "loss": 0.337, "num_input_tokens_seen": 960856, "step": 2445 }, { "epoch": 22.07207207207207, "grad_norm": 0.1956278383731842, "learning_rate": 9.924294533319713e-05, "loss": 0.3712, "num_input_tokens_seen": 963096, "step": 2450 }, { "epoch": 22.117117117117118, "grad_norm": 0.33298972249031067, "learning_rate": 9.819976269166705e-05, "loss": 0.3404, "num_input_tokens_seen": 965144, "step": 2455 }, { "epoch": 22.16216216216216, "grad_norm": 0.17386727035045624, "learning_rate": 9.71607500661672e-05, "loss": 0.3386, "num_input_tokens_seen": 967000, "step": 2460 }, { "epoch": 22.207207207207208, "grad_norm": 0.1798383742570877, "learning_rate": 9.612593599880904e-05, "loss": 0.3393, "num_input_tokens_seen": 968920, "step": 2465 }, { "epoch": 22.25225225225225, "grad_norm": 0.12274763733148575, "learning_rate": 9.509534891636787e-05, "loss": 0.3312, "num_input_tokens_seen": 970744, "step": 2470 }, { "epoch": 22.2972972972973, "grad_norm": 0.056941594928503036, "learning_rate": 9.406901712950208e-05, "loss": 0.3672, "num_input_tokens_seen": 972408, "step": 2475 }, { "epoch": 22.34234234234234, "grad_norm": 0.1194363608956337, "learning_rate": 9.304696883197541e-05, "loss": 0.3621, "num_input_tokens_seen": 974520, "step": 2480 }, { "epoch": 22.38738738738739, "grad_norm": 0.12449562549591064, "learning_rate": 9.202923209988198e-05, "loss": 0.3466, "num_input_tokens_seen": 976504, "step": 2485 }, { "epoch": 22.43243243243243, "grad_norm": 0.3021010160446167, "learning_rate": 9.10158348908758e-05, "loss": 0.3495, "num_input_tokens_seen": 978296, "step": 2490 }, { "epoch": 22.47747747747748, "grad_norm": 0.1482069343328476, "learning_rate": 9.000680504340205e-05, "loss": 0.3467, "num_input_tokens_seen": 980216, "step": 2495 }, { "epoch": 22.52252252252252, "grad_norm": 0.1399919092655182, "learning_rate": 8.90021702759329e-05, "loss": 0.3378, "num_input_tokens_seen": 982136, "step": 2500 }, { "epoch": 22.56756756756757, "grad_norm": 0.33866798877716064, "learning_rate": 8.80019581862058e-05, "loss": 0.3605, "num_input_tokens_seen": 984472, "step": 2505 }, { "epoch": 22.56756756756757, "eval_loss": 0.3515712320804596, "eval_runtime": 1.3031, "eval_samples_per_second": 85.183, "eval_steps_per_second": 21.488, "num_input_tokens_seen": 984472, "step": 2505 }, { "epoch": 22.61261261261261, "grad_norm": 0.18033058941364288, "learning_rate": 8.700619625046525e-05, "loss": 0.3731, "num_input_tokens_seen": 986488, "step": 2510 }, { "epoch": 22.65765765765766, "grad_norm": 0.16288302838802338, "learning_rate": 8.601491182270812e-05, "loss": 0.3469, "num_input_tokens_seen": 988568, "step": 2515 }, { "epoch": 22.7027027027027, "grad_norm": 0.13185729086399078, "learning_rate": 8.502813213393254e-05, "loss": 0.3538, "num_input_tokens_seen": 990744, "step": 2520 }, { "epoch": 22.74774774774775, "grad_norm": 0.18060943484306335, "learning_rate": 8.404588429138946e-05, "loss": 0.3533, "num_input_tokens_seen": 992728, "step": 2525 }, { "epoch": 22.792792792792792, "grad_norm": 0.16276629269123077, "learning_rate": 8.306819527783791e-05, "loss": 0.3533, "num_input_tokens_seen": 994584, "step": 2530 }, { "epoch": 22.83783783783784, "grad_norm": 0.19442443549633026, "learning_rate": 8.209509195080428e-05, "loss": 0.3601, "num_input_tokens_seen": 996216, "step": 2535 }, { "epoch": 22.882882882882882, "grad_norm": 0.11972759664058685, "learning_rate": 8.112660104184399e-05, "loss": 0.3454, "num_input_tokens_seen": 998328, "step": 2540 }, { "epoch": 22.92792792792793, "grad_norm": 0.21031542122364044, "learning_rate": 8.016274915580753e-05, "loss": 0.3758, "num_input_tokens_seen": 1000312, "step": 2545 }, { "epoch": 22.972972972972972, "grad_norm": 0.057995717972517014, "learning_rate": 7.920356277010965e-05, "loss": 0.3666, "num_input_tokens_seen": 1002392, "step": 2550 }, { "epoch": 23.01801801801802, "grad_norm": 0.16890956461429596, "learning_rate": 7.824906823400149e-05, "loss": 0.3499, "num_input_tokens_seen": 1004488, "step": 2555 }, { "epoch": 23.063063063063062, "grad_norm": 0.013712511397898197, "learning_rate": 7.729929176784722e-05, "loss": 0.3534, "num_input_tokens_seen": 1006536, "step": 2560 }, { "epoch": 23.10810810810811, "grad_norm": 0.025204606354236603, "learning_rate": 7.635425946240404e-05, "loss": 0.3425, "num_input_tokens_seen": 1008616, "step": 2565 }, { "epoch": 23.153153153153152, "grad_norm": 0.13888491690158844, "learning_rate": 7.541399727810458e-05, "loss": 0.338, "num_input_tokens_seen": 1010152, "step": 2570 }, { "epoch": 23.1981981981982, "grad_norm": 0.20261798799037933, "learning_rate": 7.447853104434438e-05, "loss": 0.3536, "num_input_tokens_seen": 1012008, "step": 2575 }, { "epoch": 23.243243243243242, "grad_norm": 0.10792737454175949, "learning_rate": 7.354788645877244e-05, "loss": 0.3244, "num_input_tokens_seen": 1013832, "step": 2580 }, { "epoch": 23.28828828828829, "grad_norm": 0.2280653864145279, "learning_rate": 7.262208908658472e-05, "loss": 0.3689, "num_input_tokens_seen": 1015624, "step": 2585 }, { "epoch": 23.333333333333332, "grad_norm": 0.35492804646492004, "learning_rate": 7.170116435982246e-05, "loss": 0.3519, "num_input_tokens_seen": 1017416, "step": 2590 }, { "epoch": 23.37837837837838, "grad_norm": 0.33450374007225037, "learning_rate": 7.078513757667329e-05, "loss": 0.3527, "num_input_tokens_seen": 1019176, "step": 2595 }, { "epoch": 23.423423423423422, "grad_norm": 0.016679968684911728, "learning_rate": 6.98740339007761e-05, "loss": 0.3393, "num_input_tokens_seen": 1021096, "step": 2600 }, { "epoch": 23.46846846846847, "grad_norm": 0.3214060664176941, "learning_rate": 6.896787836052992e-05, "loss": 0.3487, "num_input_tokens_seen": 1023016, "step": 2605 }, { "epoch": 23.513513513513512, "grad_norm": 0.15012472867965698, "learning_rate": 6.806669584840689e-05, "loss": 0.3362, "num_input_tokens_seen": 1025064, "step": 2610 }, { "epoch": 23.55855855855856, "grad_norm": 0.0568564310669899, "learning_rate": 6.71705111202674e-05, "loss": 0.3288, "num_input_tokens_seen": 1027048, "step": 2615 }, { "epoch": 23.603603603603602, "grad_norm": 0.09365725517272949, "learning_rate": 6.627934879468107e-05, "loss": 0.3732, "num_input_tokens_seen": 1029256, "step": 2620 }, { "epoch": 23.64864864864865, "grad_norm": 0.08551164716482162, "learning_rate": 6.539323335224965e-05, "loss": 0.3961, "num_input_tokens_seen": 1031368, "step": 2625 }, { "epoch": 23.693693693693692, "grad_norm": 0.05189188942313194, "learning_rate": 6.451218913493514e-05, "loss": 0.3485, "num_input_tokens_seen": 1033416, "step": 2630 }, { "epoch": 23.73873873873874, "grad_norm": 0.03160202130675316, "learning_rate": 6.363624034539098e-05, "loss": 0.3625, "num_input_tokens_seen": 1035400, "step": 2635 }, { "epoch": 23.783783783783782, "grad_norm": 0.167461559176445, "learning_rate": 6.276541104629672e-05, "loss": 0.344, "num_input_tokens_seen": 1037512, "step": 2640 }, { "epoch": 23.82882882882883, "grad_norm": 0.14676669239997864, "learning_rate": 6.189972515969752e-05, "loss": 0.3489, "num_input_tokens_seen": 1039496, "step": 2645 }, { "epoch": 23.873873873873872, "grad_norm": 0.1289416253566742, "learning_rate": 6.103920646634697e-05, "loss": 0.3334, "num_input_tokens_seen": 1041288, "step": 2650 }, { "epoch": 23.91891891891892, "grad_norm": 0.028281616047024727, "learning_rate": 6.018387860505367e-05, "loss": 0.3516, "num_input_tokens_seen": 1043432, "step": 2655 }, { "epoch": 23.963963963963963, "grad_norm": 0.2675410509109497, "learning_rate": 5.933376507203164e-05, "loss": 0.3296, "num_input_tokens_seen": 1045128, "step": 2660 }, { "epoch": 24.00900900900901, "grad_norm": 0.09726014733314514, "learning_rate": 5.848888922025553e-05, "loss": 0.3435, "num_input_tokens_seen": 1047368, "step": 2665 }, { "epoch": 24.054054054054053, "grad_norm": 0.054646480828523636, "learning_rate": 5.764927425881825e-05, "loss": 0.3433, "num_input_tokens_seen": 1049416, "step": 2670 }, { "epoch": 24.07207207207207, "eval_loss": 0.35339227318763733, "eval_runtime": 1.3675, "eval_samples_per_second": 81.172, "eval_steps_per_second": 20.476, "num_input_tokens_seen": 1050088, "step": 2672 }, { "epoch": 24.0990990990991, "grad_norm": 0.05526398867368698, "learning_rate": 5.681494325229422e-05, "loss": 0.3359, "num_input_tokens_seen": 1051464, "step": 2675 }, { "epoch": 24.144144144144143, "grad_norm": 0.2359054982662201, "learning_rate": 5.5985919120105254e-05, "loss": 0.3553, "num_input_tokens_seen": 1053640, "step": 2680 }, { "epoch": 24.18918918918919, "grad_norm": 0.052131857722997665, "learning_rate": 5.516222463589113e-05, "loss": 0.3536, "num_input_tokens_seen": 1055784, "step": 2685 }, { "epoch": 24.234234234234233, "grad_norm": 0.11343823373317719, "learning_rate": 5.434388242688382e-05, "loss": 0.3682, "num_input_tokens_seen": 1057896, "step": 2690 }, { "epoch": 24.27927927927928, "grad_norm": 0.1722082495689392, "learning_rate": 5.353091497328627e-05, "loss": 0.3623, "num_input_tokens_seen": 1059784, "step": 2695 }, { "epoch": 24.324324324324323, "grad_norm": 0.01874319277703762, "learning_rate": 5.272334460765466e-05, "loss": 0.3425, "num_input_tokens_seen": 1061928, "step": 2700 }, { "epoch": 24.36936936936937, "grad_norm": 0.01603076048195362, "learning_rate": 5.1921193514284674e-05, "loss": 0.3435, "num_input_tokens_seen": 1063784, "step": 2705 }, { "epoch": 24.414414414414413, "grad_norm": 0.31013593077659607, "learning_rate": 5.1124483728602564e-05, "loss": 0.3404, "num_input_tokens_seen": 1065480, "step": 2710 }, { "epoch": 24.45945945945946, "grad_norm": 0.1762527972459793, "learning_rate": 5.033323713655935e-05, "loss": 0.3391, "num_input_tokens_seen": 1067464, "step": 2715 }, { "epoch": 24.504504504504503, "grad_norm": 0.11892964690923691, "learning_rate": 4.954747547403005e-05, "loss": 0.3446, "num_input_tokens_seen": 1069224, "step": 2720 }, { "epoch": 24.54954954954955, "grad_norm": 0.1800389140844345, "learning_rate": 4.8767220326216306e-05, "loss": 0.3468, "num_input_tokens_seen": 1071176, "step": 2725 }, { "epoch": 24.594594594594593, "grad_norm": 0.13655240833759308, "learning_rate": 4.799249312705348e-05, "loss": 0.3272, "num_input_tokens_seen": 1073544, "step": 2730 }, { "epoch": 24.63963963963964, "grad_norm": 0.19059185683727264, "learning_rate": 4.7223315158621746e-05, "loss": 0.3546, "num_input_tokens_seen": 1075336, "step": 2735 }, { "epoch": 24.684684684684683, "grad_norm": 0.1412012279033661, "learning_rate": 4.645970755056181e-05, "loss": 0.3562, "num_input_tokens_seen": 1077096, "step": 2740 }, { "epoch": 24.72972972972973, "grad_norm": 0.016317564994096756, "learning_rate": 4.5701691279494166e-05, "loss": 0.3614, "num_input_tokens_seen": 1078696, "step": 2745 }, { "epoch": 24.774774774774773, "grad_norm": 0.2748192548751831, "learning_rate": 4.4949287168442874e-05, "loss": 0.3348, "num_input_tokens_seen": 1080744, "step": 2750 }, { "epoch": 24.81981981981982, "grad_norm": 0.17296737432479858, "learning_rate": 4.420251588626373e-05, "loss": 0.3581, "num_input_tokens_seen": 1082856, "step": 2755 }, { "epoch": 24.864864864864863, "grad_norm": 0.3145996034145355, "learning_rate": 4.346139794707618e-05, "loss": 0.3407, "num_input_tokens_seen": 1084648, "step": 2760 }, { "epoch": 24.90990990990991, "grad_norm": 0.16550059616565704, "learning_rate": 4.272595370970017e-05, "loss": 0.3373, "num_input_tokens_seen": 1086376, "step": 2765 }, { "epoch": 24.954954954954957, "grad_norm": 0.15880973637104034, "learning_rate": 4.199620337709661e-05, "loss": 0.3501, "num_input_tokens_seen": 1088360, "step": 2770 }, { "epoch": 25.0, "grad_norm": 0.09613199532032013, "learning_rate": 4.127216699581246e-05, "loss": 0.354, "num_input_tokens_seen": 1089936, "step": 2775 }, { "epoch": 25.045045045045047, "grad_norm": 0.01928727701306343, "learning_rate": 4.0553864455429964e-05, "loss": 0.3422, "num_input_tokens_seen": 1091632, "step": 2780 }, { "epoch": 25.09009009009009, "grad_norm": 0.15807317197322845, "learning_rate": 3.9841315488020474e-05, "loss": 0.343, "num_input_tokens_seen": 1094064, "step": 2785 }, { "epoch": 25.135135135135137, "grad_norm": 0.023204617202281952, "learning_rate": 3.91345396676023e-05, "loss": 0.3423, "num_input_tokens_seen": 1096144, "step": 2790 }, { "epoch": 25.18018018018018, "grad_norm": 0.025937512516975403, "learning_rate": 3.843355640960283e-05, "loss": 0.3399, "num_input_tokens_seen": 1098352, "step": 2795 }, { "epoch": 25.225225225225227, "grad_norm": 0.16027478873729706, "learning_rate": 3.7738384970325586e-05, "loss": 0.3447, "num_input_tokens_seen": 1100528, "step": 2800 }, { "epoch": 25.27027027027027, "grad_norm": 0.13475149869918823, "learning_rate": 3.704904444642071e-05, "loss": 0.3525, "num_input_tokens_seen": 1102448, "step": 2805 }, { "epoch": 25.315315315315317, "grad_norm": 0.3058353662490845, "learning_rate": 3.636555377436085e-05, "loss": 0.3531, "num_input_tokens_seen": 1104368, "step": 2810 }, { "epoch": 25.36036036036036, "grad_norm": 0.14236396551132202, "learning_rate": 3.568793172992082e-05, "loss": 0.3342, "num_input_tokens_seen": 1106096, "step": 2815 }, { "epoch": 25.405405405405407, "grad_norm": 0.02931872382760048, "learning_rate": 3.5016196927661615e-05, "loss": 0.3375, "num_input_tokens_seen": 1108560, "step": 2820 }, { "epoch": 25.45045045045045, "grad_norm": 0.16306793689727783, "learning_rate": 3.43503678204192e-05, "loss": 0.3407, "num_input_tokens_seen": 1110608, "step": 2825 }, { "epoch": 25.495495495495497, "grad_norm": 0.15694257616996765, "learning_rate": 3.369046269879794e-05, "loss": 0.3599, "num_input_tokens_seen": 1112336, "step": 2830 }, { "epoch": 25.54054054054054, "grad_norm": 0.19316652417182922, "learning_rate": 3.303649969066749e-05, "loss": 0.3693, "num_input_tokens_seen": 1114128, "step": 2835 }, { "epoch": 25.576576576576578, "eval_loss": 0.3555394113063812, "eval_runtime": 1.2466, "eval_samples_per_second": 89.04, "eval_steps_per_second": 22.461, "num_input_tokens_seen": 1115632, "step": 2839 }, { "epoch": 25.585585585585587, "grad_norm": 0.02843327261507511, "learning_rate": 3.23884967606653e-05, "loss": 0.3348, "num_input_tokens_seen": 1116048, "step": 2840 }, { "epoch": 25.63063063063063, "grad_norm": 0.28372621536254883, "learning_rate": 3.174647170970296e-05, "loss": 0.3512, "num_input_tokens_seen": 1117744, "step": 2845 }, { "epoch": 25.675675675675677, "grad_norm": 0.15773625671863556, "learning_rate": 3.111044217447731e-05, "loss": 0.3379, "num_input_tokens_seen": 1119696, "step": 2850 }, { "epoch": 25.72072072072072, "grad_norm": 0.15243035554885864, "learning_rate": 3.0480425626985692e-05, "loss": 0.3517, "num_input_tokens_seen": 1121584, "step": 2855 }, { "epoch": 25.765765765765767, "grad_norm": 0.31931164860725403, "learning_rate": 2.9856439374046362e-05, "loss": 0.3631, "num_input_tokens_seen": 1123984, "step": 2860 }, { "epoch": 25.81081081081081, "grad_norm": 0.15827924013137817, "learning_rate": 2.9238500556822646e-05, "loss": 0.347, "num_input_tokens_seen": 1125968, "step": 2865 }, { "epoch": 25.855855855855857, "grad_norm": 0.1534067988395691, "learning_rate": 2.862662615035244e-05, "loss": 0.3472, "num_input_tokens_seen": 1127792, "step": 2870 }, { "epoch": 25.9009009009009, "grad_norm": 0.127284973859787, "learning_rate": 2.8020832963081776e-05, "loss": 0.3416, "num_input_tokens_seen": 1129680, "step": 2875 }, { "epoch": 25.945945945945947, "grad_norm": 0.17194759845733643, "learning_rate": 2.742113763640286e-05, "loss": 0.3372, "num_input_tokens_seen": 1131728, "step": 2880 }, { "epoch": 25.99099099099099, "grad_norm": 0.11222639679908752, "learning_rate": 2.682755664419717e-05, "loss": 0.346, "num_input_tokens_seen": 1133456, "step": 2885 }, { "epoch": 26.036036036036037, "grad_norm": 0.36060425639152527, "learning_rate": 2.624010629238302e-05, "loss": 0.3535, "num_input_tokens_seen": 1135240, "step": 2890 }, { "epoch": 26.08108108108108, "grad_norm": 0.09998419135808945, "learning_rate": 2.565880271846735e-05, "loss": 0.3258, "num_input_tokens_seen": 1137320, "step": 2895 }, { "epoch": 26.126126126126128, "grad_norm": 0.09765253216028214, "learning_rate": 2.5083661891102477e-05, "loss": 0.3612, "num_input_tokens_seen": 1139912, "step": 2900 }, { "epoch": 26.17117117117117, "grad_norm": 0.10090460628271103, "learning_rate": 2.451469960964764e-05, "loss": 0.3308, "num_input_tokens_seen": 1142248, "step": 2905 }, { "epoch": 26.216216216216218, "grad_norm": 0.21640998125076294, "learning_rate": 2.3951931503734676e-05, "loss": 0.3539, "num_input_tokens_seen": 1144168, "step": 2910 }, { "epoch": 26.26126126126126, "grad_norm": 0.35022076964378357, "learning_rate": 2.3395373032838924e-05, "loss": 0.3372, "num_input_tokens_seen": 1146376, "step": 2915 }, { "epoch": 26.306306306306308, "grad_norm": 0.18222667276859283, "learning_rate": 2.2845039485854537e-05, "loss": 0.3526, "num_input_tokens_seen": 1148136, "step": 2920 }, { "epoch": 26.35135135135135, "grad_norm": 0.024071596562862396, "learning_rate": 2.2300945980674226e-05, "loss": 0.3441, "num_input_tokens_seen": 1149736, "step": 2925 }, { "epoch": 26.396396396396398, "grad_norm": 0.13232316076755524, "learning_rate": 2.176310746377416e-05, "loss": 0.3332, "num_input_tokens_seen": 1151496, "step": 2930 }, { "epoch": 26.44144144144144, "grad_norm": 0.05208688601851463, "learning_rate": 2.1231538709803488e-05, "loss": 0.3493, "num_input_tokens_seen": 1153320, "step": 2935 }, { "epoch": 26.486486486486488, "grad_norm": 0.050381213426589966, "learning_rate": 2.0706254321178288e-05, "loss": 0.3377, "num_input_tokens_seen": 1155240, "step": 2940 }, { "epoch": 26.53153153153153, "grad_norm": 0.19044794142246246, "learning_rate": 2.0187268727680508e-05, "loss": 0.3597, "num_input_tokens_seen": 1157032, "step": 2945 }, { "epoch": 26.576576576576578, "grad_norm": 0.05769636854529381, "learning_rate": 1.9674596186061516e-05, "loss": 0.3572, "num_input_tokens_seen": 1158920, "step": 2950 }, { "epoch": 26.62162162162162, "grad_norm": 0.13890323042869568, "learning_rate": 1.916825077965048e-05, "loss": 0.3529, "num_input_tokens_seen": 1161320, "step": 2955 }, { "epoch": 26.666666666666668, "grad_norm": 0.29236072301864624, "learning_rate": 1.8668246417967606e-05, "loss": 0.3343, "num_input_tokens_seen": 1163112, "step": 2960 }, { "epoch": 26.71171171171171, "grad_norm": 0.27966803312301636, "learning_rate": 1.8174596836341927e-05, "loss": 0.3403, "num_input_tokens_seen": 1165320, "step": 2965 }, { "epoch": 26.756756756756758, "grad_norm": 0.35097646713256836, "learning_rate": 1.7687315595533937e-05, "loss": 0.3671, "num_input_tokens_seen": 1167304, "step": 2970 }, { "epoch": 26.8018018018018, "grad_norm": 0.028320608660578728, "learning_rate": 1.7206416081363253e-05, "loss": 0.3338, "num_input_tokens_seen": 1169320, "step": 2975 }, { "epoch": 26.846846846846848, "grad_norm": 0.14809466898441315, "learning_rate": 1.6731911504340667e-05, "loss": 0.3442, "num_input_tokens_seen": 1171208, "step": 2980 }, { "epoch": 26.89189189189189, "grad_norm": 0.023352719843387604, "learning_rate": 1.626381489930545e-05, "loss": 0.3428, "num_input_tokens_seen": 1173096, "step": 2985 }, { "epoch": 26.936936936936938, "grad_norm": 0.1471579521894455, "learning_rate": 1.5802139125067256e-05, "loss": 0.3512, "num_input_tokens_seen": 1174792, "step": 2990 }, { "epoch": 26.98198198198198, "grad_norm": 0.30509668588638306, "learning_rate": 1.534689686405272e-05, "loss": 0.3498, "num_input_tokens_seen": 1176808, "step": 2995 }, { "epoch": 27.027027027027028, "grad_norm": 0.17820541560649872, "learning_rate": 1.489810062195715e-05, "loss": 0.3536, "num_input_tokens_seen": 1179040, "step": 3000 }, { "epoch": 27.07207207207207, "grad_norm": 0.1620960533618927, "learning_rate": 1.445576272740115e-05, "loss": 0.3353, "num_input_tokens_seen": 1180928, "step": 3005 }, { "epoch": 27.08108108108108, "eval_loss": 0.35029834508895874, "eval_runtime": 1.3145, "eval_samples_per_second": 84.445, "eval_steps_per_second": 21.302, "num_input_tokens_seen": 1181344, "step": 3006 }, { "epoch": 27.117117117117118, "grad_norm": 0.13568507134914398, "learning_rate": 1.4019895331591787e-05, "loss": 0.3471, "num_input_tokens_seen": 1183072, "step": 3010 }, { "epoch": 27.16216216216216, "grad_norm": 0.2911302149295807, "learning_rate": 1.3590510407988698e-05, "loss": 0.3242, "num_input_tokens_seen": 1185024, "step": 3015 }, { "epoch": 27.207207207207208, "grad_norm": 0.17543275654315948, "learning_rate": 1.3167619751975501e-05, "loss": 0.3498, "num_input_tokens_seen": 1186912, "step": 3020 }, { "epoch": 27.25225225225225, "grad_norm": 0.13161693513393402, "learning_rate": 1.275123498053532e-05, "loss": 0.3498, "num_input_tokens_seen": 1188768, "step": 3025 }, { "epoch": 27.2972972972973, "grad_norm": 0.19500212371349335, "learning_rate": 1.2341367531932101e-05, "loss": 0.3709, "num_input_tokens_seen": 1190784, "step": 3030 }, { "epoch": 27.34234234234234, "grad_norm": 0.11825984716415405, "learning_rate": 1.1938028665396173e-05, "loss": 0.3335, "num_input_tokens_seen": 1192896, "step": 3035 }, { "epoch": 27.38738738738739, "grad_norm": 0.11076091229915619, "learning_rate": 1.1541229460814929e-05, "loss": 0.3332, "num_input_tokens_seen": 1194560, "step": 3040 }, { "epoch": 27.43243243243243, "grad_norm": 0.17395968735218048, "learning_rate": 1.115098081842844e-05, "loss": 0.3552, "num_input_tokens_seen": 1196320, "step": 3045 }, { "epoch": 27.47747747747748, "grad_norm": 0.13586698472499847, "learning_rate": 1.0767293458530336e-05, "loss": 0.3316, "num_input_tokens_seen": 1198368, "step": 3050 }, { "epoch": 27.52252252252252, "grad_norm": 0.14617836475372314, "learning_rate": 1.0390177921172862e-05, "loss": 0.3363, "num_input_tokens_seen": 1200224, "step": 3055 }, { "epoch": 27.56756756756757, "grad_norm": 0.03922060504555702, "learning_rate": 1.0019644565877562e-05, "loss": 0.3267, "num_input_tokens_seen": 1202496, "step": 3060 }, { "epoch": 27.61261261261261, "grad_norm": 0.32467979192733765, "learning_rate": 9.655703571350789e-06, "loss": 0.3418, "num_input_tokens_seen": 1204416, "step": 3065 }, { "epoch": 27.65765765765766, "grad_norm": 0.021317536011338234, "learning_rate": 9.298364935203917e-06, "loss": 0.3501, "num_input_tokens_seen": 1206176, "step": 3070 }, { "epoch": 27.7027027027027, "grad_norm": 0.05702093616127968, "learning_rate": 8.94763847367877e-06, "loss": 0.3446, "num_input_tokens_seen": 1208512, "step": 3075 }, { "epoch": 27.74774774774775, "grad_norm": 0.3301272988319397, "learning_rate": 8.603533821378046e-06, "loss": 0.3553, "num_input_tokens_seen": 1210464, "step": 3080 }, { "epoch": 27.792792792792792, "grad_norm": 0.14331293106079102, "learning_rate": 8.26606043100045e-06, "loss": 0.3434, "num_input_tokens_seen": 1212608, "step": 3085 }, { "epoch": 27.83783783783784, "grad_norm": 0.12238102406263351, "learning_rate": 7.935227573081183e-06, "loss": 0.3321, "num_input_tokens_seen": 1214624, "step": 3090 }, { "epoch": 27.882882882882882, "grad_norm": 0.1720043122768402, "learning_rate": 7.611044335737366e-06, "loss": 0.3448, "num_input_tokens_seen": 1216672, "step": 3095 }, { "epoch": 27.92792792792793, "grad_norm": 0.02903144247829914, "learning_rate": 7.293519624418099e-06, "loss": 0.3406, "num_input_tokens_seen": 1218624, "step": 3100 }, { "epoch": 27.972972972972972, "grad_norm": 0.36137476563453674, "learning_rate": 6.982662161660047e-06, "loss": 0.3818, "num_input_tokens_seen": 1220672, "step": 3105 }, { "epoch": 28.01801801801802, "grad_norm": 0.16625815629959106, "learning_rate": 6.678480486847771e-06, "loss": 0.3454, "num_input_tokens_seen": 1222360, "step": 3110 }, { "epoch": 28.063063063063062, "grad_norm": 0.031620051711797714, "learning_rate": 6.380982955979192e-06, "loss": 0.3346, "num_input_tokens_seen": 1224440, "step": 3115 }, { "epoch": 28.10810810810811, "grad_norm": 0.17372798919677734, "learning_rate": 6.090177741435915e-06, "loss": 0.3644, "num_input_tokens_seen": 1226136, "step": 3120 }, { "epoch": 28.153153153153152, "grad_norm": 0.32577383518218994, "learning_rate": 5.806072831758852e-06, "loss": 0.3603, "num_input_tokens_seen": 1228600, "step": 3125 }, { "epoch": 28.1981981981982, "grad_norm": 0.14194470643997192, "learning_rate": 5.528676031428731e-06, "loss": 0.3471, "num_input_tokens_seen": 1230520, "step": 3130 }, { "epoch": 28.243243243243242, "grad_norm": 0.1708410531282425, "learning_rate": 5.257994960651713e-06, "loss": 0.3421, "num_input_tokens_seen": 1232536, "step": 3135 }, { "epoch": 28.28828828828829, "grad_norm": 0.135078564286232, "learning_rate": 4.994037055150114e-06, "loss": 0.3431, "num_input_tokens_seen": 1234584, "step": 3140 }, { "epoch": 28.333333333333332, "grad_norm": 0.02283155731856823, "learning_rate": 4.736809565958011e-06, "loss": 0.3514, "num_input_tokens_seen": 1236248, "step": 3145 }, { "epoch": 28.37837837837838, "grad_norm": 0.0380808524787426, "learning_rate": 4.486319559222101e-06, "loss": 0.3397, "num_input_tokens_seen": 1238008, "step": 3150 }, { "epoch": 28.423423423423422, "grad_norm": 0.1378352791070938, "learning_rate": 4.242573916007686e-06, "loss": 0.333, "num_input_tokens_seen": 1239608, "step": 3155 }, { "epoch": 28.46846846846847, "grad_norm": 0.14813214540481567, "learning_rate": 4.005579332109627e-06, "loss": 0.3506, "num_input_tokens_seen": 1241240, "step": 3160 }, { "epoch": 28.513513513513512, "grad_norm": 0.04156951606273651, "learning_rate": 3.7753423178682466e-06, "loss": 0.3549, "num_input_tokens_seen": 1243160, "step": 3165 }, { "epoch": 28.55855855855856, "grad_norm": 0.052327558398246765, "learning_rate": 3.5518691979906925e-06, "loss": 0.3507, "num_input_tokens_seen": 1245496, "step": 3170 }, { "epoch": 28.585585585585587, "eval_loss": 0.35120949149131775, "eval_runtime": 1.2816, "eval_samples_per_second": 86.608, "eval_steps_per_second": 21.847, "num_input_tokens_seen": 1246872, "step": 3173 }, { "epoch": 28.603603603603602, "grad_norm": 0.03956664353609085, "learning_rate": 3.3351661113769918e-06, "loss": 0.3448, "num_input_tokens_seen": 1247640, "step": 3175 }, { "epoch": 28.64864864864865, "grad_norm": 0.17305274307727814, "learning_rate": 3.125239010951686e-06, "loss": 0.3538, "num_input_tokens_seen": 1249848, "step": 3180 }, { "epoch": 28.693693693693692, "grad_norm": 0.3353995084762573, "learning_rate": 2.9220936635000196e-06, "loss": 0.3518, "num_input_tokens_seen": 1251608, "step": 3185 }, { "epoch": 28.73873873873874, "grad_norm": 0.13647249341011047, "learning_rate": 2.7257356495096754e-06, "loss": 0.3451, "num_input_tokens_seen": 1253272, "step": 3190 }, { "epoch": 28.783783783783782, "grad_norm": 0.2923048138618469, "learning_rate": 2.536170363017426e-06, "loss": 0.341, "num_input_tokens_seen": 1255256, "step": 3195 }, { "epoch": 28.82882882882883, "grad_norm": 0.13797777891159058, "learning_rate": 2.3534030114610585e-06, "loss": 0.3323, "num_input_tokens_seen": 1257176, "step": 3200 }, { "epoch": 28.873873873873872, "grad_norm": 0.14143967628479004, "learning_rate": 2.1774386155361538e-06, "loss": 0.3504, "num_input_tokens_seen": 1259320, "step": 3205 }, { "epoch": 28.91891891891892, "grad_norm": 0.034025486558675766, "learning_rate": 2.008282009058282e-06, "loss": 0.3303, "num_input_tokens_seen": 1261208, "step": 3210 }, { "epoch": 28.963963963963963, "grad_norm": 0.05370686575770378, "learning_rate": 1.8459378388302473e-06, "loss": 0.3429, "num_input_tokens_seen": 1262872, "step": 3215 }, { "epoch": 29.00900900900901, "grad_norm": 0.12289952486753464, "learning_rate": 1.6904105645142442e-06, "loss": 0.3352, "num_input_tokens_seen": 1265088, "step": 3220 }, { "epoch": 29.054054054054053, "grad_norm": 0.035038284957408905, "learning_rate": 1.5417044585096517e-06, "loss": 0.3588, "num_input_tokens_seen": 1267008, "step": 3225 }, { "epoch": 29.0990990990991, "grad_norm": 0.040971461683511734, "learning_rate": 1.3998236058353764e-06, "loss": 0.3505, "num_input_tokens_seen": 1268992, "step": 3230 }, { "epoch": 29.144144144144143, "grad_norm": 0.035222794860601425, "learning_rate": 1.264771904017803e-06, "loss": 0.3424, "num_input_tokens_seen": 1270752, "step": 3235 }, { "epoch": 29.18918918918919, "grad_norm": 0.3184782862663269, "learning_rate": 1.1365530629836863e-06, "loss": 0.3463, "num_input_tokens_seen": 1272512, "step": 3240 }, { "epoch": 29.234234234234233, "grad_norm": 0.028321480378508568, "learning_rate": 1.0151706049582322e-06, "loss": 0.3462, "num_input_tokens_seen": 1274240, "step": 3245 }, { "epoch": 29.27927927927928, "grad_norm": 0.11761516332626343, "learning_rate": 9.006278643683696e-07, "loss": 0.3445, "num_input_tokens_seen": 1276320, "step": 3250 }, { "epoch": 29.324324324324323, "grad_norm": 0.12780724465847015, "learning_rate": 7.92927987751102e-07, "loss": 0.3417, "num_input_tokens_seen": 1278400, "step": 3255 }, { "epoch": 29.36936936936937, "grad_norm": 0.02126743271946907, "learning_rate": 6.920739336670756e-07, "loss": 0.3496, "num_input_tokens_seen": 1280352, "step": 3260 }, { "epoch": 29.414414414414413, "grad_norm": 0.12913310527801514, "learning_rate": 5.980684726193397e-07, "loss": 0.3384, "num_input_tokens_seen": 1282752, "step": 3265 }, { "epoch": 29.45945945945946, "grad_norm": 0.14425279200077057, "learning_rate": 5.10914186977296e-07, "loss": 0.3539, "num_input_tokens_seen": 1284704, "step": 3270 }, { "epoch": 29.504504504504503, "grad_norm": 0.14810092747211456, "learning_rate": 4.3061347090558866e-07, "loss": 0.3227, "num_input_tokens_seen": 1286720, "step": 3275 }, { "epoch": 29.54954954954955, "grad_norm": 0.027806531637907028, "learning_rate": 3.5716853029851837e-07, "loss": 0.353, "num_input_tokens_seen": 1289120, "step": 3280 }, { "epoch": 29.594594594594593, "grad_norm": 0.16533631086349487, "learning_rate": 2.905813827193127e-07, "loss": 0.3473, "num_input_tokens_seen": 1290944, "step": 3285 }, { "epoch": 29.63963963963964, "grad_norm": 0.3269529640674591, "learning_rate": 2.3085385734475384e-07, "loss": 0.3605, "num_input_tokens_seen": 1292832, "step": 3290 }, { "epoch": 29.684684684684683, "grad_norm": 0.03714149072766304, "learning_rate": 1.7798759491499673e-07, "loss": 0.3441, "num_input_tokens_seen": 1294560, "step": 3295 }, { "epoch": 29.72972972972973, "grad_norm": 0.3366108238697052, "learning_rate": 1.3198404768835491e-07, "loss": 0.3563, "num_input_tokens_seen": 1296352, "step": 3300 }, { "epoch": 29.774774774774773, "grad_norm": 0.1254447102546692, "learning_rate": 9.284447940152707e-08, "loss": 0.339, "num_input_tokens_seen": 1298208, "step": 3305 }, { "epoch": 29.81981981981982, "grad_norm": 0.1845274418592453, "learning_rate": 6.056996523484682e-08, "loss": 0.357, "num_input_tokens_seen": 1300288, "step": 3310 }, { "epoch": 29.864864864864863, "grad_norm": 0.133527472615242, "learning_rate": 3.516139178272315e-08, "loss": 0.3343, "num_input_tokens_seen": 1302656, "step": 3315 }, { "epoch": 29.90990990990991, "grad_norm": 0.16501377522945404, "learning_rate": 1.6619457029243278e-08, "loss": 0.353, "num_input_tokens_seen": 1304320, "step": 3320 }, { "epoch": 29.954954954954957, "grad_norm": 0.1411769539117813, "learning_rate": 4.944670329187772e-09, "loss": 0.3341, "num_input_tokens_seen": 1306432, "step": 3325 }, { "epoch": 30.0, "grad_norm": 0.10993507504463196, "learning_rate": 1.373523937919785e-10, "loss": 0.3352, "num_input_tokens_seen": 1308280, "step": 3330 }, { "epoch": 30.0, "num_input_tokens_seen": 1308280, "step": 3330, "total_flos": 5.891125709930496e+16, "train_loss": 0.431116147871848, "train_runtime": 416.8355, "train_samples_per_second": 31.883, "train_steps_per_second": 7.989 } ], "logging_steps": 5, "max_steps": 3330, "num_input_tokens_seen": 1308280, "num_train_epochs": 30, "save_steps": 167, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.891125709930496e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }