diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31008 @@ +{ + "best_global_step": 16354, + "best_metric": 0.24837632477283478, + "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_cola_1754652147/checkpoint-16354", + "epoch": 10.0, + "eval_steps": 962, + "global_step": 19240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002598752598752599, + "grad_norm": 3.0059990882873535, + "learning_rate": 1.0395010395010396e-07, + "loss": 13.9075, + "num_input_tokens_seen": 896, + "step": 5 + }, + { + "epoch": 0.005197505197505198, + "grad_norm": 3.020296335220337, + "learning_rate": 2.338877338877339e-07, + "loss": 13.7206, + "num_input_tokens_seen": 1824, + "step": 10 + }, + { + "epoch": 0.007796257796257797, + "grad_norm": 3.3200490474700928, + "learning_rate": 3.6382536382536384e-07, + "loss": 14.0019, + "num_input_tokens_seen": 2720, + "step": 15 + }, + { + "epoch": 0.010395010395010396, + "grad_norm": 3.6123461723327637, + "learning_rate": 4.937629937629938e-07, + "loss": 13.8605, + "num_input_tokens_seen": 3680, + "step": 20 + }, + { + "epoch": 0.012993762993762994, + "grad_norm": 3.518925905227661, + "learning_rate": 6.237006237006237e-07, + "loss": 14.0398, + "num_input_tokens_seen": 4640, + "step": 25 + }, + { + "epoch": 0.015592515592515593, + "grad_norm": 3.2209320068359375, + "learning_rate": 7.536382536382538e-07, + "loss": 13.5903, + "num_input_tokens_seen": 5632, + "step": 30 + }, + { + "epoch": 0.018191268191268192, + "grad_norm": 3.2745141983032227, + "learning_rate": 8.835758835758837e-07, + "loss": 13.669, + "num_input_tokens_seen": 6592, + "step": 35 + }, + { + "epoch": 0.02079002079002079, + "grad_norm": 3.187134265899658, + "learning_rate": 1.0135135135135136e-06, + "loss": 13.7673, + "num_input_tokens_seen": 7584, + "step": 40 + }, + { + "epoch": 0.02338877338877339, + "grad_norm": 3.045395612716675, + "learning_rate": 1.1434511434511436e-06, + "loss": 13.8371, + "num_input_tokens_seen": 8512, + "step": 45 + }, + { + "epoch": 0.02598752598752599, + "grad_norm": 3.2816221714019775, + "learning_rate": 1.2733887733887735e-06, + "loss": 13.9043, + "num_input_tokens_seen": 9472, + "step": 50 + }, + { + "epoch": 0.028586278586278588, + "grad_norm": 3.1714930534362793, + "learning_rate": 1.4033264033264034e-06, + "loss": 13.9473, + "num_input_tokens_seen": 10400, + "step": 55 + }, + { + "epoch": 0.031185031185031187, + "grad_norm": 3.3869543075561523, + "learning_rate": 1.5332640332640334e-06, + "loss": 13.852, + "num_input_tokens_seen": 11264, + "step": 60 + }, + { + "epoch": 0.033783783783783786, + "grad_norm": 2.8117010593414307, + "learning_rate": 1.6632016632016633e-06, + "loss": 13.8219, + "num_input_tokens_seen": 12224, + "step": 65 + }, + { + "epoch": 0.036382536382536385, + "grad_norm": 2.859619379043579, + "learning_rate": 1.7931392931392932e-06, + "loss": 13.5102, + "num_input_tokens_seen": 13248, + "step": 70 + }, + { + "epoch": 0.03898128898128898, + "grad_norm": 2.9433155059814453, + "learning_rate": 1.9230769230769234e-06, + "loss": 13.7374, + "num_input_tokens_seen": 14304, + "step": 75 + }, + { + "epoch": 0.04158004158004158, + "grad_norm": 2.6253137588500977, + "learning_rate": 2.053014553014553e-06, + "loss": 13.4974, + "num_input_tokens_seen": 15232, + "step": 80 + }, + { + "epoch": 0.04417879417879418, + "grad_norm": 2.7899160385131836, + "learning_rate": 2.1829521829521833e-06, + "loss": 13.7278, + "num_input_tokens_seen": 16128, + "step": 85 + }, + { + "epoch": 0.04677754677754678, + "grad_norm": 3.366900682449341, + "learning_rate": 2.312889812889813e-06, + "loss": 13.8525, + "num_input_tokens_seen": 17088, + "step": 90 + }, + { + "epoch": 0.04937629937629938, + "grad_norm": 3.0370240211486816, + "learning_rate": 2.442827442827443e-06, + "loss": 13.3514, + "num_input_tokens_seen": 18016, + "step": 95 + }, + { + "epoch": 0.05197505197505198, + "grad_norm": 2.9941654205322266, + "learning_rate": 2.572765072765073e-06, + "loss": 13.5308, + "num_input_tokens_seen": 18976, + "step": 100 + }, + { + "epoch": 0.05457380457380458, + "grad_norm": 3.059682607650757, + "learning_rate": 2.702702702702703e-06, + "loss": 13.1873, + "num_input_tokens_seen": 19936, + "step": 105 + }, + { + "epoch": 0.057172557172557176, + "grad_norm": 2.859891891479492, + "learning_rate": 2.8326403326403327e-06, + "loss": 13.4624, + "num_input_tokens_seen": 20896, + "step": 110 + }, + { + "epoch": 0.059771309771309775, + "grad_norm": 2.761906147003174, + "learning_rate": 2.962577962577963e-06, + "loss": 13.7311, + "num_input_tokens_seen": 21856, + "step": 115 + }, + { + "epoch": 0.062370062370062374, + "grad_norm": 2.9455690383911133, + "learning_rate": 3.092515592515593e-06, + "loss": 13.465, + "num_input_tokens_seen": 22784, + "step": 120 + }, + { + "epoch": 0.06496881496881497, + "grad_norm": 2.9861156940460205, + "learning_rate": 3.2224532224532228e-06, + "loss": 13.3296, + "num_input_tokens_seen": 23776, + "step": 125 + }, + { + "epoch": 0.06756756756756757, + "grad_norm": 3.0924017429351807, + "learning_rate": 3.352390852390853e-06, + "loss": 13.2113, + "num_input_tokens_seen": 24736, + "step": 130 + }, + { + "epoch": 0.07016632016632017, + "grad_norm": 3.0665535926818848, + "learning_rate": 3.4823284823284826e-06, + "loss": 13.3609, + "num_input_tokens_seen": 25600, + "step": 135 + }, + { + "epoch": 0.07276507276507277, + "grad_norm": 2.8391406536102295, + "learning_rate": 3.6122661122661128e-06, + "loss": 13.2187, + "num_input_tokens_seen": 26560, + "step": 140 + }, + { + "epoch": 0.07536382536382537, + "grad_norm": 3.220362901687622, + "learning_rate": 3.7422037422037425e-06, + "loss": 13.1309, + "num_input_tokens_seen": 27552, + "step": 145 + }, + { + "epoch": 0.07796257796257797, + "grad_norm": 3.070683717727661, + "learning_rate": 3.872141372141373e-06, + "loss": 12.9245, + "num_input_tokens_seen": 28512, + "step": 150 + }, + { + "epoch": 0.08056133056133057, + "grad_norm": 2.6470870971679688, + "learning_rate": 4.002079002079003e-06, + "loss": 12.9539, + "num_input_tokens_seen": 29536, + "step": 155 + }, + { + "epoch": 0.08316008316008316, + "grad_norm": 3.2435288429260254, + "learning_rate": 4.132016632016632e-06, + "loss": 12.8396, + "num_input_tokens_seen": 30432, + "step": 160 + }, + { + "epoch": 0.08575883575883576, + "grad_norm": 2.6646361351013184, + "learning_rate": 4.261954261954262e-06, + "loss": 12.6582, + "num_input_tokens_seen": 31392, + "step": 165 + }, + { + "epoch": 0.08835758835758836, + "grad_norm": 2.8364510536193848, + "learning_rate": 4.391891891891892e-06, + "loss": 12.7215, + "num_input_tokens_seen": 32352, + "step": 170 + }, + { + "epoch": 0.09095634095634096, + "grad_norm": 3.2493977546691895, + "learning_rate": 4.5218295218295225e-06, + "loss": 12.6902, + "num_input_tokens_seen": 33312, + "step": 175 + }, + { + "epoch": 0.09355509355509356, + "grad_norm": 3.3316895961761475, + "learning_rate": 4.651767151767152e-06, + "loss": 12.7317, + "num_input_tokens_seen": 34208, + "step": 180 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 2.9660866260528564, + "learning_rate": 4.781704781704782e-06, + "loss": 12.4408, + "num_input_tokens_seen": 35104, + "step": 185 + }, + { + "epoch": 0.09875259875259876, + "grad_norm": 2.9528608322143555, + "learning_rate": 4.911642411642412e-06, + "loss": 12.681, + "num_input_tokens_seen": 36032, + "step": 190 + }, + { + "epoch": 0.10135135135135136, + "grad_norm": 2.7190122604370117, + "learning_rate": 5.041580041580042e-06, + "loss": 12.6177, + "num_input_tokens_seen": 36960, + "step": 195 + }, + { + "epoch": 0.10395010395010396, + "grad_norm": 2.9271609783172607, + "learning_rate": 5.1715176715176724e-06, + "loss": 12.3561, + "num_input_tokens_seen": 37856, + "step": 200 + }, + { + "epoch": 0.10654885654885655, + "grad_norm": 3.120872735977173, + "learning_rate": 5.301455301455302e-06, + "loss": 12.2379, + "num_input_tokens_seen": 38848, + "step": 205 + }, + { + "epoch": 0.10914760914760915, + "grad_norm": 2.6395623683929443, + "learning_rate": 5.431392931392932e-06, + "loss": 12.221, + "num_input_tokens_seen": 39776, + "step": 210 + }, + { + "epoch": 0.11174636174636175, + "grad_norm": 2.8992276191711426, + "learning_rate": 5.561330561330562e-06, + "loss": 12.2229, + "num_input_tokens_seen": 40736, + "step": 215 + }, + { + "epoch": 0.11434511434511435, + "grad_norm": 3.075824022293091, + "learning_rate": 5.691268191268192e-06, + "loss": 11.9656, + "num_input_tokens_seen": 41760, + "step": 220 + }, + { + "epoch": 0.11694386694386695, + "grad_norm": 2.9675402641296387, + "learning_rate": 5.8212058212058215e-06, + "loss": 12.1465, + "num_input_tokens_seen": 42720, + "step": 225 + }, + { + "epoch": 0.11954261954261955, + "grad_norm": 2.762045383453369, + "learning_rate": 5.951143451143452e-06, + "loss": 11.8505, + "num_input_tokens_seen": 43680, + "step": 230 + }, + { + "epoch": 0.12214137214137215, + "grad_norm": 3.093644857406616, + "learning_rate": 6.081081081081082e-06, + "loss": 11.8152, + "num_input_tokens_seen": 44640, + "step": 235 + }, + { + "epoch": 0.12474012474012475, + "grad_norm": 2.728724956512451, + "learning_rate": 6.211018711018712e-06, + "loss": 11.8123, + "num_input_tokens_seen": 45568, + "step": 240 + }, + { + "epoch": 0.12733887733887733, + "grad_norm": 2.7196667194366455, + "learning_rate": 6.340956340956341e-06, + "loss": 11.6402, + "num_input_tokens_seen": 46592, + "step": 245 + }, + { + "epoch": 0.12993762993762994, + "grad_norm": 2.647512435913086, + "learning_rate": 6.4708939708939705e-06, + "loss": 11.575, + "num_input_tokens_seen": 47520, + "step": 250 + }, + { + "epoch": 0.13253638253638253, + "grad_norm": 2.9531850814819336, + "learning_rate": 6.6008316008316015e-06, + "loss": 11.6825, + "num_input_tokens_seen": 48416, + "step": 255 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 3.1519265174865723, + "learning_rate": 6.730769230769231e-06, + "loss": 11.5589, + "num_input_tokens_seen": 49376, + "step": 260 + }, + { + "epoch": 0.13773388773388773, + "grad_norm": 2.579700231552124, + "learning_rate": 6.860706860706862e-06, + "loss": 11.5301, + "num_input_tokens_seen": 50304, + "step": 265 + }, + { + "epoch": 0.14033264033264034, + "grad_norm": 3.4171931743621826, + "learning_rate": 6.99064449064449e-06, + "loss": 11.2039, + "num_input_tokens_seen": 51200, + "step": 270 + }, + { + "epoch": 0.14293139293139293, + "grad_norm": 2.903115749359131, + "learning_rate": 7.120582120582121e-06, + "loss": 11.0651, + "num_input_tokens_seen": 52192, + "step": 275 + }, + { + "epoch": 0.14553014553014554, + "grad_norm": 2.6465282440185547, + "learning_rate": 7.250519750519751e-06, + "loss": 10.9984, + "num_input_tokens_seen": 53152, + "step": 280 + }, + { + "epoch": 0.14812889812889812, + "grad_norm": 2.7348976135253906, + "learning_rate": 7.3804573804573816e-06, + "loss": 10.9461, + "num_input_tokens_seen": 54048, + "step": 285 + }, + { + "epoch": 0.15072765072765074, + "grad_norm": 2.7884507179260254, + "learning_rate": 7.510395010395011e-06, + "loss": 11.0144, + "num_input_tokens_seen": 55072, + "step": 290 + }, + { + "epoch": 0.15332640332640332, + "grad_norm": 2.7380189895629883, + "learning_rate": 7.640332640332642e-06, + "loss": 10.6464, + "num_input_tokens_seen": 56000, + "step": 295 + }, + { + "epoch": 0.15592515592515593, + "grad_norm": 2.900484085083008, + "learning_rate": 7.77027027027027e-06, + "loss": 10.8101, + "num_input_tokens_seen": 57024, + "step": 300 + }, + { + "epoch": 0.15852390852390852, + "grad_norm": 2.7604472637176514, + "learning_rate": 7.9002079002079e-06, + "loss": 10.6473, + "num_input_tokens_seen": 58048, + "step": 305 + }, + { + "epoch": 0.16112266112266113, + "grad_norm": 2.9122323989868164, + "learning_rate": 8.03014553014553e-06, + "loss": 10.3487, + "num_input_tokens_seen": 59040, + "step": 310 + }, + { + "epoch": 0.16372141372141372, + "grad_norm": 3.269867181777954, + "learning_rate": 8.16008316008316e-06, + "loss": 10.513, + "num_input_tokens_seen": 59968, + "step": 315 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 3.1871588230133057, + "learning_rate": 8.290020790020791e-06, + "loss": 10.3878, + "num_input_tokens_seen": 60928, + "step": 320 + }, + { + "epoch": 0.16891891891891891, + "grad_norm": 3.362354278564453, + "learning_rate": 8.419958419958421e-06, + "loss": 10.2486, + "num_input_tokens_seen": 61888, + "step": 325 + }, + { + "epoch": 0.17151767151767153, + "grad_norm": 2.6884210109710693, + "learning_rate": 8.54989604989605e-06, + "loss": 9.7556, + "num_input_tokens_seen": 62816, + "step": 330 + }, + { + "epoch": 0.1741164241164241, + "grad_norm": 2.931689977645874, + "learning_rate": 8.679833679833681e-06, + "loss": 9.9993, + "num_input_tokens_seen": 63680, + "step": 335 + }, + { + "epoch": 0.17671517671517672, + "grad_norm": 2.686624526977539, + "learning_rate": 8.80977130977131e-06, + "loss": 9.9069, + "num_input_tokens_seen": 64672, + "step": 340 + }, + { + "epoch": 0.1793139293139293, + "grad_norm": 3.08392071723938, + "learning_rate": 8.93970893970894e-06, + "loss": 9.6993, + "num_input_tokens_seen": 65568, + "step": 345 + }, + { + "epoch": 0.18191268191268192, + "grad_norm": 2.9747867584228516, + "learning_rate": 9.06964656964657e-06, + "loss": 9.7216, + "num_input_tokens_seen": 66560, + "step": 350 + }, + { + "epoch": 0.1845114345114345, + "grad_norm": 2.819876194000244, + "learning_rate": 9.1995841995842e-06, + "loss": 9.4183, + "num_input_tokens_seen": 67488, + "step": 355 + }, + { + "epoch": 0.18711018711018712, + "grad_norm": 3.075188398361206, + "learning_rate": 9.32952182952183e-06, + "loss": 9.1652, + "num_input_tokens_seen": 68416, + "step": 360 + }, + { + "epoch": 0.1897089397089397, + "grad_norm": 2.6514804363250732, + "learning_rate": 9.45945945945946e-06, + "loss": 9.5381, + "num_input_tokens_seen": 69440, + "step": 365 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 3.059100866317749, + "learning_rate": 9.589397089397089e-06, + "loss": 9.3588, + "num_input_tokens_seen": 70400, + "step": 370 + }, + { + "epoch": 0.1949064449064449, + "grad_norm": 2.6874988079071045, + "learning_rate": 9.719334719334721e-06, + "loss": 9.1104, + "num_input_tokens_seen": 71360, + "step": 375 + }, + { + "epoch": 0.19750519750519752, + "grad_norm": 2.8047313690185547, + "learning_rate": 9.84927234927235e-06, + "loss": 8.8926, + "num_input_tokens_seen": 72288, + "step": 380 + }, + { + "epoch": 0.2001039501039501, + "grad_norm": 2.81318998336792, + "learning_rate": 9.97920997920998e-06, + "loss": 9.0372, + "num_input_tokens_seen": 73280, + "step": 385 + }, + { + "epoch": 0.20270270270270271, + "grad_norm": 2.4623029232025146, + "learning_rate": 1.010914760914761e-05, + "loss": 8.6423, + "num_input_tokens_seen": 74272, + "step": 390 + }, + { + "epoch": 0.2053014553014553, + "grad_norm": 2.61942195892334, + "learning_rate": 1.023908523908524e-05, + "loss": 8.5798, + "num_input_tokens_seen": 75296, + "step": 395 + }, + { + "epoch": 0.2079002079002079, + "grad_norm": 2.613299608230591, + "learning_rate": 1.036902286902287e-05, + "loss": 8.3479, + "num_input_tokens_seen": 76224, + "step": 400 + }, + { + "epoch": 0.2104989604989605, + "grad_norm": 2.651472330093384, + "learning_rate": 1.04989604989605e-05, + "loss": 8.2528, + "num_input_tokens_seen": 77088, + "step": 405 + }, + { + "epoch": 0.2130977130977131, + "grad_norm": 2.5281894207000732, + "learning_rate": 1.0628898128898128e-05, + "loss": 8.1458, + "num_input_tokens_seen": 78048, + "step": 410 + }, + { + "epoch": 0.2156964656964657, + "grad_norm": 2.613604784011841, + "learning_rate": 1.075883575883576e-05, + "loss": 7.9118, + "num_input_tokens_seen": 79040, + "step": 415 + }, + { + "epoch": 0.2182952182952183, + "grad_norm": 2.5554122924804688, + "learning_rate": 1.0888773388773389e-05, + "loss": 7.9518, + "num_input_tokens_seen": 80032, + "step": 420 + }, + { + "epoch": 0.2208939708939709, + "grad_norm": 2.376058578491211, + "learning_rate": 1.101871101871102e-05, + "loss": 7.8278, + "num_input_tokens_seen": 81024, + "step": 425 + }, + { + "epoch": 0.2234927234927235, + "grad_norm": 2.2098548412323, + "learning_rate": 1.1148648648648649e-05, + "loss": 7.718, + "num_input_tokens_seen": 81952, + "step": 430 + }, + { + "epoch": 0.2260914760914761, + "grad_norm": 1.6636327505111694, + "learning_rate": 1.127858627858628e-05, + "loss": 7.593, + "num_input_tokens_seen": 82944, + "step": 435 + }, + { + "epoch": 0.2286902286902287, + "grad_norm": 1.6215975284576416, + "learning_rate": 1.140852390852391e-05, + "loss": 7.5693, + "num_input_tokens_seen": 84000, + "step": 440 + }, + { + "epoch": 0.2312889812889813, + "grad_norm": 1.7421337366104126, + "learning_rate": 1.153846153846154e-05, + "loss": 7.5378, + "num_input_tokens_seen": 85024, + "step": 445 + }, + { + "epoch": 0.2338877338877339, + "grad_norm": 2.3342180252075195, + "learning_rate": 1.166839916839917e-05, + "loss": 7.2039, + "num_input_tokens_seen": 85952, + "step": 450 + }, + { + "epoch": 0.23648648648648649, + "grad_norm": 1.4411379098892212, + "learning_rate": 1.17983367983368e-05, + "loss": 7.2446, + "num_input_tokens_seen": 86944, + "step": 455 + }, + { + "epoch": 0.2390852390852391, + "grad_norm": 1.553244948387146, + "learning_rate": 1.1928274428274428e-05, + "loss": 7.1817, + "num_input_tokens_seen": 87840, + "step": 460 + }, + { + "epoch": 0.24168399168399168, + "grad_norm": 1.8044276237487793, + "learning_rate": 1.205821205821206e-05, + "loss": 6.9667, + "num_input_tokens_seen": 88768, + "step": 465 + }, + { + "epoch": 0.2442827442827443, + "grad_norm": 1.7003382444381714, + "learning_rate": 1.2188149688149689e-05, + "loss": 6.9498, + "num_input_tokens_seen": 89760, + "step": 470 + }, + { + "epoch": 0.24688149688149688, + "grad_norm": 1.6018058061599731, + "learning_rate": 1.2318087318087319e-05, + "loss": 6.8935, + "num_input_tokens_seen": 90720, + "step": 475 + }, + { + "epoch": 0.2494802494802495, + "grad_norm": 1.597390055656433, + "learning_rate": 1.2448024948024949e-05, + "loss": 6.7524, + "num_input_tokens_seen": 91680, + "step": 480 + }, + { + "epoch": 0.2520790020790021, + "grad_norm": 1.8193989992141724, + "learning_rate": 1.2577962577962579e-05, + "loss": 6.6417, + "num_input_tokens_seen": 92608, + "step": 485 + }, + { + "epoch": 0.25467775467775466, + "grad_norm": 1.8698362112045288, + "learning_rate": 1.270790020790021e-05, + "loss": 6.6851, + "num_input_tokens_seen": 93632, + "step": 490 + }, + { + "epoch": 0.25727650727650725, + "grad_norm": 1.375123381614685, + "learning_rate": 1.2837837837837838e-05, + "loss": 6.6383, + "num_input_tokens_seen": 94720, + "step": 495 + }, + { + "epoch": 0.2598752598752599, + "grad_norm": 1.3937016725540161, + "learning_rate": 1.2967775467775468e-05, + "loss": 6.3678, + "num_input_tokens_seen": 95648, + "step": 500 + }, + { + "epoch": 0.2624740124740125, + "grad_norm": 1.6960875988006592, + "learning_rate": 1.30977130977131e-05, + "loss": 6.401, + "num_input_tokens_seen": 96512, + "step": 505 + }, + { + "epoch": 0.26507276507276506, + "grad_norm": 1.723480463027954, + "learning_rate": 1.3227650727650728e-05, + "loss": 6.1163, + "num_input_tokens_seen": 97376, + "step": 510 + }, + { + "epoch": 0.26767151767151764, + "grad_norm": 2.097259759902954, + "learning_rate": 1.3357588357588358e-05, + "loss": 6.0781, + "num_input_tokens_seen": 98304, + "step": 515 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.352899432182312, + "learning_rate": 1.3487525987525987e-05, + "loss": 6.2474, + "num_input_tokens_seen": 99392, + "step": 520 + }, + { + "epoch": 0.27286902286902287, + "grad_norm": 2.1250534057617188, + "learning_rate": 1.3617463617463619e-05, + "loss": 6.1145, + "num_input_tokens_seen": 100480, + "step": 525 + }, + { + "epoch": 0.27546777546777546, + "grad_norm": 1.6027281284332275, + "learning_rate": 1.3747401247401249e-05, + "loss": 5.7999, + "num_input_tokens_seen": 101440, + "step": 530 + }, + { + "epoch": 0.27806652806652804, + "grad_norm": 1.6344823837280273, + "learning_rate": 1.3877338877338877e-05, + "loss": 5.932, + "num_input_tokens_seen": 102400, + "step": 535 + }, + { + "epoch": 0.2806652806652807, + "grad_norm": 1.7080185413360596, + "learning_rate": 1.4007276507276507e-05, + "loss": 5.8225, + "num_input_tokens_seen": 103424, + "step": 540 + }, + { + "epoch": 0.28326403326403327, + "grad_norm": 1.4230607748031616, + "learning_rate": 1.4137214137214139e-05, + "loss": 5.5741, + "num_input_tokens_seen": 104352, + "step": 545 + }, + { + "epoch": 0.28586278586278585, + "grad_norm": 1.6008234024047852, + "learning_rate": 1.4267151767151768e-05, + "loss": 5.4374, + "num_input_tokens_seen": 105280, + "step": 550 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 1.8315085172653198, + "learning_rate": 1.4397089397089398e-05, + "loss": 5.4834, + "num_input_tokens_seen": 106240, + "step": 555 + }, + { + "epoch": 0.2910602910602911, + "grad_norm": 1.6040793657302856, + "learning_rate": 1.4527027027027026e-05, + "loss": 5.238, + "num_input_tokens_seen": 107168, + "step": 560 + }, + { + "epoch": 0.29365904365904366, + "grad_norm": 1.5619388818740845, + "learning_rate": 1.4656964656964658e-05, + "loss": 5.2345, + "num_input_tokens_seen": 108160, + "step": 565 + }, + { + "epoch": 0.29625779625779625, + "grad_norm": 1.8326219320297241, + "learning_rate": 1.4786902286902288e-05, + "loss": 5.1215, + "num_input_tokens_seen": 109152, + "step": 570 + }, + { + "epoch": 0.29885654885654883, + "grad_norm": 1.4255032539367676, + "learning_rate": 1.4916839916839917e-05, + "loss": 4.9004, + "num_input_tokens_seen": 110112, + "step": 575 + }, + { + "epoch": 0.30145530145530147, + "grad_norm": 1.3752734661102295, + "learning_rate": 1.5046777546777547e-05, + "loss": 4.994, + "num_input_tokens_seen": 111104, + "step": 580 + }, + { + "epoch": 0.30405405405405406, + "grad_norm": 1.4917436838150024, + "learning_rate": 1.5176715176715179e-05, + "loss": 5.0067, + "num_input_tokens_seen": 112096, + "step": 585 + }, + { + "epoch": 0.30665280665280664, + "grad_norm": 1.680992841720581, + "learning_rate": 1.530665280665281e-05, + "loss": 4.7502, + "num_input_tokens_seen": 113056, + "step": 590 + }, + { + "epoch": 0.3092515592515592, + "grad_norm": 1.476543664932251, + "learning_rate": 1.5436590436590437e-05, + "loss": 4.547, + "num_input_tokens_seen": 113952, + "step": 595 + }, + { + "epoch": 0.31185031185031187, + "grad_norm": 1.5278189182281494, + "learning_rate": 1.5566528066528066e-05, + "loss": 4.6741, + "num_input_tokens_seen": 114944, + "step": 600 + }, + { + "epoch": 0.31444906444906445, + "grad_norm": 1.7943371534347534, + "learning_rate": 1.5696465696465697e-05, + "loss": 4.4559, + "num_input_tokens_seen": 115904, + "step": 605 + }, + { + "epoch": 0.31704781704781704, + "grad_norm": 1.7497714757919312, + "learning_rate": 1.5826403326403326e-05, + "loss": 4.3795, + "num_input_tokens_seen": 116928, + "step": 610 + }, + { + "epoch": 0.3196465696465696, + "grad_norm": 1.6569536924362183, + "learning_rate": 1.5956340956340958e-05, + "loss": 4.2229, + "num_input_tokens_seen": 117888, + "step": 615 + }, + { + "epoch": 0.32224532224532226, + "grad_norm": 1.373608946800232, + "learning_rate": 1.6086278586278586e-05, + "loss": 4.3903, + "num_input_tokens_seen": 118944, + "step": 620 + }, + { + "epoch": 0.32484407484407485, + "grad_norm": 1.6382619142532349, + "learning_rate": 1.6216216216216218e-05, + "loss": 4.1244, + "num_input_tokens_seen": 119904, + "step": 625 + }, + { + "epoch": 0.32744282744282743, + "grad_norm": 1.553454041481018, + "learning_rate": 1.6346153846153847e-05, + "loss": 4.0224, + "num_input_tokens_seen": 120864, + "step": 630 + }, + { + "epoch": 0.33004158004158, + "grad_norm": 1.9424980878829956, + "learning_rate": 1.6476091476091475e-05, + "loss": 4.1623, + "num_input_tokens_seen": 121920, + "step": 635 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 1.621649980545044, + "learning_rate": 1.6606029106029107e-05, + "loss": 3.7534, + "num_input_tokens_seen": 122848, + "step": 640 + }, + { + "epoch": 0.33523908523908524, + "grad_norm": 1.7379424571990967, + "learning_rate": 1.673596673596674e-05, + "loss": 3.8468, + "num_input_tokens_seen": 123808, + "step": 645 + }, + { + "epoch": 0.33783783783783783, + "grad_norm": 2.097628355026245, + "learning_rate": 1.6865904365904367e-05, + "loss": 4.0084, + "num_input_tokens_seen": 124832, + "step": 650 + }, + { + "epoch": 0.3404365904365904, + "grad_norm": 1.557512879371643, + "learning_rate": 1.6995841995841996e-05, + "loss": 3.5924, + "num_input_tokens_seen": 125728, + "step": 655 + }, + { + "epoch": 0.34303534303534305, + "grad_norm": 1.4880597591400146, + "learning_rate": 1.7125779625779624e-05, + "loss": 3.5315, + "num_input_tokens_seen": 126656, + "step": 660 + }, + { + "epoch": 0.34563409563409564, + "grad_norm": 1.5294651985168457, + "learning_rate": 1.7255717255717256e-05, + "loss": 3.4593, + "num_input_tokens_seen": 127584, + "step": 665 + }, + { + "epoch": 0.3482328482328482, + "grad_norm": 1.389002799987793, + "learning_rate": 1.7385654885654888e-05, + "loss": 3.3239, + "num_input_tokens_seen": 128480, + "step": 670 + }, + { + "epoch": 0.3508316008316008, + "grad_norm": 1.3972755670547485, + "learning_rate": 1.7515592515592516e-05, + "loss": 3.0791, + "num_input_tokens_seen": 129376, + "step": 675 + }, + { + "epoch": 0.35343035343035345, + "grad_norm": 1.4241199493408203, + "learning_rate": 1.7645530145530145e-05, + "loss": 3.0802, + "num_input_tokens_seen": 130272, + "step": 680 + }, + { + "epoch": 0.35602910602910603, + "grad_norm": 1.4666866064071655, + "learning_rate": 1.7775467775467776e-05, + "loss": 2.9275, + "num_input_tokens_seen": 131168, + "step": 685 + }, + { + "epoch": 0.3586278586278586, + "grad_norm": 1.1709403991699219, + "learning_rate": 1.7905405405405405e-05, + "loss": 2.7997, + "num_input_tokens_seen": 132128, + "step": 690 + }, + { + "epoch": 0.3612266112266112, + "grad_norm": 1.8384019136428833, + "learning_rate": 1.8035343035343037e-05, + "loss": 2.999, + "num_input_tokens_seen": 133120, + "step": 695 + }, + { + "epoch": 0.36382536382536385, + "grad_norm": 1.1167632341384888, + "learning_rate": 1.8165280665280665e-05, + "loss": 2.9052, + "num_input_tokens_seen": 134144, + "step": 700 + }, + { + "epoch": 0.36642411642411643, + "grad_norm": 1.5475212335586548, + "learning_rate": 1.8295218295218297e-05, + "loss": 2.6415, + "num_input_tokens_seen": 135072, + "step": 705 + }, + { + "epoch": 0.369022869022869, + "grad_norm": 1.3473645448684692, + "learning_rate": 1.8425155925155926e-05, + "loss": 2.5233, + "num_input_tokens_seen": 136000, + "step": 710 + }, + { + "epoch": 0.3716216216216216, + "grad_norm": 1.2840481996536255, + "learning_rate": 1.8555093555093554e-05, + "loss": 2.5549, + "num_input_tokens_seen": 136960, + "step": 715 + }, + { + "epoch": 0.37422037422037424, + "grad_norm": 1.3568450212478638, + "learning_rate": 1.8685031185031186e-05, + "loss": 2.7053, + "num_input_tokens_seen": 137984, + "step": 720 + }, + { + "epoch": 0.3768191268191268, + "grad_norm": 1.2902652025222778, + "learning_rate": 1.8814968814968818e-05, + "loss": 2.3047, + "num_input_tokens_seen": 138912, + "step": 725 + }, + { + "epoch": 0.3794178794178794, + "grad_norm": 1.7637903690338135, + "learning_rate": 1.8944906444906446e-05, + "loss": 2.2905, + "num_input_tokens_seen": 139840, + "step": 730 + }, + { + "epoch": 0.382016632016632, + "grad_norm": 1.4083458185195923, + "learning_rate": 1.9074844074844075e-05, + "loss": 2.1721, + "num_input_tokens_seen": 140768, + "step": 735 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.264119029045105, + "learning_rate": 1.9204781704781703e-05, + "loss": 2.3949, + "num_input_tokens_seen": 141728, + "step": 740 + }, + { + "epoch": 0.3872141372141372, + "grad_norm": 1.0844061374664307, + "learning_rate": 1.9334719334719338e-05, + "loss": 1.8931, + "num_input_tokens_seen": 142656, + "step": 745 + }, + { + "epoch": 0.3898128898128898, + "grad_norm": 1.2301208972930908, + "learning_rate": 1.9464656964656967e-05, + "loss": 1.82, + "num_input_tokens_seen": 143584, + "step": 750 + }, + { + "epoch": 0.3924116424116424, + "grad_norm": 1.2382252216339111, + "learning_rate": 1.9594594594594595e-05, + "loss": 1.8282, + "num_input_tokens_seen": 144512, + "step": 755 + }, + { + "epoch": 0.39501039501039503, + "grad_norm": 1.6737418174743652, + "learning_rate": 1.9724532224532224e-05, + "loss": 1.9358, + "num_input_tokens_seen": 145504, + "step": 760 + }, + { + "epoch": 0.3976091476091476, + "grad_norm": 1.8771380186080933, + "learning_rate": 1.9854469854469855e-05, + "loss": 1.6112, + "num_input_tokens_seen": 146400, + "step": 765 + }, + { + "epoch": 0.4002079002079002, + "grad_norm": 1.3670105934143066, + "learning_rate": 1.9984407484407487e-05, + "loss": 1.5854, + "num_input_tokens_seen": 147360, + "step": 770 + }, + { + "epoch": 0.4028066528066528, + "grad_norm": 1.1775445938110352, + "learning_rate": 2.0114345114345116e-05, + "loss": 1.5965, + "num_input_tokens_seen": 148320, + "step": 775 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9903310537338257, + "learning_rate": 2.0244282744282744e-05, + "loss": 1.3417, + "num_input_tokens_seen": 149216, + "step": 780 + }, + { + "epoch": 0.408004158004158, + "grad_norm": 1.3262276649475098, + "learning_rate": 2.0374220374220376e-05, + "loss": 1.383, + "num_input_tokens_seen": 150176, + "step": 785 + }, + { + "epoch": 0.4106029106029106, + "grad_norm": 1.5399706363677979, + "learning_rate": 2.0504158004158005e-05, + "loss": 1.6064, + "num_input_tokens_seen": 151168, + "step": 790 + }, + { + "epoch": 0.4132016632016632, + "grad_norm": 0.954311192035675, + "learning_rate": 2.0634095634095636e-05, + "loss": 1.2618, + "num_input_tokens_seen": 152128, + "step": 795 + }, + { + "epoch": 0.4158004158004158, + "grad_norm": 0.9292040467262268, + "learning_rate": 2.0764033264033265e-05, + "loss": 1.2646, + "num_input_tokens_seen": 153120, + "step": 800 + }, + { + "epoch": 0.4183991683991684, + "grad_norm": 1.546484351158142, + "learning_rate": 2.0893970893970897e-05, + "loss": 0.9838, + "num_input_tokens_seen": 154016, + "step": 805 + }, + { + "epoch": 0.420997920997921, + "grad_norm": 0.9746153354644775, + "learning_rate": 2.1023908523908525e-05, + "loss": 1.1384, + "num_input_tokens_seen": 155008, + "step": 810 + }, + { + "epoch": 0.4235966735966736, + "grad_norm": 1.021835207939148, + "learning_rate": 2.1153846153846154e-05, + "loss": 1.0577, + "num_input_tokens_seen": 156000, + "step": 815 + }, + { + "epoch": 0.4261954261954262, + "grad_norm": 1.1322021484375, + "learning_rate": 2.1283783783783785e-05, + "loss": 1.162, + "num_input_tokens_seen": 156992, + "step": 820 + }, + { + "epoch": 0.4287941787941788, + "grad_norm": 1.3043100833892822, + "learning_rate": 2.1413721413721417e-05, + "loss": 0.7379, + "num_input_tokens_seen": 157856, + "step": 825 + }, + { + "epoch": 0.4313929313929314, + "grad_norm": 0.7139701843261719, + "learning_rate": 2.1543659043659046e-05, + "loss": 0.8121, + "num_input_tokens_seen": 158784, + "step": 830 + }, + { + "epoch": 0.433991683991684, + "grad_norm": 0.8252978920936584, + "learning_rate": 2.1673596673596674e-05, + "loss": 0.7483, + "num_input_tokens_seen": 159744, + "step": 835 + }, + { + "epoch": 0.4365904365904366, + "grad_norm": 1.8715603351593018, + "learning_rate": 2.1803534303534303e-05, + "loss": 0.9079, + "num_input_tokens_seen": 160672, + "step": 840 + }, + { + "epoch": 0.4391891891891892, + "grad_norm": 0.9221636056900024, + "learning_rate": 2.1933471933471934e-05, + "loss": 1.1336, + "num_input_tokens_seen": 161664, + "step": 845 + }, + { + "epoch": 0.4417879417879418, + "grad_norm": 1.6894092559814453, + "learning_rate": 2.2063409563409566e-05, + "loss": 0.9648, + "num_input_tokens_seen": 162592, + "step": 850 + }, + { + "epoch": 0.44438669438669437, + "grad_norm": 1.546030879020691, + "learning_rate": 2.2193347193347195e-05, + "loss": 0.6141, + "num_input_tokens_seen": 163488, + "step": 855 + }, + { + "epoch": 0.446985446985447, + "grad_norm": 1.7600934505462646, + "learning_rate": 2.2323284823284823e-05, + "loss": 0.8714, + "num_input_tokens_seen": 164448, + "step": 860 + }, + { + "epoch": 0.4495841995841996, + "grad_norm": 1.8552420139312744, + "learning_rate": 2.2453222453222455e-05, + "loss": 0.8522, + "num_input_tokens_seen": 165440, + "step": 865 + }, + { + "epoch": 0.4521829521829522, + "grad_norm": 0.8162457346916199, + "learning_rate": 2.2583160083160083e-05, + "loss": 0.9229, + "num_input_tokens_seen": 166432, + "step": 870 + }, + { + "epoch": 0.45478170478170477, + "grad_norm": 1.1647212505340576, + "learning_rate": 2.2713097713097715e-05, + "loss": 0.8367, + "num_input_tokens_seen": 167424, + "step": 875 + }, + { + "epoch": 0.4573804573804574, + "grad_norm": 0.8563199043273926, + "learning_rate": 2.2843035343035344e-05, + "loss": 0.7905, + "num_input_tokens_seen": 168416, + "step": 880 + }, + { + "epoch": 0.45997920997921, + "grad_norm": 2.491429090499878, + "learning_rate": 2.2972972972972976e-05, + "loss": 0.9368, + "num_input_tokens_seen": 169376, + "step": 885 + }, + { + "epoch": 0.4625779625779626, + "grad_norm": 0.9556612968444824, + "learning_rate": 2.3102910602910604e-05, + "loss": 0.6212, + "num_input_tokens_seen": 170368, + "step": 890 + }, + { + "epoch": 0.46517671517671516, + "grad_norm": 1.4237765073776245, + "learning_rate": 2.3232848232848233e-05, + "loss": 0.7278, + "num_input_tokens_seen": 171392, + "step": 895 + }, + { + "epoch": 0.4677754677754678, + "grad_norm": 1.1571844816207886, + "learning_rate": 2.3362785862785864e-05, + "loss": 0.8673, + "num_input_tokens_seen": 172448, + "step": 900 + }, + { + "epoch": 0.4703742203742204, + "grad_norm": 0.5877601504325867, + "learning_rate": 2.3492723492723496e-05, + "loss": 0.6578, + "num_input_tokens_seen": 173440, + "step": 905 + }, + { + "epoch": 0.47297297297297297, + "grad_norm": 0.7980511784553528, + "learning_rate": 2.3622661122661125e-05, + "loss": 0.4645, + "num_input_tokens_seen": 174368, + "step": 910 + }, + { + "epoch": 0.47557172557172556, + "grad_norm": 0.868983268737793, + "learning_rate": 2.3752598752598753e-05, + "loss": 0.6218, + "num_input_tokens_seen": 175328, + "step": 915 + }, + { + "epoch": 0.4781704781704782, + "grad_norm": 1.2316257953643799, + "learning_rate": 2.388253638253638e-05, + "loss": 0.4328, + "num_input_tokens_seen": 176256, + "step": 920 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 0.7463362812995911, + "learning_rate": 2.4012474012474013e-05, + "loss": 0.4689, + "num_input_tokens_seen": 177216, + "step": 925 + }, + { + "epoch": 0.48336798336798337, + "grad_norm": 1.3213144540786743, + "learning_rate": 2.4142411642411645e-05, + "loss": 0.4473, + "num_input_tokens_seen": 178112, + "step": 930 + }, + { + "epoch": 0.48596673596673595, + "grad_norm": 1.6357136964797974, + "learning_rate": 2.4272349272349274e-05, + "loss": 0.5215, + "num_input_tokens_seen": 179072, + "step": 935 + }, + { + "epoch": 0.4885654885654886, + "grad_norm": 1.1133953332901, + "learning_rate": 2.4402286902286902e-05, + "loss": 0.4479, + "num_input_tokens_seen": 179968, + "step": 940 + }, + { + "epoch": 0.4911642411642412, + "grad_norm": 2.0896358489990234, + "learning_rate": 2.4532224532224534e-05, + "loss": 0.5772, + "num_input_tokens_seen": 180928, + "step": 945 + }, + { + "epoch": 0.49376299376299376, + "grad_norm": 0.7475742101669312, + "learning_rate": 2.4662162162162162e-05, + "loss": 0.5137, + "num_input_tokens_seen": 181920, + "step": 950 + }, + { + "epoch": 0.49636174636174635, + "grad_norm": 0.3693449795246124, + "learning_rate": 2.4792099792099794e-05, + "loss": 0.4005, + "num_input_tokens_seen": 182848, + "step": 955 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 0.6574680209159851, + "learning_rate": 2.4922037422037423e-05, + "loss": 0.4911, + "num_input_tokens_seen": 183808, + "step": 960 + }, + { + "epoch": 0.5, + "eval_loss": 0.45229649543762207, + "eval_runtime": 7.9821, + "eval_samples_per_second": 107.24, + "eval_steps_per_second": 26.81, + "num_input_tokens_seen": 184192, + "step": 962 + }, + { + "epoch": 0.5015592515592515, + "grad_norm": 0.5122359395027161, + "learning_rate": 2.505197505197505e-05, + "loss": 0.5887, + "num_input_tokens_seen": 184768, + "step": 965 + }, + { + "epoch": 0.5041580041580042, + "grad_norm": 1.4228028059005737, + "learning_rate": 2.5181912681912683e-05, + "loss": 0.4167, + "num_input_tokens_seen": 185696, + "step": 970 + }, + { + "epoch": 0.5067567567567568, + "grad_norm": 0.9486043453216553, + "learning_rate": 2.531185031185031e-05, + "loss": 0.4979, + "num_input_tokens_seen": 186656, + "step": 975 + }, + { + "epoch": 0.5093555093555093, + "grad_norm": 0.8117039203643799, + "learning_rate": 2.5441787941787943e-05, + "loss": 0.4986, + "num_input_tokens_seen": 187648, + "step": 980 + }, + { + "epoch": 0.511954261954262, + "grad_norm": 1.218529462814331, + "learning_rate": 2.5571725571725575e-05, + "loss": 0.3843, + "num_input_tokens_seen": 188544, + "step": 985 + }, + { + "epoch": 0.5145530145530145, + "grad_norm": 1.277956247329712, + "learning_rate": 2.57016632016632e-05, + "loss": 0.5171, + "num_input_tokens_seen": 189504, + "step": 990 + }, + { + "epoch": 0.5171517671517671, + "grad_norm": 0.660407543182373, + "learning_rate": 2.5831600831600832e-05, + "loss": 0.4293, + "num_input_tokens_seen": 190496, + "step": 995 + }, + { + "epoch": 0.5197505197505198, + "grad_norm": 2.4085230827331543, + "learning_rate": 2.5961538461538464e-05, + "loss": 0.4797, + "num_input_tokens_seen": 191488, + "step": 1000 + }, + { + "epoch": 0.5223492723492723, + "grad_norm": 1.7096463441848755, + "learning_rate": 2.6091476091476092e-05, + "loss": 0.5354, + "num_input_tokens_seen": 192480, + "step": 1005 + }, + { + "epoch": 0.524948024948025, + "grad_norm": 0.635370135307312, + "learning_rate": 2.6221413721413724e-05, + "loss": 0.3515, + "num_input_tokens_seen": 193440, + "step": 1010 + }, + { + "epoch": 0.5275467775467776, + "grad_norm": 0.5600859522819519, + "learning_rate": 2.635135135135135e-05, + "loss": 0.3952, + "num_input_tokens_seen": 194304, + "step": 1015 + }, + { + "epoch": 0.5301455301455301, + "grad_norm": 1.5293742418289185, + "learning_rate": 2.648128898128898e-05, + "loss": 0.4263, + "num_input_tokens_seen": 195328, + "step": 1020 + }, + { + "epoch": 0.5327442827442828, + "grad_norm": 1.239669680595398, + "learning_rate": 2.6611226611226613e-05, + "loss": 0.325, + "num_input_tokens_seen": 196256, + "step": 1025 + }, + { + "epoch": 0.5353430353430353, + "grad_norm": 1.6382585763931274, + "learning_rate": 2.674116424116424e-05, + "loss": 0.385, + "num_input_tokens_seen": 197152, + "step": 1030 + }, + { + "epoch": 0.5379417879417879, + "grad_norm": 2.0463473796844482, + "learning_rate": 2.6871101871101873e-05, + "loss": 0.5108, + "num_input_tokens_seen": 198048, + "step": 1035 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.976319670677185, + "learning_rate": 2.7001039501039505e-05, + "loss": 0.3662, + "num_input_tokens_seen": 199008, + "step": 1040 + }, + { + "epoch": 0.5431392931392931, + "grad_norm": 1.6727354526519775, + "learning_rate": 2.713097713097713e-05, + "loss": 0.4022, + "num_input_tokens_seen": 200032, + "step": 1045 + }, + { + "epoch": 0.5457380457380457, + "grad_norm": 1.6329702138900757, + "learning_rate": 2.7260914760914762e-05, + "loss": 0.3498, + "num_input_tokens_seen": 200928, + "step": 1050 + }, + { + "epoch": 0.5483367983367984, + "grad_norm": 1.5057405233383179, + "learning_rate": 2.739085239085239e-05, + "loss": 0.4628, + "num_input_tokens_seen": 201920, + "step": 1055 + }, + { + "epoch": 0.5509355509355509, + "grad_norm": 1.5815712213516235, + "learning_rate": 2.7520790020790022e-05, + "loss": 0.3098, + "num_input_tokens_seen": 202848, + "step": 1060 + }, + { + "epoch": 0.5535343035343036, + "grad_norm": 0.7389600872993469, + "learning_rate": 2.7650727650727654e-05, + "loss": 0.4419, + "num_input_tokens_seen": 203808, + "step": 1065 + }, + { + "epoch": 0.5561330561330561, + "grad_norm": 1.0222574472427368, + "learning_rate": 2.778066528066528e-05, + "loss": 0.5985, + "num_input_tokens_seen": 204896, + "step": 1070 + }, + { + "epoch": 0.5587318087318087, + "grad_norm": 1.2028870582580566, + "learning_rate": 2.791060291060291e-05, + "loss": 0.3613, + "num_input_tokens_seen": 205856, + "step": 1075 + }, + { + "epoch": 0.5613305613305614, + "grad_norm": 0.7653259038925171, + "learning_rate": 2.8040540540540543e-05, + "loss": 0.4808, + "num_input_tokens_seen": 206816, + "step": 1080 + }, + { + "epoch": 0.5639293139293139, + "grad_norm": 0.9623191356658936, + "learning_rate": 2.817047817047817e-05, + "loss": 0.3391, + "num_input_tokens_seen": 207712, + "step": 1085 + }, + { + "epoch": 0.5665280665280665, + "grad_norm": 0.7559850811958313, + "learning_rate": 2.8300415800415803e-05, + "loss": 0.4033, + "num_input_tokens_seen": 208608, + "step": 1090 + }, + { + "epoch": 0.5691268191268192, + "grad_norm": 1.2239782810211182, + "learning_rate": 2.8430353430353428e-05, + "loss": 0.3972, + "num_input_tokens_seen": 209568, + "step": 1095 + }, + { + "epoch": 0.5717255717255717, + "grad_norm": 0.8853040933609009, + "learning_rate": 2.856029106029106e-05, + "loss": 0.3731, + "num_input_tokens_seen": 210496, + "step": 1100 + }, + { + "epoch": 0.5743243243243243, + "grad_norm": 0.42015790939331055, + "learning_rate": 2.8690228690228692e-05, + "loss": 0.3502, + "num_input_tokens_seen": 211392, + "step": 1105 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 1.4091655015945435, + "learning_rate": 2.882016632016632e-05, + "loss": 0.3422, + "num_input_tokens_seen": 212352, + "step": 1110 + }, + { + "epoch": 0.5795218295218295, + "grad_norm": 0.7290813326835632, + "learning_rate": 2.8950103950103952e-05, + "loss": 0.3799, + "num_input_tokens_seen": 213344, + "step": 1115 + }, + { + "epoch": 0.5821205821205822, + "grad_norm": 1.1378464698791504, + "learning_rate": 2.9080041580041584e-05, + "loss": 0.4968, + "num_input_tokens_seen": 214432, + "step": 1120 + }, + { + "epoch": 0.5847193347193347, + "grad_norm": 1.5980232954025269, + "learning_rate": 2.920997920997921e-05, + "loss": 0.2851, + "num_input_tokens_seen": 215328, + "step": 1125 + }, + { + "epoch": 0.5873180873180873, + "grad_norm": 1.2321456670761108, + "learning_rate": 2.933991683991684e-05, + "loss": 0.4479, + "num_input_tokens_seen": 216320, + "step": 1130 + }, + { + "epoch": 0.58991683991684, + "grad_norm": 0.6581343412399292, + "learning_rate": 2.946985446985447e-05, + "loss": 0.3737, + "num_input_tokens_seen": 217216, + "step": 1135 + }, + { + "epoch": 0.5925155925155925, + "grad_norm": 0.9009098410606384, + "learning_rate": 2.95997920997921e-05, + "loss": 0.3189, + "num_input_tokens_seen": 218144, + "step": 1140 + }, + { + "epoch": 0.5951143451143451, + "grad_norm": 1.2425599098205566, + "learning_rate": 2.9729729729729733e-05, + "loss": 0.2949, + "num_input_tokens_seen": 219040, + "step": 1145 + }, + { + "epoch": 0.5977130977130977, + "grad_norm": 0.9230920672416687, + "learning_rate": 2.9859667359667358e-05, + "loss": 0.319, + "num_input_tokens_seen": 220000, + "step": 1150 + }, + { + "epoch": 0.6003118503118503, + "grad_norm": 1.8534809350967407, + "learning_rate": 2.998960498960499e-05, + "loss": 0.4988, + "num_input_tokens_seen": 220992, + "step": 1155 + }, + { + "epoch": 0.6029106029106029, + "grad_norm": 1.321053147315979, + "learning_rate": 3.0119542619542622e-05, + "loss": 0.3593, + "num_input_tokens_seen": 221920, + "step": 1160 + }, + { + "epoch": 0.6055093555093555, + "grad_norm": 1.2908509969711304, + "learning_rate": 3.024948024948025e-05, + "loss": 0.283, + "num_input_tokens_seen": 222880, + "step": 1165 + }, + { + "epoch": 0.6081081081081081, + "grad_norm": 0.8658168315887451, + "learning_rate": 3.0379417879417882e-05, + "loss": 0.3028, + "num_input_tokens_seen": 223776, + "step": 1170 + }, + { + "epoch": 0.6107068607068608, + "grad_norm": 0.8817125558853149, + "learning_rate": 3.0509355509355507e-05, + "loss": 0.3827, + "num_input_tokens_seen": 224704, + "step": 1175 + }, + { + "epoch": 0.6133056133056133, + "grad_norm": 1.052205204963684, + "learning_rate": 3.063929313929314e-05, + "loss": 0.4129, + "num_input_tokens_seen": 225728, + "step": 1180 + }, + { + "epoch": 0.6159043659043659, + "grad_norm": 1.769040822982788, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.3207, + "num_input_tokens_seen": 226656, + "step": 1185 + }, + { + "epoch": 0.6185031185031185, + "grad_norm": 0.9383282661437988, + "learning_rate": 3.08991683991684e-05, + "loss": 0.3544, + "num_input_tokens_seen": 227552, + "step": 1190 + }, + { + "epoch": 0.6211018711018711, + "grad_norm": 0.8225072622299194, + "learning_rate": 3.102910602910603e-05, + "loss": 0.2983, + "num_input_tokens_seen": 228448, + "step": 1195 + }, + { + "epoch": 0.6237006237006237, + "grad_norm": 2.0568459033966064, + "learning_rate": 3.115904365904366e-05, + "loss": 0.3887, + "num_input_tokens_seen": 229376, + "step": 1200 + }, + { + "epoch": 0.6262993762993763, + "grad_norm": 1.2711564302444458, + "learning_rate": 3.128898128898129e-05, + "loss": 0.3539, + "num_input_tokens_seen": 230400, + "step": 1205 + }, + { + "epoch": 0.6288981288981289, + "grad_norm": 0.7906997799873352, + "learning_rate": 3.141891891891892e-05, + "loss": 0.3492, + "num_input_tokens_seen": 231392, + "step": 1210 + }, + { + "epoch": 0.6314968814968815, + "grad_norm": 0.9827476739883423, + "learning_rate": 3.1548856548856545e-05, + "loss": 0.4408, + "num_input_tokens_seen": 232448, + "step": 1215 + }, + { + "epoch": 0.6340956340956341, + "grad_norm": 1.1999120712280273, + "learning_rate": 3.167879417879418e-05, + "loss": 0.3247, + "num_input_tokens_seen": 233408, + "step": 1220 + }, + { + "epoch": 0.6366943866943867, + "grad_norm": 0.8049551248550415, + "learning_rate": 3.180873180873181e-05, + "loss": 0.3217, + "num_input_tokens_seen": 234368, + "step": 1225 + }, + { + "epoch": 0.6392931392931392, + "grad_norm": 1.4476077556610107, + "learning_rate": 3.193866943866944e-05, + "loss": 0.237, + "num_input_tokens_seen": 235264, + "step": 1230 + }, + { + "epoch": 0.6418918918918919, + "grad_norm": 0.49240344762802124, + "learning_rate": 3.206860706860707e-05, + "loss": 0.1793, + "num_input_tokens_seen": 236192, + "step": 1235 + }, + { + "epoch": 0.6444906444906445, + "grad_norm": 1.4347528219223022, + "learning_rate": 3.2198544698544704e-05, + "loss": 0.4479, + "num_input_tokens_seen": 237152, + "step": 1240 + }, + { + "epoch": 0.6470893970893971, + "grad_norm": 0.5632762908935547, + "learning_rate": 3.232848232848233e-05, + "loss": 0.4227, + "num_input_tokens_seen": 238080, + "step": 1245 + }, + { + "epoch": 0.6496881496881497, + "grad_norm": 1.1640625, + "learning_rate": 3.245841995841996e-05, + "loss": 0.4632, + "num_input_tokens_seen": 239104, + "step": 1250 + }, + { + "epoch": 0.6522869022869023, + "grad_norm": 0.7535299062728882, + "learning_rate": 3.2588357588357586e-05, + "loss": 0.363, + "num_input_tokens_seen": 240064, + "step": 1255 + }, + { + "epoch": 0.6548856548856549, + "grad_norm": 1.8654605150222778, + "learning_rate": 3.271829521829522e-05, + "loss": 0.3458, + "num_input_tokens_seen": 241024, + "step": 1260 + }, + { + "epoch": 0.6574844074844075, + "grad_norm": 1.18294358253479, + "learning_rate": 3.284823284823285e-05, + "loss": 0.3018, + "num_input_tokens_seen": 241952, + "step": 1265 + }, + { + "epoch": 0.66008316008316, + "grad_norm": 1.2507665157318115, + "learning_rate": 3.2978170478170475e-05, + "loss": 0.3253, + "num_input_tokens_seen": 242880, + "step": 1270 + }, + { + "epoch": 0.6626819126819127, + "grad_norm": 0.829384982585907, + "learning_rate": 3.310810810810811e-05, + "loss": 0.3465, + "num_input_tokens_seen": 243776, + "step": 1275 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 0.8168261051177979, + "learning_rate": 3.3238045738045745e-05, + "loss": 0.3199, + "num_input_tokens_seen": 244768, + "step": 1280 + }, + { + "epoch": 0.6678794178794178, + "grad_norm": 1.5088731050491333, + "learning_rate": 3.336798336798337e-05, + "loss": 0.3603, + "num_input_tokens_seen": 245856, + "step": 1285 + }, + { + "epoch": 0.6704781704781705, + "grad_norm": 1.4827786684036255, + "learning_rate": 3.3497920997921e-05, + "loss": 0.4245, + "num_input_tokens_seen": 246912, + "step": 1290 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 0.9262805581092834, + "learning_rate": 3.362785862785863e-05, + "loss": 0.4046, + "num_input_tokens_seen": 247872, + "step": 1295 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 1.5846593379974365, + "learning_rate": 3.375779625779626e-05, + "loss": 0.3723, + "num_input_tokens_seen": 248832, + "step": 1300 + }, + { + "epoch": 0.6782744282744283, + "grad_norm": 1.5653736591339111, + "learning_rate": 3.388773388773389e-05, + "loss": 0.3371, + "num_input_tokens_seen": 249792, + "step": 1305 + }, + { + "epoch": 0.6808731808731808, + "grad_norm": 0.5334802865982056, + "learning_rate": 3.4017671517671516e-05, + "loss": 0.3325, + "num_input_tokens_seen": 250816, + "step": 1310 + }, + { + "epoch": 0.6834719334719335, + "grad_norm": 0.635204553604126, + "learning_rate": 3.414760914760915e-05, + "loss": 0.2641, + "num_input_tokens_seen": 251808, + "step": 1315 + }, + { + "epoch": 0.6860706860706861, + "grad_norm": 1.304555892944336, + "learning_rate": 3.427754677754678e-05, + "loss": 0.3076, + "num_input_tokens_seen": 252704, + "step": 1320 + }, + { + "epoch": 0.6886694386694386, + "grad_norm": 1.3702759742736816, + "learning_rate": 3.4407484407484405e-05, + "loss": 0.3027, + "num_input_tokens_seen": 253664, + "step": 1325 + }, + { + "epoch": 0.6912681912681913, + "grad_norm": 0.61958247423172, + "learning_rate": 3.4537422037422044e-05, + "loss": 0.3297, + "num_input_tokens_seen": 254688, + "step": 1330 + }, + { + "epoch": 0.6938669438669439, + "grad_norm": 0.8294961452484131, + "learning_rate": 3.466735966735967e-05, + "loss": 0.2422, + "num_input_tokens_seen": 255584, + "step": 1335 + }, + { + "epoch": 0.6964656964656964, + "grad_norm": 0.6164268851280212, + "learning_rate": 3.47972972972973e-05, + "loss": 0.3408, + "num_input_tokens_seen": 256576, + "step": 1340 + }, + { + "epoch": 0.6990644490644491, + "grad_norm": 0.9613202214241028, + "learning_rate": 3.492723492723493e-05, + "loss": 0.3094, + "num_input_tokens_seen": 257536, + "step": 1345 + }, + { + "epoch": 0.7016632016632016, + "grad_norm": 1.3391263484954834, + "learning_rate": 3.505717255717256e-05, + "loss": 0.2384, + "num_input_tokens_seen": 258528, + "step": 1350 + }, + { + "epoch": 0.7042619542619543, + "grad_norm": 0.700713574886322, + "learning_rate": 3.518711018711019e-05, + "loss": 0.2219, + "num_input_tokens_seen": 259456, + "step": 1355 + }, + { + "epoch": 0.7068607068607069, + "grad_norm": 1.4987305402755737, + "learning_rate": 3.531704781704782e-05, + "loss": 0.3956, + "num_input_tokens_seen": 260384, + "step": 1360 + }, + { + "epoch": 0.7094594594594594, + "grad_norm": 0.8845855593681335, + "learning_rate": 3.5446985446985446e-05, + "loss": 0.2892, + "num_input_tokens_seen": 261312, + "step": 1365 + }, + { + "epoch": 0.7120582120582121, + "grad_norm": 1.4114726781845093, + "learning_rate": 3.557692307692308e-05, + "loss": 0.2731, + "num_input_tokens_seen": 262240, + "step": 1370 + }, + { + "epoch": 0.7146569646569647, + "grad_norm": 0.7728409171104431, + "learning_rate": 3.57068607068607e-05, + "loss": 0.3217, + "num_input_tokens_seen": 263232, + "step": 1375 + }, + { + "epoch": 0.7172557172557172, + "grad_norm": 0.9857121706008911, + "learning_rate": 3.583679833679834e-05, + "loss": 0.3122, + "num_input_tokens_seen": 264224, + "step": 1380 + }, + { + "epoch": 0.7198544698544699, + "grad_norm": 0.8835378885269165, + "learning_rate": 3.5966735966735974e-05, + "loss": 0.3096, + "num_input_tokens_seen": 265248, + "step": 1385 + }, + { + "epoch": 0.7224532224532224, + "grad_norm": 0.9226134419441223, + "learning_rate": 3.60966735966736e-05, + "loss": 0.3505, + "num_input_tokens_seen": 266112, + "step": 1390 + }, + { + "epoch": 0.725051975051975, + "grad_norm": 0.7748931646347046, + "learning_rate": 3.622661122661123e-05, + "loss": 0.3172, + "num_input_tokens_seen": 267104, + "step": 1395 + }, + { + "epoch": 0.7276507276507277, + "grad_norm": 0.6640596389770508, + "learning_rate": 3.635654885654886e-05, + "loss": 0.3268, + "num_input_tokens_seen": 268032, + "step": 1400 + }, + { + "epoch": 0.7302494802494802, + "grad_norm": 0.8789185285568237, + "learning_rate": 3.648648648648649e-05, + "loss": 0.3662, + "num_input_tokens_seen": 268928, + "step": 1405 + }, + { + "epoch": 0.7328482328482329, + "grad_norm": 1.4753905534744263, + "learning_rate": 3.661642411642412e-05, + "loss": 0.2949, + "num_input_tokens_seen": 269824, + "step": 1410 + }, + { + "epoch": 0.7354469854469855, + "grad_norm": 0.6266114711761475, + "learning_rate": 3.6746361746361744e-05, + "loss": 0.323, + "num_input_tokens_seen": 270720, + "step": 1415 + }, + { + "epoch": 0.738045738045738, + "grad_norm": 1.712862491607666, + "learning_rate": 3.6876299376299376e-05, + "loss": 0.293, + "num_input_tokens_seen": 271648, + "step": 1420 + }, + { + "epoch": 0.7406444906444907, + "grad_norm": 1.0343481302261353, + "learning_rate": 3.700623700623701e-05, + "loss": 0.3057, + "num_input_tokens_seen": 272608, + "step": 1425 + }, + { + "epoch": 0.7432432432432432, + "grad_norm": 0.6384943723678589, + "learning_rate": 3.713617463617464e-05, + "loss": 0.2801, + "num_input_tokens_seen": 273568, + "step": 1430 + }, + { + "epoch": 0.7458419958419958, + "grad_norm": 1.4799717664718628, + "learning_rate": 3.726611226611227e-05, + "loss": 0.2617, + "num_input_tokens_seen": 274496, + "step": 1435 + }, + { + "epoch": 0.7484407484407485, + "grad_norm": 2.5760536193847656, + "learning_rate": 3.7396049896049903e-05, + "loss": 0.3665, + "num_input_tokens_seen": 275456, + "step": 1440 + }, + { + "epoch": 0.751039501039501, + "grad_norm": 0.6549526453018188, + "learning_rate": 3.752598752598753e-05, + "loss": 0.255, + "num_input_tokens_seen": 276448, + "step": 1445 + }, + { + "epoch": 0.7536382536382537, + "grad_norm": 1.319715142250061, + "learning_rate": 3.765592515592516e-05, + "loss": 0.3058, + "num_input_tokens_seen": 277408, + "step": 1450 + }, + { + "epoch": 0.7562370062370062, + "grad_norm": 0.4423368275165558, + "learning_rate": 3.7785862785862785e-05, + "loss": 0.4334, + "num_input_tokens_seen": 278400, + "step": 1455 + }, + { + "epoch": 0.7588357588357588, + "grad_norm": 0.795110821723938, + "learning_rate": 3.791580041580042e-05, + "loss": 0.259, + "num_input_tokens_seen": 279232, + "step": 1460 + }, + { + "epoch": 0.7614345114345115, + "grad_norm": 0.7549684643745422, + "learning_rate": 3.804573804573805e-05, + "loss": 0.235, + "num_input_tokens_seen": 280128, + "step": 1465 + }, + { + "epoch": 0.764033264033264, + "grad_norm": 1.6449388265609741, + "learning_rate": 3.8175675675675674e-05, + "loss": 0.3641, + "num_input_tokens_seen": 281056, + "step": 1470 + }, + { + "epoch": 0.7666320166320166, + "grad_norm": 0.15389680862426758, + "learning_rate": 3.8305613305613306e-05, + "loss": 0.3836, + "num_input_tokens_seen": 281984, + "step": 1475 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.810915470123291, + "learning_rate": 3.843555093555094e-05, + "loss": 0.338, + "num_input_tokens_seen": 282880, + "step": 1480 + }, + { + "epoch": 0.7718295218295218, + "grad_norm": 1.8858537673950195, + "learning_rate": 3.856548856548857e-05, + "loss": 0.2875, + "num_input_tokens_seen": 283808, + "step": 1485 + }, + { + "epoch": 0.7744282744282744, + "grad_norm": 1.7483348846435547, + "learning_rate": 3.86954261954262e-05, + "loss": 0.3067, + "num_input_tokens_seen": 284768, + "step": 1490 + }, + { + "epoch": 0.777027027027027, + "grad_norm": 1.6311383247375488, + "learning_rate": 3.8825363825363827e-05, + "loss": 0.2663, + "num_input_tokens_seen": 285632, + "step": 1495 + }, + { + "epoch": 0.7796257796257796, + "grad_norm": 0.21776871383190155, + "learning_rate": 3.895530145530146e-05, + "loss": 0.2937, + "num_input_tokens_seen": 286560, + "step": 1500 + }, + { + "epoch": 0.7822245322245323, + "grad_norm": 0.6382114887237549, + "learning_rate": 3.908523908523909e-05, + "loss": 0.2561, + "num_input_tokens_seen": 287520, + "step": 1505 + }, + { + "epoch": 0.7848232848232848, + "grad_norm": 0.4605923295021057, + "learning_rate": 3.9215176715176715e-05, + "loss": 0.398, + "num_input_tokens_seen": 288480, + "step": 1510 + }, + { + "epoch": 0.7874220374220374, + "grad_norm": 0.4095039963722229, + "learning_rate": 3.934511434511435e-05, + "loss": 0.283, + "num_input_tokens_seen": 289440, + "step": 1515 + }, + { + "epoch": 0.7900207900207901, + "grad_norm": 1.1037871837615967, + "learning_rate": 3.947505197505197e-05, + "loss": 0.2832, + "num_input_tokens_seen": 290400, + "step": 1520 + }, + { + "epoch": 0.7926195426195426, + "grad_norm": 2.035074472427368, + "learning_rate": 3.9604989604989604e-05, + "loss": 0.2966, + "num_input_tokens_seen": 291328, + "step": 1525 + }, + { + "epoch": 0.7952182952182952, + "grad_norm": 0.4147579073905945, + "learning_rate": 3.9734927234927236e-05, + "loss": 0.3205, + "num_input_tokens_seen": 292256, + "step": 1530 + }, + { + "epoch": 0.7978170478170478, + "grad_norm": 0.5884885191917419, + "learning_rate": 3.986486486486487e-05, + "loss": 0.2985, + "num_input_tokens_seen": 293216, + "step": 1535 + }, + { + "epoch": 0.8004158004158004, + "grad_norm": 1.032549500465393, + "learning_rate": 3.99948024948025e-05, + "loss": 0.3101, + "num_input_tokens_seen": 294176, + "step": 1540 + }, + { + "epoch": 0.803014553014553, + "grad_norm": 1.1064029932022095, + "learning_rate": 4.012474012474013e-05, + "loss": 0.3959, + "num_input_tokens_seen": 295232, + "step": 1545 + }, + { + "epoch": 0.8056133056133056, + "grad_norm": 1.9686000347137451, + "learning_rate": 4.0254677754677757e-05, + "loss": 0.2816, + "num_input_tokens_seen": 296160, + "step": 1550 + }, + { + "epoch": 0.8082120582120582, + "grad_norm": 0.8986497521400452, + "learning_rate": 4.038461538461539e-05, + "loss": 0.3017, + "num_input_tokens_seen": 297120, + "step": 1555 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.5490813851356506, + "learning_rate": 4.0514553014553013e-05, + "loss": 0.2761, + "num_input_tokens_seen": 298016, + "step": 1560 + }, + { + "epoch": 0.8134095634095634, + "grad_norm": 0.7572400569915771, + "learning_rate": 4.0644490644490645e-05, + "loss": 0.3144, + "num_input_tokens_seen": 299008, + "step": 1565 + }, + { + "epoch": 0.816008316008316, + "grad_norm": 0.7753134965896606, + "learning_rate": 4.077442827442828e-05, + "loss": 0.2445, + "num_input_tokens_seen": 299872, + "step": 1570 + }, + { + "epoch": 0.8186070686070686, + "grad_norm": 0.6342728137969971, + "learning_rate": 4.09043659043659e-05, + "loss": 0.25, + "num_input_tokens_seen": 300832, + "step": 1575 + }, + { + "epoch": 0.8212058212058212, + "grad_norm": 1.0149288177490234, + "learning_rate": 4.1034303534303534e-05, + "loss": 0.3057, + "num_input_tokens_seen": 301760, + "step": 1580 + }, + { + "epoch": 0.8238045738045738, + "grad_norm": 0.7489494681358337, + "learning_rate": 4.1164241164241166e-05, + "loss": 0.2393, + "num_input_tokens_seen": 302752, + "step": 1585 + }, + { + "epoch": 0.8264033264033264, + "grad_norm": 0.8222891688346863, + "learning_rate": 4.12941787941788e-05, + "loss": 0.2349, + "num_input_tokens_seen": 303648, + "step": 1590 + }, + { + "epoch": 0.829002079002079, + "grad_norm": 1.468519926071167, + "learning_rate": 4.142411642411643e-05, + "loss": 0.3159, + "num_input_tokens_seen": 304576, + "step": 1595 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 1.2900186777114868, + "learning_rate": 4.1554054054054055e-05, + "loss": 0.2852, + "num_input_tokens_seen": 305504, + "step": 1600 + }, + { + "epoch": 0.8341995841995842, + "grad_norm": 1.8936059474945068, + "learning_rate": 4.1683991683991686e-05, + "loss": 0.3488, + "num_input_tokens_seen": 306400, + "step": 1605 + }, + { + "epoch": 0.8367983367983368, + "grad_norm": 1.2880103588104248, + "learning_rate": 4.181392931392932e-05, + "loss": 0.3554, + "num_input_tokens_seen": 307392, + "step": 1610 + }, + { + "epoch": 0.8393970893970893, + "grad_norm": 0.7342007160186768, + "learning_rate": 4.194386694386694e-05, + "loss": 0.2829, + "num_input_tokens_seen": 308288, + "step": 1615 + }, + { + "epoch": 0.841995841995842, + "grad_norm": 0.697099506855011, + "learning_rate": 4.2073804573804575e-05, + "loss": 0.2809, + "num_input_tokens_seen": 309280, + "step": 1620 + }, + { + "epoch": 0.8445945945945946, + "grad_norm": 0.6651408076286316, + "learning_rate": 4.220374220374221e-05, + "loss": 0.2391, + "num_input_tokens_seen": 310240, + "step": 1625 + }, + { + "epoch": 0.8471933471933472, + "grad_norm": 1.0537163019180298, + "learning_rate": 4.233367983367983e-05, + "loss": 0.2539, + "num_input_tokens_seen": 311168, + "step": 1630 + }, + { + "epoch": 0.8497920997920998, + "grad_norm": 0.7354336380958557, + "learning_rate": 4.2463617463617464e-05, + "loss": 0.2569, + "num_input_tokens_seen": 312192, + "step": 1635 + }, + { + "epoch": 0.8523908523908524, + "grad_norm": 2.139225721359253, + "learning_rate": 4.2593555093555096e-05, + "loss": 0.4299, + "num_input_tokens_seen": 313184, + "step": 1640 + }, + { + "epoch": 0.854989604989605, + "grad_norm": 1.2279760837554932, + "learning_rate": 4.272349272349273e-05, + "loss": 0.2804, + "num_input_tokens_seen": 314176, + "step": 1645 + }, + { + "epoch": 0.8575883575883576, + "grad_norm": 1.0816293954849243, + "learning_rate": 4.285343035343036e-05, + "loss": 0.3545, + "num_input_tokens_seen": 315136, + "step": 1650 + }, + { + "epoch": 0.8601871101871101, + "grad_norm": 0.9918544888496399, + "learning_rate": 4.2983367983367985e-05, + "loss": 0.2955, + "num_input_tokens_seen": 316032, + "step": 1655 + }, + { + "epoch": 0.8627858627858628, + "grad_norm": 1.2132697105407715, + "learning_rate": 4.3113305613305616e-05, + "loss": 0.2697, + "num_input_tokens_seen": 316960, + "step": 1660 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 0.8042198419570923, + "learning_rate": 4.324324324324325e-05, + "loss": 0.3135, + "num_input_tokens_seen": 317824, + "step": 1665 + }, + { + "epoch": 0.867983367983368, + "grad_norm": 1.0889718532562256, + "learning_rate": 4.337318087318087e-05, + "loss": 0.2581, + "num_input_tokens_seen": 318816, + "step": 1670 + }, + { + "epoch": 0.8705821205821206, + "grad_norm": 1.444066047668457, + "learning_rate": 4.3503118503118505e-05, + "loss": 0.2466, + "num_input_tokens_seen": 319776, + "step": 1675 + }, + { + "epoch": 0.8731808731808732, + "grad_norm": 1.2313957214355469, + "learning_rate": 4.363305613305613e-05, + "loss": 0.3825, + "num_input_tokens_seen": 320704, + "step": 1680 + }, + { + "epoch": 0.8757796257796258, + "grad_norm": 2.1320395469665527, + "learning_rate": 4.376299376299376e-05, + "loss": 0.3362, + "num_input_tokens_seen": 321664, + "step": 1685 + }, + { + "epoch": 0.8783783783783784, + "grad_norm": 1.3161306381225586, + "learning_rate": 4.3892931392931394e-05, + "loss": 0.2963, + "num_input_tokens_seen": 322624, + "step": 1690 + }, + { + "epoch": 0.8809771309771309, + "grad_norm": 1.5668002367019653, + "learning_rate": 4.4022869022869026e-05, + "loss": 0.2981, + "num_input_tokens_seen": 323584, + "step": 1695 + }, + { + "epoch": 0.8835758835758836, + "grad_norm": 0.9030066728591919, + "learning_rate": 4.415280665280666e-05, + "loss": 0.3221, + "num_input_tokens_seen": 324512, + "step": 1700 + }, + { + "epoch": 0.8861746361746362, + "grad_norm": 0.49252182245254517, + "learning_rate": 4.428274428274429e-05, + "loss": 0.3122, + "num_input_tokens_seen": 325504, + "step": 1705 + }, + { + "epoch": 0.8887733887733887, + "grad_norm": 0.584960401058197, + "learning_rate": 4.4412681912681914e-05, + "loss": 0.317, + "num_input_tokens_seen": 326464, + "step": 1710 + }, + { + "epoch": 0.8913721413721414, + "grad_norm": 0.3440428376197815, + "learning_rate": 4.4542619542619546e-05, + "loss": 0.2592, + "num_input_tokens_seen": 327392, + "step": 1715 + }, + { + "epoch": 0.893970893970894, + "grad_norm": 0.5998156070709229, + "learning_rate": 4.467255717255717e-05, + "loss": 0.2184, + "num_input_tokens_seen": 328448, + "step": 1720 + }, + { + "epoch": 0.8965696465696466, + "grad_norm": 1.5345858335494995, + "learning_rate": 4.48024948024948e-05, + "loss": 0.3616, + "num_input_tokens_seen": 329376, + "step": 1725 + }, + { + "epoch": 0.8991683991683992, + "grad_norm": 0.54049152135849, + "learning_rate": 4.4932432432432435e-05, + "loss": 0.335, + "num_input_tokens_seen": 330304, + "step": 1730 + }, + { + "epoch": 0.9017671517671517, + "grad_norm": 0.5417428016662598, + "learning_rate": 4.506237006237006e-05, + "loss": 0.3126, + "num_input_tokens_seen": 331328, + "step": 1735 + }, + { + "epoch": 0.9043659043659044, + "grad_norm": 0.9967828989028931, + "learning_rate": 4.519230769230769e-05, + "loss": 0.3433, + "num_input_tokens_seen": 332192, + "step": 1740 + }, + { + "epoch": 0.906964656964657, + "grad_norm": 0.9364327788352966, + "learning_rate": 4.5322245322245324e-05, + "loss": 0.2839, + "num_input_tokens_seen": 333120, + "step": 1745 + }, + { + "epoch": 0.9095634095634095, + "grad_norm": 0.9667032957077026, + "learning_rate": 4.5452182952182956e-05, + "loss": 0.2836, + "num_input_tokens_seen": 334080, + "step": 1750 + }, + { + "epoch": 0.9121621621621622, + "grad_norm": 1.1321288347244263, + "learning_rate": 4.558212058212059e-05, + "loss": 0.2692, + "num_input_tokens_seen": 335008, + "step": 1755 + }, + { + "epoch": 0.9147609147609148, + "grad_norm": 1.010988473892212, + "learning_rate": 4.571205821205821e-05, + "loss": 0.2474, + "num_input_tokens_seen": 335936, + "step": 1760 + }, + { + "epoch": 0.9173596673596673, + "grad_norm": 1.18883216381073, + "learning_rate": 4.5841995841995844e-05, + "loss": 0.2693, + "num_input_tokens_seen": 336896, + "step": 1765 + }, + { + "epoch": 0.91995841995842, + "grad_norm": 0.3347911536693573, + "learning_rate": 4.5971933471933476e-05, + "loss": 0.3266, + "num_input_tokens_seen": 337856, + "step": 1770 + }, + { + "epoch": 0.9225571725571725, + "grad_norm": 0.7850611805915833, + "learning_rate": 4.61018711018711e-05, + "loss": 0.3004, + "num_input_tokens_seen": 338848, + "step": 1775 + }, + { + "epoch": 0.9251559251559252, + "grad_norm": 1.1355574131011963, + "learning_rate": 4.623180873180873e-05, + "loss": 0.321, + "num_input_tokens_seen": 339744, + "step": 1780 + }, + { + "epoch": 0.9277546777546778, + "grad_norm": 0.7107558846473694, + "learning_rate": 4.6361746361746365e-05, + "loss": 0.2867, + "num_input_tokens_seen": 340704, + "step": 1785 + }, + { + "epoch": 0.9303534303534303, + "grad_norm": 1.991003155708313, + "learning_rate": 4.649168399168399e-05, + "loss": 0.2891, + "num_input_tokens_seen": 341600, + "step": 1790 + }, + { + "epoch": 0.932952182952183, + "grad_norm": 1.0557947158813477, + "learning_rate": 4.662162162162162e-05, + "loss": 0.3084, + "num_input_tokens_seen": 342592, + "step": 1795 + }, + { + "epoch": 0.9355509355509356, + "grad_norm": 0.8586711883544922, + "learning_rate": 4.6751559251559254e-05, + "loss": 0.1819, + "num_input_tokens_seen": 343552, + "step": 1800 + }, + { + "epoch": 0.9381496881496881, + "grad_norm": 1.3815573453903198, + "learning_rate": 4.6881496881496886e-05, + "loss": 0.4858, + "num_input_tokens_seen": 344576, + "step": 1805 + }, + { + "epoch": 0.9407484407484408, + "grad_norm": 0.9473904371261597, + "learning_rate": 4.701143451143452e-05, + "loss": 0.3469, + "num_input_tokens_seen": 345536, + "step": 1810 + }, + { + "epoch": 0.9433471933471933, + "grad_norm": 0.9963656067848206, + "learning_rate": 4.714137214137214e-05, + "loss": 0.2992, + "num_input_tokens_seen": 346528, + "step": 1815 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 1.459944248199463, + "learning_rate": 4.7271309771309774e-05, + "loss": 0.3586, + "num_input_tokens_seen": 347552, + "step": 1820 + }, + { + "epoch": 0.9485446985446986, + "grad_norm": 0.833815336227417, + "learning_rate": 4.7401247401247406e-05, + "loss": 0.3177, + "num_input_tokens_seen": 348480, + "step": 1825 + }, + { + "epoch": 0.9511434511434511, + "grad_norm": 0.6951180100440979, + "learning_rate": 4.753118503118503e-05, + "loss": 0.2649, + "num_input_tokens_seen": 349504, + "step": 1830 + }, + { + "epoch": 0.9537422037422038, + "grad_norm": 0.9805010557174683, + "learning_rate": 4.766112266112266e-05, + "loss": 0.2752, + "num_input_tokens_seen": 350400, + "step": 1835 + }, + { + "epoch": 0.9563409563409564, + "grad_norm": 0.9687503576278687, + "learning_rate": 4.779106029106029e-05, + "loss": 0.298, + "num_input_tokens_seen": 351392, + "step": 1840 + }, + { + "epoch": 0.9589397089397089, + "grad_norm": 0.4621194005012512, + "learning_rate": 4.792099792099792e-05, + "loss": 0.2299, + "num_input_tokens_seen": 352320, + "step": 1845 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 0.7286187410354614, + "learning_rate": 4.805093555093555e-05, + "loss": 0.3293, + "num_input_tokens_seen": 353280, + "step": 1850 + }, + { + "epoch": 0.9641372141372141, + "grad_norm": 1.281572937965393, + "learning_rate": 4.8180873180873184e-05, + "loss": 0.337, + "num_input_tokens_seen": 354240, + "step": 1855 + }, + { + "epoch": 0.9667359667359667, + "grad_norm": 0.5388261675834656, + "learning_rate": 4.8310810810810816e-05, + "loss": 0.3098, + "num_input_tokens_seen": 355168, + "step": 1860 + }, + { + "epoch": 0.9693347193347194, + "grad_norm": 1.5835912227630615, + "learning_rate": 4.844074844074845e-05, + "loss": 0.3138, + "num_input_tokens_seen": 356096, + "step": 1865 + }, + { + "epoch": 0.9719334719334719, + "grad_norm": 0.9060671925544739, + "learning_rate": 4.857068607068607e-05, + "loss": 0.2816, + "num_input_tokens_seen": 357088, + "step": 1870 + }, + { + "epoch": 0.9745322245322245, + "grad_norm": 1.17020583152771, + "learning_rate": 4.8700623700623704e-05, + "loss": 0.2447, + "num_input_tokens_seen": 357984, + "step": 1875 + }, + { + "epoch": 0.9771309771309772, + "grad_norm": 1.4168078899383545, + "learning_rate": 4.883056133056133e-05, + "loss": 0.3664, + "num_input_tokens_seen": 358976, + "step": 1880 + }, + { + "epoch": 0.9797297297297297, + "grad_norm": 0.7208994626998901, + "learning_rate": 4.896049896049896e-05, + "loss": 0.2671, + "num_input_tokens_seen": 359936, + "step": 1885 + }, + { + "epoch": 0.9823284823284824, + "grad_norm": 1.1165350675582886, + "learning_rate": 4.909043659043659e-05, + "loss": 0.2554, + "num_input_tokens_seen": 360864, + "step": 1890 + }, + { + "epoch": 0.9849272349272349, + "grad_norm": 0.9531814455986023, + "learning_rate": 4.922037422037422e-05, + "loss": 0.3064, + "num_input_tokens_seen": 361792, + "step": 1895 + }, + { + "epoch": 0.9875259875259875, + "grad_norm": 1.264581322669983, + "learning_rate": 4.935031185031185e-05, + "loss": 0.2832, + "num_input_tokens_seen": 362720, + "step": 1900 + }, + { + "epoch": 0.9901247401247402, + "grad_norm": 1.0691028833389282, + "learning_rate": 4.948024948024949e-05, + "loss": 0.2626, + "num_input_tokens_seen": 363648, + "step": 1905 + }, + { + "epoch": 0.9927234927234927, + "grad_norm": 1.0655323266983032, + "learning_rate": 4.9610187110187114e-05, + "loss": 0.4231, + "num_input_tokens_seen": 364704, + "step": 1910 + }, + { + "epoch": 0.9953222453222453, + "grad_norm": 0.9488716721534729, + "learning_rate": 4.9740124740124745e-05, + "loss": 0.2158, + "num_input_tokens_seen": 365664, + "step": 1915 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 0.7670338749885559, + "learning_rate": 4.987006237006237e-05, + "loss": 0.1788, + "num_input_tokens_seen": 366592, + "step": 1920 + }, + { + "epoch": 1.0, + "eval_loss": 0.30842578411102295, + "eval_runtime": 8.017, + "eval_samples_per_second": 106.773, + "eval_steps_per_second": 26.693, + "num_input_tokens_seen": 367320, + "step": 1924 + }, + { + "epoch": 1.0005197505197505, + "grad_norm": 1.6953619718551636, + "learning_rate": 5e-05, + "loss": 0.2614, + "num_input_tokens_seen": 367512, + "step": 1925 + }, + { + "epoch": 1.003118503118503, + "grad_norm": 1.6042208671569824, + "learning_rate": 4.9999989713809036e-05, + "loss": 0.3569, + "num_input_tokens_seen": 368472, + "step": 1930 + }, + { + "epoch": 1.0057172557172558, + "grad_norm": 0.8512699007987976, + "learning_rate": 4.999995885524459e-05, + "loss": 0.3423, + "num_input_tokens_seen": 369496, + "step": 1935 + }, + { + "epoch": 1.0083160083160083, + "grad_norm": 1.611057996749878, + "learning_rate": 4.999990742433206e-05, + "loss": 0.283, + "num_input_tokens_seen": 370456, + "step": 1940 + }, + { + "epoch": 1.0109147609147608, + "grad_norm": 0.8350038528442383, + "learning_rate": 4.9999835421113784e-05, + "loss": 0.3157, + "num_input_tokens_seen": 371416, + "step": 1945 + }, + { + "epoch": 1.0135135135135136, + "grad_norm": 1.074404001235962, + "learning_rate": 4.999974284564899e-05, + "loss": 0.314, + "num_input_tokens_seen": 372376, + "step": 1950 + }, + { + "epoch": 1.0161122661122661, + "grad_norm": 0.480262815952301, + "learning_rate": 4.999962969801387e-05, + "loss": 0.3124, + "num_input_tokens_seen": 373336, + "step": 1955 + }, + { + "epoch": 1.0187110187110187, + "grad_norm": 0.9194876551628113, + "learning_rate": 4.9999495978301534e-05, + "loss": 0.3001, + "num_input_tokens_seen": 374296, + "step": 1960 + }, + { + "epoch": 1.0213097713097714, + "grad_norm": 0.9150740504264832, + "learning_rate": 4.999934168662201e-05, + "loss": 0.3196, + "num_input_tokens_seen": 375320, + "step": 1965 + }, + { + "epoch": 1.023908523908524, + "grad_norm": 1.6183803081512451, + "learning_rate": 4.9999166823102275e-05, + "loss": 0.257, + "num_input_tokens_seen": 376280, + "step": 1970 + }, + { + "epoch": 1.0265072765072765, + "grad_norm": 0.7896539568901062, + "learning_rate": 4.9998971387886217e-05, + "loss": 0.3345, + "num_input_tokens_seen": 377208, + "step": 1975 + }, + { + "epoch": 1.0291060291060292, + "grad_norm": 0.7898284792900085, + "learning_rate": 4.9998755381134655e-05, + "loss": 0.2497, + "num_input_tokens_seen": 378168, + "step": 1980 + }, + { + "epoch": 1.0317047817047817, + "grad_norm": 0.6070805788040161, + "learning_rate": 4.999851880302535e-05, + "loss": 0.3142, + "num_input_tokens_seen": 379032, + "step": 1985 + }, + { + "epoch": 1.0343035343035343, + "grad_norm": 0.6123949885368347, + "learning_rate": 4.999826165375298e-05, + "loss": 0.2704, + "num_input_tokens_seen": 379992, + "step": 1990 + }, + { + "epoch": 1.0369022869022868, + "grad_norm": 0.686897337436676, + "learning_rate": 4.999798393352914e-05, + "loss": 0.2945, + "num_input_tokens_seen": 380952, + "step": 1995 + }, + { + "epoch": 1.0395010395010396, + "grad_norm": 0.9358550310134888, + "learning_rate": 4.999768564258238e-05, + "loss": 0.2921, + "num_input_tokens_seen": 381944, + "step": 2000 + }, + { + "epoch": 1.042099792099792, + "grad_norm": 1.1482809782028198, + "learning_rate": 4.999736678115815e-05, + "loss": 0.2943, + "num_input_tokens_seen": 382872, + "step": 2005 + }, + { + "epoch": 1.0446985446985446, + "grad_norm": 0.6205877661705017, + "learning_rate": 4.9997027349518845e-05, + "loss": 0.2664, + "num_input_tokens_seen": 383832, + "step": 2010 + }, + { + "epoch": 1.0472972972972974, + "grad_norm": 0.852140486240387, + "learning_rate": 4.999666734794378e-05, + "loss": 0.2984, + "num_input_tokens_seen": 384792, + "step": 2015 + }, + { + "epoch": 1.04989604989605, + "grad_norm": 1.1937357187271118, + "learning_rate": 4.999628677672921e-05, + "loss": 0.2583, + "num_input_tokens_seen": 385720, + "step": 2020 + }, + { + "epoch": 1.0524948024948024, + "grad_norm": 0.7869382500648499, + "learning_rate": 4.999588563618828e-05, + "loss": 0.2313, + "num_input_tokens_seen": 386712, + "step": 2025 + }, + { + "epoch": 1.0550935550935552, + "grad_norm": 1.6340159177780151, + "learning_rate": 4.999546392665111e-05, + "loss": 0.2917, + "num_input_tokens_seen": 387768, + "step": 2030 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 1.15883207321167, + "learning_rate": 4.999502164846471e-05, + "loss": 0.2419, + "num_input_tokens_seen": 388760, + "step": 2035 + }, + { + "epoch": 1.0602910602910602, + "grad_norm": 1.386920690536499, + "learning_rate": 4.9994558801993043e-05, + "loss": 0.3631, + "num_input_tokens_seen": 389688, + "step": 2040 + }, + { + "epoch": 1.062889812889813, + "grad_norm": 1.953059196472168, + "learning_rate": 4.999407538761696e-05, + "loss": 0.4423, + "num_input_tokens_seen": 390648, + "step": 2045 + }, + { + "epoch": 1.0654885654885655, + "grad_norm": 0.6277247667312622, + "learning_rate": 4.999357140573428e-05, + "loss": 0.3478, + "num_input_tokens_seen": 391544, + "step": 2050 + }, + { + "epoch": 1.068087318087318, + "grad_norm": 2.5407204627990723, + "learning_rate": 4.999304685675972e-05, + "loss": 0.4002, + "num_input_tokens_seen": 392536, + "step": 2055 + }, + { + "epoch": 1.0706860706860706, + "grad_norm": 0.4049489498138428, + "learning_rate": 4.999250174112493e-05, + "loss": 0.2906, + "num_input_tokens_seen": 393496, + "step": 2060 + }, + { + "epoch": 1.0732848232848233, + "grad_norm": 1.0730319023132324, + "learning_rate": 4.999193605927848e-05, + "loss": 0.2602, + "num_input_tokens_seen": 394456, + "step": 2065 + }, + { + "epoch": 1.0758835758835759, + "grad_norm": 1.255232334136963, + "learning_rate": 4.9991349811685874e-05, + "loss": 0.5061, + "num_input_tokens_seen": 395512, + "step": 2070 + }, + { + "epoch": 1.0784823284823284, + "grad_norm": 1.0518909692764282, + "learning_rate": 4.999074299882953e-05, + "loss": 0.2234, + "num_input_tokens_seen": 396504, + "step": 2075 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7315625548362732, + "learning_rate": 4.999011562120879e-05, + "loss": 0.2854, + "num_input_tokens_seen": 397464, + "step": 2080 + }, + { + "epoch": 1.0836798336798337, + "grad_norm": 0.7328015565872192, + "learning_rate": 4.9989467679339915e-05, + "loss": 0.2197, + "num_input_tokens_seen": 398392, + "step": 2085 + }, + { + "epoch": 1.0862785862785862, + "grad_norm": 1.1111507415771484, + "learning_rate": 4.99887991737561e-05, + "loss": 0.3081, + "num_input_tokens_seen": 399320, + "step": 2090 + }, + { + "epoch": 1.088877338877339, + "grad_norm": 1.4713317155838013, + "learning_rate": 4.9988110105007444e-05, + "loss": 0.349, + "num_input_tokens_seen": 400248, + "step": 2095 + }, + { + "epoch": 1.0914760914760915, + "grad_norm": 0.5578085780143738, + "learning_rate": 4.9987400473661e-05, + "loss": 0.3154, + "num_input_tokens_seen": 401144, + "step": 2100 + }, + { + "epoch": 1.094074844074844, + "grad_norm": 0.7205425500869751, + "learning_rate": 4.998667028030071e-05, + "loss": 0.2843, + "num_input_tokens_seen": 402072, + "step": 2105 + }, + { + "epoch": 1.0966735966735968, + "grad_norm": 1.708716630935669, + "learning_rate": 4.9985919525527434e-05, + "loss": 0.2771, + "num_input_tokens_seen": 403032, + "step": 2110 + }, + { + "epoch": 1.0992723492723493, + "grad_norm": 0.6950733661651611, + "learning_rate": 4.998514820995898e-05, + "loss": 0.3202, + "num_input_tokens_seen": 403992, + "step": 2115 + }, + { + "epoch": 1.1018711018711018, + "grad_norm": 1.1045575141906738, + "learning_rate": 4.9984356334230055e-05, + "loss": 0.2918, + "num_input_tokens_seen": 404984, + "step": 2120 + }, + { + "epoch": 1.1044698544698546, + "grad_norm": 1.0533231496810913, + "learning_rate": 4.9983543898992284e-05, + "loss": 0.2517, + "num_input_tokens_seen": 405976, + "step": 2125 + }, + { + "epoch": 1.107068607068607, + "grad_norm": 0.6189938187599182, + "learning_rate": 4.9982710904914224e-05, + "loss": 0.2745, + "num_input_tokens_seen": 406968, + "step": 2130 + }, + { + "epoch": 1.1096673596673596, + "grad_norm": 1.0420933961868286, + "learning_rate": 4.998185735268135e-05, + "loss": 0.3125, + "num_input_tokens_seen": 407928, + "step": 2135 + }, + { + "epoch": 1.1122661122661124, + "grad_norm": 0.912861168384552, + "learning_rate": 4.998098324299603e-05, + "loss": 0.3343, + "num_input_tokens_seen": 408856, + "step": 2140 + }, + { + "epoch": 1.114864864864865, + "grad_norm": 1.660348653793335, + "learning_rate": 4.998008857657756e-05, + "loss": 0.2748, + "num_input_tokens_seen": 409816, + "step": 2145 + }, + { + "epoch": 1.1174636174636174, + "grad_norm": 0.9695132374763489, + "learning_rate": 4.997917335416218e-05, + "loss": 0.322, + "num_input_tokens_seen": 410776, + "step": 2150 + }, + { + "epoch": 1.12006237006237, + "grad_norm": 0.6110017895698547, + "learning_rate": 4.997823757650301e-05, + "loss": 0.2863, + "num_input_tokens_seen": 411768, + "step": 2155 + }, + { + "epoch": 1.1226611226611227, + "grad_norm": 1.0712707042694092, + "learning_rate": 4.997728124437009e-05, + "loss": 0.2341, + "num_input_tokens_seen": 412696, + "step": 2160 + }, + { + "epoch": 1.1252598752598753, + "grad_norm": 1.2357128858566284, + "learning_rate": 4.9976304358550384e-05, + "loss": 0.3458, + "num_input_tokens_seen": 413624, + "step": 2165 + }, + { + "epoch": 1.1278586278586278, + "grad_norm": 0.8273010849952698, + "learning_rate": 4.9975306919847774e-05, + "loss": 0.2952, + "num_input_tokens_seen": 414584, + "step": 2170 + }, + { + "epoch": 1.1304573804573805, + "grad_norm": 1.1393890380859375, + "learning_rate": 4.997428892908305e-05, + "loss": 0.2596, + "num_input_tokens_seen": 415512, + "step": 2175 + }, + { + "epoch": 1.133056133056133, + "grad_norm": 1.4200376272201538, + "learning_rate": 4.997325038709391e-05, + "loss": 0.2334, + "num_input_tokens_seen": 416440, + "step": 2180 + }, + { + "epoch": 1.1356548856548856, + "grad_norm": 0.5034695267677307, + "learning_rate": 4.997219129473495e-05, + "loss": 0.3781, + "num_input_tokens_seen": 417560, + "step": 2185 + }, + { + "epoch": 1.1382536382536383, + "grad_norm": 0.7023857831954956, + "learning_rate": 4.9971111652877705e-05, + "loss": 0.288, + "num_input_tokens_seen": 418520, + "step": 2190 + }, + { + "epoch": 1.1408523908523909, + "grad_norm": 0.7489503026008606, + "learning_rate": 4.99700114624106e-05, + "loss": 0.2859, + "num_input_tokens_seen": 419384, + "step": 2195 + }, + { + "epoch": 1.1434511434511434, + "grad_norm": 0.8036942481994629, + "learning_rate": 4.9968890724238996e-05, + "loss": 0.3194, + "num_input_tokens_seen": 420312, + "step": 2200 + }, + { + "epoch": 1.1460498960498962, + "grad_norm": 1.7001214027404785, + "learning_rate": 4.996774943928513e-05, + "loss": 0.2714, + "num_input_tokens_seen": 421208, + "step": 2205 + }, + { + "epoch": 1.1486486486486487, + "grad_norm": 1.1444615125656128, + "learning_rate": 4.996658760848815e-05, + "loss": 0.2794, + "num_input_tokens_seen": 422200, + "step": 2210 + }, + { + "epoch": 1.1512474012474012, + "grad_norm": 0.47216010093688965, + "learning_rate": 4.996540523280413e-05, + "loss": 0.2432, + "num_input_tokens_seen": 423192, + "step": 2215 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.8575963973999023, + "learning_rate": 4.996420231320604e-05, + "loss": 0.3699, + "num_input_tokens_seen": 424152, + "step": 2220 + }, + { + "epoch": 1.1564449064449065, + "grad_norm": 1.1986695528030396, + "learning_rate": 4.996297885068376e-05, + "loss": 0.3147, + "num_input_tokens_seen": 425208, + "step": 2225 + }, + { + "epoch": 1.159043659043659, + "grad_norm": 1.0484544038772583, + "learning_rate": 4.996173484624408e-05, + "loss": 0.2147, + "num_input_tokens_seen": 426168, + "step": 2230 + }, + { + "epoch": 1.1616424116424116, + "grad_norm": 1.02173912525177, + "learning_rate": 4.9960470300910665e-05, + "loss": 0.2472, + "num_input_tokens_seen": 427128, + "step": 2235 + }, + { + "epoch": 1.1642411642411643, + "grad_norm": 1.1825315952301025, + "learning_rate": 4.995918521572411e-05, + "loss": 0.3695, + "num_input_tokens_seen": 428152, + "step": 2240 + }, + { + "epoch": 1.1668399168399168, + "grad_norm": 0.5554243326187134, + "learning_rate": 4.995787959174192e-05, + "loss": 0.2033, + "num_input_tokens_seen": 429112, + "step": 2245 + }, + { + "epoch": 1.1694386694386694, + "grad_norm": 0.9597379565238953, + "learning_rate": 4.995655343003847e-05, + "loss": 0.2465, + "num_input_tokens_seen": 430072, + "step": 2250 + }, + { + "epoch": 1.1720374220374221, + "grad_norm": 0.6463460326194763, + "learning_rate": 4.995520673170506e-05, + "loss": 0.3078, + "num_input_tokens_seen": 430968, + "step": 2255 + }, + { + "epoch": 1.1746361746361746, + "grad_norm": 1.5485886335372925, + "learning_rate": 4.9953839497849886e-05, + "loss": 0.2724, + "num_input_tokens_seen": 431896, + "step": 2260 + }, + { + "epoch": 1.1772349272349272, + "grad_norm": 1.4209892749786377, + "learning_rate": 4.995245172959802e-05, + "loss": 0.327, + "num_input_tokens_seen": 432856, + "step": 2265 + }, + { + "epoch": 1.17983367983368, + "grad_norm": 0.6004291772842407, + "learning_rate": 4.995104342809147e-05, + "loss": 0.2587, + "num_input_tokens_seen": 433816, + "step": 2270 + }, + { + "epoch": 1.1824324324324325, + "grad_norm": 0.7910385131835938, + "learning_rate": 4.994961459448911e-05, + "loss": 0.2941, + "num_input_tokens_seen": 434776, + "step": 2275 + }, + { + "epoch": 1.185031185031185, + "grad_norm": 1.7051739692687988, + "learning_rate": 4.994816522996672e-05, + "loss": 0.2918, + "num_input_tokens_seen": 435704, + "step": 2280 + }, + { + "epoch": 1.1876299376299375, + "grad_norm": 0.5318970084190369, + "learning_rate": 4.994669533571699e-05, + "loss": 0.2938, + "num_input_tokens_seen": 436600, + "step": 2285 + }, + { + "epoch": 1.1902286902286903, + "grad_norm": 1.2849063873291016, + "learning_rate": 4.994520491294947e-05, + "loss": 0.2843, + "num_input_tokens_seen": 437528, + "step": 2290 + }, + { + "epoch": 1.1928274428274428, + "grad_norm": 1.0622130632400513, + "learning_rate": 4.994369396289063e-05, + "loss": 0.286, + "num_input_tokens_seen": 438456, + "step": 2295 + }, + { + "epoch": 1.1954261954261955, + "grad_norm": 1.4648094177246094, + "learning_rate": 4.9942162486783825e-05, + "loss": 0.3108, + "num_input_tokens_seen": 439544, + "step": 2300 + }, + { + "epoch": 1.198024948024948, + "grad_norm": 0.6956253051757812, + "learning_rate": 4.994061048588929e-05, + "loss": 0.311, + "num_input_tokens_seen": 440568, + "step": 2305 + }, + { + "epoch": 1.2006237006237006, + "grad_norm": 0.46893075108528137, + "learning_rate": 4.993903796148418e-05, + "loss": 0.263, + "num_input_tokens_seen": 441496, + "step": 2310 + }, + { + "epoch": 1.2032224532224531, + "grad_norm": 0.7511025071144104, + "learning_rate": 4.99374449148625e-05, + "loss": 0.4163, + "num_input_tokens_seen": 442456, + "step": 2315 + }, + { + "epoch": 1.2058212058212059, + "grad_norm": 0.8320274353027344, + "learning_rate": 4.993583134733516e-05, + "loss": 0.2927, + "num_input_tokens_seen": 443480, + "step": 2320 + }, + { + "epoch": 1.2084199584199584, + "grad_norm": 2.678215503692627, + "learning_rate": 4.993419726022997e-05, + "loss": 0.3494, + "num_input_tokens_seen": 444472, + "step": 2325 + }, + { + "epoch": 1.211018711018711, + "grad_norm": 1.04775869846344, + "learning_rate": 4.993254265489159e-05, + "loss": 0.3416, + "num_input_tokens_seen": 445432, + "step": 2330 + }, + { + "epoch": 1.2136174636174637, + "grad_norm": 0.6267170906066895, + "learning_rate": 4.9930867532681615e-05, + "loss": 0.2602, + "num_input_tokens_seen": 446424, + "step": 2335 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.35720688104629517, + "learning_rate": 4.992917189497848e-05, + "loss": 0.1558, + "num_input_tokens_seen": 447320, + "step": 2340 + }, + { + "epoch": 1.2188149688149688, + "grad_norm": 0.4909346103668213, + "learning_rate": 4.9927455743177515e-05, + "loss": 0.3667, + "num_input_tokens_seen": 448216, + "step": 2345 + }, + { + "epoch": 1.2214137214137215, + "grad_norm": 0.3823666274547577, + "learning_rate": 4.9925719078690934e-05, + "loss": 0.4244, + "num_input_tokens_seen": 449176, + "step": 2350 + }, + { + "epoch": 1.224012474012474, + "grad_norm": 1.3324514627456665, + "learning_rate": 4.992396190294785e-05, + "loss": 0.2718, + "num_input_tokens_seen": 450168, + "step": 2355 + }, + { + "epoch": 1.2266112266112266, + "grad_norm": 1.5537874698638916, + "learning_rate": 4.99221842173942e-05, + "loss": 0.3332, + "num_input_tokens_seen": 451096, + "step": 2360 + }, + { + "epoch": 1.2292099792099793, + "grad_norm": 1.8406767845153809, + "learning_rate": 4.992038602349286e-05, + "loss": 0.3006, + "num_input_tokens_seen": 452024, + "step": 2365 + }, + { + "epoch": 1.2318087318087318, + "grad_norm": 2.221665143966675, + "learning_rate": 4.991856732272354e-05, + "loss": 0.3312, + "num_input_tokens_seen": 452984, + "step": 2370 + }, + { + "epoch": 1.2344074844074844, + "grad_norm": 0.6601800918579102, + "learning_rate": 4.9916728116582856e-05, + "loss": 0.284, + "num_input_tokens_seen": 453944, + "step": 2375 + }, + { + "epoch": 1.237006237006237, + "grad_norm": 0.29944461584091187, + "learning_rate": 4.991486840658427e-05, + "loss": 0.2878, + "num_input_tokens_seen": 454872, + "step": 2380 + }, + { + "epoch": 1.2396049896049897, + "grad_norm": 0.25481995940208435, + "learning_rate": 4.9912988194258125e-05, + "loss": 0.2726, + "num_input_tokens_seen": 455832, + "step": 2385 + }, + { + "epoch": 1.2422037422037422, + "grad_norm": 0.7090577483177185, + "learning_rate": 4.991108748115165e-05, + "loss": 0.2605, + "num_input_tokens_seen": 456792, + "step": 2390 + }, + { + "epoch": 1.2448024948024947, + "grad_norm": 1.5450897216796875, + "learning_rate": 4.990916626882893e-05, + "loss": 0.3113, + "num_input_tokens_seen": 457656, + "step": 2395 + }, + { + "epoch": 1.2474012474012475, + "grad_norm": 1.0139020681381226, + "learning_rate": 4.990722455887091e-05, + "loss": 0.271, + "num_input_tokens_seen": 458648, + "step": 2400 + }, + { + "epoch": 1.25, + "grad_norm": 0.7366583943367004, + "learning_rate": 4.990526235287544e-05, + "loss": 0.3733, + "num_input_tokens_seen": 459576, + "step": 2405 + }, + { + "epoch": 1.2525987525987525, + "grad_norm": 0.6619210243225098, + "learning_rate": 4.9903279652457177e-05, + "loss": 0.2674, + "num_input_tokens_seen": 460504, + "step": 2410 + }, + { + "epoch": 1.255197505197505, + "grad_norm": 1.159008502960205, + "learning_rate": 4.99012764592477e-05, + "loss": 0.252, + "num_input_tokens_seen": 461432, + "step": 2415 + }, + { + "epoch": 1.2577962577962578, + "grad_norm": 0.5548393726348877, + "learning_rate": 4.989925277489542e-05, + "loss": 0.3378, + "num_input_tokens_seen": 462360, + "step": 2420 + }, + { + "epoch": 1.2603950103950103, + "grad_norm": 0.8888547420501709, + "learning_rate": 4.9897208601065614e-05, + "loss": 0.256, + "num_input_tokens_seen": 463224, + "step": 2425 + }, + { + "epoch": 1.262993762993763, + "grad_norm": 0.8702475428581238, + "learning_rate": 4.9895143939440434e-05, + "loss": 0.2237, + "num_input_tokens_seen": 464184, + "step": 2430 + }, + { + "epoch": 1.2655925155925156, + "grad_norm": 1.3862464427947998, + "learning_rate": 4.989305879171886e-05, + "loss": 0.3114, + "num_input_tokens_seen": 465112, + "step": 2435 + }, + { + "epoch": 1.2681912681912682, + "grad_norm": 1.0997538566589355, + "learning_rate": 4.989095315961677e-05, + "loss": 0.2819, + "num_input_tokens_seen": 466008, + "step": 2440 + }, + { + "epoch": 1.2707900207900207, + "grad_norm": 0.7198344469070435, + "learning_rate": 4.988882704486687e-05, + "loss": 0.292, + "num_input_tokens_seen": 467032, + "step": 2445 + }, + { + "epoch": 1.2733887733887734, + "grad_norm": 0.5923174023628235, + "learning_rate": 4.988668044921872e-05, + "loss": 0.2354, + "num_input_tokens_seen": 467960, + "step": 2450 + }, + { + "epoch": 1.275987525987526, + "grad_norm": 1.212507724761963, + "learning_rate": 4.988451337443877e-05, + "loss": 0.2219, + "num_input_tokens_seen": 468920, + "step": 2455 + }, + { + "epoch": 1.2785862785862787, + "grad_norm": 1.2855085134506226, + "learning_rate": 4.9882325822310275e-05, + "loss": 0.252, + "num_input_tokens_seen": 469816, + "step": 2460 + }, + { + "epoch": 1.2811850311850312, + "grad_norm": 1.6069612503051758, + "learning_rate": 4.9880117794633365e-05, + "loss": 0.2931, + "num_input_tokens_seen": 470744, + "step": 2465 + }, + { + "epoch": 1.2837837837837838, + "grad_norm": 0.3687765300273895, + "learning_rate": 4.9877889293225014e-05, + "loss": 0.267, + "num_input_tokens_seen": 471672, + "step": 2470 + }, + { + "epoch": 1.2863825363825363, + "grad_norm": 0.752976655960083, + "learning_rate": 4.987564031991905e-05, + "loss": 0.2529, + "num_input_tokens_seen": 472664, + "step": 2475 + }, + { + "epoch": 1.288981288981289, + "grad_norm": 0.3206794559955597, + "learning_rate": 4.987337087656614e-05, + "loss": 0.2757, + "num_input_tokens_seen": 473592, + "step": 2480 + }, + { + "epoch": 1.2915800415800416, + "grad_norm": 0.2756587266921997, + "learning_rate": 4.98710809650338e-05, + "loss": 0.2398, + "num_input_tokens_seen": 474552, + "step": 2485 + }, + { + "epoch": 1.2941787941787941, + "grad_norm": 0.6186290979385376, + "learning_rate": 4.9868770587206394e-05, + "loss": 0.3464, + "num_input_tokens_seen": 475608, + "step": 2490 + }, + { + "epoch": 1.2967775467775469, + "grad_norm": 1.7409343719482422, + "learning_rate": 4.98664397449851e-05, + "loss": 0.3174, + "num_input_tokens_seen": 476536, + "step": 2495 + }, + { + "epoch": 1.2993762993762994, + "grad_norm": 0.4945911467075348, + "learning_rate": 4.986408844028797e-05, + "loss": 0.3114, + "num_input_tokens_seen": 477496, + "step": 2500 + }, + { + "epoch": 1.301975051975052, + "grad_norm": 1.018936038017273, + "learning_rate": 4.986171667504989e-05, + "loss": 0.2192, + "num_input_tokens_seen": 478392, + "step": 2505 + }, + { + "epoch": 1.3045738045738045, + "grad_norm": 0.7040171027183533, + "learning_rate": 4.985932445122257e-05, + "loss": 0.3472, + "num_input_tokens_seen": 479416, + "step": 2510 + }, + { + "epoch": 1.3071725571725572, + "grad_norm": 0.37676411867141724, + "learning_rate": 4.985691177077454e-05, + "loss": 0.2406, + "num_input_tokens_seen": 480312, + "step": 2515 + }, + { + "epoch": 1.3097713097713097, + "grad_norm": 0.8835750818252563, + "learning_rate": 4.9854478635691215e-05, + "loss": 0.3286, + "num_input_tokens_seen": 481304, + "step": 2520 + }, + { + "epoch": 1.3123700623700625, + "grad_norm": 1.3793997764587402, + "learning_rate": 4.985202504797478e-05, + "loss": 0.2956, + "num_input_tokens_seen": 482328, + "step": 2525 + }, + { + "epoch": 1.314968814968815, + "grad_norm": 0.2739459276199341, + "learning_rate": 4.984955100964431e-05, + "loss": 0.2926, + "num_input_tokens_seen": 483320, + "step": 2530 + }, + { + "epoch": 1.3175675675675675, + "grad_norm": 0.9354133009910583, + "learning_rate": 4.9847056522735655e-05, + "loss": 0.2841, + "num_input_tokens_seen": 484344, + "step": 2535 + }, + { + "epoch": 1.32016632016632, + "grad_norm": 0.9515479207038879, + "learning_rate": 4.984454158930153e-05, + "loss": 0.2375, + "num_input_tokens_seen": 485336, + "step": 2540 + }, + { + "epoch": 1.3227650727650728, + "grad_norm": 1.775568962097168, + "learning_rate": 4.984200621141145e-05, + "loss": 0.3389, + "num_input_tokens_seen": 486264, + "step": 2545 + }, + { + "epoch": 1.3253638253638254, + "grad_norm": 0.34738653898239136, + "learning_rate": 4.9839450391151785e-05, + "loss": 0.3024, + "num_input_tokens_seen": 487288, + "step": 2550 + }, + { + "epoch": 1.3279625779625779, + "grad_norm": 1.0480157136917114, + "learning_rate": 4.983687413062569e-05, + "loss": 0.287, + "num_input_tokens_seen": 488120, + "step": 2555 + }, + { + "epoch": 1.3305613305613306, + "grad_norm": 0.6742106676101685, + "learning_rate": 4.983427743195317e-05, + "loss": 0.3013, + "num_input_tokens_seen": 489080, + "step": 2560 + }, + { + "epoch": 1.3331600831600832, + "grad_norm": 1.4146372079849243, + "learning_rate": 4.983166029727102e-05, + "loss": 0.2852, + "num_input_tokens_seen": 489976, + "step": 2565 + }, + { + "epoch": 1.3357588357588357, + "grad_norm": 0.43428176641464233, + "learning_rate": 4.9829022728732884e-05, + "loss": 0.2262, + "num_input_tokens_seen": 490904, + "step": 2570 + }, + { + "epoch": 1.3383575883575882, + "grad_norm": 0.943998396396637, + "learning_rate": 4.9826364728509195e-05, + "loss": 0.2578, + "num_input_tokens_seen": 491832, + "step": 2575 + }, + { + "epoch": 1.340956340956341, + "grad_norm": 0.6028611660003662, + "learning_rate": 4.982368629878722e-05, + "loss": 0.3436, + "num_input_tokens_seen": 492760, + "step": 2580 + }, + { + "epoch": 1.3435550935550935, + "grad_norm": 1.029421329498291, + "learning_rate": 4.9820987441771e-05, + "loss": 0.2437, + "num_input_tokens_seen": 493720, + "step": 2585 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 0.9970633387565613, + "learning_rate": 4.981826815968145e-05, + "loss": 0.2915, + "num_input_tokens_seen": 494680, + "step": 2590 + }, + { + "epoch": 1.3487525987525988, + "grad_norm": 1.0982328653335571, + "learning_rate": 4.981552845475622e-05, + "loss": 0.3168, + "num_input_tokens_seen": 495672, + "step": 2595 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.5646061301231384, + "learning_rate": 4.981276832924982e-05, + "loss": 0.2858, + "num_input_tokens_seen": 496664, + "step": 2600 + }, + { + "epoch": 1.3539501039501038, + "grad_norm": 0.5978178381919861, + "learning_rate": 4.9809987785433544e-05, + "loss": 0.2851, + "num_input_tokens_seen": 497592, + "step": 2605 + }, + { + "epoch": 1.3565488565488566, + "grad_norm": 0.47902145981788635, + "learning_rate": 4.980718682559547e-05, + "loss": 0.2881, + "num_input_tokens_seen": 498520, + "step": 2610 + }, + { + "epoch": 1.3591476091476091, + "grad_norm": 1.2402440309524536, + "learning_rate": 4.9804365452040516e-05, + "loss": 0.3127, + "num_input_tokens_seen": 499416, + "step": 2615 + }, + { + "epoch": 1.3617463617463619, + "grad_norm": 1.6433082818984985, + "learning_rate": 4.980152366709037e-05, + "loss": 0.2821, + "num_input_tokens_seen": 500376, + "step": 2620 + }, + { + "epoch": 1.3643451143451144, + "grad_norm": 0.5708093047142029, + "learning_rate": 4.979866147308352e-05, + "loss": 0.2883, + "num_input_tokens_seen": 501304, + "step": 2625 + }, + { + "epoch": 1.366943866943867, + "grad_norm": 0.25612059235572815, + "learning_rate": 4.979577887237525e-05, + "loss": 0.3116, + "num_input_tokens_seen": 502200, + "step": 2630 + }, + { + "epoch": 1.3695426195426195, + "grad_norm": 0.3833748400211334, + "learning_rate": 4.979287586733765e-05, + "loss": 0.29, + "num_input_tokens_seen": 503128, + "step": 2635 + }, + { + "epoch": 1.3721413721413722, + "grad_norm": 0.6129119396209717, + "learning_rate": 4.978995246035958e-05, + "loss": 0.2727, + "num_input_tokens_seen": 504056, + "step": 2640 + }, + { + "epoch": 1.3747401247401247, + "grad_norm": 0.46993499994277954, + "learning_rate": 4.97870086538467e-05, + "loss": 0.2378, + "num_input_tokens_seen": 505016, + "step": 2645 + }, + { + "epoch": 1.3773388773388773, + "grad_norm": 0.4871598482131958, + "learning_rate": 4.9784044450221454e-05, + "loss": 0.2522, + "num_input_tokens_seen": 506008, + "step": 2650 + }, + { + "epoch": 1.37993762993763, + "grad_norm": 0.4822857677936554, + "learning_rate": 4.978105985192306e-05, + "loss": 0.2631, + "num_input_tokens_seen": 506904, + "step": 2655 + }, + { + "epoch": 1.3825363825363826, + "grad_norm": 0.9913327097892761, + "learning_rate": 4.9778054861407555e-05, + "loss": 0.2829, + "num_input_tokens_seen": 507864, + "step": 2660 + }, + { + "epoch": 1.385135135135135, + "grad_norm": 0.6456615328788757, + "learning_rate": 4.977502948114772e-05, + "loss": 0.1943, + "num_input_tokens_seen": 508792, + "step": 2665 + }, + { + "epoch": 1.3877338877338876, + "grad_norm": 0.8305045962333679, + "learning_rate": 4.977198371363311e-05, + "loss": 0.2905, + "num_input_tokens_seen": 509720, + "step": 2670 + }, + { + "epoch": 1.3903326403326404, + "grad_norm": 0.9083090424537659, + "learning_rate": 4.9768917561370093e-05, + "loss": 0.3002, + "num_input_tokens_seen": 510648, + "step": 2675 + }, + { + "epoch": 1.392931392931393, + "grad_norm": 1.6049450635910034, + "learning_rate": 4.9765831026881785e-05, + "loss": 0.3173, + "num_input_tokens_seen": 511608, + "step": 2680 + }, + { + "epoch": 1.3955301455301456, + "grad_norm": 0.8487095236778259, + "learning_rate": 4.9762724112708084e-05, + "loss": 0.2796, + "num_input_tokens_seen": 512536, + "step": 2685 + }, + { + "epoch": 1.3981288981288982, + "grad_norm": 1.099241852760315, + "learning_rate": 4.975959682140564e-05, + "loss": 0.3084, + "num_input_tokens_seen": 513496, + "step": 2690 + }, + { + "epoch": 1.4007276507276507, + "grad_norm": 0.4378984272480011, + "learning_rate": 4.97564491555479e-05, + "loss": 0.2813, + "num_input_tokens_seen": 514456, + "step": 2695 + }, + { + "epoch": 1.4033264033264032, + "grad_norm": 0.6011500954627991, + "learning_rate": 4.975328111772507e-05, + "loss": 0.2792, + "num_input_tokens_seen": 515416, + "step": 2700 + }, + { + "epoch": 1.405925155925156, + "grad_norm": 0.612916111946106, + "learning_rate": 4.975009271054409e-05, + "loss": 0.3297, + "num_input_tokens_seen": 516376, + "step": 2705 + }, + { + "epoch": 1.4085239085239085, + "grad_norm": 0.7329040765762329, + "learning_rate": 4.974688393662872e-05, + "loss": 0.2778, + "num_input_tokens_seen": 517336, + "step": 2710 + }, + { + "epoch": 1.411122661122661, + "grad_norm": 0.7556671500205994, + "learning_rate": 4.974365479861941e-05, + "loss": 0.2855, + "num_input_tokens_seen": 518232, + "step": 2715 + }, + { + "epoch": 1.4137214137214138, + "grad_norm": 1.1903578042984009, + "learning_rate": 4.974040529917342e-05, + "loss": 0.3071, + "num_input_tokens_seen": 519192, + "step": 2720 + }, + { + "epoch": 1.4163201663201663, + "grad_norm": 0.695177435874939, + "learning_rate": 4.973713544096475e-05, + "loss": 0.3234, + "num_input_tokens_seen": 520152, + "step": 2725 + }, + { + "epoch": 1.4189189189189189, + "grad_norm": 0.9272096157073975, + "learning_rate": 4.973384522668413e-05, + "loss": 0.262, + "num_input_tokens_seen": 521176, + "step": 2730 + }, + { + "epoch": 1.4215176715176714, + "grad_norm": 0.7031478881835938, + "learning_rate": 4.973053465903909e-05, + "loss": 0.315, + "num_input_tokens_seen": 522072, + "step": 2735 + }, + { + "epoch": 1.4241164241164241, + "grad_norm": 0.4153922200202942, + "learning_rate": 4.9727203740753855e-05, + "loss": 0.2814, + "num_input_tokens_seen": 522968, + "step": 2740 + }, + { + "epoch": 1.4267151767151767, + "grad_norm": 0.6267372369766235, + "learning_rate": 4.972385247456945e-05, + "loss": 0.2329, + "num_input_tokens_seen": 523896, + "step": 2745 + }, + { + "epoch": 1.4293139293139294, + "grad_norm": 0.7980009317398071, + "learning_rate": 4.972048086324359e-05, + "loss": 0.2381, + "num_input_tokens_seen": 524920, + "step": 2750 + }, + { + "epoch": 1.431912681912682, + "grad_norm": 0.6800652742385864, + "learning_rate": 4.9717088909550775e-05, + "loss": 0.3543, + "num_input_tokens_seen": 525784, + "step": 2755 + }, + { + "epoch": 1.4345114345114345, + "grad_norm": 0.7802736163139343, + "learning_rate": 4.971367661628222e-05, + "loss": 0.2957, + "num_input_tokens_seen": 526712, + "step": 2760 + }, + { + "epoch": 1.437110187110187, + "grad_norm": 0.5125477313995361, + "learning_rate": 4.971024398624588e-05, + "loss": 0.2417, + "num_input_tokens_seen": 527672, + "step": 2765 + }, + { + "epoch": 1.4397089397089398, + "grad_norm": 0.7476266622543335, + "learning_rate": 4.970679102226646e-05, + "loss": 0.1624, + "num_input_tokens_seen": 528632, + "step": 2770 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 1.3508342504501343, + "learning_rate": 4.97033177271854e-05, + "loss": 0.3172, + "num_input_tokens_seen": 529592, + "step": 2775 + }, + { + "epoch": 1.444906444906445, + "grad_norm": 0.5194209218025208, + "learning_rate": 4.9699824103860815e-05, + "loss": 0.4208, + "num_input_tokens_seen": 530552, + "step": 2780 + }, + { + "epoch": 1.4475051975051976, + "grad_norm": 1.1884872913360596, + "learning_rate": 4.9696310155167635e-05, + "loss": 0.2251, + "num_input_tokens_seen": 531544, + "step": 2785 + }, + { + "epoch": 1.45010395010395, + "grad_norm": 0.45123809576034546, + "learning_rate": 4.9692775883997456e-05, + "loss": 0.2541, + "num_input_tokens_seen": 532440, + "step": 2790 + }, + { + "epoch": 1.4527027027027026, + "grad_norm": 0.29192110896110535, + "learning_rate": 4.9689221293258605e-05, + "loss": 0.2382, + "num_input_tokens_seen": 533368, + "step": 2795 + }, + { + "epoch": 1.4553014553014554, + "grad_norm": 0.8465179800987244, + "learning_rate": 4.968564638587615e-05, + "loss": 0.211, + "num_input_tokens_seen": 534360, + "step": 2800 + }, + { + "epoch": 1.457900207900208, + "grad_norm": 0.818423867225647, + "learning_rate": 4.9682051164791855e-05, + "loss": 0.3541, + "num_input_tokens_seen": 535320, + "step": 2805 + }, + { + "epoch": 1.4604989604989604, + "grad_norm": 1.0466898679733276, + "learning_rate": 4.967843563296422e-05, + "loss": 0.2499, + "num_input_tokens_seen": 536248, + "step": 2810 + }, + { + "epoch": 1.4630977130977132, + "grad_norm": 1.319502592086792, + "learning_rate": 4.967479979336844e-05, + "loss": 0.2829, + "num_input_tokens_seen": 537144, + "step": 2815 + }, + { + "epoch": 1.4656964656964657, + "grad_norm": 0.446092814207077, + "learning_rate": 4.9671143648996445e-05, + "loss": 0.4121, + "num_input_tokens_seen": 538232, + "step": 2820 + }, + { + "epoch": 1.4682952182952183, + "grad_norm": 0.5495761036872864, + "learning_rate": 4.9667467202856844e-05, + "loss": 0.3075, + "num_input_tokens_seen": 539192, + "step": 2825 + }, + { + "epoch": 1.4708939708939708, + "grad_norm": 1.1115831136703491, + "learning_rate": 4.966377045797498e-05, + "loss": 0.2724, + "num_input_tokens_seen": 540152, + "step": 2830 + }, + { + "epoch": 1.4734927234927235, + "grad_norm": 0.5472722053527832, + "learning_rate": 4.9660053417392866e-05, + "loss": 0.2864, + "num_input_tokens_seen": 541080, + "step": 2835 + }, + { + "epoch": 1.476091476091476, + "grad_norm": 0.8084115982055664, + "learning_rate": 4.9656316084169255e-05, + "loss": 0.2156, + "num_input_tokens_seen": 542008, + "step": 2840 + }, + { + "epoch": 1.4786902286902288, + "grad_norm": 1.1350737810134888, + "learning_rate": 4.965255846137958e-05, + "loss": 0.2884, + "num_input_tokens_seen": 542968, + "step": 2845 + }, + { + "epoch": 1.4812889812889813, + "grad_norm": 1.0140743255615234, + "learning_rate": 4.964878055211597e-05, + "loss": 0.3779, + "num_input_tokens_seen": 543960, + "step": 2850 + }, + { + "epoch": 1.4838877338877339, + "grad_norm": 0.29176580905914307, + "learning_rate": 4.9644982359487234e-05, + "loss": 0.2819, + "num_input_tokens_seen": 544856, + "step": 2855 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.40480250120162964, + "learning_rate": 4.964116388661891e-05, + "loss": 0.3307, + "num_input_tokens_seen": 545848, + "step": 2860 + }, + { + "epoch": 1.4890852390852392, + "grad_norm": 0.7600014209747314, + "learning_rate": 4.963732513665319e-05, + "loss": 0.354, + "num_input_tokens_seen": 546840, + "step": 2865 + }, + { + "epoch": 1.4916839916839917, + "grad_norm": 0.4127037525177002, + "learning_rate": 4.963346611274896e-05, + "loss": 0.2702, + "num_input_tokens_seen": 547736, + "step": 2870 + }, + { + "epoch": 1.4942827442827442, + "grad_norm": 0.45990654826164246, + "learning_rate": 4.96295868180818e-05, + "loss": 0.2527, + "num_input_tokens_seen": 548728, + "step": 2875 + }, + { + "epoch": 1.496881496881497, + "grad_norm": 0.6772897839546204, + "learning_rate": 4.962568725584395e-05, + "loss": 0.3105, + "num_input_tokens_seen": 549688, + "step": 2880 + }, + { + "epoch": 1.4994802494802495, + "grad_norm": 0.4570785164833069, + "learning_rate": 4.962176742924436e-05, + "loss": 0.2984, + "num_input_tokens_seen": 550648, + "step": 2885 + }, + { + "epoch": 1.5, + "eval_loss": 0.2642279267311096, + "eval_runtime": 7.9677, + "eval_samples_per_second": 107.433, + "eval_steps_per_second": 26.858, + "num_input_tokens_seen": 550840, + "step": 2886 + }, + { + "epoch": 1.502079002079002, + "grad_norm": 0.5995020866394043, + "learning_rate": 4.961782734150862e-05, + "loss": 0.243, + "num_input_tokens_seen": 551544, + "step": 2890 + }, + { + "epoch": 1.5046777546777546, + "grad_norm": 1.626114845275879, + "learning_rate": 4.961386699587902e-05, + "loss": 0.2853, + "num_input_tokens_seen": 552504, + "step": 2895 + }, + { + "epoch": 1.5072765072765073, + "grad_norm": 0.3899468183517456, + "learning_rate": 4.96098863956145e-05, + "loss": 0.2497, + "num_input_tokens_seen": 553400, + "step": 2900 + }, + { + "epoch": 1.5098752598752598, + "grad_norm": 0.4763009250164032, + "learning_rate": 4.960588554399069e-05, + "loss": 0.2375, + "num_input_tokens_seen": 554392, + "step": 2905 + }, + { + "epoch": 1.5124740124740126, + "grad_norm": 0.9925509095191956, + "learning_rate": 4.9601864444299875e-05, + "loss": 0.3165, + "num_input_tokens_seen": 555416, + "step": 2910 + }, + { + "epoch": 1.5150727650727651, + "grad_norm": 0.5701366662979126, + "learning_rate": 4.959782309985098e-05, + "loss": 0.3716, + "num_input_tokens_seen": 556472, + "step": 2915 + }, + { + "epoch": 1.5176715176715176, + "grad_norm": 1.2018920183181763, + "learning_rate": 4.959376151396962e-05, + "loss": 0.2955, + "num_input_tokens_seen": 557368, + "step": 2920 + }, + { + "epoch": 1.5202702702702702, + "grad_norm": 1.7024717330932617, + "learning_rate": 4.9589679689998046e-05, + "loss": 0.3048, + "num_input_tokens_seen": 558328, + "step": 2925 + }, + { + "epoch": 1.5228690228690227, + "grad_norm": 0.5451721549034119, + "learning_rate": 4.9585577631295186e-05, + "loss": 0.2664, + "num_input_tokens_seen": 559288, + "step": 2930 + }, + { + "epoch": 1.5254677754677755, + "grad_norm": 0.22056831419467926, + "learning_rate": 4.958145534123659e-05, + "loss": 0.2642, + "num_input_tokens_seen": 560216, + "step": 2935 + }, + { + "epoch": 1.5280665280665282, + "grad_norm": 0.2637401223182678, + "learning_rate": 4.957731282321449e-05, + "loss": 0.3234, + "num_input_tokens_seen": 561176, + "step": 2940 + }, + { + "epoch": 1.5306652806652807, + "grad_norm": 1.0724984407424927, + "learning_rate": 4.957315008063773e-05, + "loss": 0.2984, + "num_input_tokens_seen": 562136, + "step": 2945 + }, + { + "epoch": 1.5332640332640333, + "grad_norm": 1.1551004648208618, + "learning_rate": 4.956896711693181e-05, + "loss": 0.2809, + "num_input_tokens_seen": 563032, + "step": 2950 + }, + { + "epoch": 1.5358627858627858, + "grad_norm": 0.4570561945438385, + "learning_rate": 4.956476393553887e-05, + "loss": 0.2437, + "num_input_tokens_seen": 564056, + "step": 2955 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.4867783486843109, + "learning_rate": 4.9560540539917697e-05, + "loss": 0.2591, + "num_input_tokens_seen": 565048, + "step": 2960 + }, + { + "epoch": 1.541060291060291, + "grad_norm": 0.9933836460113525, + "learning_rate": 4.95562969335437e-05, + "loss": 0.3211, + "num_input_tokens_seen": 565944, + "step": 2965 + }, + { + "epoch": 1.5436590436590436, + "grad_norm": 0.40049684047698975, + "learning_rate": 4.9552033119908924e-05, + "loss": 0.2702, + "num_input_tokens_seen": 566936, + "step": 2970 + }, + { + "epoch": 1.5462577962577964, + "grad_norm": 0.24814783036708832, + "learning_rate": 4.954774910252204e-05, + "loss": 0.2893, + "num_input_tokens_seen": 567896, + "step": 2975 + }, + { + "epoch": 1.5488565488565489, + "grad_norm": 0.6694278717041016, + "learning_rate": 4.954344488490834e-05, + "loss": 0.3179, + "num_input_tokens_seen": 568728, + "step": 2980 + }, + { + "epoch": 1.5514553014553014, + "grad_norm": 0.686793327331543, + "learning_rate": 4.953912047060976e-05, + "loss": 0.3178, + "num_input_tokens_seen": 569688, + "step": 2985 + }, + { + "epoch": 1.554054054054054, + "grad_norm": 0.6896393895149231, + "learning_rate": 4.953477586318482e-05, + "loss": 0.2752, + "num_input_tokens_seen": 570680, + "step": 2990 + }, + { + "epoch": 1.5566528066528067, + "grad_norm": 0.33549898862838745, + "learning_rate": 4.953041106620869e-05, + "loss": 0.2397, + "num_input_tokens_seen": 571576, + "step": 2995 + }, + { + "epoch": 1.5592515592515592, + "grad_norm": 0.26489168405532837, + "learning_rate": 4.952602608327313e-05, + "loss": 0.2937, + "num_input_tokens_seen": 572536, + "step": 3000 + }, + { + "epoch": 1.561850311850312, + "grad_norm": 0.6121596693992615, + "learning_rate": 4.952162091798653e-05, + "loss": 0.2702, + "num_input_tokens_seen": 573496, + "step": 3005 + }, + { + "epoch": 1.5644490644490645, + "grad_norm": 0.5922102332115173, + "learning_rate": 4.9517195573973886e-05, + "loss": 0.258, + "num_input_tokens_seen": 574488, + "step": 3010 + }, + { + "epoch": 1.567047817047817, + "grad_norm": 0.7546965479850769, + "learning_rate": 4.9512750054876786e-05, + "loss": 0.2997, + "num_input_tokens_seen": 575416, + "step": 3015 + }, + { + "epoch": 1.5696465696465696, + "grad_norm": 0.6487696170806885, + "learning_rate": 4.9508284364353416e-05, + "loss": 0.272, + "num_input_tokens_seen": 576408, + "step": 3020 + }, + { + "epoch": 1.572245322245322, + "grad_norm": 0.7608407735824585, + "learning_rate": 4.950379850607859e-05, + "loss": 0.2796, + "num_input_tokens_seen": 577336, + "step": 3025 + }, + { + "epoch": 1.5748440748440748, + "grad_norm": 0.29079118371009827, + "learning_rate": 4.949929248374369e-05, + "loss": 0.2489, + "num_input_tokens_seen": 578328, + "step": 3030 + }, + { + "epoch": 1.5774428274428276, + "grad_norm": 1.1091995239257812, + "learning_rate": 4.949476630105669e-05, + "loss": 0.2524, + "num_input_tokens_seen": 579288, + "step": 3035 + }, + { + "epoch": 1.5800415800415801, + "grad_norm": 0.7921031713485718, + "learning_rate": 4.949021996174219e-05, + "loss": 0.2015, + "num_input_tokens_seen": 580216, + "step": 3040 + }, + { + "epoch": 1.5826403326403327, + "grad_norm": 0.34616631269454956, + "learning_rate": 4.9485653469541335e-05, + "loss": 0.3022, + "num_input_tokens_seen": 581144, + "step": 3045 + }, + { + "epoch": 1.5852390852390852, + "grad_norm": 0.8674325942993164, + "learning_rate": 4.9481066828211865e-05, + "loss": 0.3326, + "num_input_tokens_seen": 582104, + "step": 3050 + }, + { + "epoch": 1.5878378378378377, + "grad_norm": 0.334204763174057, + "learning_rate": 4.947646004152812e-05, + "loss": 0.3075, + "num_input_tokens_seen": 583064, + "step": 3055 + }, + { + "epoch": 1.5904365904365905, + "grad_norm": 1.0170400142669678, + "learning_rate": 4.9471833113280994e-05, + "loss": 0.3061, + "num_input_tokens_seen": 584024, + "step": 3060 + }, + { + "epoch": 1.593035343035343, + "grad_norm": 0.9500819444656372, + "learning_rate": 4.9467186047277965e-05, + "loss": 0.3267, + "num_input_tokens_seen": 584984, + "step": 3065 + }, + { + "epoch": 1.5956340956340958, + "grad_norm": 1.386699914932251, + "learning_rate": 4.9462518847343075e-05, + "loss": 0.2859, + "num_input_tokens_seen": 585880, + "step": 3070 + }, + { + "epoch": 1.5982328482328483, + "grad_norm": 0.2954603433609009, + "learning_rate": 4.945783151731696e-05, + "loss": 0.26, + "num_input_tokens_seen": 586776, + "step": 3075 + }, + { + "epoch": 1.6008316008316008, + "grad_norm": 0.8837693929672241, + "learning_rate": 4.9453124061056786e-05, + "loss": 0.271, + "num_input_tokens_seen": 587736, + "step": 3080 + }, + { + "epoch": 1.6034303534303533, + "grad_norm": 1.2492313385009766, + "learning_rate": 4.94483964824363e-05, + "loss": 0.2572, + "num_input_tokens_seen": 588760, + "step": 3085 + }, + { + "epoch": 1.6060291060291059, + "grad_norm": 1.4197447299957275, + "learning_rate": 4.94436487853458e-05, + "loss": 0.3399, + "num_input_tokens_seen": 589752, + "step": 3090 + }, + { + "epoch": 1.6086278586278586, + "grad_norm": 0.4415820837020874, + "learning_rate": 4.943888097369216e-05, + "loss": 0.2174, + "num_input_tokens_seen": 590776, + "step": 3095 + }, + { + "epoch": 1.6112266112266114, + "grad_norm": 0.7854027152061462, + "learning_rate": 4.943409305139877e-05, + "loss": 0.3053, + "num_input_tokens_seen": 591704, + "step": 3100 + }, + { + "epoch": 1.613825363825364, + "grad_norm": 0.7600734233856201, + "learning_rate": 4.94292850224056e-05, + "loss": 0.2935, + "num_input_tokens_seen": 592696, + "step": 3105 + }, + { + "epoch": 1.6164241164241164, + "grad_norm": 0.6488826274871826, + "learning_rate": 4.9424456890669144e-05, + "loss": 0.2516, + "num_input_tokens_seen": 593624, + "step": 3110 + }, + { + "epoch": 1.619022869022869, + "grad_norm": 1.0808137655258179, + "learning_rate": 4.941960866016246e-05, + "loss": 0.3125, + "num_input_tokens_seen": 594584, + "step": 3115 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 1.1360338926315308, + "learning_rate": 4.941474033487513e-05, + "loss": 0.2749, + "num_input_tokens_seen": 595576, + "step": 3120 + }, + { + "epoch": 1.6242203742203742, + "grad_norm": 0.8757058382034302, + "learning_rate": 4.940985191881328e-05, + "loss": 0.2225, + "num_input_tokens_seen": 596472, + "step": 3125 + }, + { + "epoch": 1.6268191268191268, + "grad_norm": 0.6202439665794373, + "learning_rate": 4.940494341599955e-05, + "loss": 0.2498, + "num_input_tokens_seen": 597432, + "step": 3130 + }, + { + "epoch": 1.6294178794178795, + "grad_norm": 1.2758415937423706, + "learning_rate": 4.940001483047314e-05, + "loss": 0.4025, + "num_input_tokens_seen": 598456, + "step": 3135 + }, + { + "epoch": 1.632016632016632, + "grad_norm": 0.3873748779296875, + "learning_rate": 4.939506616628976e-05, + "loss": 0.2687, + "num_input_tokens_seen": 599384, + "step": 3140 + }, + { + "epoch": 1.6346153846153846, + "grad_norm": 1.095473289489746, + "learning_rate": 4.939009742752162e-05, + "loss": 0.2609, + "num_input_tokens_seen": 600312, + "step": 3145 + }, + { + "epoch": 1.637214137214137, + "grad_norm": 0.4651813507080078, + "learning_rate": 4.9385108618257505e-05, + "loss": 0.2432, + "num_input_tokens_seen": 601176, + "step": 3150 + }, + { + "epoch": 1.6398128898128899, + "grad_norm": 1.3987767696380615, + "learning_rate": 4.938009974260265e-05, + "loss": 0.3055, + "num_input_tokens_seen": 602136, + "step": 3155 + }, + { + "epoch": 1.6424116424116424, + "grad_norm": 0.4927736818790436, + "learning_rate": 4.9375070804678866e-05, + "loss": 0.2594, + "num_input_tokens_seen": 603032, + "step": 3160 + }, + { + "epoch": 1.6450103950103951, + "grad_norm": 1.2129144668579102, + "learning_rate": 4.937002180862441e-05, + "loss": 0.2597, + "num_input_tokens_seen": 603960, + "step": 3165 + }, + { + "epoch": 1.6476091476091477, + "grad_norm": 0.6148203015327454, + "learning_rate": 4.936495275859411e-05, + "loss": 0.2473, + "num_input_tokens_seen": 604952, + "step": 3170 + }, + { + "epoch": 1.6502079002079002, + "grad_norm": 0.6275754570960999, + "learning_rate": 4.9359863658759235e-05, + "loss": 0.2208, + "num_input_tokens_seen": 605976, + "step": 3175 + }, + { + "epoch": 1.6528066528066527, + "grad_norm": 0.7180606126785278, + "learning_rate": 4.93547545133076e-05, + "loss": 0.2802, + "num_input_tokens_seen": 606904, + "step": 3180 + }, + { + "epoch": 1.6554054054054053, + "grad_norm": 0.575066328048706, + "learning_rate": 4.9349625326443483e-05, + "loss": 0.337, + "num_input_tokens_seen": 607800, + "step": 3185 + }, + { + "epoch": 1.658004158004158, + "grad_norm": 0.36835190653800964, + "learning_rate": 4.9344476102387685e-05, + "loss": 0.295, + "num_input_tokens_seen": 608792, + "step": 3190 + }, + { + "epoch": 1.6606029106029108, + "grad_norm": 0.576571524143219, + "learning_rate": 4.933930684537746e-05, + "loss": 0.2445, + "num_input_tokens_seen": 609720, + "step": 3195 + }, + { + "epoch": 1.6632016632016633, + "grad_norm": 0.5010712146759033, + "learning_rate": 4.933411755966657e-05, + "loss": 0.2598, + "num_input_tokens_seen": 610680, + "step": 3200 + }, + { + "epoch": 1.6658004158004158, + "grad_norm": 0.7644681930541992, + "learning_rate": 4.9328908249525264e-05, + "loss": 0.2398, + "num_input_tokens_seen": 611704, + "step": 3205 + }, + { + "epoch": 1.6683991683991684, + "grad_norm": 1.5516656637191772, + "learning_rate": 4.9323678919240246e-05, + "loss": 0.3392, + "num_input_tokens_seen": 612600, + "step": 3210 + }, + { + "epoch": 1.6709979209979209, + "grad_norm": 0.41959768533706665, + "learning_rate": 4.931842957311472e-05, + "loss": 0.2826, + "num_input_tokens_seen": 613560, + "step": 3215 + }, + { + "epoch": 1.6735966735966736, + "grad_norm": 0.3877437710762024, + "learning_rate": 4.9313160215468334e-05, + "loss": 0.2406, + "num_input_tokens_seen": 614552, + "step": 3220 + }, + { + "epoch": 1.6761954261954262, + "grad_norm": 0.9397563338279724, + "learning_rate": 4.930787085063723e-05, + "loss": 0.2844, + "num_input_tokens_seen": 615480, + "step": 3225 + }, + { + "epoch": 1.678794178794179, + "grad_norm": 0.4473220705986023, + "learning_rate": 4.930256148297398e-05, + "loss": 0.3294, + "num_input_tokens_seen": 616440, + "step": 3230 + }, + { + "epoch": 1.6813929313929314, + "grad_norm": 0.5405673384666443, + "learning_rate": 4.929723211684767e-05, + "loss": 0.2264, + "num_input_tokens_seen": 617336, + "step": 3235 + }, + { + "epoch": 1.683991683991684, + "grad_norm": 0.5070677995681763, + "learning_rate": 4.929188275664379e-05, + "loss": 0.2921, + "num_input_tokens_seen": 618328, + "step": 3240 + }, + { + "epoch": 1.6865904365904365, + "grad_norm": 0.527340292930603, + "learning_rate": 4.928651340676431e-05, + "loss": 0.3186, + "num_input_tokens_seen": 619320, + "step": 3245 + }, + { + "epoch": 1.689189189189189, + "grad_norm": 0.6441925764083862, + "learning_rate": 4.9281124071627624e-05, + "loss": 0.2593, + "num_input_tokens_seen": 620184, + "step": 3250 + }, + { + "epoch": 1.6917879417879418, + "grad_norm": 0.5347278118133545, + "learning_rate": 4.9275714755668624e-05, + "loss": 0.2536, + "num_input_tokens_seen": 621112, + "step": 3255 + }, + { + "epoch": 1.6943866943866945, + "grad_norm": 0.9809896349906921, + "learning_rate": 4.927028546333858e-05, + "loss": 0.2908, + "num_input_tokens_seen": 622072, + "step": 3260 + }, + { + "epoch": 1.696985446985447, + "grad_norm": 0.8245409727096558, + "learning_rate": 4.926483619910525e-05, + "loss": 0.3041, + "num_input_tokens_seen": 623000, + "step": 3265 + }, + { + "epoch": 1.6995841995841996, + "grad_norm": 0.6835474371910095, + "learning_rate": 4.9259366967452794e-05, + "loss": 0.2837, + "num_input_tokens_seen": 623928, + "step": 3270 + }, + { + "epoch": 1.7021829521829521, + "grad_norm": 0.8764484524726868, + "learning_rate": 4.925387777288183e-05, + "loss": 0.3033, + "num_input_tokens_seen": 624952, + "step": 3275 + }, + { + "epoch": 1.7047817047817047, + "grad_norm": 0.9921683073043823, + "learning_rate": 4.924836861990938e-05, + "loss": 0.2348, + "num_input_tokens_seen": 625912, + "step": 3280 + }, + { + "epoch": 1.7073804573804574, + "grad_norm": 0.3873283565044403, + "learning_rate": 4.9242839513068906e-05, + "loss": 0.2373, + "num_input_tokens_seen": 626840, + "step": 3285 + }, + { + "epoch": 1.70997920997921, + "grad_norm": 0.6283779144287109, + "learning_rate": 4.923729045691028e-05, + "loss": 0.2827, + "num_input_tokens_seen": 627800, + "step": 3290 + }, + { + "epoch": 1.7125779625779627, + "grad_norm": 0.47826698422431946, + "learning_rate": 4.92317214559998e-05, + "loss": 0.3034, + "num_input_tokens_seen": 628664, + "step": 3295 + }, + { + "epoch": 1.7151767151767152, + "grad_norm": 1.0392546653747559, + "learning_rate": 4.9226132514920165e-05, + "loss": 0.3374, + "num_input_tokens_seen": 629560, + "step": 3300 + }, + { + "epoch": 1.7177754677754677, + "grad_norm": 0.40837082266807556, + "learning_rate": 4.9220523638270494e-05, + "loss": 0.3239, + "num_input_tokens_seen": 630456, + "step": 3305 + }, + { + "epoch": 1.7203742203742203, + "grad_norm": 0.6127018332481384, + "learning_rate": 4.92148948306663e-05, + "loss": 0.2875, + "num_input_tokens_seen": 631384, + "step": 3310 + }, + { + "epoch": 1.722972972972973, + "grad_norm": 0.4681112468242645, + "learning_rate": 4.92092460967395e-05, + "loss": 0.2607, + "num_input_tokens_seen": 632376, + "step": 3315 + }, + { + "epoch": 1.7255717255717256, + "grad_norm": 0.5008119940757751, + "learning_rate": 4.920357744113841e-05, + "loss": 0.2461, + "num_input_tokens_seen": 633304, + "step": 3320 + }, + { + "epoch": 1.7281704781704783, + "grad_norm": 0.6784627437591553, + "learning_rate": 4.9197888868527756e-05, + "loss": 0.287, + "num_input_tokens_seen": 634232, + "step": 3325 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 0.8454640507698059, + "learning_rate": 4.919218038358861e-05, + "loss": 0.2939, + "num_input_tokens_seen": 635192, + "step": 3330 + }, + { + "epoch": 1.7333679833679834, + "grad_norm": 0.6023103594779968, + "learning_rate": 4.918645199101848e-05, + "loss": 0.2801, + "num_input_tokens_seen": 636152, + "step": 3335 + }, + { + "epoch": 1.735966735966736, + "grad_norm": 0.9337441921234131, + "learning_rate": 4.918070369553123e-05, + "loss": 0.2197, + "num_input_tokens_seen": 637112, + "step": 3340 + }, + { + "epoch": 1.7385654885654884, + "grad_norm": 0.7493476271629333, + "learning_rate": 4.917493550185709e-05, + "loss": 0.2276, + "num_input_tokens_seen": 638040, + "step": 3345 + }, + { + "epoch": 1.7411642411642412, + "grad_norm": 1.4211945533752441, + "learning_rate": 4.91691474147427e-05, + "loss": 0.2966, + "num_input_tokens_seen": 638968, + "step": 3350 + }, + { + "epoch": 1.743762993762994, + "grad_norm": 0.5147002339363098, + "learning_rate": 4.916333943895104e-05, + "loss": 0.1932, + "num_input_tokens_seen": 639896, + "step": 3355 + }, + { + "epoch": 1.7463617463617465, + "grad_norm": 0.5183929204940796, + "learning_rate": 4.915751157926146e-05, + "loss": 0.251, + "num_input_tokens_seen": 640888, + "step": 3360 + }, + { + "epoch": 1.748960498960499, + "grad_norm": 0.4086575210094452, + "learning_rate": 4.9151663840469687e-05, + "loss": 0.2163, + "num_input_tokens_seen": 641816, + "step": 3365 + }, + { + "epoch": 1.7515592515592515, + "grad_norm": 0.7804684638977051, + "learning_rate": 4.914579622738779e-05, + "loss": 0.3251, + "num_input_tokens_seen": 642776, + "step": 3370 + }, + { + "epoch": 1.754158004158004, + "grad_norm": 0.536431074142456, + "learning_rate": 4.913990874484421e-05, + "loss": 0.2067, + "num_input_tokens_seen": 643672, + "step": 3375 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.4579595625400543, + "learning_rate": 4.913400139768372e-05, + "loss": 0.3046, + "num_input_tokens_seen": 644632, + "step": 3380 + }, + { + "epoch": 1.7593555093555093, + "grad_norm": 0.5672430992126465, + "learning_rate": 4.9128074190767456e-05, + "loss": 0.3004, + "num_input_tokens_seen": 645528, + "step": 3385 + }, + { + "epoch": 1.761954261954262, + "grad_norm": 1.436991810798645, + "learning_rate": 4.912212712897288e-05, + "loss": 0.2738, + "num_input_tokens_seen": 646552, + "step": 3390 + }, + { + "epoch": 1.7645530145530146, + "grad_norm": 1.2954567670822144, + "learning_rate": 4.911616021719381e-05, + "loss": 0.2477, + "num_input_tokens_seen": 647448, + "step": 3395 + }, + { + "epoch": 1.7671517671517671, + "grad_norm": 0.5461776852607727, + "learning_rate": 4.911017346034037e-05, + "loss": 0.2858, + "num_input_tokens_seen": 648440, + "step": 3400 + }, + { + "epoch": 1.7697505197505197, + "grad_norm": 0.9758524894714355, + "learning_rate": 4.910416686333906e-05, + "loss": 0.2415, + "num_input_tokens_seen": 649304, + "step": 3405 + }, + { + "epoch": 1.7723492723492722, + "grad_norm": 0.4500974714756012, + "learning_rate": 4.909814043113267e-05, + "loss": 0.2187, + "num_input_tokens_seen": 650200, + "step": 3410 + }, + { + "epoch": 1.774948024948025, + "grad_norm": 0.42791879177093506, + "learning_rate": 4.909209416868032e-05, + "loss": 0.235, + "num_input_tokens_seen": 651128, + "step": 3415 + }, + { + "epoch": 1.7775467775467777, + "grad_norm": 0.42734813690185547, + "learning_rate": 4.9086028080957445e-05, + "loss": 0.3002, + "num_input_tokens_seen": 652088, + "step": 3420 + }, + { + "epoch": 1.7801455301455302, + "grad_norm": 0.5106260776519775, + "learning_rate": 4.907994217295582e-05, + "loss": 0.2482, + "num_input_tokens_seen": 652984, + "step": 3425 + }, + { + "epoch": 1.7827442827442828, + "grad_norm": 1.4451203346252441, + "learning_rate": 4.9073836449683486e-05, + "loss": 0.4111, + "num_input_tokens_seen": 653976, + "step": 3430 + }, + { + "epoch": 1.7853430353430353, + "grad_norm": 0.1918172687292099, + "learning_rate": 4.906771091616483e-05, + "loss": 0.2961, + "num_input_tokens_seen": 655032, + "step": 3435 + }, + { + "epoch": 1.7879417879417878, + "grad_norm": 0.7378954887390137, + "learning_rate": 4.9061565577440516e-05, + "loss": 0.3072, + "num_input_tokens_seen": 655992, + "step": 3440 + }, + { + "epoch": 1.7905405405405406, + "grad_norm": 0.6790567636489868, + "learning_rate": 4.9055400438567515e-05, + "loss": 0.3277, + "num_input_tokens_seen": 656920, + "step": 3445 + }, + { + "epoch": 1.793139293139293, + "grad_norm": 0.9287056922912598, + "learning_rate": 4.90492155046191e-05, + "loss": 0.2765, + "num_input_tokens_seen": 657880, + "step": 3450 + }, + { + "epoch": 1.7957380457380459, + "grad_norm": 1.0909225940704346, + "learning_rate": 4.9043010780684814e-05, + "loss": 0.28, + "num_input_tokens_seen": 658840, + "step": 3455 + }, + { + "epoch": 1.7983367983367984, + "grad_norm": 0.3975485563278198, + "learning_rate": 4.9036786271870504e-05, + "loss": 0.293, + "num_input_tokens_seen": 659800, + "step": 3460 + }, + { + "epoch": 1.800935550935551, + "grad_norm": 1.120675802230835, + "learning_rate": 4.903054198329827e-05, + "loss": 0.2796, + "num_input_tokens_seen": 660824, + "step": 3465 + }, + { + "epoch": 1.8035343035343034, + "grad_norm": 0.6125587821006775, + "learning_rate": 4.902427792010653e-05, + "loss": 0.16, + "num_input_tokens_seen": 661816, + "step": 3470 + }, + { + "epoch": 1.806133056133056, + "grad_norm": 0.27888065576553345, + "learning_rate": 4.9017994087449946e-05, + "loss": 0.2577, + "num_input_tokens_seen": 662744, + "step": 3475 + }, + { + "epoch": 1.8087318087318087, + "grad_norm": 0.27413657307624817, + "learning_rate": 4.901169049049945e-05, + "loss": 0.3468, + "num_input_tokens_seen": 663672, + "step": 3480 + }, + { + "epoch": 1.8113305613305615, + "grad_norm": 0.7460250854492188, + "learning_rate": 4.9005367134442235e-05, + "loss": 0.324, + "num_input_tokens_seen": 664568, + "step": 3485 + }, + { + "epoch": 1.813929313929314, + "grad_norm": 0.983891487121582, + "learning_rate": 4.8999024024481775e-05, + "loss": 0.2561, + "num_input_tokens_seen": 665560, + "step": 3490 + }, + { + "epoch": 1.8165280665280665, + "grad_norm": 1.054861068725586, + "learning_rate": 4.8992661165837785e-05, + "loss": 0.2713, + "num_input_tokens_seen": 666488, + "step": 3495 + }, + { + "epoch": 1.819126819126819, + "grad_norm": 0.27063408493995667, + "learning_rate": 4.8986278563746216e-05, + "loss": 0.2587, + "num_input_tokens_seen": 667512, + "step": 3500 + }, + { + "epoch": 1.8217255717255716, + "grad_norm": 0.42453834414482117, + "learning_rate": 4.8979876223459295e-05, + "loss": 0.2341, + "num_input_tokens_seen": 668472, + "step": 3505 + }, + { + "epoch": 1.8243243243243243, + "grad_norm": 0.8347375392913818, + "learning_rate": 4.8973454150245466e-05, + "loss": 0.2655, + "num_input_tokens_seen": 669464, + "step": 3510 + }, + { + "epoch": 1.8269230769230769, + "grad_norm": 0.48473769426345825, + "learning_rate": 4.896701234938944e-05, + "loss": 0.3482, + "num_input_tokens_seen": 670424, + "step": 3515 + }, + { + "epoch": 1.8295218295218296, + "grad_norm": 1.0082907676696777, + "learning_rate": 4.896055082619213e-05, + "loss": 0.283, + "num_input_tokens_seen": 671480, + "step": 3520 + }, + { + "epoch": 1.8321205821205822, + "grad_norm": 0.5217125415802002, + "learning_rate": 4.89540695859707e-05, + "loss": 0.3162, + "num_input_tokens_seen": 672536, + "step": 3525 + }, + { + "epoch": 1.8347193347193347, + "grad_norm": 0.5085710883140564, + "learning_rate": 4.8947568634058525e-05, + "loss": 0.2861, + "num_input_tokens_seen": 673528, + "step": 3530 + }, + { + "epoch": 1.8373180873180872, + "grad_norm": 0.6158746480941772, + "learning_rate": 4.894104797580522e-05, + "loss": 0.2679, + "num_input_tokens_seen": 674456, + "step": 3535 + }, + { + "epoch": 1.83991683991684, + "grad_norm": 0.34147578477859497, + "learning_rate": 4.893450761657658e-05, + "loss": 0.2524, + "num_input_tokens_seen": 675416, + "step": 3540 + }, + { + "epoch": 1.8425155925155925, + "grad_norm": 0.24734939634799957, + "learning_rate": 4.8927947561754675e-05, + "loss": 0.3583, + "num_input_tokens_seen": 676280, + "step": 3545 + }, + { + "epoch": 1.8451143451143452, + "grad_norm": 0.5315392017364502, + "learning_rate": 4.892136781673771e-05, + "loss": 0.2662, + "num_input_tokens_seen": 677176, + "step": 3550 + }, + { + "epoch": 1.8477130977130978, + "grad_norm": 0.8005232810974121, + "learning_rate": 4.891476838694012e-05, + "loss": 0.2665, + "num_input_tokens_seen": 678104, + "step": 3555 + }, + { + "epoch": 1.8503118503118503, + "grad_norm": 1.2881752252578735, + "learning_rate": 4.890814927779258e-05, + "loss": 0.3178, + "num_input_tokens_seen": 679096, + "step": 3560 + }, + { + "epoch": 1.8529106029106028, + "grad_norm": 0.734790027141571, + "learning_rate": 4.8901510494741895e-05, + "loss": 0.2728, + "num_input_tokens_seen": 680088, + "step": 3565 + }, + { + "epoch": 1.8555093555093554, + "grad_norm": 0.9197729825973511, + "learning_rate": 4.88948520432511e-05, + "loss": 0.2547, + "num_input_tokens_seen": 681016, + "step": 3570 + }, + { + "epoch": 1.8581081081081081, + "grad_norm": 0.5162913203239441, + "learning_rate": 4.88881739287994e-05, + "loss": 0.2484, + "num_input_tokens_seen": 681976, + "step": 3575 + }, + { + "epoch": 1.8607068607068609, + "grad_norm": 0.352763831615448, + "learning_rate": 4.888147615688219e-05, + "loss": 0.3024, + "num_input_tokens_seen": 683000, + "step": 3580 + }, + { + "epoch": 1.8633056133056134, + "grad_norm": 0.6642034649848938, + "learning_rate": 4.8874758733011023e-05, + "loss": 0.3009, + "num_input_tokens_seen": 684024, + "step": 3585 + }, + { + "epoch": 1.865904365904366, + "grad_norm": 0.5377672910690308, + "learning_rate": 4.886802166271364e-05, + "loss": 0.2606, + "num_input_tokens_seen": 684984, + "step": 3590 + }, + { + "epoch": 1.8685031185031185, + "grad_norm": 0.5754424333572388, + "learning_rate": 4.886126495153395e-05, + "loss": 0.3155, + "num_input_tokens_seen": 685912, + "step": 3595 + }, + { + "epoch": 1.871101871101871, + "grad_norm": 0.6143356561660767, + "learning_rate": 4.8854488605032014e-05, + "loss": 0.2525, + "num_input_tokens_seen": 686904, + "step": 3600 + }, + { + "epoch": 1.8737006237006237, + "grad_norm": 0.4269965589046478, + "learning_rate": 4.884769262878406e-05, + "loss": 0.2643, + "num_input_tokens_seen": 687768, + "step": 3605 + }, + { + "epoch": 1.8762993762993763, + "grad_norm": 1.5063530206680298, + "learning_rate": 4.884087702838246e-05, + "loss": 0.3122, + "num_input_tokens_seen": 688696, + "step": 3610 + }, + { + "epoch": 1.878898128898129, + "grad_norm": 0.7017138004302979, + "learning_rate": 4.8834041809435736e-05, + "loss": 0.2273, + "num_input_tokens_seen": 689624, + "step": 3615 + }, + { + "epoch": 1.8814968814968815, + "grad_norm": 0.7258045077323914, + "learning_rate": 4.8827186977568565e-05, + "loss": 0.3417, + "num_input_tokens_seen": 690552, + "step": 3620 + }, + { + "epoch": 1.884095634095634, + "grad_norm": 0.7345150709152222, + "learning_rate": 4.8820312538421755e-05, + "loss": 0.2826, + "num_input_tokens_seen": 691512, + "step": 3625 + }, + { + "epoch": 1.8866943866943866, + "grad_norm": 0.7665963172912598, + "learning_rate": 4.881341849765224e-05, + "loss": 0.2704, + "num_input_tokens_seen": 692536, + "step": 3630 + }, + { + "epoch": 1.8892931392931391, + "grad_norm": 0.7616263031959534, + "learning_rate": 4.88065048609331e-05, + "loss": 0.2548, + "num_input_tokens_seen": 693560, + "step": 3635 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8291826844215393, + "learning_rate": 4.879957163395354e-05, + "loss": 0.2618, + "num_input_tokens_seen": 694456, + "step": 3640 + }, + { + "epoch": 1.8944906444906446, + "grad_norm": 0.8603946566581726, + "learning_rate": 4.879261882241888e-05, + "loss": 0.2537, + "num_input_tokens_seen": 695448, + "step": 3645 + }, + { + "epoch": 1.8970893970893972, + "grad_norm": 1.0172308683395386, + "learning_rate": 4.878564643205054e-05, + "loss": 0.3319, + "num_input_tokens_seen": 696408, + "step": 3650 + }, + { + "epoch": 1.8996881496881497, + "grad_norm": 0.8178093433380127, + "learning_rate": 4.877865446858608e-05, + "loss": 0.3199, + "num_input_tokens_seen": 697336, + "step": 3655 + }, + { + "epoch": 1.9022869022869022, + "grad_norm": 0.5206460356712341, + "learning_rate": 4.877164293777916e-05, + "loss": 0.2448, + "num_input_tokens_seen": 698200, + "step": 3660 + }, + { + "epoch": 1.9048856548856548, + "grad_norm": 0.8099747896194458, + "learning_rate": 4.8764611845399516e-05, + "loss": 0.2271, + "num_input_tokens_seen": 699256, + "step": 3665 + }, + { + "epoch": 1.9074844074844075, + "grad_norm": 1.1116304397583008, + "learning_rate": 4.875756119723301e-05, + "loss": 0.3736, + "num_input_tokens_seen": 700280, + "step": 3670 + }, + { + "epoch": 1.91008316008316, + "grad_norm": 0.4751640260219574, + "learning_rate": 4.87504909990816e-05, + "loss": 0.3383, + "num_input_tokens_seen": 701272, + "step": 3675 + }, + { + "epoch": 1.9126819126819128, + "grad_norm": 0.3998151421546936, + "learning_rate": 4.87434012567633e-05, + "loss": 0.318, + "num_input_tokens_seen": 702200, + "step": 3680 + }, + { + "epoch": 1.9152806652806653, + "grad_norm": 1.1473153829574585, + "learning_rate": 4.8736291976112235e-05, + "loss": 0.2402, + "num_input_tokens_seen": 703160, + "step": 3685 + }, + { + "epoch": 1.9178794178794178, + "grad_norm": 0.8054749369621277, + "learning_rate": 4.872916316297859e-05, + "loss": 0.1959, + "num_input_tokens_seen": 704152, + "step": 3690 + }, + { + "epoch": 1.9204781704781704, + "grad_norm": 0.600105881690979, + "learning_rate": 4.872201482322865e-05, + "loss": 0.2779, + "num_input_tokens_seen": 705048, + "step": 3695 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.5062316656112671, + "learning_rate": 4.8714846962744725e-05, + "loss": 0.2236, + "num_input_tokens_seen": 706072, + "step": 3700 + }, + { + "epoch": 1.9256756756756757, + "grad_norm": 0.6063704490661621, + "learning_rate": 4.870765958742523e-05, + "loss": 0.3071, + "num_input_tokens_seen": 706936, + "step": 3705 + }, + { + "epoch": 1.9282744282744284, + "grad_norm": 0.4505123794078827, + "learning_rate": 4.8700452703184616e-05, + "loss": 0.2729, + "num_input_tokens_seen": 707864, + "step": 3710 + }, + { + "epoch": 1.930873180873181, + "grad_norm": 1.1904067993164062, + "learning_rate": 4.869322631595341e-05, + "loss": 0.2928, + "num_input_tokens_seen": 708888, + "step": 3715 + }, + { + "epoch": 1.9334719334719335, + "grad_norm": 0.4216737151145935, + "learning_rate": 4.8685980431678144e-05, + "loss": 0.2983, + "num_input_tokens_seen": 709816, + "step": 3720 + }, + { + "epoch": 1.936070686070686, + "grad_norm": 0.668410062789917, + "learning_rate": 4.867871505632144e-05, + "loss": 0.2743, + "num_input_tokens_seen": 710840, + "step": 3725 + }, + { + "epoch": 1.9386694386694385, + "grad_norm": 0.5120556950569153, + "learning_rate": 4.867143019586195e-05, + "loss": 0.2664, + "num_input_tokens_seen": 711832, + "step": 3730 + }, + { + "epoch": 1.9412681912681913, + "grad_norm": 0.6808879375457764, + "learning_rate": 4.866412585629432e-05, + "loss": 0.2854, + "num_input_tokens_seen": 712824, + "step": 3735 + }, + { + "epoch": 1.943866943866944, + "grad_norm": 0.4110320508480072, + "learning_rate": 4.865680204362928e-05, + "loss": 0.2303, + "num_input_tokens_seen": 713784, + "step": 3740 + }, + { + "epoch": 1.9464656964656966, + "grad_norm": 0.8332298994064331, + "learning_rate": 4.864945876389356e-05, + "loss": 0.2126, + "num_input_tokens_seen": 714776, + "step": 3745 + }, + { + "epoch": 1.949064449064449, + "grad_norm": 0.32694095373153687, + "learning_rate": 4.864209602312991e-05, + "loss": 0.3216, + "num_input_tokens_seen": 715704, + "step": 3750 + }, + { + "epoch": 1.9516632016632016, + "grad_norm": 0.45137980580329895, + "learning_rate": 4.863471382739708e-05, + "loss": 0.2403, + "num_input_tokens_seen": 716632, + "step": 3755 + }, + { + "epoch": 1.9542619542619541, + "grad_norm": 0.41432908177375793, + "learning_rate": 4.862731218276987e-05, + "loss": 0.1773, + "num_input_tokens_seen": 717592, + "step": 3760 + }, + { + "epoch": 1.956860706860707, + "grad_norm": 0.3941957652568817, + "learning_rate": 4.8619891095339034e-05, + "loss": 0.263, + "num_input_tokens_seen": 718552, + "step": 3765 + }, + { + "epoch": 1.9594594594594594, + "grad_norm": 0.37639331817626953, + "learning_rate": 4.861245057121135e-05, + "loss": 0.2814, + "num_input_tokens_seen": 719480, + "step": 3770 + }, + { + "epoch": 1.9620582120582122, + "grad_norm": 0.5133543014526367, + "learning_rate": 4.8604990616509616e-05, + "loss": 0.2373, + "num_input_tokens_seen": 720440, + "step": 3775 + }, + { + "epoch": 1.9646569646569647, + "grad_norm": 1.2452272176742554, + "learning_rate": 4.8597511237372574e-05, + "loss": 0.3135, + "num_input_tokens_seen": 721400, + "step": 3780 + }, + { + "epoch": 1.9672557172557172, + "grad_norm": 0.30282530188560486, + "learning_rate": 4.859001243995497e-05, + "loss": 0.2575, + "num_input_tokens_seen": 722328, + "step": 3785 + }, + { + "epoch": 1.9698544698544698, + "grad_norm": 0.32523196935653687, + "learning_rate": 4.858249423042753e-05, + "loss": 0.3468, + "num_input_tokens_seen": 723320, + "step": 3790 + }, + { + "epoch": 1.9724532224532223, + "grad_norm": 0.7080844044685364, + "learning_rate": 4.857495661497695e-05, + "loss": 0.2991, + "num_input_tokens_seen": 724280, + "step": 3795 + }, + { + "epoch": 1.975051975051975, + "grad_norm": 1.135201334953308, + "learning_rate": 4.856739959980591e-05, + "loss": 0.2722, + "num_input_tokens_seen": 725240, + "step": 3800 + }, + { + "epoch": 1.9776507276507278, + "grad_norm": 0.6793693900108337, + "learning_rate": 4.855982319113304e-05, + "loss": 0.2603, + "num_input_tokens_seen": 726200, + "step": 3805 + }, + { + "epoch": 1.9802494802494803, + "grad_norm": 0.27769699692726135, + "learning_rate": 4.855222739519292e-05, + "loss": 0.2763, + "num_input_tokens_seen": 727192, + "step": 3810 + }, + { + "epoch": 1.9828482328482329, + "grad_norm": 0.33137866854667664, + "learning_rate": 4.8544612218236096e-05, + "loss": 0.2961, + "num_input_tokens_seen": 728184, + "step": 3815 + }, + { + "epoch": 1.9854469854469854, + "grad_norm": 0.34216591715812683, + "learning_rate": 4.853697766652907e-05, + "loss": 0.2425, + "num_input_tokens_seen": 729176, + "step": 3820 + }, + { + "epoch": 1.988045738045738, + "grad_norm": 0.3952418267726898, + "learning_rate": 4.852932374635427e-05, + "loss": 0.2486, + "num_input_tokens_seen": 730168, + "step": 3825 + }, + { + "epoch": 1.9906444906444907, + "grad_norm": 0.6853535175323486, + "learning_rate": 4.852165046401008e-05, + "loss": 0.261, + "num_input_tokens_seen": 731096, + "step": 3830 + }, + { + "epoch": 1.9932432432432432, + "grad_norm": 0.6423739790916443, + "learning_rate": 4.85139578258108e-05, + "loss": 0.2716, + "num_input_tokens_seen": 732120, + "step": 3835 + }, + { + "epoch": 1.995841995841996, + "grad_norm": 0.5096138119697571, + "learning_rate": 4.850624583808667e-05, + "loss": 0.3015, + "num_input_tokens_seen": 733080, + "step": 3840 + }, + { + "epoch": 1.9984407484407485, + "grad_norm": 1.219369649887085, + "learning_rate": 4.849851450718385e-05, + "loss": 0.2195, + "num_input_tokens_seen": 734072, + "step": 3845 + }, + { + "epoch": 2.0, + "eval_loss": 0.2571273446083069, + "eval_runtime": 7.921, + "eval_samples_per_second": 108.067, + "eval_steps_per_second": 27.017, + "num_input_tokens_seen": 734600, + "step": 3848 + }, + { + "epoch": 2.001039501039501, + "grad_norm": 0.34900781512260437, + "learning_rate": 4.849076383946441e-05, + "loss": 0.3281, + "num_input_tokens_seen": 734984, + "step": 3850 + }, + { + "epoch": 2.0036382536382535, + "grad_norm": 0.5847567915916443, + "learning_rate": 4.848299384130634e-05, + "loss": 0.2293, + "num_input_tokens_seen": 735912, + "step": 3855 + }, + { + "epoch": 2.006237006237006, + "grad_norm": 0.32783377170562744, + "learning_rate": 4.8475204519103536e-05, + "loss": 0.2558, + "num_input_tokens_seen": 736776, + "step": 3860 + }, + { + "epoch": 2.008835758835759, + "grad_norm": 0.19909125566482544, + "learning_rate": 4.8467395879265786e-05, + "loss": 0.2449, + "num_input_tokens_seen": 737768, + "step": 3865 + }, + { + "epoch": 2.0114345114345116, + "grad_norm": 0.48175621032714844, + "learning_rate": 4.8459567928218794e-05, + "loss": 0.2403, + "num_input_tokens_seen": 738728, + "step": 3870 + }, + { + "epoch": 2.014033264033264, + "grad_norm": 0.7447880506515503, + "learning_rate": 4.845172067240415e-05, + "loss": 0.283, + "num_input_tokens_seen": 739656, + "step": 3875 + }, + { + "epoch": 2.0166320166320166, + "grad_norm": 0.47836923599243164, + "learning_rate": 4.844385411827931e-05, + "loss": 0.2261, + "num_input_tokens_seen": 740584, + "step": 3880 + }, + { + "epoch": 2.019230769230769, + "grad_norm": 0.6352177858352661, + "learning_rate": 4.8435968272317624e-05, + "loss": 0.2433, + "num_input_tokens_seen": 741512, + "step": 3885 + }, + { + "epoch": 2.0218295218295217, + "grad_norm": 0.7203357219696045, + "learning_rate": 4.8428063141008316e-05, + "loss": 0.3217, + "num_input_tokens_seen": 742472, + "step": 3890 + }, + { + "epoch": 2.024428274428274, + "grad_norm": 0.8496769070625305, + "learning_rate": 4.8420138730856495e-05, + "loss": 0.2788, + "num_input_tokens_seen": 743464, + "step": 3895 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.5050995349884033, + "learning_rate": 4.8412195048383115e-05, + "loss": 0.2768, + "num_input_tokens_seen": 744392, + "step": 3900 + }, + { + "epoch": 2.0296257796257797, + "grad_norm": 1.1059150695800781, + "learning_rate": 4.8404232100124994e-05, + "loss": 0.3185, + "num_input_tokens_seen": 745352, + "step": 3905 + }, + { + "epoch": 2.0322245322245323, + "grad_norm": 0.6574660539627075, + "learning_rate": 4.839624989263479e-05, + "loss": 0.2397, + "num_input_tokens_seen": 746312, + "step": 3910 + }, + { + "epoch": 2.034823284823285, + "grad_norm": 0.3694811761379242, + "learning_rate": 4.838824843248104e-05, + "loss": 0.2708, + "num_input_tokens_seen": 747272, + "step": 3915 + }, + { + "epoch": 2.0374220374220373, + "grad_norm": 0.5332977771759033, + "learning_rate": 4.838022772624812e-05, + "loss": 0.3675, + "num_input_tokens_seen": 748264, + "step": 3920 + }, + { + "epoch": 2.04002079002079, + "grad_norm": 0.7569249868392944, + "learning_rate": 4.837218778053621e-05, + "loss": 0.3079, + "num_input_tokens_seen": 749256, + "step": 3925 + }, + { + "epoch": 2.042619542619543, + "grad_norm": 0.6346331834793091, + "learning_rate": 4.8364128601961335e-05, + "loss": 0.2869, + "num_input_tokens_seen": 750248, + "step": 3930 + }, + { + "epoch": 2.0452182952182953, + "grad_norm": 0.3340613842010498, + "learning_rate": 4.835605019715538e-05, + "loss": 0.29, + "num_input_tokens_seen": 751272, + "step": 3935 + }, + { + "epoch": 2.047817047817048, + "grad_norm": 0.5226474404335022, + "learning_rate": 4.834795257276601e-05, + "loss": 0.2795, + "num_input_tokens_seen": 752296, + "step": 3940 + }, + { + "epoch": 2.0504158004158004, + "grad_norm": 0.7366021871566772, + "learning_rate": 4.8339835735456736e-05, + "loss": 0.2888, + "num_input_tokens_seen": 753256, + "step": 3945 + }, + { + "epoch": 2.053014553014553, + "grad_norm": 1.0441032648086548, + "learning_rate": 4.833169969190685e-05, + "loss": 0.2159, + "num_input_tokens_seen": 754344, + "step": 3950 + }, + { + "epoch": 2.0556133056133055, + "grad_norm": 0.44628211855888367, + "learning_rate": 4.832354444881147e-05, + "loss": 0.2167, + "num_input_tokens_seen": 755304, + "step": 3955 + }, + { + "epoch": 2.0582120582120584, + "grad_norm": 0.3708413243293762, + "learning_rate": 4.8315370012881514e-05, + "loss": 0.2563, + "num_input_tokens_seen": 756296, + "step": 3960 + }, + { + "epoch": 2.060810810810811, + "grad_norm": 0.5447730422019958, + "learning_rate": 4.830717639084367e-05, + "loss": 0.1811, + "num_input_tokens_seen": 757256, + "step": 3965 + }, + { + "epoch": 2.0634095634095635, + "grad_norm": 1.0856915712356567, + "learning_rate": 4.829896358944044e-05, + "loss": 0.2911, + "num_input_tokens_seen": 758216, + "step": 3970 + }, + { + "epoch": 2.066008316008316, + "grad_norm": 0.9282616972923279, + "learning_rate": 4.8290731615430104e-05, + "loss": 0.3729, + "num_input_tokens_seen": 759176, + "step": 3975 + }, + { + "epoch": 2.0686070686070686, + "grad_norm": 0.23787830770015717, + "learning_rate": 4.828248047558672e-05, + "loss": 0.2525, + "num_input_tokens_seen": 760072, + "step": 3980 + }, + { + "epoch": 2.071205821205821, + "grad_norm": 0.28281369805336, + "learning_rate": 4.82742101767001e-05, + "loss": 0.2779, + "num_input_tokens_seen": 761032, + "step": 3985 + }, + { + "epoch": 2.0738045738045736, + "grad_norm": 1.1236748695373535, + "learning_rate": 4.8265920725575825e-05, + "loss": 0.3051, + "num_input_tokens_seen": 761960, + "step": 3990 + }, + { + "epoch": 2.0764033264033266, + "grad_norm": 0.4832697808742523, + "learning_rate": 4.825761212903527e-05, + "loss": 0.2932, + "num_input_tokens_seen": 762984, + "step": 3995 + }, + { + "epoch": 2.079002079002079, + "grad_norm": 0.24737945199012756, + "learning_rate": 4.824928439391552e-05, + "loss": 0.323, + "num_input_tokens_seen": 763880, + "step": 4000 + }, + { + "epoch": 2.0816008316008316, + "grad_norm": 0.565021276473999, + "learning_rate": 4.824093752706943e-05, + "loss": 0.3057, + "num_input_tokens_seen": 764808, + "step": 4005 + }, + { + "epoch": 2.084199584199584, + "grad_norm": 1.0909640789031982, + "learning_rate": 4.823257153536561e-05, + "loss": 0.2662, + "num_input_tokens_seen": 765704, + "step": 4010 + }, + { + "epoch": 2.0867983367983367, + "grad_norm": 0.8373414874076843, + "learning_rate": 4.822418642568839e-05, + "loss": 0.2853, + "num_input_tokens_seen": 766664, + "step": 4015 + }, + { + "epoch": 2.0893970893970892, + "grad_norm": 0.8584372401237488, + "learning_rate": 4.821578220493783e-05, + "loss": 0.2847, + "num_input_tokens_seen": 767656, + "step": 4020 + }, + { + "epoch": 2.091995841995842, + "grad_norm": 1.100738286972046, + "learning_rate": 4.8207358880029726e-05, + "loss": 0.3006, + "num_input_tokens_seen": 768648, + "step": 4025 + }, + { + "epoch": 2.0945945945945947, + "grad_norm": 0.5923410058021545, + "learning_rate": 4.8198916457895604e-05, + "loss": 0.2618, + "num_input_tokens_seen": 769576, + "step": 4030 + }, + { + "epoch": 2.0971933471933473, + "grad_norm": 0.8775808811187744, + "learning_rate": 4.819045494548268e-05, + "loss": 0.23, + "num_input_tokens_seen": 770504, + "step": 4035 + }, + { + "epoch": 2.0997920997921, + "grad_norm": 0.508751392364502, + "learning_rate": 4.81819743497539e-05, + "loss": 0.2907, + "num_input_tokens_seen": 771464, + "step": 4040 + }, + { + "epoch": 2.1023908523908523, + "grad_norm": 0.8956941366195679, + "learning_rate": 4.8173474677687904e-05, + "loss": 0.2658, + "num_input_tokens_seen": 772424, + "step": 4045 + }, + { + "epoch": 2.104989604989605, + "grad_norm": 0.8308512568473816, + "learning_rate": 4.816495593627902e-05, + "loss": 0.2789, + "num_input_tokens_seen": 773416, + "step": 4050 + }, + { + "epoch": 2.1075883575883574, + "grad_norm": 1.0011343955993652, + "learning_rate": 4.81564181325373e-05, + "loss": 0.225, + "num_input_tokens_seen": 774408, + "step": 4055 + }, + { + "epoch": 2.1101871101871104, + "grad_norm": 0.33379319310188293, + "learning_rate": 4.814786127348845e-05, + "loss": 0.2513, + "num_input_tokens_seen": 775304, + "step": 4060 + }, + { + "epoch": 2.112785862785863, + "grad_norm": 0.46509823203086853, + "learning_rate": 4.813928536617388e-05, + "loss": 0.2563, + "num_input_tokens_seen": 776328, + "step": 4065 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 0.8041447997093201, + "learning_rate": 4.813069041765065e-05, + "loss": 0.3123, + "num_input_tokens_seen": 777256, + "step": 4070 + }, + { + "epoch": 2.117983367983368, + "grad_norm": 0.3440883457660675, + "learning_rate": 4.8122076434991506e-05, + "loss": 0.2367, + "num_input_tokens_seen": 778216, + "step": 4075 + }, + { + "epoch": 2.1205821205821205, + "grad_norm": 0.7994845509529114, + "learning_rate": 4.8113443425284865e-05, + "loss": 0.2548, + "num_input_tokens_seen": 779240, + "step": 4080 + }, + { + "epoch": 2.123180873180873, + "grad_norm": 0.8685004115104675, + "learning_rate": 4.8104791395634774e-05, + "loss": 0.2574, + "num_input_tokens_seen": 780168, + "step": 4085 + }, + { + "epoch": 2.125779625779626, + "grad_norm": 1.1677322387695312, + "learning_rate": 4.809612035316096e-05, + "loss": 0.3087, + "num_input_tokens_seen": 781128, + "step": 4090 + }, + { + "epoch": 2.1283783783783785, + "grad_norm": 0.4499652087688446, + "learning_rate": 4.808743030499877e-05, + "loss": 0.2052, + "num_input_tokens_seen": 782088, + "step": 4095 + }, + { + "epoch": 2.130977130977131, + "grad_norm": 0.9271240234375, + "learning_rate": 4.807872125829922e-05, + "loss": 0.3118, + "num_input_tokens_seen": 783080, + "step": 4100 + }, + { + "epoch": 2.1335758835758836, + "grad_norm": 0.4212406575679779, + "learning_rate": 4.8069993220228925e-05, + "loss": 0.2222, + "num_input_tokens_seen": 784008, + "step": 4105 + }, + { + "epoch": 2.136174636174636, + "grad_norm": 0.6127902269363403, + "learning_rate": 4.806124619797016e-05, + "loss": 0.4249, + "num_input_tokens_seen": 785064, + "step": 4110 + }, + { + "epoch": 2.1387733887733886, + "grad_norm": 0.3438217043876648, + "learning_rate": 4.805248019872081e-05, + "loss": 0.2604, + "num_input_tokens_seen": 786024, + "step": 4115 + }, + { + "epoch": 2.141372141372141, + "grad_norm": 0.5289342403411865, + "learning_rate": 4.8043695229694365e-05, + "loss": 0.2669, + "num_input_tokens_seen": 786952, + "step": 4120 + }, + { + "epoch": 2.143970893970894, + "grad_norm": 0.7696058750152588, + "learning_rate": 4.803489129811993e-05, + "loss": 0.3283, + "num_input_tokens_seen": 787944, + "step": 4125 + }, + { + "epoch": 2.1465696465696467, + "grad_norm": 0.7946153283119202, + "learning_rate": 4.802606841124223e-05, + "loss": 0.295, + "num_input_tokens_seen": 788904, + "step": 4130 + }, + { + "epoch": 2.149168399168399, + "grad_norm": 0.6079053282737732, + "learning_rate": 4.8017226576321586e-05, + "loss": 0.2887, + "num_input_tokens_seen": 789864, + "step": 4135 + }, + { + "epoch": 2.1517671517671517, + "grad_norm": 1.214202880859375, + "learning_rate": 4.8008365800633875e-05, + "loss": 0.3038, + "num_input_tokens_seen": 790728, + "step": 4140 + }, + { + "epoch": 2.1543659043659042, + "grad_norm": 0.4739745259284973, + "learning_rate": 4.799948609147061e-05, + "loss": 0.2843, + "num_input_tokens_seen": 791720, + "step": 4145 + }, + { + "epoch": 2.156964656964657, + "grad_norm": 0.5472787618637085, + "learning_rate": 4.799058745613885e-05, + "loss": 0.2907, + "num_input_tokens_seen": 792616, + "step": 4150 + }, + { + "epoch": 2.1595634095634098, + "grad_norm": 1.2286006212234497, + "learning_rate": 4.798166990196125e-05, + "loss": 0.2731, + "num_input_tokens_seen": 793576, + "step": 4155 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.47798317670822144, + "learning_rate": 4.797273343627601e-05, + "loss": 0.268, + "num_input_tokens_seen": 794536, + "step": 4160 + }, + { + "epoch": 2.164760914760915, + "grad_norm": 0.25360408425331116, + "learning_rate": 4.796377806643692e-05, + "loss": 0.3182, + "num_input_tokens_seen": 795528, + "step": 4165 + }, + { + "epoch": 2.1673596673596673, + "grad_norm": 0.664707362651825, + "learning_rate": 4.79548037998133e-05, + "loss": 0.3033, + "num_input_tokens_seen": 796552, + "step": 4170 + }, + { + "epoch": 2.16995841995842, + "grad_norm": 0.36841338872909546, + "learning_rate": 4.7945810643790026e-05, + "loss": 0.2755, + "num_input_tokens_seen": 797544, + "step": 4175 + }, + { + "epoch": 2.1725571725571724, + "grad_norm": 0.5712988376617432, + "learning_rate": 4.793679860576755e-05, + "loss": 0.24, + "num_input_tokens_seen": 798472, + "step": 4180 + }, + { + "epoch": 2.1751559251559254, + "grad_norm": 0.4060610830783844, + "learning_rate": 4.7927767693161805e-05, + "loss": 0.2868, + "num_input_tokens_seen": 799464, + "step": 4185 + }, + { + "epoch": 2.177754677754678, + "grad_norm": 0.4252876341342926, + "learning_rate": 4.791871791340431e-05, + "loss": 0.3347, + "num_input_tokens_seen": 800392, + "step": 4190 + }, + { + "epoch": 2.1803534303534304, + "grad_norm": 0.36064624786376953, + "learning_rate": 4.7909649273942083e-05, + "loss": 0.2914, + "num_input_tokens_seen": 801352, + "step": 4195 + }, + { + "epoch": 2.182952182952183, + "grad_norm": 0.7849377989768982, + "learning_rate": 4.790056178223764e-05, + "loss": 0.3056, + "num_input_tokens_seen": 802440, + "step": 4200 + }, + { + "epoch": 2.1855509355509355, + "grad_norm": 0.5107596516609192, + "learning_rate": 4.789145544576906e-05, + "loss": 0.2957, + "num_input_tokens_seen": 803400, + "step": 4205 + }, + { + "epoch": 2.188149688149688, + "grad_norm": 0.5083978772163391, + "learning_rate": 4.7882330272029906e-05, + "loss": 0.2621, + "num_input_tokens_seen": 804392, + "step": 4210 + }, + { + "epoch": 2.1907484407484406, + "grad_norm": 0.5207892060279846, + "learning_rate": 4.787318626852923e-05, + "loss": 0.2775, + "num_input_tokens_seen": 805352, + "step": 4215 + }, + { + "epoch": 2.1933471933471935, + "grad_norm": 0.7574901580810547, + "learning_rate": 4.7864023442791587e-05, + "loss": 0.2705, + "num_input_tokens_seen": 806376, + "step": 4220 + }, + { + "epoch": 2.195945945945946, + "grad_norm": 0.7870606780052185, + "learning_rate": 4.785484180235702e-05, + "loss": 0.2351, + "num_input_tokens_seen": 807336, + "step": 4225 + }, + { + "epoch": 2.1985446985446986, + "grad_norm": 1.3117356300354004, + "learning_rate": 4.7845641354781065e-05, + "loss": 0.2593, + "num_input_tokens_seen": 808296, + "step": 4230 + }, + { + "epoch": 2.201143451143451, + "grad_norm": 0.5404126644134521, + "learning_rate": 4.7836422107634735e-05, + "loss": 0.2558, + "num_input_tokens_seen": 809288, + "step": 4235 + }, + { + "epoch": 2.2037422037422036, + "grad_norm": 0.3839046061038971, + "learning_rate": 4.782718406850449e-05, + "loss": 0.2598, + "num_input_tokens_seen": 810184, + "step": 4240 + }, + { + "epoch": 2.206340956340956, + "grad_norm": 0.5097280144691467, + "learning_rate": 4.781792724499228e-05, + "loss": 0.3122, + "num_input_tokens_seen": 811080, + "step": 4245 + }, + { + "epoch": 2.208939708939709, + "grad_norm": 0.5014235973358154, + "learning_rate": 4.78086516447155e-05, + "loss": 0.2554, + "num_input_tokens_seen": 812072, + "step": 4250 + }, + { + "epoch": 2.2115384615384617, + "grad_norm": 1.0921064615249634, + "learning_rate": 4.779935727530699e-05, + "loss": 0.2774, + "num_input_tokens_seen": 813000, + "step": 4255 + }, + { + "epoch": 2.214137214137214, + "grad_norm": 0.699111819267273, + "learning_rate": 4.779004414441504e-05, + "loss": 0.2609, + "num_input_tokens_seen": 814056, + "step": 4260 + }, + { + "epoch": 2.2167359667359667, + "grad_norm": 0.7333725690841675, + "learning_rate": 4.77807122597034e-05, + "loss": 0.2329, + "num_input_tokens_seen": 815080, + "step": 4265 + }, + { + "epoch": 2.2193347193347193, + "grad_norm": 0.7627071738243103, + "learning_rate": 4.777136162885121e-05, + "loss": 0.3089, + "num_input_tokens_seen": 816008, + "step": 4270 + }, + { + "epoch": 2.221933471933472, + "grad_norm": 0.7013311386108398, + "learning_rate": 4.776199225955308e-05, + "loss": 0.2378, + "num_input_tokens_seen": 816936, + "step": 4275 + }, + { + "epoch": 2.2245322245322248, + "grad_norm": 0.6426093578338623, + "learning_rate": 4.775260415951901e-05, + "loss": 0.2213, + "num_input_tokens_seen": 817864, + "step": 4280 + }, + { + "epoch": 2.2271309771309773, + "grad_norm": 0.4759265184402466, + "learning_rate": 4.774319733647442e-05, + "loss": 0.2733, + "num_input_tokens_seen": 818792, + "step": 4285 + }, + { + "epoch": 2.22972972972973, + "grad_norm": 0.3027482032775879, + "learning_rate": 4.773377179816016e-05, + "loss": 0.2825, + "num_input_tokens_seen": 819624, + "step": 4290 + }, + { + "epoch": 2.2323284823284824, + "grad_norm": 0.45447322726249695, + "learning_rate": 4.7724327552332425e-05, + "loss": 0.2822, + "num_input_tokens_seen": 820552, + "step": 4295 + }, + { + "epoch": 2.234927234927235, + "grad_norm": 0.9490477442741394, + "learning_rate": 4.771486460676288e-05, + "loss": 0.2979, + "num_input_tokens_seen": 821480, + "step": 4300 + }, + { + "epoch": 2.2375259875259874, + "grad_norm": 0.31688135862350464, + "learning_rate": 4.770538296923851e-05, + "loss": 0.2719, + "num_input_tokens_seen": 822312, + "step": 4305 + }, + { + "epoch": 2.24012474012474, + "grad_norm": 0.7761601209640503, + "learning_rate": 4.769588264756172e-05, + "loss": 0.2963, + "num_input_tokens_seen": 823336, + "step": 4310 + }, + { + "epoch": 2.242723492723493, + "grad_norm": 0.47245991230010986, + "learning_rate": 4.768636364955028e-05, + "loss": 0.2764, + "num_input_tokens_seen": 824264, + "step": 4315 + }, + { + "epoch": 2.2453222453222454, + "grad_norm": 0.6422507166862488, + "learning_rate": 4.7676825983037334e-05, + "loss": 0.2583, + "num_input_tokens_seen": 825192, + "step": 4320 + }, + { + "epoch": 2.247920997920998, + "grad_norm": 0.39522191882133484, + "learning_rate": 4.766726965587137e-05, + "loss": 0.2077, + "num_input_tokens_seen": 826152, + "step": 4325 + }, + { + "epoch": 2.2505197505197505, + "grad_norm": 0.40997564792633057, + "learning_rate": 4.765769467591625e-05, + "loss": 0.1404, + "num_input_tokens_seen": 827016, + "step": 4330 + }, + { + "epoch": 2.253118503118503, + "grad_norm": 1.424136757850647, + "learning_rate": 4.764810105105119e-05, + "loss": 0.3636, + "num_input_tokens_seen": 828008, + "step": 4335 + }, + { + "epoch": 2.2557172557172556, + "grad_norm": 0.3726789653301239, + "learning_rate": 4.763848878917072e-05, + "loss": 0.3178, + "num_input_tokens_seen": 828936, + "step": 4340 + }, + { + "epoch": 2.258316008316008, + "grad_norm": 0.43089884519577026, + "learning_rate": 4.762885789818473e-05, + "loss": 0.2906, + "num_input_tokens_seen": 829832, + "step": 4345 + }, + { + "epoch": 2.260914760914761, + "grad_norm": 0.470176637172699, + "learning_rate": 4.7619208386018455e-05, + "loss": 0.3154, + "num_input_tokens_seen": 830824, + "step": 4350 + }, + { + "epoch": 2.2635135135135136, + "grad_norm": 1.011785864830017, + "learning_rate": 4.760954026061241e-05, + "loss": 0.2728, + "num_input_tokens_seen": 831752, + "step": 4355 + }, + { + "epoch": 2.266112266112266, + "grad_norm": 0.36527371406555176, + "learning_rate": 4.759985352992245e-05, + "loss": 0.2561, + "num_input_tokens_seen": 832744, + "step": 4360 + }, + { + "epoch": 2.2687110187110187, + "grad_norm": 0.9806233048439026, + "learning_rate": 4.759014820191975e-05, + "loss": 0.2888, + "num_input_tokens_seen": 833704, + "step": 4365 + }, + { + "epoch": 2.271309771309771, + "grad_norm": 0.9077515602111816, + "learning_rate": 4.758042428459078e-05, + "loss": 0.2652, + "num_input_tokens_seen": 834632, + "step": 4370 + }, + { + "epoch": 2.2739085239085237, + "grad_norm": 0.5495890378952026, + "learning_rate": 4.75706817859373e-05, + "loss": 0.2631, + "num_input_tokens_seen": 835624, + "step": 4375 + }, + { + "epoch": 2.2765072765072767, + "grad_norm": 0.5224711298942566, + "learning_rate": 4.7560920713976365e-05, + "loss": 0.2647, + "num_input_tokens_seen": 836584, + "step": 4380 + }, + { + "epoch": 2.279106029106029, + "grad_norm": 0.6707485914230347, + "learning_rate": 4.7551141076740316e-05, + "loss": 0.208, + "num_input_tokens_seen": 837512, + "step": 4385 + }, + { + "epoch": 2.2817047817047817, + "grad_norm": 0.8580771088600159, + "learning_rate": 4.7541342882276775e-05, + "loss": 0.2152, + "num_input_tokens_seen": 838472, + "step": 4390 + }, + { + "epoch": 2.2843035343035343, + "grad_norm": 0.43018800020217896, + "learning_rate": 4.7531526138648616e-05, + "loss": 0.259, + "num_input_tokens_seen": 839464, + "step": 4395 + }, + { + "epoch": 2.286902286902287, + "grad_norm": 0.4959452748298645, + "learning_rate": 4.752169085393401e-05, + "loss": 0.2023, + "num_input_tokens_seen": 840456, + "step": 4400 + }, + { + "epoch": 2.2895010395010393, + "grad_norm": 0.8968576192855835, + "learning_rate": 4.751183703622636e-05, + "loss": 0.3581, + "num_input_tokens_seen": 841384, + "step": 4405 + }, + { + "epoch": 2.2920997920997923, + "grad_norm": 0.7652654051780701, + "learning_rate": 4.750196469363432e-05, + "loss": 0.2504, + "num_input_tokens_seen": 842344, + "step": 4410 + }, + { + "epoch": 2.294698544698545, + "grad_norm": 0.3006027936935425, + "learning_rate": 4.74920738342818e-05, + "loss": 0.3025, + "num_input_tokens_seen": 843240, + "step": 4415 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.5633500814437866, + "learning_rate": 4.748216446630794e-05, + "loss": 0.3084, + "num_input_tokens_seen": 844200, + "step": 4420 + }, + { + "epoch": 2.29989604989605, + "grad_norm": 0.822457492351532, + "learning_rate": 4.747223659786711e-05, + "loss": 0.3064, + "num_input_tokens_seen": 845160, + "step": 4425 + }, + { + "epoch": 2.3024948024948024, + "grad_norm": 0.5318806171417236, + "learning_rate": 4.74622902371289e-05, + "loss": 0.281, + "num_input_tokens_seen": 846088, + "step": 4430 + }, + { + "epoch": 2.305093555093555, + "grad_norm": 0.5538321137428284, + "learning_rate": 4.7452325392278144e-05, + "loss": 0.3106, + "num_input_tokens_seen": 847016, + "step": 4435 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.967134952545166, + "learning_rate": 4.744234207151484e-05, + "loss": 0.2977, + "num_input_tokens_seen": 847976, + "step": 4440 + }, + { + "epoch": 2.3102910602910605, + "grad_norm": 1.2764509916305542, + "learning_rate": 4.743234028305424e-05, + "loss": 0.2575, + "num_input_tokens_seen": 848936, + "step": 4445 + }, + { + "epoch": 2.312889812889813, + "grad_norm": 0.5660247802734375, + "learning_rate": 4.742232003512675e-05, + "loss": 0.32, + "num_input_tokens_seen": 850024, + "step": 4450 + }, + { + "epoch": 2.3154885654885655, + "grad_norm": 0.4603048861026764, + "learning_rate": 4.741228133597799e-05, + "loss": 0.2616, + "num_input_tokens_seen": 850984, + "step": 4455 + }, + { + "epoch": 2.318087318087318, + "grad_norm": 0.749518096446991, + "learning_rate": 4.740222419386876e-05, + "loss": 0.255, + "num_input_tokens_seen": 851944, + "step": 4460 + }, + { + "epoch": 2.3206860706860706, + "grad_norm": 0.7117644548416138, + "learning_rate": 4.7392148617075036e-05, + "loss": 0.2123, + "num_input_tokens_seen": 852904, + "step": 4465 + }, + { + "epoch": 2.323284823284823, + "grad_norm": 0.7945102453231812, + "learning_rate": 4.738205461388796e-05, + "loss": 0.3115, + "num_input_tokens_seen": 853896, + "step": 4470 + }, + { + "epoch": 2.3258835758835756, + "grad_norm": 0.7686260938644409, + "learning_rate": 4.7371942192613836e-05, + "loss": 0.2909, + "num_input_tokens_seen": 854824, + "step": 4475 + }, + { + "epoch": 2.3284823284823286, + "grad_norm": 0.5958372950553894, + "learning_rate": 4.7361811361574134e-05, + "loss": 0.2632, + "num_input_tokens_seen": 855720, + "step": 4480 + }, + { + "epoch": 2.331081081081081, + "grad_norm": 0.8689013123512268, + "learning_rate": 4.735166212910547e-05, + "loss": 0.3126, + "num_input_tokens_seen": 856648, + "step": 4485 + }, + { + "epoch": 2.3336798336798337, + "grad_norm": 1.1813690662384033, + "learning_rate": 4.7341494503559594e-05, + "loss": 0.277, + "num_input_tokens_seen": 857608, + "step": 4490 + }, + { + "epoch": 2.336278586278586, + "grad_norm": 0.5662037134170532, + "learning_rate": 4.73313084933034e-05, + "loss": 0.2802, + "num_input_tokens_seen": 858536, + "step": 4495 + }, + { + "epoch": 2.3388773388773387, + "grad_norm": 0.569011390209198, + "learning_rate": 4.7321104106718895e-05, + "loss": 0.292, + "num_input_tokens_seen": 859560, + "step": 4500 + }, + { + "epoch": 2.3414760914760917, + "grad_norm": 0.5877837538719177, + "learning_rate": 4.731088135220324e-05, + "loss": 0.327, + "num_input_tokens_seen": 860456, + "step": 4505 + }, + { + "epoch": 2.3440748440748442, + "grad_norm": 1.0624452829360962, + "learning_rate": 4.730064023816867e-05, + "loss": 0.2658, + "num_input_tokens_seen": 861384, + "step": 4510 + }, + { + "epoch": 2.3466735966735968, + "grad_norm": 0.6704313158988953, + "learning_rate": 4.7290380773042575e-05, + "loss": 0.2612, + "num_input_tokens_seen": 862280, + "step": 4515 + }, + { + "epoch": 2.3492723492723493, + "grad_norm": 0.5709783434867859, + "learning_rate": 4.72801029652674e-05, + "loss": 0.2672, + "num_input_tokens_seen": 863240, + "step": 4520 + }, + { + "epoch": 2.351871101871102, + "grad_norm": 0.6644476056098938, + "learning_rate": 4.726980682330071e-05, + "loss": 0.2311, + "num_input_tokens_seen": 864200, + "step": 4525 + }, + { + "epoch": 2.3544698544698544, + "grad_norm": 0.6655598282814026, + "learning_rate": 4.725949235561516e-05, + "loss": 0.2503, + "num_input_tokens_seen": 865128, + "step": 4530 + }, + { + "epoch": 2.357068607068607, + "grad_norm": 0.297323614358902, + "learning_rate": 4.7249159570698466e-05, + "loss": 0.2823, + "num_input_tokens_seen": 866152, + "step": 4535 + }, + { + "epoch": 2.35966735966736, + "grad_norm": 0.6980193853378296, + "learning_rate": 4.723880847705343e-05, + "loss": 0.2892, + "num_input_tokens_seen": 867048, + "step": 4540 + }, + { + "epoch": 2.3622661122661124, + "grad_norm": 0.5317457914352417, + "learning_rate": 4.722843908319792e-05, + "loss": 0.3084, + "num_input_tokens_seen": 868008, + "step": 4545 + }, + { + "epoch": 2.364864864864865, + "grad_norm": 0.25813841819763184, + "learning_rate": 4.721805139766486e-05, + "loss": 0.2295, + "num_input_tokens_seen": 868968, + "step": 4550 + }, + { + "epoch": 2.3674636174636174, + "grad_norm": 0.6290224194526672, + "learning_rate": 4.7207645429002226e-05, + "loss": 0.2726, + "num_input_tokens_seen": 869960, + "step": 4555 + }, + { + "epoch": 2.37006237006237, + "grad_norm": 0.565205454826355, + "learning_rate": 4.719722118577305e-05, + "loss": 0.2436, + "num_input_tokens_seen": 870952, + "step": 4560 + }, + { + "epoch": 2.3726611226611225, + "grad_norm": 0.09687777608633041, + "learning_rate": 4.718677867655538e-05, + "loss": 0.2345, + "num_input_tokens_seen": 871880, + "step": 4565 + }, + { + "epoch": 2.375259875259875, + "grad_norm": 0.6381661891937256, + "learning_rate": 4.717631790994231e-05, + "loss": 0.3214, + "num_input_tokens_seen": 872840, + "step": 4570 + }, + { + "epoch": 2.377858627858628, + "grad_norm": 0.26941829919815063, + "learning_rate": 4.716583889454197e-05, + "loss": 0.2543, + "num_input_tokens_seen": 873800, + "step": 4575 + }, + { + "epoch": 2.3804573804573805, + "grad_norm": 0.5901310443878174, + "learning_rate": 4.7155341638977475e-05, + "loss": 0.2444, + "num_input_tokens_seen": 874760, + "step": 4580 + }, + { + "epoch": 2.383056133056133, + "grad_norm": 0.8718878626823425, + "learning_rate": 4.714482615188697e-05, + "loss": 0.293, + "num_input_tokens_seen": 875656, + "step": 4585 + }, + { + "epoch": 2.3856548856548856, + "grad_norm": 0.19639089703559875, + "learning_rate": 4.71342924419236e-05, + "loss": 0.2432, + "num_input_tokens_seen": 876584, + "step": 4590 + }, + { + "epoch": 2.388253638253638, + "grad_norm": 1.2176142930984497, + "learning_rate": 4.712374051775551e-05, + "loss": 0.2401, + "num_input_tokens_seen": 877544, + "step": 4595 + }, + { + "epoch": 2.390852390852391, + "grad_norm": 0.7186681032180786, + "learning_rate": 4.7113170388065833e-05, + "loss": 0.3281, + "num_input_tokens_seen": 878472, + "step": 4600 + }, + { + "epoch": 2.3934511434511436, + "grad_norm": 0.4756167232990265, + "learning_rate": 4.710258206155266e-05, + "loss": 0.2369, + "num_input_tokens_seen": 879432, + "step": 4605 + }, + { + "epoch": 2.396049896049896, + "grad_norm": 0.5078365206718445, + "learning_rate": 4.7091975546929093e-05, + "loss": 0.2622, + "num_input_tokens_seen": 880328, + "step": 4610 + }, + { + "epoch": 2.3986486486486487, + "grad_norm": 0.5073776245117188, + "learning_rate": 4.7081350852923177e-05, + "loss": 0.253, + "num_input_tokens_seen": 881256, + "step": 4615 + }, + { + "epoch": 2.401247401247401, + "grad_norm": 0.8220692873001099, + "learning_rate": 4.707070798827792e-05, + "loss": 0.3292, + "num_input_tokens_seen": 882216, + "step": 4620 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 0.9538479447364807, + "learning_rate": 4.7060046961751294e-05, + "loss": 0.2322, + "num_input_tokens_seen": 883208, + "step": 4625 + }, + { + "epoch": 2.4064449064449063, + "grad_norm": 0.9834445118904114, + "learning_rate": 4.704936778211619e-05, + "loss": 0.2916, + "num_input_tokens_seen": 884200, + "step": 4630 + }, + { + "epoch": 2.4090436590436592, + "grad_norm": 0.8373116850852966, + "learning_rate": 4.703867045816047e-05, + "loss": 0.2902, + "num_input_tokens_seen": 885192, + "step": 4635 + }, + { + "epoch": 2.4116424116424118, + "grad_norm": 0.9210081696510315, + "learning_rate": 4.702795499868691e-05, + "loss": 0.257, + "num_input_tokens_seen": 886152, + "step": 4640 + }, + { + "epoch": 2.4142411642411643, + "grad_norm": 1.063492774963379, + "learning_rate": 4.70172214125132e-05, + "loss": 0.2974, + "num_input_tokens_seen": 887144, + "step": 4645 + }, + { + "epoch": 2.416839916839917, + "grad_norm": 0.21289251744747162, + "learning_rate": 4.700646970847197e-05, + "loss": 0.2452, + "num_input_tokens_seen": 888136, + "step": 4650 + }, + { + "epoch": 2.4194386694386694, + "grad_norm": 0.930168092250824, + "learning_rate": 4.699569989541074e-05, + "loss": 0.3337, + "num_input_tokens_seen": 889064, + "step": 4655 + }, + { + "epoch": 2.422037422037422, + "grad_norm": 0.4038194417953491, + "learning_rate": 4.6984911982191936e-05, + "loss": 0.2923, + "num_input_tokens_seen": 890024, + "step": 4660 + }, + { + "epoch": 2.4246361746361744, + "grad_norm": 0.608410120010376, + "learning_rate": 4.6974105977692884e-05, + "loss": 0.3098, + "num_input_tokens_seen": 891048, + "step": 4665 + }, + { + "epoch": 2.4272349272349274, + "grad_norm": 0.3455858528614044, + "learning_rate": 4.69632818908058e-05, + "loss": 0.299, + "num_input_tokens_seen": 891944, + "step": 4670 + }, + { + "epoch": 2.42983367983368, + "grad_norm": 1.027152180671692, + "learning_rate": 4.695243973043777e-05, + "loss": 0.2793, + "num_input_tokens_seen": 892936, + "step": 4675 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.4390454590320587, + "learning_rate": 4.694157950551075e-05, + "loss": 0.3129, + "num_input_tokens_seen": 893896, + "step": 4680 + }, + { + "epoch": 2.435031185031185, + "grad_norm": 0.5482154488563538, + "learning_rate": 4.6930701224961573e-05, + "loss": 0.2895, + "num_input_tokens_seen": 894792, + "step": 4685 + }, + { + "epoch": 2.4376299376299375, + "grad_norm": 0.7580490112304688, + "learning_rate": 4.6919804897741925e-05, + "loss": 0.2288, + "num_input_tokens_seen": 895784, + "step": 4690 + }, + { + "epoch": 2.44022869022869, + "grad_norm": 0.535269021987915, + "learning_rate": 4.690889053281834e-05, + "loss": 0.2148, + "num_input_tokens_seen": 896712, + "step": 4695 + }, + { + "epoch": 2.442827442827443, + "grad_norm": 0.30111661553382874, + "learning_rate": 4.68979581391722e-05, + "loss": 0.1792, + "num_input_tokens_seen": 897672, + "step": 4700 + }, + { + "epoch": 2.4454261954261955, + "grad_norm": 0.4973357915878296, + "learning_rate": 4.688700772579972e-05, + "loss": 0.3779, + "num_input_tokens_seen": 898568, + "step": 4705 + }, + { + "epoch": 2.448024948024948, + "grad_norm": 0.5144957900047302, + "learning_rate": 4.687603930171194e-05, + "loss": 0.2975, + "num_input_tokens_seen": 899592, + "step": 4710 + }, + { + "epoch": 2.4506237006237006, + "grad_norm": 0.6291283965110779, + "learning_rate": 4.686505287593473e-05, + "loss": 0.2524, + "num_input_tokens_seen": 900552, + "step": 4715 + }, + { + "epoch": 2.453222453222453, + "grad_norm": 0.7620900869369507, + "learning_rate": 4.685404845750877e-05, + "loss": 0.2592, + "num_input_tokens_seen": 901416, + "step": 4720 + }, + { + "epoch": 2.4558212058212057, + "grad_norm": 0.5049926042556763, + "learning_rate": 4.6843026055489524e-05, + "loss": 0.295, + "num_input_tokens_seen": 902440, + "step": 4725 + }, + { + "epoch": 2.4584199584199586, + "grad_norm": 0.43582355976104736, + "learning_rate": 4.6831985678947296e-05, + "loss": 0.2131, + "num_input_tokens_seen": 903432, + "step": 4730 + }, + { + "epoch": 2.461018711018711, + "grad_norm": 0.3928908407688141, + "learning_rate": 4.682092733696716e-05, + "loss": 0.3402, + "num_input_tokens_seen": 904424, + "step": 4735 + }, + { + "epoch": 2.4636174636174637, + "grad_norm": 0.324194997549057, + "learning_rate": 4.680985103864896e-05, + "loss": 0.2782, + "num_input_tokens_seen": 905352, + "step": 4740 + }, + { + "epoch": 2.4662162162162162, + "grad_norm": 0.935611367225647, + "learning_rate": 4.679875679310734e-05, + "loss": 0.2461, + "num_input_tokens_seen": 906248, + "step": 4745 + }, + { + "epoch": 2.4688149688149688, + "grad_norm": 0.5176529884338379, + "learning_rate": 4.67876446094717e-05, + "loss": 0.3483, + "num_input_tokens_seen": 907176, + "step": 4750 + }, + { + "epoch": 2.4714137214137213, + "grad_norm": 0.9352943301200867, + "learning_rate": 4.677651449688619e-05, + "loss": 0.2661, + "num_input_tokens_seen": 908104, + "step": 4755 + }, + { + "epoch": 2.474012474012474, + "grad_norm": 0.7361578941345215, + "learning_rate": 4.676536646450975e-05, + "loss": 0.2695, + "num_input_tokens_seen": 909096, + "step": 4760 + }, + { + "epoch": 2.476611226611227, + "grad_norm": 0.29520857334136963, + "learning_rate": 4.675420052151603e-05, + "loss": 0.2767, + "num_input_tokens_seen": 910120, + "step": 4765 + }, + { + "epoch": 2.4792099792099793, + "grad_norm": 1.3667218685150146, + "learning_rate": 4.674301667709343e-05, + "loss": 0.3152, + "num_input_tokens_seen": 911144, + "step": 4770 + }, + { + "epoch": 2.481808731808732, + "grad_norm": 0.2163599282503128, + "learning_rate": 4.673181494044509e-05, + "loss": 0.3715, + "num_input_tokens_seen": 912072, + "step": 4775 + }, + { + "epoch": 2.4844074844074844, + "grad_norm": 0.7388083934783936, + "learning_rate": 4.672059532078886e-05, + "loss": 0.2541, + "num_input_tokens_seen": 913064, + "step": 4780 + }, + { + "epoch": 2.487006237006237, + "grad_norm": 1.1265802383422852, + "learning_rate": 4.670935782735732e-05, + "loss": 0.264, + "num_input_tokens_seen": 913960, + "step": 4785 + }, + { + "epoch": 2.4896049896049894, + "grad_norm": 0.36076247692108154, + "learning_rate": 4.669810246939774e-05, + "loss": 0.2994, + "num_input_tokens_seen": 914888, + "step": 4790 + }, + { + "epoch": 2.492203742203742, + "grad_norm": 0.7501949667930603, + "learning_rate": 4.668682925617211e-05, + "loss": 0.2239, + "num_input_tokens_seen": 915752, + "step": 4795 + }, + { + "epoch": 2.494802494802495, + "grad_norm": 0.557384192943573, + "learning_rate": 4.6675538196957096e-05, + "loss": 0.1969, + "num_input_tokens_seen": 916648, + "step": 4800 + }, + { + "epoch": 2.4974012474012475, + "grad_norm": 1.334924340248108, + "learning_rate": 4.666422930104406e-05, + "loss": 0.3207, + "num_input_tokens_seen": 917640, + "step": 4805 + }, + { + "epoch": 2.5, + "grad_norm": 0.5488286018371582, + "learning_rate": 4.665290257773904e-05, + "loss": 0.2629, + "num_input_tokens_seen": 918600, + "step": 4810 + }, + { + "epoch": 2.5, + "eval_loss": 0.2610861659049988, + "eval_runtime": 7.9549, + "eval_samples_per_second": 107.606, + "eval_steps_per_second": 26.902, + "num_input_tokens_seen": 918600, + "step": 4810 + }, + { + "epoch": 2.5025987525987525, + "grad_norm": 0.456437885761261, + "learning_rate": 4.6641558036362754e-05, + "loss": 0.4005, + "num_input_tokens_seen": 919592, + "step": 4815 + }, + { + "epoch": 2.505197505197505, + "grad_norm": 0.2799813449382782, + "learning_rate": 4.663019568625055e-05, + "loss": 0.2978, + "num_input_tokens_seen": 920520, + "step": 4820 + }, + { + "epoch": 2.507796257796258, + "grad_norm": 1.005923867225647, + "learning_rate": 4.661881553675247e-05, + "loss": 0.2612, + "num_input_tokens_seen": 921544, + "step": 4825 + }, + { + "epoch": 2.51039501039501, + "grad_norm": 0.5326134562492371, + "learning_rate": 4.660741759723317e-05, + "loss": 0.3073, + "num_input_tokens_seen": 922504, + "step": 4830 + }, + { + "epoch": 2.512993762993763, + "grad_norm": 0.5496788620948792, + "learning_rate": 4.6596001877071976e-05, + "loss": 0.2596, + "num_input_tokens_seen": 923464, + "step": 4835 + }, + { + "epoch": 2.5155925155925156, + "grad_norm": 0.306947261095047, + "learning_rate": 4.658456838566282e-05, + "loss": 0.2792, + "num_input_tokens_seen": 924456, + "step": 4840 + }, + { + "epoch": 2.518191268191268, + "grad_norm": 0.8909977674484253, + "learning_rate": 4.657311713241427e-05, + "loss": 0.3105, + "num_input_tokens_seen": 925448, + "step": 4845 + }, + { + "epoch": 2.5207900207900207, + "grad_norm": 0.516859233379364, + "learning_rate": 4.656164812674951e-05, + "loss": 0.2548, + "num_input_tokens_seen": 926408, + "step": 4850 + }, + { + "epoch": 2.523388773388773, + "grad_norm": 0.858291506767273, + "learning_rate": 4.655016137810634e-05, + "loss": 0.2853, + "num_input_tokens_seen": 927272, + "step": 4855 + }, + { + "epoch": 2.525987525987526, + "grad_norm": 0.6144601106643677, + "learning_rate": 4.6538656895937135e-05, + "loss": 0.3247, + "num_input_tokens_seen": 928168, + "step": 4860 + }, + { + "epoch": 2.5285862785862787, + "grad_norm": 0.34840139746665955, + "learning_rate": 4.652713468970888e-05, + "loss": 0.243, + "num_input_tokens_seen": 929096, + "step": 4865 + }, + { + "epoch": 2.5311850311850312, + "grad_norm": 0.7707224488258362, + "learning_rate": 4.651559476890315e-05, + "loss": 0.2048, + "num_input_tokens_seen": 930024, + "step": 4870 + }, + { + "epoch": 2.5337837837837838, + "grad_norm": 0.29072585701942444, + "learning_rate": 4.65040371430161e-05, + "loss": 0.2994, + "num_input_tokens_seen": 931016, + "step": 4875 + }, + { + "epoch": 2.5363825363825363, + "grad_norm": 0.8378693461418152, + "learning_rate": 4.6492461821558434e-05, + "loss": 0.3128, + "num_input_tokens_seen": 931944, + "step": 4880 + }, + { + "epoch": 2.538981288981289, + "grad_norm": 0.783228874206543, + "learning_rate": 4.6480868814055424e-05, + "loss": 0.2438, + "num_input_tokens_seen": 932840, + "step": 4885 + }, + { + "epoch": 2.5415800415800414, + "grad_norm": 0.3165428936481476, + "learning_rate": 4.646925813004691e-05, + "loss": 0.3024, + "num_input_tokens_seen": 933768, + "step": 4890 + }, + { + "epoch": 2.5441787941787943, + "grad_norm": 0.6073763370513916, + "learning_rate": 4.645762977908728e-05, + "loss": 0.2807, + "num_input_tokens_seen": 934696, + "step": 4895 + }, + { + "epoch": 2.546777546777547, + "grad_norm": 0.349008172750473, + "learning_rate": 4.644598377074543e-05, + "loss": 0.2755, + "num_input_tokens_seen": 935656, + "step": 4900 + }, + { + "epoch": 2.5493762993762994, + "grad_norm": 0.48542386293411255, + "learning_rate": 4.64343201146048e-05, + "loss": 0.2378, + "num_input_tokens_seen": 936616, + "step": 4905 + }, + { + "epoch": 2.551975051975052, + "grad_norm": 0.7747911214828491, + "learning_rate": 4.642263882026339e-05, + "loss": 0.2238, + "num_input_tokens_seen": 937608, + "step": 4910 + }, + { + "epoch": 2.5545738045738045, + "grad_norm": 0.5993784666061401, + "learning_rate": 4.6410939897333646e-05, + "loss": 0.31, + "num_input_tokens_seen": 938568, + "step": 4915 + }, + { + "epoch": 2.5571725571725574, + "grad_norm": 0.5138455629348755, + "learning_rate": 4.639922335544258e-05, + "loss": 0.1762, + "num_input_tokens_seen": 939528, + "step": 4920 + }, + { + "epoch": 2.5597713097713095, + "grad_norm": 0.5150822401046753, + "learning_rate": 4.638748920423167e-05, + "loss": 0.2844, + "num_input_tokens_seen": 940488, + "step": 4925 + }, + { + "epoch": 2.5623700623700625, + "grad_norm": 0.2948455810546875, + "learning_rate": 4.637573745335691e-05, + "loss": 0.2908, + "num_input_tokens_seen": 941416, + "step": 4930 + }, + { + "epoch": 2.564968814968815, + "grad_norm": 0.5366185307502747, + "learning_rate": 4.636396811248872e-05, + "loss": 0.3569, + "num_input_tokens_seen": 942312, + "step": 4935 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.4963361620903015, + "learning_rate": 4.635218119131207e-05, + "loss": 0.2357, + "num_input_tokens_seen": 943240, + "step": 4940 + }, + { + "epoch": 2.57016632016632, + "grad_norm": 0.44492506980895996, + "learning_rate": 4.6340376699526356e-05, + "loss": 0.251, + "num_input_tokens_seen": 944200, + "step": 4945 + }, + { + "epoch": 2.5727650727650726, + "grad_norm": 0.3754466772079468, + "learning_rate": 4.6328554646845434e-05, + "loss": 0.3475, + "num_input_tokens_seen": 945192, + "step": 4950 + }, + { + "epoch": 2.5753638253638256, + "grad_norm": 0.4711630940437317, + "learning_rate": 4.631671504299762e-05, + "loss": 0.2916, + "num_input_tokens_seen": 946152, + "step": 4955 + }, + { + "epoch": 2.577962577962578, + "grad_norm": 0.3240714967250824, + "learning_rate": 4.6304857897725653e-05, + "loss": 0.2716, + "num_input_tokens_seen": 947176, + "step": 4960 + }, + { + "epoch": 2.5805613305613306, + "grad_norm": 0.5127665996551514, + "learning_rate": 4.629298322078674e-05, + "loss": 0.2556, + "num_input_tokens_seen": 948104, + "step": 4965 + }, + { + "epoch": 2.583160083160083, + "grad_norm": 0.6877671480178833, + "learning_rate": 4.628109102195249e-05, + "loss": 0.2129, + "num_input_tokens_seen": 949064, + "step": 4970 + }, + { + "epoch": 2.5857588357588357, + "grad_norm": 0.5175995230674744, + "learning_rate": 4.626918131100894e-05, + "loss": 0.2206, + "num_input_tokens_seen": 950024, + "step": 4975 + }, + { + "epoch": 2.5883575883575882, + "grad_norm": 0.4945067763328552, + "learning_rate": 4.625725409775652e-05, + "loss": 0.3115, + "num_input_tokens_seen": 950984, + "step": 4980 + }, + { + "epoch": 2.5909563409563408, + "grad_norm": 1.1585469245910645, + "learning_rate": 4.6245309392010094e-05, + "loss": 0.282, + "num_input_tokens_seen": 951880, + "step": 4985 + }, + { + "epoch": 2.5935550935550937, + "grad_norm": 0.6801390647888184, + "learning_rate": 4.6233347203598896e-05, + "loss": 0.3451, + "num_input_tokens_seen": 952936, + "step": 4990 + }, + { + "epoch": 2.5961538461538463, + "grad_norm": 0.9685271382331848, + "learning_rate": 4.622136754236657e-05, + "loss": 0.2218, + "num_input_tokens_seen": 953896, + "step": 4995 + }, + { + "epoch": 2.598752598752599, + "grad_norm": 0.4521665573120117, + "learning_rate": 4.62093704181711e-05, + "loss": 0.278, + "num_input_tokens_seen": 954792, + "step": 5000 + }, + { + "epoch": 2.6013513513513513, + "grad_norm": 1.2031599283218384, + "learning_rate": 4.619735584088487e-05, + "loss": 0.2712, + "num_input_tokens_seen": 955752, + "step": 5005 + }, + { + "epoch": 2.603950103950104, + "grad_norm": 1.0106712579727173, + "learning_rate": 4.618532382039463e-05, + "loss": 0.2932, + "num_input_tokens_seen": 956712, + "step": 5010 + }, + { + "epoch": 2.606548856548857, + "grad_norm": 0.7452159523963928, + "learning_rate": 4.6173274366601466e-05, + "loss": 0.2664, + "num_input_tokens_seen": 957640, + "step": 5015 + }, + { + "epoch": 2.609147609147609, + "grad_norm": 0.7387436032295227, + "learning_rate": 4.616120748942081e-05, + "loss": 0.2846, + "num_input_tokens_seen": 958632, + "step": 5020 + }, + { + "epoch": 2.611746361746362, + "grad_norm": 0.2772707939147949, + "learning_rate": 4.614912319878244e-05, + "loss": 0.3036, + "num_input_tokens_seen": 959592, + "step": 5025 + }, + { + "epoch": 2.6143451143451144, + "grad_norm": 0.7046545743942261, + "learning_rate": 4.6137021504630476e-05, + "loss": 0.2438, + "num_input_tokens_seen": 960520, + "step": 5030 + }, + { + "epoch": 2.616943866943867, + "grad_norm": 0.5565292835235596, + "learning_rate": 4.612490241692332e-05, + "loss": 0.2813, + "num_input_tokens_seen": 961512, + "step": 5035 + }, + { + "epoch": 2.6195426195426195, + "grad_norm": 0.09657760709524155, + "learning_rate": 4.611276594563374e-05, + "loss": 0.2681, + "num_input_tokens_seen": 962472, + "step": 5040 + }, + { + "epoch": 2.622141372141372, + "grad_norm": 0.48732906579971313, + "learning_rate": 4.6100612100748765e-05, + "loss": 0.309, + "num_input_tokens_seen": 963336, + "step": 5045 + }, + { + "epoch": 2.624740124740125, + "grad_norm": 1.2157529592514038, + "learning_rate": 4.608844089226974e-05, + "loss": 0.2914, + "num_input_tokens_seen": 964296, + "step": 5050 + }, + { + "epoch": 2.6273388773388775, + "grad_norm": 0.22283151745796204, + "learning_rate": 4.607625233021228e-05, + "loss": 0.301, + "num_input_tokens_seen": 965256, + "step": 5055 + }, + { + "epoch": 2.62993762993763, + "grad_norm": 0.5035054087638855, + "learning_rate": 4.6064046424606324e-05, + "loss": 0.2909, + "num_input_tokens_seen": 966248, + "step": 5060 + }, + { + "epoch": 2.6325363825363826, + "grad_norm": 0.37133240699768066, + "learning_rate": 4.605182318549602e-05, + "loss": 0.249, + "num_input_tokens_seen": 967272, + "step": 5065 + }, + { + "epoch": 2.635135135135135, + "grad_norm": 0.5307822227478027, + "learning_rate": 4.6039582622939854e-05, + "loss": 0.3258, + "num_input_tokens_seen": 968296, + "step": 5070 + }, + { + "epoch": 2.6377338877338876, + "grad_norm": 0.42061692476272583, + "learning_rate": 4.602732474701049e-05, + "loss": 0.2429, + "num_input_tokens_seen": 969224, + "step": 5075 + }, + { + "epoch": 2.64033264033264, + "grad_norm": 0.3679790794849396, + "learning_rate": 4.60150495677949e-05, + "loss": 0.2387, + "num_input_tokens_seen": 970152, + "step": 5080 + }, + { + "epoch": 2.642931392931393, + "grad_norm": 0.8742953538894653, + "learning_rate": 4.600275709539427e-05, + "loss": 0.2689, + "num_input_tokens_seen": 971208, + "step": 5085 + }, + { + "epoch": 2.6455301455301456, + "grad_norm": 1.0316767692565918, + "learning_rate": 4.5990447339924e-05, + "loss": 0.2827, + "num_input_tokens_seen": 972136, + "step": 5090 + }, + { + "epoch": 2.648128898128898, + "grad_norm": 0.44112807512283325, + "learning_rate": 4.597812031151374e-05, + "loss": 0.1547, + "num_input_tokens_seen": 973096, + "step": 5095 + }, + { + "epoch": 2.6507276507276507, + "grad_norm": 1.4995003938674927, + "learning_rate": 4.5965776020307344e-05, + "loss": 0.3111, + "num_input_tokens_seen": 973992, + "step": 5100 + }, + { + "epoch": 2.6533264033264032, + "grad_norm": 0.7780179381370544, + "learning_rate": 4.5953414476462875e-05, + "loss": 0.2589, + "num_input_tokens_seen": 974920, + "step": 5105 + }, + { + "epoch": 2.6559251559251558, + "grad_norm": 0.4005117118358612, + "learning_rate": 4.594103569015258e-05, + "loss": 0.2487, + "num_input_tokens_seen": 975816, + "step": 5110 + }, + { + "epoch": 2.6585239085239083, + "grad_norm": 0.44125401973724365, + "learning_rate": 4.592863967156291e-05, + "loss": 0.2344, + "num_input_tokens_seen": 976712, + "step": 5115 + }, + { + "epoch": 2.6611226611226613, + "grad_norm": 0.3813053071498871, + "learning_rate": 4.5916226430894494e-05, + "loss": 0.2327, + "num_input_tokens_seen": 977704, + "step": 5120 + }, + { + "epoch": 2.663721413721414, + "grad_norm": 0.27538949251174927, + "learning_rate": 4.590379597836212e-05, + "loss": 0.2716, + "num_input_tokens_seen": 978632, + "step": 5125 + }, + { + "epoch": 2.6663201663201663, + "grad_norm": 0.3745516538619995, + "learning_rate": 4.589134832419475e-05, + "loss": 0.2552, + "num_input_tokens_seen": 979592, + "step": 5130 + }, + { + "epoch": 2.668918918918919, + "grad_norm": 0.28242114186286926, + "learning_rate": 4.58788834786355e-05, + "loss": 0.2498, + "num_input_tokens_seen": 980456, + "step": 5135 + }, + { + "epoch": 2.6715176715176714, + "grad_norm": 0.6986222863197327, + "learning_rate": 4.586640145194164e-05, + "loss": 0.2351, + "num_input_tokens_seen": 981384, + "step": 5140 + }, + { + "epoch": 2.6741164241164244, + "grad_norm": 0.8920523524284363, + "learning_rate": 4.5853902254384564e-05, + "loss": 0.2481, + "num_input_tokens_seen": 982312, + "step": 5145 + }, + { + "epoch": 2.6767151767151764, + "grad_norm": 0.6890351176261902, + "learning_rate": 4.584138589624981e-05, + "loss": 0.3103, + "num_input_tokens_seen": 983272, + "step": 5150 + }, + { + "epoch": 2.6793139293139294, + "grad_norm": 0.5786080360412598, + "learning_rate": 4.582885238783702e-05, + "loss": 0.2024, + "num_input_tokens_seen": 984296, + "step": 5155 + }, + { + "epoch": 2.681912681912682, + "grad_norm": 0.6949089169502258, + "learning_rate": 4.581630173945995e-05, + "loss": 0.2776, + "num_input_tokens_seen": 985288, + "step": 5160 + }, + { + "epoch": 2.6845114345114345, + "grad_norm": 0.39905160665512085, + "learning_rate": 4.58037339614465e-05, + "loss": 0.1591, + "num_input_tokens_seen": 986216, + "step": 5165 + }, + { + "epoch": 2.687110187110187, + "grad_norm": 0.34613093733787537, + "learning_rate": 4.579114906413861e-05, + "loss": 0.2002, + "num_input_tokens_seen": 987112, + "step": 5170 + }, + { + "epoch": 2.6897089397089395, + "grad_norm": 0.30801185965538025, + "learning_rate": 4.577854705789234e-05, + "loss": 0.1841, + "num_input_tokens_seen": 988040, + "step": 5175 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.4995872378349304, + "learning_rate": 4.576592795307782e-05, + "loss": 0.3781, + "num_input_tokens_seen": 988968, + "step": 5180 + }, + { + "epoch": 2.694906444906445, + "grad_norm": 0.8338913321495056, + "learning_rate": 4.5753291760079265e-05, + "loss": 0.2704, + "num_input_tokens_seen": 989896, + "step": 5185 + }, + { + "epoch": 2.6975051975051976, + "grad_norm": 0.5721254944801331, + "learning_rate": 4.5740638489294915e-05, + "loss": 0.1736, + "num_input_tokens_seen": 990760, + "step": 5190 + }, + { + "epoch": 2.70010395010395, + "grad_norm": 1.0354681015014648, + "learning_rate": 4.5727968151137104e-05, + "loss": 0.2209, + "num_input_tokens_seen": 991688, + "step": 5195 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.5675065517425537, + "learning_rate": 4.5715280756032184e-05, + "loss": 0.2444, + "num_input_tokens_seen": 992648, + "step": 5200 + }, + { + "epoch": 2.705301455301455, + "grad_norm": 0.3124942183494568, + "learning_rate": 4.5702576314420554e-05, + "loss": 0.3458, + "num_input_tokens_seen": 993576, + "step": 5205 + }, + { + "epoch": 2.7079002079002077, + "grad_norm": 0.28607234358787537, + "learning_rate": 4.5689854836756654e-05, + "loss": 0.2622, + "num_input_tokens_seen": 994536, + "step": 5210 + }, + { + "epoch": 2.7104989604989607, + "grad_norm": 0.5470243692398071, + "learning_rate": 4.567711633350891e-05, + "loss": 0.2333, + "num_input_tokens_seen": 995464, + "step": 5215 + }, + { + "epoch": 2.713097713097713, + "grad_norm": 0.7189806699752808, + "learning_rate": 4.5664360815159775e-05, + "loss": 0.3021, + "num_input_tokens_seen": 996456, + "step": 5220 + }, + { + "epoch": 2.7156964656964657, + "grad_norm": 0.7225502729415894, + "learning_rate": 4.5651588292205715e-05, + "loss": 0.2551, + "num_input_tokens_seen": 997384, + "step": 5225 + }, + { + "epoch": 2.7182952182952183, + "grad_norm": 0.6541319489479065, + "learning_rate": 4.5638798775157176e-05, + "loss": 0.185, + "num_input_tokens_seen": 998312, + "step": 5230 + }, + { + "epoch": 2.720893970893971, + "grad_norm": 0.3433009386062622, + "learning_rate": 4.562599227453859e-05, + "loss": 0.1771, + "num_input_tokens_seen": 999304, + "step": 5235 + }, + { + "epoch": 2.7234927234927238, + "grad_norm": 1.4416918754577637, + "learning_rate": 4.5613168800888366e-05, + "loss": 0.3699, + "num_input_tokens_seen": 1000200, + "step": 5240 + }, + { + "epoch": 2.726091476091476, + "grad_norm": 0.5422126650810242, + "learning_rate": 4.560032836475888e-05, + "loss": 0.3013, + "num_input_tokens_seen": 1001192, + "step": 5245 + }, + { + "epoch": 2.728690228690229, + "grad_norm": 0.2389257252216339, + "learning_rate": 4.5587470976716464e-05, + "loss": 0.2686, + "num_input_tokens_seen": 1002184, + "step": 5250 + }, + { + "epoch": 2.7312889812889813, + "grad_norm": 1.0520564317703247, + "learning_rate": 4.557459664734141e-05, + "loss": 0.2598, + "num_input_tokens_seen": 1003144, + "step": 5255 + }, + { + "epoch": 2.733887733887734, + "grad_norm": 0.5522788763046265, + "learning_rate": 4.556170538722794e-05, + "loss": 0.2805, + "num_input_tokens_seen": 1004168, + "step": 5260 + }, + { + "epoch": 2.7364864864864864, + "grad_norm": 0.6161827445030212, + "learning_rate": 4.55487972069842e-05, + "loss": 0.2443, + "num_input_tokens_seen": 1005096, + "step": 5265 + }, + { + "epoch": 2.739085239085239, + "grad_norm": 0.2497492879629135, + "learning_rate": 4.553587211723228e-05, + "loss": 0.2511, + "num_input_tokens_seen": 1006056, + "step": 5270 + }, + { + "epoch": 2.741683991683992, + "grad_norm": 0.5117607116699219, + "learning_rate": 4.5522930128608176e-05, + "loss": 0.2679, + "num_input_tokens_seen": 1006984, + "step": 5275 + }, + { + "epoch": 2.7442827442827444, + "grad_norm": 0.8639620542526245, + "learning_rate": 4.550997125176179e-05, + "loss": 0.2802, + "num_input_tokens_seen": 1007976, + "step": 5280 + }, + { + "epoch": 2.746881496881497, + "grad_norm": 0.6852080225944519, + "learning_rate": 4.549699549735692e-05, + "loss": 0.2241, + "num_input_tokens_seen": 1008936, + "step": 5285 + }, + { + "epoch": 2.7494802494802495, + "grad_norm": 0.6523515582084656, + "learning_rate": 4.548400287607124e-05, + "loss": 0.2718, + "num_input_tokens_seen": 1009832, + "step": 5290 + }, + { + "epoch": 2.752079002079002, + "grad_norm": 0.4111681580543518, + "learning_rate": 4.547099339859634e-05, + "loss": 0.2219, + "num_input_tokens_seen": 1010888, + "step": 5295 + }, + { + "epoch": 2.7546777546777546, + "grad_norm": 0.4608582556247711, + "learning_rate": 4.5457967075637644e-05, + "loss": 0.3127, + "num_input_tokens_seen": 1011784, + "step": 5300 + }, + { + "epoch": 2.757276507276507, + "grad_norm": 0.5167219042778015, + "learning_rate": 4.544492391791445e-05, + "loss": 0.2103, + "num_input_tokens_seen": 1012776, + "step": 5305 + }, + { + "epoch": 2.75987525987526, + "grad_norm": 1.0982921123504639, + "learning_rate": 4.543186393615991e-05, + "loss": 0.3729, + "num_input_tokens_seen": 1013672, + "step": 5310 + }, + { + "epoch": 2.7624740124740126, + "grad_norm": 0.4470478296279907, + "learning_rate": 4.5418787141121026e-05, + "loss": 0.2711, + "num_input_tokens_seen": 1014568, + "step": 5315 + }, + { + "epoch": 2.765072765072765, + "grad_norm": 0.4412657916545868, + "learning_rate": 4.540569354355864e-05, + "loss": 0.3456, + "num_input_tokens_seen": 1015560, + "step": 5320 + }, + { + "epoch": 2.7676715176715176, + "grad_norm": 0.4568749964237213, + "learning_rate": 4.539258315424739e-05, + "loss": 0.2923, + "num_input_tokens_seen": 1016488, + "step": 5325 + }, + { + "epoch": 2.77027027027027, + "grad_norm": 0.46912968158721924, + "learning_rate": 4.537945598397577e-05, + "loss": 0.2979, + "num_input_tokens_seen": 1017384, + "step": 5330 + }, + { + "epoch": 2.7728690228690227, + "grad_norm": 0.1977468729019165, + "learning_rate": 4.536631204354606e-05, + "loss": 0.2824, + "num_input_tokens_seen": 1018248, + "step": 5335 + }, + { + "epoch": 2.7754677754677752, + "grad_norm": 0.3067348897457123, + "learning_rate": 4.535315134377435e-05, + "loss": 0.2599, + "num_input_tokens_seen": 1019176, + "step": 5340 + }, + { + "epoch": 2.778066528066528, + "grad_norm": 0.5761695504188538, + "learning_rate": 4.533997389549052e-05, + "loss": 0.2434, + "num_input_tokens_seen": 1020200, + "step": 5345 + }, + { + "epoch": 2.7806652806652807, + "grad_norm": 0.6203577518463135, + "learning_rate": 4.532677970953821e-05, + "loss": 0.3147, + "num_input_tokens_seen": 1021128, + "step": 5350 + }, + { + "epoch": 2.7832640332640333, + "grad_norm": 0.5864941477775574, + "learning_rate": 4.531356879677488e-05, + "loss": 0.3354, + "num_input_tokens_seen": 1022088, + "step": 5355 + }, + { + "epoch": 2.785862785862786, + "grad_norm": 1.0227410793304443, + "learning_rate": 4.530034116807172e-05, + "loss": 0.27, + "num_input_tokens_seen": 1023016, + "step": 5360 + }, + { + "epoch": 2.7884615384615383, + "grad_norm": 0.7428109645843506, + "learning_rate": 4.528709683431368e-05, + "loss": 0.2315, + "num_input_tokens_seen": 1023976, + "step": 5365 + }, + { + "epoch": 2.7910602910602913, + "grad_norm": 0.2394358217716217, + "learning_rate": 4.527383580639946e-05, + "loss": 0.2485, + "num_input_tokens_seen": 1024936, + "step": 5370 + }, + { + "epoch": 2.7936590436590434, + "grad_norm": 0.23326103389263153, + "learning_rate": 4.526055809524149e-05, + "loss": 0.2392, + "num_input_tokens_seen": 1025864, + "step": 5375 + }, + { + "epoch": 2.7962577962577964, + "grad_norm": 0.6290645003318787, + "learning_rate": 4.524726371176594e-05, + "loss": 0.2956, + "num_input_tokens_seen": 1026824, + "step": 5380 + }, + { + "epoch": 2.798856548856549, + "grad_norm": 0.08675947785377502, + "learning_rate": 4.52339526669127e-05, + "loss": 0.2474, + "num_input_tokens_seen": 1027720, + "step": 5385 + }, + { + "epoch": 2.8014553014553014, + "grad_norm": 0.8751652836799622, + "learning_rate": 4.522062497163538e-05, + "loss": 0.3447, + "num_input_tokens_seen": 1028584, + "step": 5390 + }, + { + "epoch": 2.804054054054054, + "grad_norm": 0.1955094039440155, + "learning_rate": 4.5207280636901246e-05, + "loss": 0.2393, + "num_input_tokens_seen": 1029480, + "step": 5395 + }, + { + "epoch": 2.8066528066528065, + "grad_norm": 0.7332314252853394, + "learning_rate": 4.519391967369131e-05, + "loss": 0.2686, + "num_input_tokens_seen": 1030440, + "step": 5400 + }, + { + "epoch": 2.8092515592515594, + "grad_norm": 0.1494261771440506, + "learning_rate": 4.5180542093000234e-05, + "loss": 0.2476, + "num_input_tokens_seen": 1031400, + "step": 5405 + }, + { + "epoch": 2.811850311850312, + "grad_norm": 0.22501179575920105, + "learning_rate": 4.516714790583637e-05, + "loss": 0.249, + "num_input_tokens_seen": 1032392, + "step": 5410 + }, + { + "epoch": 2.8144490644490645, + "grad_norm": 0.32134851813316345, + "learning_rate": 4.515373712322174e-05, + "loss": 0.2502, + "num_input_tokens_seen": 1033256, + "step": 5415 + }, + { + "epoch": 2.817047817047817, + "grad_norm": 0.6391826272010803, + "learning_rate": 4.5140309756192e-05, + "loss": 0.197, + "num_input_tokens_seen": 1034248, + "step": 5420 + }, + { + "epoch": 2.8196465696465696, + "grad_norm": 0.6987336874008179, + "learning_rate": 4.5126865815796474e-05, + "loss": 0.3878, + "num_input_tokens_seen": 1035272, + "step": 5425 + }, + { + "epoch": 2.822245322245322, + "grad_norm": 0.2990476191043854, + "learning_rate": 4.511340531309812e-05, + "loss": 0.1568, + "num_input_tokens_seen": 1036264, + "step": 5430 + }, + { + "epoch": 2.8248440748440746, + "grad_norm": 0.5897757411003113, + "learning_rate": 4.5099928259173516e-05, + "loss": 0.2468, + "num_input_tokens_seen": 1037160, + "step": 5435 + }, + { + "epoch": 2.8274428274428276, + "grad_norm": 0.37370580434799194, + "learning_rate": 4.5086434665112864e-05, + "loss": 0.2792, + "num_input_tokens_seen": 1038152, + "step": 5440 + }, + { + "epoch": 2.83004158004158, + "grad_norm": 0.3158928155899048, + "learning_rate": 4.507292454201999e-05, + "loss": 0.2446, + "num_input_tokens_seen": 1039112, + "step": 5445 + }, + { + "epoch": 2.8326403326403327, + "grad_norm": 0.5866453647613525, + "learning_rate": 4.50593979010123e-05, + "loss": 0.3112, + "num_input_tokens_seen": 1040072, + "step": 5450 + }, + { + "epoch": 2.835239085239085, + "grad_norm": 0.35597407817840576, + "learning_rate": 4.5045854753220805e-05, + "loss": 0.3315, + "num_input_tokens_seen": 1041000, + "step": 5455 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 1.1861082315444946, + "learning_rate": 4.50322951097901e-05, + "loss": 0.3053, + "num_input_tokens_seen": 1041928, + "step": 5460 + }, + { + "epoch": 2.8404365904365907, + "grad_norm": 0.49162399768829346, + "learning_rate": 4.501871898187836e-05, + "loss": 0.2837, + "num_input_tokens_seen": 1042888, + "step": 5465 + }, + { + "epoch": 2.8430353430353428, + "grad_norm": 0.3285665810108185, + "learning_rate": 4.5005126380657296e-05, + "loss": 0.2928, + "num_input_tokens_seen": 1043752, + "step": 5470 + }, + { + "epoch": 2.8456340956340958, + "grad_norm": 0.2186887115240097, + "learning_rate": 4.499151731731221e-05, + "loss": 0.2813, + "num_input_tokens_seen": 1044712, + "step": 5475 + }, + { + "epoch": 2.8482328482328483, + "grad_norm": 0.2468162626028061, + "learning_rate": 4.497789180304193e-05, + "loss": 0.2017, + "num_input_tokens_seen": 1045608, + "step": 5480 + }, + { + "epoch": 2.850831600831601, + "grad_norm": 0.3973219096660614, + "learning_rate": 4.496424984905883e-05, + "loss": 0.1572, + "num_input_tokens_seen": 1046568, + "step": 5485 + }, + { + "epoch": 2.8534303534303533, + "grad_norm": 0.3917061984539032, + "learning_rate": 4.495059146658881e-05, + "loss": 0.3198, + "num_input_tokens_seen": 1047560, + "step": 5490 + }, + { + "epoch": 2.856029106029106, + "grad_norm": 0.5772581100463867, + "learning_rate": 4.493691666687129e-05, + "loss": 0.3142, + "num_input_tokens_seen": 1048552, + "step": 5495 + }, + { + "epoch": 2.858627858627859, + "grad_norm": 0.16836020350456238, + "learning_rate": 4.49232254611592e-05, + "loss": 0.2933, + "num_input_tokens_seen": 1049544, + "step": 5500 + }, + { + "epoch": 2.8612266112266114, + "grad_norm": 0.5668784379959106, + "learning_rate": 4.4909517860718954e-05, + "loss": 0.2198, + "num_input_tokens_seen": 1050536, + "step": 5505 + }, + { + "epoch": 2.863825363825364, + "grad_norm": 0.3535847067832947, + "learning_rate": 4.489579387683048e-05, + "loss": 0.2218, + "num_input_tokens_seen": 1051464, + "step": 5510 + }, + { + "epoch": 2.8664241164241164, + "grad_norm": 0.7826998233795166, + "learning_rate": 4.4882053520787196e-05, + "loss": 0.2972, + "num_input_tokens_seen": 1052360, + "step": 5515 + }, + { + "epoch": 2.869022869022869, + "grad_norm": 0.5403087139129639, + "learning_rate": 4.4868296803895946e-05, + "loss": 0.3058, + "num_input_tokens_seen": 1053416, + "step": 5520 + }, + { + "epoch": 2.8716216216216215, + "grad_norm": 0.2033681571483612, + "learning_rate": 4.4854523737477094e-05, + "loss": 0.2924, + "num_input_tokens_seen": 1054312, + "step": 5525 + }, + { + "epoch": 2.874220374220374, + "grad_norm": 0.5118981599807739, + "learning_rate": 4.484073433286442e-05, + "loss": 0.2817, + "num_input_tokens_seen": 1055208, + "step": 5530 + }, + { + "epoch": 2.876819126819127, + "grad_norm": 0.783509373664856, + "learning_rate": 4.4826928601405155e-05, + "loss": 0.2778, + "num_input_tokens_seen": 1056136, + "step": 5535 + }, + { + "epoch": 2.8794178794178795, + "grad_norm": 0.7982479333877563, + "learning_rate": 4.481310655445998e-05, + "loss": 0.2523, + "num_input_tokens_seen": 1057160, + "step": 5540 + }, + { + "epoch": 2.882016632016632, + "grad_norm": 0.7285820841789246, + "learning_rate": 4.479926820340298e-05, + "loss": 0.3149, + "num_input_tokens_seen": 1058120, + "step": 5545 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 0.48240089416503906, + "learning_rate": 4.478541355962168e-05, + "loss": 0.292, + "num_input_tokens_seen": 1059048, + "step": 5550 + }, + { + "epoch": 2.887214137214137, + "grad_norm": 0.3602878451347351, + "learning_rate": 4.4771542634517e-05, + "loss": 0.2733, + "num_input_tokens_seen": 1059976, + "step": 5555 + }, + { + "epoch": 2.88981288981289, + "grad_norm": 0.4422951340675354, + "learning_rate": 4.4757655439503255e-05, + "loss": 0.2531, + "num_input_tokens_seen": 1060936, + "step": 5560 + }, + { + "epoch": 2.892411642411642, + "grad_norm": 0.9722403287887573, + "learning_rate": 4.474375198600815e-05, + "loss": 0.2394, + "num_input_tokens_seen": 1061928, + "step": 5565 + }, + { + "epoch": 2.895010395010395, + "grad_norm": 0.5750762224197388, + "learning_rate": 4.472983228547278e-05, + "loss": 0.293, + "num_input_tokens_seen": 1062920, + "step": 5570 + }, + { + "epoch": 2.8976091476091477, + "grad_norm": 0.7011445164680481, + "learning_rate": 4.4715896349351596e-05, + "loss": 0.2756, + "num_input_tokens_seen": 1063880, + "step": 5575 + }, + { + "epoch": 2.9002079002079, + "grad_norm": 0.9123721122741699, + "learning_rate": 4.4701944189112404e-05, + "loss": 0.2946, + "num_input_tokens_seen": 1064808, + "step": 5580 + }, + { + "epoch": 2.9028066528066527, + "grad_norm": 0.6428107023239136, + "learning_rate": 4.468797581623638e-05, + "loss": 0.2276, + "num_input_tokens_seen": 1065736, + "step": 5585 + }, + { + "epoch": 2.9054054054054053, + "grad_norm": 0.34355059266090393, + "learning_rate": 4.4673991242218045e-05, + "loss": 0.2763, + "num_input_tokens_seen": 1066600, + "step": 5590 + }, + { + "epoch": 2.9080041580041582, + "grad_norm": 0.541581392288208, + "learning_rate": 4.4659990478565215e-05, + "loss": 0.3042, + "num_input_tokens_seen": 1067592, + "step": 5595 + }, + { + "epoch": 2.9106029106029108, + "grad_norm": 0.9522217512130737, + "learning_rate": 4.4645973536799065e-05, + "loss": 0.2985, + "num_input_tokens_seen": 1068552, + "step": 5600 + }, + { + "epoch": 2.9132016632016633, + "grad_norm": 0.9402531981468201, + "learning_rate": 4.463194042845408e-05, + "loss": 0.2975, + "num_input_tokens_seen": 1069544, + "step": 5605 + }, + { + "epoch": 2.915800415800416, + "grad_norm": 0.264253169298172, + "learning_rate": 4.4617891165078014e-05, + "loss": 0.2831, + "num_input_tokens_seen": 1070472, + "step": 5610 + }, + { + "epoch": 2.9183991683991684, + "grad_norm": 1.131881594657898, + "learning_rate": 4.4603825758231954e-05, + "loss": 0.2813, + "num_input_tokens_seen": 1071336, + "step": 5615 + }, + { + "epoch": 2.920997920997921, + "grad_norm": 0.37732380628585815, + "learning_rate": 4.4589744219490256e-05, + "loss": 0.279, + "num_input_tokens_seen": 1072232, + "step": 5620 + }, + { + "epoch": 2.9235966735966734, + "grad_norm": 0.37327703833580017, + "learning_rate": 4.457564656044056e-05, + "loss": 0.2775, + "num_input_tokens_seen": 1073192, + "step": 5625 + }, + { + "epoch": 2.9261954261954264, + "grad_norm": 0.5716499090194702, + "learning_rate": 4.456153279268375e-05, + "loss": 0.2833, + "num_input_tokens_seen": 1074216, + "step": 5630 + }, + { + "epoch": 2.928794178794179, + "grad_norm": 0.5937663912773132, + "learning_rate": 4.454740292783397e-05, + "loss": 0.2569, + "num_input_tokens_seen": 1075176, + "step": 5635 + }, + { + "epoch": 2.9313929313929314, + "grad_norm": 0.9296860098838806, + "learning_rate": 4.4533256977518646e-05, + "loss": 0.2779, + "num_input_tokens_seen": 1076072, + "step": 5640 + }, + { + "epoch": 2.933991683991684, + "grad_norm": 0.5887002348899841, + "learning_rate": 4.45190949533784e-05, + "loss": 0.2844, + "num_input_tokens_seen": 1077032, + "step": 5645 + }, + { + "epoch": 2.9365904365904365, + "grad_norm": 0.5401278734207153, + "learning_rate": 4.450491686706709e-05, + "loss": 0.2697, + "num_input_tokens_seen": 1078024, + "step": 5650 + }, + { + "epoch": 2.939189189189189, + "grad_norm": 0.26259908080101013, + "learning_rate": 4.44907227302518e-05, + "loss": 0.2419, + "num_input_tokens_seen": 1078952, + "step": 5655 + }, + { + "epoch": 2.9417879417879416, + "grad_norm": 0.6824623346328735, + "learning_rate": 4.4476512554612826e-05, + "loss": 0.1944, + "num_input_tokens_seen": 1079880, + "step": 5660 + }, + { + "epoch": 2.9443866943866945, + "grad_norm": 1.0353506803512573, + "learning_rate": 4.446228635184364e-05, + "loss": 0.2778, + "num_input_tokens_seen": 1080872, + "step": 5665 + }, + { + "epoch": 2.946985446985447, + "grad_norm": 0.2624112665653229, + "learning_rate": 4.444804413365093e-05, + "loss": 0.2733, + "num_input_tokens_seen": 1081832, + "step": 5670 + }, + { + "epoch": 2.9495841995841996, + "grad_norm": 0.63267982006073, + "learning_rate": 4.4433785911754545e-05, + "loss": 0.2287, + "num_input_tokens_seen": 1082760, + "step": 5675 + }, + { + "epoch": 2.952182952182952, + "grad_norm": 0.4225483536720276, + "learning_rate": 4.44195116978875e-05, + "loss": 0.2286, + "num_input_tokens_seen": 1083752, + "step": 5680 + }, + { + "epoch": 2.9547817047817047, + "grad_norm": 0.6189302206039429, + "learning_rate": 4.440522150379599e-05, + "loss": 0.3043, + "num_input_tokens_seen": 1084776, + "step": 5685 + }, + { + "epoch": 2.9573804573804576, + "grad_norm": 0.6396567821502686, + "learning_rate": 4.439091534123935e-05, + "loss": 0.2545, + "num_input_tokens_seen": 1085704, + "step": 5690 + }, + { + "epoch": 2.9599792099792097, + "grad_norm": 0.6529777646064758, + "learning_rate": 4.437659322199004e-05, + "loss": 0.2777, + "num_input_tokens_seen": 1086632, + "step": 5695 + }, + { + "epoch": 2.9625779625779627, + "grad_norm": 0.40983209013938904, + "learning_rate": 4.436225515783368e-05, + "loss": 0.2715, + "num_input_tokens_seen": 1087656, + "step": 5700 + }, + { + "epoch": 2.965176715176715, + "grad_norm": 1.153260588645935, + "learning_rate": 4.4347901160568985e-05, + "loss": 0.2963, + "num_input_tokens_seen": 1088584, + "step": 5705 + }, + { + "epoch": 2.9677754677754677, + "grad_norm": 1.0041351318359375, + "learning_rate": 4.43335312420078e-05, + "loss": 0.263, + "num_input_tokens_seen": 1089640, + "step": 5710 + }, + { + "epoch": 2.9703742203742203, + "grad_norm": 0.6464055180549622, + "learning_rate": 4.4319145413975044e-05, + "loss": 0.3193, + "num_input_tokens_seen": 1090632, + "step": 5715 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.5804903507232666, + "learning_rate": 4.430474368830877e-05, + "loss": 0.3368, + "num_input_tokens_seen": 1091560, + "step": 5720 + }, + { + "epoch": 2.975571725571726, + "grad_norm": 0.3330296277999878, + "learning_rate": 4.429032607686008e-05, + "loss": 0.2706, + "num_input_tokens_seen": 1092520, + "step": 5725 + }, + { + "epoch": 2.9781704781704783, + "grad_norm": 0.8270687460899353, + "learning_rate": 4.427589259149315e-05, + "loss": 0.3227, + "num_input_tokens_seen": 1093384, + "step": 5730 + }, + { + "epoch": 2.980769230769231, + "grad_norm": 1.4093852043151855, + "learning_rate": 4.426144324408524e-05, + "loss": 0.3151, + "num_input_tokens_seen": 1094376, + "step": 5735 + }, + { + "epoch": 2.9833679833679834, + "grad_norm": 0.47368648648262024, + "learning_rate": 4.424697804652666e-05, + "loss": 0.283, + "num_input_tokens_seen": 1095336, + "step": 5740 + }, + { + "epoch": 2.985966735966736, + "grad_norm": 0.5204001665115356, + "learning_rate": 4.423249701072072e-05, + "loss": 0.2934, + "num_input_tokens_seen": 1096264, + "step": 5745 + }, + { + "epoch": 2.9885654885654884, + "grad_norm": 0.4455412030220032, + "learning_rate": 4.421800014858382e-05, + "loss": 0.3017, + "num_input_tokens_seen": 1097160, + "step": 5750 + }, + { + "epoch": 2.991164241164241, + "grad_norm": 0.5371845364570618, + "learning_rate": 4.420348747204536e-05, + "loss": 0.2933, + "num_input_tokens_seen": 1098024, + "step": 5755 + }, + { + "epoch": 2.993762993762994, + "grad_norm": 0.4886581301689148, + "learning_rate": 4.418895899304774e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1099016, + "step": 5760 + }, + { + "epoch": 2.9963617463617465, + "grad_norm": 0.32530465722084045, + "learning_rate": 4.417441472354638e-05, + "loss": 0.2945, + "num_input_tokens_seen": 1100008, + "step": 5765 + }, + { + "epoch": 2.998960498960499, + "grad_norm": 0.3901521563529968, + "learning_rate": 4.41598546755097e-05, + "loss": 0.2186, + "num_input_tokens_seen": 1100904, + "step": 5770 + }, + { + "epoch": 3.0, + "eval_loss": 0.2508894205093384, + "eval_runtime": 7.938, + "eval_samples_per_second": 107.836, + "eval_steps_per_second": 26.959, + "num_input_tokens_seen": 1101216, + "step": 5772 + }, + { + "epoch": 3.0015592515592515, + "grad_norm": 0.7301241755485535, + "learning_rate": 4.414527886091909e-05, + "loss": 0.2482, + "num_input_tokens_seen": 1101824, + "step": 5775 + }, + { + "epoch": 3.004158004158004, + "grad_norm": 0.5054182410240173, + "learning_rate": 4.413068729176891e-05, + "loss": 0.3172, + "num_input_tokens_seen": 1102784, + "step": 5780 + }, + { + "epoch": 3.0067567567567566, + "grad_norm": 0.9938554167747498, + "learning_rate": 4.4116079980066504e-05, + "loss": 0.3396, + "num_input_tokens_seen": 1103904, + "step": 5785 + }, + { + "epoch": 3.0093555093555096, + "grad_norm": 0.24478013813495636, + "learning_rate": 4.4101456937832166e-05, + "loss": 0.289, + "num_input_tokens_seen": 1104800, + "step": 5790 + }, + { + "epoch": 3.011954261954262, + "grad_norm": 0.6129480004310608, + "learning_rate": 4.408681817709911e-05, + "loss": 0.2858, + "num_input_tokens_seen": 1105760, + "step": 5795 + }, + { + "epoch": 3.0145530145530146, + "grad_norm": 1.0303103923797607, + "learning_rate": 4.407216370991351e-05, + "loss": 0.2663, + "num_input_tokens_seen": 1106656, + "step": 5800 + }, + { + "epoch": 3.017151767151767, + "grad_norm": 0.9177419543266296, + "learning_rate": 4.405749354833447e-05, + "loss": 0.2406, + "num_input_tokens_seen": 1107584, + "step": 5805 + }, + { + "epoch": 3.0197505197505197, + "grad_norm": 0.3323848247528076, + "learning_rate": 4.404280770443398e-05, + "loss": 0.274, + "num_input_tokens_seen": 1108512, + "step": 5810 + }, + { + "epoch": 3.022349272349272, + "grad_norm": 0.5137077569961548, + "learning_rate": 4.402810619029696e-05, + "loss": 0.3084, + "num_input_tokens_seen": 1109504, + "step": 5815 + }, + { + "epoch": 3.024948024948025, + "grad_norm": 0.6259181499481201, + "learning_rate": 4.401338901802122e-05, + "loss": 0.2817, + "num_input_tokens_seen": 1110464, + "step": 5820 + }, + { + "epoch": 3.0275467775467777, + "grad_norm": 0.8927936553955078, + "learning_rate": 4.3998656199717435e-05, + "loss": 0.2954, + "num_input_tokens_seen": 1111456, + "step": 5825 + }, + { + "epoch": 3.0301455301455302, + "grad_norm": 0.3378528654575348, + "learning_rate": 4.3983907747509195e-05, + "loss": 0.2793, + "num_input_tokens_seen": 1112512, + "step": 5830 + }, + { + "epoch": 3.0327442827442828, + "grad_norm": 0.4075321853160858, + "learning_rate": 4.396914367353292e-05, + "loss": 0.2724, + "num_input_tokens_seen": 1113504, + "step": 5835 + }, + { + "epoch": 3.0353430353430353, + "grad_norm": 0.44639143347740173, + "learning_rate": 4.3954363989937894e-05, + "loss": 0.2569, + "num_input_tokens_seen": 1114400, + "step": 5840 + }, + { + "epoch": 3.037941787941788, + "grad_norm": 0.6547308564186096, + "learning_rate": 4.3939568708886245e-05, + "loss": 0.3019, + "num_input_tokens_seen": 1115360, + "step": 5845 + }, + { + "epoch": 3.0405405405405403, + "grad_norm": 0.4280237555503845, + "learning_rate": 4.3924757842552955e-05, + "loss": 0.2806, + "num_input_tokens_seen": 1116288, + "step": 5850 + }, + { + "epoch": 3.0431392931392933, + "grad_norm": 0.8460020422935486, + "learning_rate": 4.3909931403125805e-05, + "loss": 0.291, + "num_input_tokens_seen": 1117184, + "step": 5855 + }, + { + "epoch": 3.045738045738046, + "grad_norm": 0.5826412439346313, + "learning_rate": 4.38950894028054e-05, + "loss": 0.2566, + "num_input_tokens_seen": 1118080, + "step": 5860 + }, + { + "epoch": 3.0483367983367984, + "grad_norm": 0.6432111263275146, + "learning_rate": 4.388023185380516e-05, + "loss": 0.3276, + "num_input_tokens_seen": 1119072, + "step": 5865 + }, + { + "epoch": 3.050935550935551, + "grad_norm": 0.3591099977493286, + "learning_rate": 4.386535876835129e-05, + "loss": 0.282, + "num_input_tokens_seen": 1119968, + "step": 5870 + }, + { + "epoch": 3.0535343035343034, + "grad_norm": 0.3366011679172516, + "learning_rate": 4.3850470158682774e-05, + "loss": 0.2804, + "num_input_tokens_seen": 1120864, + "step": 5875 + }, + { + "epoch": 3.056133056133056, + "grad_norm": 0.2743311822414398, + "learning_rate": 4.383556603705139e-05, + "loss": 0.2829, + "num_input_tokens_seen": 1121792, + "step": 5880 + }, + { + "epoch": 3.058731808731809, + "grad_norm": 0.4461887776851654, + "learning_rate": 4.382064641572167e-05, + "loss": 0.2934, + "num_input_tokens_seen": 1122784, + "step": 5885 + }, + { + "epoch": 3.0613305613305615, + "grad_norm": 0.5543252229690552, + "learning_rate": 4.380571130697088e-05, + "loss": 0.2889, + "num_input_tokens_seen": 1123776, + "step": 5890 + }, + { + "epoch": 3.063929313929314, + "grad_norm": 0.8890363574028015, + "learning_rate": 4.3790760723089074e-05, + "loss": 0.2608, + "num_input_tokens_seen": 1124704, + "step": 5895 + }, + { + "epoch": 3.0665280665280665, + "grad_norm": 0.6426643133163452, + "learning_rate": 4.3775794676379e-05, + "loss": 0.2273, + "num_input_tokens_seen": 1125600, + "step": 5900 + }, + { + "epoch": 3.069126819126819, + "grad_norm": 0.4320303201675415, + "learning_rate": 4.376081317915616e-05, + "loss": 0.2083, + "num_input_tokens_seen": 1126496, + "step": 5905 + }, + { + "epoch": 3.0717255717255716, + "grad_norm": 0.4203374683856964, + "learning_rate": 4.3745816243748755e-05, + "loss": 0.1854, + "num_input_tokens_seen": 1127584, + "step": 5910 + }, + { + "epoch": 3.074324324324324, + "grad_norm": 0.37838613986968994, + "learning_rate": 4.373080388249768e-05, + "loss": 0.3348, + "num_input_tokens_seen": 1128480, + "step": 5915 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.5219151973724365, + "learning_rate": 4.371577610775656e-05, + "loss": 0.2401, + "num_input_tokens_seen": 1129472, + "step": 5920 + }, + { + "epoch": 3.0795218295218296, + "grad_norm": 0.5141345262527466, + "learning_rate": 4.370073293189165e-05, + "loss": 0.2628, + "num_input_tokens_seen": 1130528, + "step": 5925 + }, + { + "epoch": 3.082120582120582, + "grad_norm": 0.4681874215602875, + "learning_rate": 4.3685674367281925e-05, + "loss": 0.2882, + "num_input_tokens_seen": 1131488, + "step": 5930 + }, + { + "epoch": 3.0847193347193347, + "grad_norm": 0.4304925203323364, + "learning_rate": 4.367060042631901e-05, + "loss": 0.2634, + "num_input_tokens_seen": 1132416, + "step": 5935 + }, + { + "epoch": 3.087318087318087, + "grad_norm": 0.286306232213974, + "learning_rate": 4.3655511121407176e-05, + "loss": 0.2155, + "num_input_tokens_seen": 1133408, + "step": 5940 + }, + { + "epoch": 3.0899168399168397, + "grad_norm": 0.26281315088272095, + "learning_rate": 4.3640406464963333e-05, + "loss": 0.287, + "num_input_tokens_seen": 1134400, + "step": 5945 + }, + { + "epoch": 3.0925155925155927, + "grad_norm": 0.5768784284591675, + "learning_rate": 4.3625286469417046e-05, + "loss": 0.3093, + "num_input_tokens_seen": 1135392, + "step": 5950 + }, + { + "epoch": 3.0951143451143452, + "grad_norm": 0.5917097926139832, + "learning_rate": 4.3610151147210475e-05, + "loss": 0.2288, + "num_input_tokens_seen": 1136352, + "step": 5955 + }, + { + "epoch": 3.0977130977130978, + "grad_norm": 0.2914988100528717, + "learning_rate": 4.359500051079841e-05, + "loss": 0.297, + "num_input_tokens_seen": 1137280, + "step": 5960 + }, + { + "epoch": 3.1003118503118503, + "grad_norm": 0.9243279099464417, + "learning_rate": 4.357983457264825e-05, + "loss": 0.2621, + "num_input_tokens_seen": 1138208, + "step": 5965 + }, + { + "epoch": 3.102910602910603, + "grad_norm": 0.9076776504516602, + "learning_rate": 4.356465334523995e-05, + "loss": 0.2602, + "num_input_tokens_seen": 1139168, + "step": 5970 + }, + { + "epoch": 3.1055093555093554, + "grad_norm": 0.2813416123390198, + "learning_rate": 4.354945684106608e-05, + "loss": 0.2956, + "num_input_tokens_seen": 1140224, + "step": 5975 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.7063618898391724, + "learning_rate": 4.3534245072631785e-05, + "loss": 0.222, + "num_input_tokens_seen": 1141184, + "step": 5980 + }, + { + "epoch": 3.110706860706861, + "grad_norm": 0.7842854261398315, + "learning_rate": 4.351901805245474e-05, + "loss": 0.2838, + "num_input_tokens_seen": 1142176, + "step": 5985 + }, + { + "epoch": 3.1133056133056134, + "grad_norm": 0.2798095941543579, + "learning_rate": 4.350377579306519e-05, + "loss": 0.2455, + "num_input_tokens_seen": 1143072, + "step": 5990 + }, + { + "epoch": 3.115904365904366, + "grad_norm": 0.6332041621208191, + "learning_rate": 4.348851830700593e-05, + "loss": 0.2784, + "num_input_tokens_seen": 1144064, + "step": 5995 + }, + { + "epoch": 3.1185031185031185, + "grad_norm": 0.4091799259185791, + "learning_rate": 4.347324560683227e-05, + "loss": 0.2914, + "num_input_tokens_seen": 1145056, + "step": 6000 + }, + { + "epoch": 3.121101871101871, + "grad_norm": 0.2648732662200928, + "learning_rate": 4.3457957705112034e-05, + "loss": 0.2321, + "num_input_tokens_seen": 1145984, + "step": 6005 + }, + { + "epoch": 3.1237006237006235, + "grad_norm": 0.608970046043396, + "learning_rate": 4.344265461442557e-05, + "loss": 0.2645, + "num_input_tokens_seen": 1146912, + "step": 6010 + }, + { + "epoch": 3.1262993762993765, + "grad_norm": 0.24588671326637268, + "learning_rate": 4.342733634736571e-05, + "loss": 0.2392, + "num_input_tokens_seen": 1147840, + "step": 6015 + }, + { + "epoch": 3.128898128898129, + "grad_norm": 0.44663292169570923, + "learning_rate": 4.341200291653781e-05, + "loss": 0.2732, + "num_input_tokens_seen": 1148832, + "step": 6020 + }, + { + "epoch": 3.1314968814968815, + "grad_norm": 0.780484139919281, + "learning_rate": 4.339665433455965e-05, + "loss": 0.2681, + "num_input_tokens_seen": 1149792, + "step": 6025 + }, + { + "epoch": 3.134095634095634, + "grad_norm": 0.12660792469978333, + "learning_rate": 4.338129061406151e-05, + "loss": 0.2506, + "num_input_tokens_seen": 1150720, + "step": 6030 + }, + { + "epoch": 3.1366943866943866, + "grad_norm": 0.4987092614173889, + "learning_rate": 4.336591176768613e-05, + "loss": 0.2529, + "num_input_tokens_seen": 1151680, + "step": 6035 + }, + { + "epoch": 3.139293139293139, + "grad_norm": 0.587579607963562, + "learning_rate": 4.3350517808088694e-05, + "loss": 0.2637, + "num_input_tokens_seen": 1152640, + "step": 6040 + }, + { + "epoch": 3.141891891891892, + "grad_norm": 0.7026330232620239, + "learning_rate": 4.333510874793681e-05, + "loss": 0.2784, + "num_input_tokens_seen": 1153600, + "step": 6045 + }, + { + "epoch": 3.1444906444906446, + "grad_norm": 0.24185624718666077, + "learning_rate": 4.331968459991052e-05, + "loss": 0.3127, + "num_input_tokens_seen": 1154560, + "step": 6050 + }, + { + "epoch": 3.147089397089397, + "grad_norm": 0.34318724274635315, + "learning_rate": 4.330424537670229e-05, + "loss": 0.2677, + "num_input_tokens_seen": 1155520, + "step": 6055 + }, + { + "epoch": 3.1496881496881497, + "grad_norm": 1.0020076036453247, + "learning_rate": 4.3288791091016983e-05, + "loss": 0.3338, + "num_input_tokens_seen": 1156512, + "step": 6060 + }, + { + "epoch": 3.1522869022869022, + "grad_norm": 0.34176328778266907, + "learning_rate": 4.327332175557185e-05, + "loss": 0.2436, + "num_input_tokens_seen": 1157440, + "step": 6065 + }, + { + "epoch": 3.1548856548856548, + "grad_norm": 0.3484143912792206, + "learning_rate": 4.325783738309654e-05, + "loss": 0.2795, + "num_input_tokens_seen": 1158368, + "step": 6070 + }, + { + "epoch": 3.1574844074844073, + "grad_norm": 0.8932015895843506, + "learning_rate": 4.324233798633308e-05, + "loss": 0.2926, + "num_input_tokens_seen": 1159264, + "step": 6075 + }, + { + "epoch": 3.1600831600831603, + "grad_norm": 0.7192901372909546, + "learning_rate": 4.322682357803582e-05, + "loss": 0.2187, + "num_input_tokens_seen": 1160192, + "step": 6080 + }, + { + "epoch": 3.162681912681913, + "grad_norm": 0.5698877573013306, + "learning_rate": 4.321129417097153e-05, + "loss": 0.2385, + "num_input_tokens_seen": 1161184, + "step": 6085 + }, + { + "epoch": 3.1652806652806653, + "grad_norm": 0.5282605290412903, + "learning_rate": 4.319574977791926e-05, + "loss": 0.227, + "num_input_tokens_seen": 1162176, + "step": 6090 + }, + { + "epoch": 3.167879417879418, + "grad_norm": 0.3263411521911621, + "learning_rate": 4.318019041167042e-05, + "loss": 0.2978, + "num_input_tokens_seen": 1163200, + "step": 6095 + }, + { + "epoch": 3.1704781704781704, + "grad_norm": 0.36691609025001526, + "learning_rate": 4.316461608502875e-05, + "loss": 0.1993, + "num_input_tokens_seen": 1164128, + "step": 6100 + }, + { + "epoch": 3.173076923076923, + "grad_norm": 0.16560353338718414, + "learning_rate": 4.314902681081029e-05, + "loss": 0.2336, + "num_input_tokens_seen": 1165056, + "step": 6105 + }, + { + "epoch": 3.175675675675676, + "grad_norm": 0.5950994491577148, + "learning_rate": 4.313342260184337e-05, + "loss": 0.2769, + "num_input_tokens_seen": 1166048, + "step": 6110 + }, + { + "epoch": 3.1782744282744284, + "grad_norm": 0.5281665921211243, + "learning_rate": 4.311780347096863e-05, + "loss": 0.2094, + "num_input_tokens_seen": 1167040, + "step": 6115 + }, + { + "epoch": 3.180873180873181, + "grad_norm": 0.10456261783838272, + "learning_rate": 4.310216943103898e-05, + "loss": 0.3314, + "num_input_tokens_seen": 1167936, + "step": 6120 + }, + { + "epoch": 3.1834719334719335, + "grad_norm": 0.7163777947425842, + "learning_rate": 4.308652049491957e-05, + "loss": 0.2547, + "num_input_tokens_seen": 1168864, + "step": 6125 + }, + { + "epoch": 3.186070686070686, + "grad_norm": 0.4587853252887726, + "learning_rate": 4.307085667548788e-05, + "loss": 0.2908, + "num_input_tokens_seen": 1169824, + "step": 6130 + }, + { + "epoch": 3.1886694386694385, + "grad_norm": 1.1332672834396362, + "learning_rate": 4.3055177985633556e-05, + "loss": 0.2947, + "num_input_tokens_seen": 1170720, + "step": 6135 + }, + { + "epoch": 3.1912681912681915, + "grad_norm": 0.536065936088562, + "learning_rate": 4.3039484438258536e-05, + "loss": 0.3107, + "num_input_tokens_seen": 1171648, + "step": 6140 + }, + { + "epoch": 3.193866943866944, + "grad_norm": 0.5142977237701416, + "learning_rate": 4.302377604627696e-05, + "loss": 0.2846, + "num_input_tokens_seen": 1172576, + "step": 6145 + }, + { + "epoch": 3.1964656964656966, + "grad_norm": 0.5178808569908142, + "learning_rate": 4.30080528226152e-05, + "loss": 0.2919, + "num_input_tokens_seen": 1173568, + "step": 6150 + }, + { + "epoch": 3.199064449064449, + "grad_norm": 0.5807610154151917, + "learning_rate": 4.299231478021181e-05, + "loss": 0.24, + "num_input_tokens_seen": 1174496, + "step": 6155 + }, + { + "epoch": 3.2016632016632016, + "grad_norm": 0.2202247530221939, + "learning_rate": 4.297656193201755e-05, + "loss": 0.3185, + "num_input_tokens_seen": 1175456, + "step": 6160 + }, + { + "epoch": 3.204261954261954, + "grad_norm": 0.4761686623096466, + "learning_rate": 4.296079429099538e-05, + "loss": 0.3301, + "num_input_tokens_seen": 1176480, + "step": 6165 + }, + { + "epoch": 3.2068607068607067, + "grad_norm": 0.352522611618042, + "learning_rate": 4.2945011870120395e-05, + "loss": 0.2774, + "num_input_tokens_seen": 1177440, + "step": 6170 + }, + { + "epoch": 3.2094594594594597, + "grad_norm": 0.39831334352493286, + "learning_rate": 4.2929214682379894e-05, + "loss": 0.2932, + "num_input_tokens_seen": 1178400, + "step": 6175 + }, + { + "epoch": 3.212058212058212, + "grad_norm": 0.9114214181900024, + "learning_rate": 4.2913402740773294e-05, + "loss": 0.2375, + "num_input_tokens_seen": 1179360, + "step": 6180 + }, + { + "epoch": 3.2146569646569647, + "grad_norm": 0.5286975502967834, + "learning_rate": 4.2897576058312176e-05, + "loss": 0.2584, + "num_input_tokens_seen": 1180384, + "step": 6185 + }, + { + "epoch": 3.2172557172557172, + "grad_norm": 0.5389646887779236, + "learning_rate": 4.2881734648020245e-05, + "loss": 0.2041, + "num_input_tokens_seen": 1181312, + "step": 6190 + }, + { + "epoch": 3.2198544698544698, + "grad_norm": 0.3579500913619995, + "learning_rate": 4.286587852293331e-05, + "loss": 0.3434, + "num_input_tokens_seen": 1182240, + "step": 6195 + }, + { + "epoch": 3.2224532224532223, + "grad_norm": 0.15561674535274506, + "learning_rate": 4.285000769609932e-05, + "loss": 0.2214, + "num_input_tokens_seen": 1183264, + "step": 6200 + }, + { + "epoch": 3.225051975051975, + "grad_norm": 0.6668947339057922, + "learning_rate": 4.283412218057829e-05, + "loss": 0.2305, + "num_input_tokens_seen": 1184192, + "step": 6205 + }, + { + "epoch": 3.227650727650728, + "grad_norm": 0.8829325437545776, + "learning_rate": 4.281822198944233e-05, + "loss": 0.3683, + "num_input_tokens_seen": 1185088, + "step": 6210 + }, + { + "epoch": 3.2302494802494803, + "grad_norm": 0.6072591543197632, + "learning_rate": 4.280230713577564e-05, + "loss": 0.2721, + "num_input_tokens_seen": 1186016, + "step": 6215 + }, + { + "epoch": 3.232848232848233, + "grad_norm": 0.6624928116798401, + "learning_rate": 4.278637763267448e-05, + "loss": 0.28, + "num_input_tokens_seen": 1187040, + "step": 6220 + }, + { + "epoch": 3.2354469854469854, + "grad_norm": 0.8285497426986694, + "learning_rate": 4.277043349324716e-05, + "loss": 0.2704, + "num_input_tokens_seen": 1187936, + "step": 6225 + }, + { + "epoch": 3.238045738045738, + "grad_norm": 0.6012800335884094, + "learning_rate": 4.275447473061405e-05, + "loss": 0.2689, + "num_input_tokens_seen": 1188864, + "step": 6230 + }, + { + "epoch": 3.2406444906444904, + "grad_norm": 0.0799122080206871, + "learning_rate": 4.273850135790752e-05, + "loss": 0.2508, + "num_input_tokens_seen": 1189760, + "step": 6235 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.286553293466568, + "learning_rate": 4.272251338827199e-05, + "loss": 0.2498, + "num_input_tokens_seen": 1190656, + "step": 6240 + }, + { + "epoch": 3.245841995841996, + "grad_norm": 0.6660816669464111, + "learning_rate": 4.270651083486389e-05, + "loss": 0.2441, + "num_input_tokens_seen": 1191584, + "step": 6245 + }, + { + "epoch": 3.2484407484407485, + "grad_norm": 0.3380957543849945, + "learning_rate": 4.269049371085164e-05, + "loss": 0.2152, + "num_input_tokens_seen": 1192480, + "step": 6250 + }, + { + "epoch": 3.251039501039501, + "grad_norm": 0.7067977786064148, + "learning_rate": 4.2674462029415654e-05, + "loss": 0.3112, + "num_input_tokens_seen": 1193472, + "step": 6255 + }, + { + "epoch": 3.2536382536382535, + "grad_norm": 0.42244410514831543, + "learning_rate": 4.265841580374834e-05, + "loss": 0.239, + "num_input_tokens_seen": 1194400, + "step": 6260 + }, + { + "epoch": 3.256237006237006, + "grad_norm": 0.3071938157081604, + "learning_rate": 4.264235504705404e-05, + "loss": 0.2517, + "num_input_tokens_seen": 1195328, + "step": 6265 + }, + { + "epoch": 3.258835758835759, + "grad_norm": 0.49675703048706055, + "learning_rate": 4.2626279772549096e-05, + "loss": 0.1855, + "num_input_tokens_seen": 1196224, + "step": 6270 + }, + { + "epoch": 3.2614345114345116, + "grad_norm": 0.5173097252845764, + "learning_rate": 4.2610189993461766e-05, + "loss": 0.2538, + "num_input_tokens_seen": 1197088, + "step": 6275 + }, + { + "epoch": 3.264033264033264, + "grad_norm": 0.7833018898963928, + "learning_rate": 4.259408572303225e-05, + "loss": 0.2588, + "num_input_tokens_seen": 1198016, + "step": 6280 + }, + { + "epoch": 3.2666320166320166, + "grad_norm": 0.3659665286540985, + "learning_rate": 4.2577966974512685e-05, + "loss": 0.2186, + "num_input_tokens_seen": 1198976, + "step": 6285 + }, + { + "epoch": 3.269230769230769, + "grad_norm": 0.3831707239151001, + "learning_rate": 4.25618337611671e-05, + "loss": 0.2331, + "num_input_tokens_seen": 1199936, + "step": 6290 + }, + { + "epoch": 3.2718295218295217, + "grad_norm": 0.8017318844795227, + "learning_rate": 4.254568609627145e-05, + "loss": 0.197, + "num_input_tokens_seen": 1200896, + "step": 6295 + }, + { + "epoch": 3.274428274428274, + "grad_norm": 0.43842869997024536, + "learning_rate": 4.2529523993113574e-05, + "loss": 0.2504, + "num_input_tokens_seen": 1201792, + "step": 6300 + }, + { + "epoch": 3.277027027027027, + "grad_norm": 0.6353688836097717, + "learning_rate": 4.2513347464993184e-05, + "loss": 0.3522, + "num_input_tokens_seen": 1202720, + "step": 6305 + }, + { + "epoch": 3.2796257796257797, + "grad_norm": 0.24459891021251678, + "learning_rate": 4.249715652522187e-05, + "loss": 0.1916, + "num_input_tokens_seen": 1203680, + "step": 6310 + }, + { + "epoch": 3.2822245322245323, + "grad_norm": 0.5622727274894714, + "learning_rate": 4.2480951187123084e-05, + "loss": 0.2468, + "num_input_tokens_seen": 1204736, + "step": 6315 + }, + { + "epoch": 3.284823284823285, + "grad_norm": 0.9172669649124146, + "learning_rate": 4.246473146403212e-05, + "loss": 0.3046, + "num_input_tokens_seen": 1205728, + "step": 6320 + }, + { + "epoch": 3.2874220374220373, + "grad_norm": 0.8818272948265076, + "learning_rate": 4.2448497369296096e-05, + "loss": 0.2751, + "num_input_tokens_seen": 1206624, + "step": 6325 + }, + { + "epoch": 3.29002079002079, + "grad_norm": 0.6839228868484497, + "learning_rate": 4.2432248916273996e-05, + "loss": 0.1955, + "num_input_tokens_seen": 1207648, + "step": 6330 + }, + { + "epoch": 3.2926195426195424, + "grad_norm": 0.355372816324234, + "learning_rate": 4.241598611833659e-05, + "loss": 0.2652, + "num_input_tokens_seen": 1208672, + "step": 6335 + }, + { + "epoch": 3.2952182952182953, + "grad_norm": 0.5932354927062988, + "learning_rate": 4.239970898886645e-05, + "loss": 0.3313, + "num_input_tokens_seen": 1209664, + "step": 6340 + }, + { + "epoch": 3.297817047817048, + "grad_norm": 0.2944807708263397, + "learning_rate": 4.2383417541257954e-05, + "loss": 0.1933, + "num_input_tokens_seen": 1210624, + "step": 6345 + }, + { + "epoch": 3.3004158004158004, + "grad_norm": 0.6497434377670288, + "learning_rate": 4.236711178891725e-05, + "loss": 0.3281, + "num_input_tokens_seen": 1211552, + "step": 6350 + }, + { + "epoch": 3.303014553014553, + "grad_norm": 0.6208869814872742, + "learning_rate": 4.2350791745262274e-05, + "loss": 0.2679, + "num_input_tokens_seen": 1212512, + "step": 6355 + }, + { + "epoch": 3.3056133056133055, + "grad_norm": 0.4351276755332947, + "learning_rate": 4.2334457423722704e-05, + "loss": 0.241, + "num_input_tokens_seen": 1213408, + "step": 6360 + }, + { + "epoch": 3.3082120582120584, + "grad_norm": 0.4177756905555725, + "learning_rate": 4.231810883773999e-05, + "loss": 0.2804, + "num_input_tokens_seen": 1214304, + "step": 6365 + }, + { + "epoch": 3.310810810810811, + "grad_norm": 0.7166165709495544, + "learning_rate": 4.230174600076729e-05, + "loss": 0.2969, + "num_input_tokens_seen": 1215264, + "step": 6370 + }, + { + "epoch": 3.3134095634095635, + "grad_norm": 0.39749354124069214, + "learning_rate": 4.228536892626951e-05, + "loss": 0.2351, + "num_input_tokens_seen": 1216224, + "step": 6375 + }, + { + "epoch": 3.316008316008316, + "grad_norm": 0.30825215578079224, + "learning_rate": 4.2268977627723285e-05, + "loss": 0.2628, + "num_input_tokens_seen": 1217152, + "step": 6380 + }, + { + "epoch": 3.3186070686070686, + "grad_norm": 0.2551487386226654, + "learning_rate": 4.225257211861691e-05, + "loss": 0.2622, + "num_input_tokens_seen": 1218112, + "step": 6385 + }, + { + "epoch": 3.321205821205821, + "grad_norm": 0.6877274513244629, + "learning_rate": 4.223615241245041e-05, + "loss": 0.2149, + "num_input_tokens_seen": 1219040, + "step": 6390 + }, + { + "epoch": 3.3238045738045736, + "grad_norm": 0.5523968935012817, + "learning_rate": 4.221971852273549e-05, + "loss": 0.2838, + "num_input_tokens_seen": 1219936, + "step": 6395 + }, + { + "epoch": 3.3264033264033266, + "grad_norm": 0.612987220287323, + "learning_rate": 4.2203270462995515e-05, + "loss": 0.1929, + "num_input_tokens_seen": 1220928, + "step": 6400 + }, + { + "epoch": 3.329002079002079, + "grad_norm": 0.2847176492214203, + "learning_rate": 4.218680824676552e-05, + "loss": 0.2815, + "num_input_tokens_seen": 1221888, + "step": 6405 + }, + { + "epoch": 3.3316008316008316, + "grad_norm": 0.8514449596405029, + "learning_rate": 4.217033188759219e-05, + "loss": 0.2935, + "num_input_tokens_seen": 1222816, + "step": 6410 + }, + { + "epoch": 3.334199584199584, + "grad_norm": 0.3052251935005188, + "learning_rate": 4.215384139903382e-05, + "loss": 0.2779, + "num_input_tokens_seen": 1223712, + "step": 6415 + }, + { + "epoch": 3.3367983367983367, + "grad_norm": 0.3936839699745178, + "learning_rate": 4.2137336794660384e-05, + "loss": 0.3096, + "num_input_tokens_seen": 1224736, + "step": 6420 + }, + { + "epoch": 3.3393970893970892, + "grad_norm": 0.31647178530693054, + "learning_rate": 4.212081808805342e-05, + "loss": 0.2993, + "num_input_tokens_seen": 1225696, + "step": 6425 + }, + { + "epoch": 3.3419958419958418, + "grad_norm": 0.26095709204673767, + "learning_rate": 4.210428529280611e-05, + "loss": 0.2982, + "num_input_tokens_seen": 1226688, + "step": 6430 + }, + { + "epoch": 3.3445945945945947, + "grad_norm": 0.2813906669616699, + "learning_rate": 4.2087738422523206e-05, + "loss": 0.2573, + "num_input_tokens_seen": 1227616, + "step": 6435 + }, + { + "epoch": 3.3471933471933473, + "grad_norm": 0.6789572238922119, + "learning_rate": 4.207117749082104e-05, + "loss": 0.211, + "num_input_tokens_seen": 1228512, + "step": 6440 + }, + { + "epoch": 3.3497920997921, + "grad_norm": 0.4868243336677551, + "learning_rate": 4.205460251132755e-05, + "loss": 0.2673, + "num_input_tokens_seen": 1229504, + "step": 6445 + }, + { + "epoch": 3.3523908523908523, + "grad_norm": 0.2619650661945343, + "learning_rate": 4.2038013497682186e-05, + "loss": 0.2984, + "num_input_tokens_seen": 1230496, + "step": 6450 + }, + { + "epoch": 3.354989604989605, + "grad_norm": 0.7592352032661438, + "learning_rate": 4.202141046353597e-05, + "loss": 0.2482, + "num_input_tokens_seen": 1231456, + "step": 6455 + }, + { + "epoch": 3.357588357588358, + "grad_norm": 0.21154488623142242, + "learning_rate": 4.2004793422551475e-05, + "loss": 0.2972, + "num_input_tokens_seen": 1232416, + "step": 6460 + }, + { + "epoch": 3.3601871101871104, + "grad_norm": 0.536598801612854, + "learning_rate": 4.198816238840277e-05, + "loss": 0.25, + "num_input_tokens_seen": 1233312, + "step": 6465 + }, + { + "epoch": 3.362785862785863, + "grad_norm": 0.7502046823501587, + "learning_rate": 4.197151737477547e-05, + "loss": 0.2618, + "num_input_tokens_seen": 1234272, + "step": 6470 + }, + { + "epoch": 3.3653846153846154, + "grad_norm": 0.5725829005241394, + "learning_rate": 4.195485839536666e-05, + "loss": 0.2843, + "num_input_tokens_seen": 1235168, + "step": 6475 + }, + { + "epoch": 3.367983367983368, + "grad_norm": 0.321350634098053, + "learning_rate": 4.193818546388495e-05, + "loss": 0.2592, + "num_input_tokens_seen": 1236096, + "step": 6480 + }, + { + "epoch": 3.3705821205821205, + "grad_norm": 0.3761528730392456, + "learning_rate": 4.192149859405041e-05, + "loss": 0.3019, + "num_input_tokens_seen": 1237024, + "step": 6485 + }, + { + "epoch": 3.373180873180873, + "grad_norm": 0.4425995349884033, + "learning_rate": 4.190479779959459e-05, + "loss": 0.2538, + "num_input_tokens_seen": 1237984, + "step": 6490 + }, + { + "epoch": 3.375779625779626, + "grad_norm": 0.7664889097213745, + "learning_rate": 4.1888083094260486e-05, + "loss": 0.2454, + "num_input_tokens_seen": 1238976, + "step": 6495 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.5602294206619263, + "learning_rate": 4.187135449180256e-05, + "loss": 0.2791, + "num_input_tokens_seen": 1239840, + "step": 6500 + }, + { + "epoch": 3.380977130977131, + "grad_norm": 0.2898153066635132, + "learning_rate": 4.1854612005986704e-05, + "loss": 0.2229, + "num_input_tokens_seen": 1240768, + "step": 6505 + }, + { + "epoch": 3.3835758835758836, + "grad_norm": 0.5452187657356262, + "learning_rate": 4.1837855650590216e-05, + "loss": 0.3034, + "num_input_tokens_seen": 1241728, + "step": 6510 + }, + { + "epoch": 3.386174636174636, + "grad_norm": 0.6665269136428833, + "learning_rate": 4.182108543940183e-05, + "loss": 0.308, + "num_input_tokens_seen": 1242688, + "step": 6515 + }, + { + "epoch": 3.3887733887733886, + "grad_norm": 0.3627178370952606, + "learning_rate": 4.1804301386221665e-05, + "loss": 0.2304, + "num_input_tokens_seen": 1243648, + "step": 6520 + }, + { + "epoch": 3.391372141372141, + "grad_norm": 0.6012855172157288, + "learning_rate": 4.1787503504861256e-05, + "loss": 0.2861, + "num_input_tokens_seen": 1244704, + "step": 6525 + }, + { + "epoch": 3.393970893970894, + "grad_norm": 0.5437318682670593, + "learning_rate": 4.1770691809143495e-05, + "loss": 0.204, + "num_input_tokens_seen": 1245664, + "step": 6530 + }, + { + "epoch": 3.3965696465696467, + "grad_norm": 0.3299700915813446, + "learning_rate": 4.175386631290263e-05, + "loss": 0.2591, + "num_input_tokens_seen": 1246592, + "step": 6535 + }, + { + "epoch": 3.399168399168399, + "grad_norm": 0.5089454054832458, + "learning_rate": 4.1737027029984307e-05, + "loss": 0.2741, + "num_input_tokens_seen": 1247584, + "step": 6540 + }, + { + "epoch": 3.4017671517671517, + "grad_norm": 0.5116890072822571, + "learning_rate": 4.172017397424548e-05, + "loss": 0.285, + "num_input_tokens_seen": 1248544, + "step": 6545 + }, + { + "epoch": 3.4043659043659042, + "grad_norm": 0.3318401277065277, + "learning_rate": 4.170330715955444e-05, + "loss": 0.2937, + "num_input_tokens_seen": 1249504, + "step": 6550 + }, + { + "epoch": 3.406964656964657, + "grad_norm": 0.7964585423469543, + "learning_rate": 4.1686426599790826e-05, + "loss": 0.2468, + "num_input_tokens_seen": 1250560, + "step": 6555 + }, + { + "epoch": 3.4095634095634098, + "grad_norm": 0.7262808680534363, + "learning_rate": 4.166953230884556e-05, + "loss": 0.2705, + "num_input_tokens_seen": 1251488, + "step": 6560 + }, + { + "epoch": 3.4121621621621623, + "grad_norm": 0.3527669906616211, + "learning_rate": 4.165262430062088e-05, + "loss": 0.2636, + "num_input_tokens_seen": 1252512, + "step": 6565 + }, + { + "epoch": 3.414760914760915, + "grad_norm": 0.6378412842750549, + "learning_rate": 4.16357025890303e-05, + "loss": 0.272, + "num_input_tokens_seen": 1253472, + "step": 6570 + }, + { + "epoch": 3.4173596673596673, + "grad_norm": 0.33244985342025757, + "learning_rate": 4.161876718799863e-05, + "loss": 0.3118, + "num_input_tokens_seen": 1254432, + "step": 6575 + }, + { + "epoch": 3.41995841995842, + "grad_norm": 0.8901306390762329, + "learning_rate": 4.160181811146192e-05, + "loss": 0.2322, + "num_input_tokens_seen": 1255328, + "step": 6580 + }, + { + "epoch": 3.4225571725571724, + "grad_norm": 0.9161574244499207, + "learning_rate": 4.158485537336748e-05, + "loss": 0.2919, + "num_input_tokens_seen": 1256256, + "step": 6585 + }, + { + "epoch": 3.4251559251559254, + "grad_norm": 0.6717404723167419, + "learning_rate": 4.156787898767388e-05, + "loss": 0.3241, + "num_input_tokens_seen": 1257216, + "step": 6590 + }, + { + "epoch": 3.427754677754678, + "grad_norm": 0.5649703145027161, + "learning_rate": 4.15508889683509e-05, + "loss": 0.2893, + "num_input_tokens_seen": 1258208, + "step": 6595 + }, + { + "epoch": 3.4303534303534304, + "grad_norm": 0.7496435046195984, + "learning_rate": 4.153388532937955e-05, + "loss": 0.2021, + "num_input_tokens_seen": 1259104, + "step": 6600 + }, + { + "epoch": 3.432952182952183, + "grad_norm": 0.2893180251121521, + "learning_rate": 4.151686808475204e-05, + "loss": 0.2621, + "num_input_tokens_seen": 1260064, + "step": 6605 + }, + { + "epoch": 3.4355509355509355, + "grad_norm": 0.528916597366333, + "learning_rate": 4.149983724847178e-05, + "loss": 0.2723, + "num_input_tokens_seen": 1261056, + "step": 6610 + }, + { + "epoch": 3.438149688149688, + "grad_norm": 0.35607999563217163, + "learning_rate": 4.1482792834553374e-05, + "loss": 0.2557, + "num_input_tokens_seen": 1261920, + "step": 6615 + }, + { + "epoch": 3.4407484407484406, + "grad_norm": 0.5891088247299194, + "learning_rate": 4.146573485702258e-05, + "loss": 0.2379, + "num_input_tokens_seen": 1262848, + "step": 6620 + }, + { + "epoch": 3.4433471933471935, + "grad_norm": 0.6090258359909058, + "learning_rate": 4.144866332991634e-05, + "loss": 0.2805, + "num_input_tokens_seen": 1263808, + "step": 6625 + }, + { + "epoch": 3.445945945945946, + "grad_norm": 0.23232793807983398, + "learning_rate": 4.143157826728271e-05, + "loss": 0.2817, + "num_input_tokens_seen": 1264736, + "step": 6630 + }, + { + "epoch": 3.4485446985446986, + "grad_norm": 0.29723450541496277, + "learning_rate": 4.1414479683180926e-05, + "loss": 0.2615, + "num_input_tokens_seen": 1265696, + "step": 6635 + }, + { + "epoch": 3.451143451143451, + "grad_norm": 0.4356515109539032, + "learning_rate": 4.139736759168133e-05, + "loss": 0.293, + "num_input_tokens_seen": 1266624, + "step": 6640 + }, + { + "epoch": 3.4537422037422036, + "grad_norm": 0.5577291250228882, + "learning_rate": 4.138024200686538e-05, + "loss": 0.2704, + "num_input_tokens_seen": 1267616, + "step": 6645 + }, + { + "epoch": 3.456340956340956, + "grad_norm": 0.3743303120136261, + "learning_rate": 4.1363102942825634e-05, + "loss": 0.2187, + "num_input_tokens_seen": 1268576, + "step": 6650 + }, + { + "epoch": 3.4589397089397087, + "grad_norm": 0.2990421652793884, + "learning_rate": 4.134595041366575e-05, + "loss": 0.2332, + "num_input_tokens_seen": 1269472, + "step": 6655 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 1.0416231155395508, + "learning_rate": 4.1328784433500464e-05, + "loss": 0.3319, + "num_input_tokens_seen": 1270400, + "step": 6660 + }, + { + "epoch": 3.464137214137214, + "grad_norm": 0.4140661656856537, + "learning_rate": 4.131160501645558e-05, + "loss": 0.2832, + "num_input_tokens_seen": 1271392, + "step": 6665 + }, + { + "epoch": 3.4667359667359667, + "grad_norm": 0.4780081808567047, + "learning_rate": 4.1294412176667954e-05, + "loss": 0.2248, + "num_input_tokens_seen": 1272288, + "step": 6670 + }, + { + "epoch": 3.4693347193347193, + "grad_norm": 0.20280249416828156, + "learning_rate": 4.12772059282855e-05, + "loss": 0.2308, + "num_input_tokens_seen": 1273248, + "step": 6675 + }, + { + "epoch": 3.471933471933472, + "grad_norm": 0.7095723152160645, + "learning_rate": 4.1259986285467155e-05, + "loss": 0.3153, + "num_input_tokens_seen": 1274176, + "step": 6680 + }, + { + "epoch": 3.4745322245322248, + "grad_norm": 0.6861981749534607, + "learning_rate": 4.1242753262382884e-05, + "loss": 0.2569, + "num_input_tokens_seen": 1275072, + "step": 6685 + }, + { + "epoch": 3.4771309771309773, + "grad_norm": 0.29297709465026855, + "learning_rate": 4.122550687321366e-05, + "loss": 0.2725, + "num_input_tokens_seen": 1275904, + "step": 6690 + }, + { + "epoch": 3.47972972972973, + "grad_norm": 0.7322605848312378, + "learning_rate": 4.1208247132151456e-05, + "loss": 0.2584, + "num_input_tokens_seen": 1276864, + "step": 6695 + }, + { + "epoch": 3.4823284823284824, + "grad_norm": 0.4313810467720032, + "learning_rate": 4.119097405339922e-05, + "loss": 0.3092, + "num_input_tokens_seen": 1277792, + "step": 6700 + }, + { + "epoch": 3.484927234927235, + "grad_norm": 0.9072105884552002, + "learning_rate": 4.11736876511709e-05, + "loss": 0.2702, + "num_input_tokens_seen": 1278752, + "step": 6705 + }, + { + "epoch": 3.4875259875259874, + "grad_norm": 0.4516713321208954, + "learning_rate": 4.11563879396914e-05, + "loss": 0.3144, + "num_input_tokens_seen": 1279744, + "step": 6710 + }, + { + "epoch": 3.49012474012474, + "grad_norm": 0.645341157913208, + "learning_rate": 4.113907493319655e-05, + "loss": 0.2839, + "num_input_tokens_seen": 1280640, + "step": 6715 + }, + { + "epoch": 3.492723492723493, + "grad_norm": 1.0402089357376099, + "learning_rate": 4.1121748645933164e-05, + "loss": 0.2866, + "num_input_tokens_seen": 1281600, + "step": 6720 + }, + { + "epoch": 3.4953222453222454, + "grad_norm": 0.3762689530849457, + "learning_rate": 4.1104409092158943e-05, + "loss": 0.2691, + "num_input_tokens_seen": 1282560, + "step": 6725 + }, + { + "epoch": 3.497920997920998, + "grad_norm": 0.28401458263397217, + "learning_rate": 4.1087056286142544e-05, + "loss": 0.266, + "num_input_tokens_seen": 1283552, + "step": 6730 + }, + { + "epoch": 3.5, + "eval_loss": 0.2540164589881897, + "eval_runtime": 7.928, + "eval_samples_per_second": 107.971, + "eval_steps_per_second": 26.993, + "num_input_tokens_seen": 1284288, + "step": 6734 + }, + { + "epoch": 3.5005197505197505, + "grad_norm": 0.26122185587882996, + "learning_rate": 4.1069690242163484e-05, + "loss": 0.2259, + "num_input_tokens_seen": 1284480, + "step": 6735 + }, + { + "epoch": 3.503118503118503, + "grad_norm": 0.7034456729888916, + "learning_rate": 4.105231097451222e-05, + "loss": 0.3031, + "num_input_tokens_seen": 1285376, + "step": 6740 + }, + { + "epoch": 3.5057172557172556, + "grad_norm": 0.34566769003868103, + "learning_rate": 4.103491849749006e-05, + "loss": 0.2783, + "num_input_tokens_seen": 1286304, + "step": 6745 + }, + { + "epoch": 3.508316008316008, + "grad_norm": 0.3360481560230255, + "learning_rate": 4.101751282540919e-05, + "loss": 0.2535, + "num_input_tokens_seen": 1287328, + "step": 6750 + }, + { + "epoch": 3.510914760914761, + "grad_norm": 0.6026784181594849, + "learning_rate": 4.1000093972592654e-05, + "loss": 0.1956, + "num_input_tokens_seen": 1288320, + "step": 6755 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7346345782279968, + "learning_rate": 4.098266195337436e-05, + "loss": 0.2882, + "num_input_tokens_seen": 1289216, + "step": 6760 + }, + { + "epoch": 3.516112266112266, + "grad_norm": 0.6012426018714905, + "learning_rate": 4.0965216782099004e-05, + "loss": 0.2592, + "num_input_tokens_seen": 1290144, + "step": 6765 + }, + { + "epoch": 3.5187110187110187, + "grad_norm": 0.41626226902008057, + "learning_rate": 4.0947758473122165e-05, + "loss": 0.3164, + "num_input_tokens_seen": 1291136, + "step": 6770 + }, + { + "epoch": 3.521309771309771, + "grad_norm": 1.1086417436599731, + "learning_rate": 4.093028704081019e-05, + "loss": 0.2605, + "num_input_tokens_seen": 1292128, + "step": 6775 + }, + { + "epoch": 3.523908523908524, + "grad_norm": 0.4711829423904419, + "learning_rate": 4.091280249954024e-05, + "loss": 0.3023, + "num_input_tokens_seen": 1293056, + "step": 6780 + }, + { + "epoch": 3.5265072765072762, + "grad_norm": 0.42376139760017395, + "learning_rate": 4.089530486370025e-05, + "loss": 0.2706, + "num_input_tokens_seen": 1294048, + "step": 6785 + }, + { + "epoch": 3.529106029106029, + "grad_norm": 0.2120097577571869, + "learning_rate": 4.087779414768896e-05, + "loss": 0.283, + "num_input_tokens_seen": 1295008, + "step": 6790 + }, + { + "epoch": 3.5317047817047817, + "grad_norm": 0.27670517563819885, + "learning_rate": 4.086027036591585e-05, + "loss": 0.2493, + "num_input_tokens_seen": 1296032, + "step": 6795 + }, + { + "epoch": 3.5343035343035343, + "grad_norm": 0.9904405474662781, + "learning_rate": 4.084273353280115e-05, + "loss": 0.2663, + "num_input_tokens_seen": 1296992, + "step": 6800 + }, + { + "epoch": 3.536902286902287, + "grad_norm": 0.15938298404216766, + "learning_rate": 4.082518366277585e-05, + "loss": 0.2937, + "num_input_tokens_seen": 1297952, + "step": 6805 + }, + { + "epoch": 3.5395010395010393, + "grad_norm": 0.23322562873363495, + "learning_rate": 4.080762077028164e-05, + "loss": 0.2827, + "num_input_tokens_seen": 1298880, + "step": 6810 + }, + { + "epoch": 3.5420997920997923, + "grad_norm": 0.835925817489624, + "learning_rate": 4.079004486977095e-05, + "loss": 0.2751, + "num_input_tokens_seen": 1299808, + "step": 6815 + }, + { + "epoch": 3.544698544698545, + "grad_norm": 0.47740915417671204, + "learning_rate": 4.077245597570691e-05, + "loss": 0.2805, + "num_input_tokens_seen": 1300832, + "step": 6820 + }, + { + "epoch": 3.5472972972972974, + "grad_norm": 0.3363986313343048, + "learning_rate": 4.075485410256332e-05, + "loss": 0.2661, + "num_input_tokens_seen": 1301792, + "step": 6825 + }, + { + "epoch": 3.54989604989605, + "grad_norm": 0.4355182349681854, + "learning_rate": 4.07372392648247e-05, + "loss": 0.2009, + "num_input_tokens_seen": 1302752, + "step": 6830 + }, + { + "epoch": 3.5524948024948024, + "grad_norm": 0.6832343339920044, + "learning_rate": 4.071961147698621e-05, + "loss": 0.2589, + "num_input_tokens_seen": 1303776, + "step": 6835 + }, + { + "epoch": 3.555093555093555, + "grad_norm": 0.6973892450332642, + "learning_rate": 4.070197075355366e-05, + "loss": 0.4213, + "num_input_tokens_seen": 1304704, + "step": 6840 + }, + { + "epoch": 3.5576923076923075, + "grad_norm": 0.2425023466348648, + "learning_rate": 4.068431710904354e-05, + "loss": 0.2545, + "num_input_tokens_seen": 1305600, + "step": 6845 + }, + { + "epoch": 3.5602910602910605, + "grad_norm": 0.5912318825721741, + "learning_rate": 4.066665055798293e-05, + "loss": 0.2255, + "num_input_tokens_seen": 1306560, + "step": 6850 + }, + { + "epoch": 3.562889812889813, + "grad_norm": 0.74749356508255, + "learning_rate": 4.0648971114909564e-05, + "loss": 0.2642, + "num_input_tokens_seen": 1307584, + "step": 6855 + }, + { + "epoch": 3.5654885654885655, + "grad_norm": 0.8479193449020386, + "learning_rate": 4.0631278794371776e-05, + "loss": 0.2838, + "num_input_tokens_seen": 1308512, + "step": 6860 + }, + { + "epoch": 3.568087318087318, + "grad_norm": 0.7993515133857727, + "learning_rate": 4.0613573610928476e-05, + "loss": 0.3169, + "num_input_tokens_seen": 1309504, + "step": 6865 + }, + { + "epoch": 3.5706860706860706, + "grad_norm": 0.7580817341804504, + "learning_rate": 4.059585557914919e-05, + "loss": 0.2476, + "num_input_tokens_seen": 1310464, + "step": 6870 + }, + { + "epoch": 3.5732848232848236, + "grad_norm": 0.3561457693576813, + "learning_rate": 4.0578124713614e-05, + "loss": 0.2356, + "num_input_tokens_seen": 1311392, + "step": 6875 + }, + { + "epoch": 3.5758835758835756, + "grad_norm": 0.13899242877960205, + "learning_rate": 4.0560381028913544e-05, + "loss": 0.2962, + "num_input_tokens_seen": 1312320, + "step": 6880 + }, + { + "epoch": 3.5784823284823286, + "grad_norm": 0.5511555671691895, + "learning_rate": 4.054262453964902e-05, + "loss": 0.2496, + "num_input_tokens_seen": 1313312, + "step": 6885 + }, + { + "epoch": 3.581081081081081, + "grad_norm": 0.39073124527931213, + "learning_rate": 4.052485526043217e-05, + "loss": 0.3172, + "num_input_tokens_seen": 1314272, + "step": 6890 + }, + { + "epoch": 3.5836798336798337, + "grad_norm": 0.23326092958450317, + "learning_rate": 4.050707320588524e-05, + "loss": 0.2844, + "num_input_tokens_seen": 1315264, + "step": 6895 + }, + { + "epoch": 3.586278586278586, + "grad_norm": 0.5561765432357788, + "learning_rate": 4.0489278390640996e-05, + "loss": 0.2932, + "num_input_tokens_seen": 1316320, + "step": 6900 + }, + { + "epoch": 3.5888773388773387, + "grad_norm": 0.8926471471786499, + "learning_rate": 4.047147082934272e-05, + "loss": 0.2492, + "num_input_tokens_seen": 1317312, + "step": 6905 + }, + { + "epoch": 3.5914760914760917, + "grad_norm": 0.5349783897399902, + "learning_rate": 4.045365053664415e-05, + "loss": 0.1875, + "num_input_tokens_seen": 1318272, + "step": 6910 + }, + { + "epoch": 3.5940748440748442, + "grad_norm": 0.4611491858959198, + "learning_rate": 4.043581752720954e-05, + "loss": 0.2318, + "num_input_tokens_seen": 1319296, + "step": 6915 + }, + { + "epoch": 3.5966735966735968, + "grad_norm": 1.0269519090652466, + "learning_rate": 4.0417971815713584e-05, + "loss": 0.2882, + "num_input_tokens_seen": 1320192, + "step": 6920 + }, + { + "epoch": 3.5992723492723493, + "grad_norm": 0.7806605696678162, + "learning_rate": 4.040011341684142e-05, + "loss": 0.2253, + "num_input_tokens_seen": 1321152, + "step": 6925 + }, + { + "epoch": 3.601871101871102, + "grad_norm": 0.4586956799030304, + "learning_rate": 4.038224234528866e-05, + "loss": 0.2541, + "num_input_tokens_seen": 1322048, + "step": 6930 + }, + { + "epoch": 3.6044698544698544, + "grad_norm": 0.4343246519565582, + "learning_rate": 4.036435861576131e-05, + "loss": 0.1995, + "num_input_tokens_seen": 1322976, + "step": 6935 + }, + { + "epoch": 3.607068607068607, + "grad_norm": 0.30712515115737915, + "learning_rate": 4.0346462242975826e-05, + "loss": 0.2589, + "num_input_tokens_seen": 1323936, + "step": 6940 + }, + { + "epoch": 3.60966735966736, + "grad_norm": 0.5768503546714783, + "learning_rate": 4.032855324165902e-05, + "loss": 0.1917, + "num_input_tokens_seen": 1324992, + "step": 6945 + }, + { + "epoch": 3.6122661122661124, + "grad_norm": 0.7085437178611755, + "learning_rate": 4.031063162654815e-05, + "loss": 0.2639, + "num_input_tokens_seen": 1325952, + "step": 6950 + }, + { + "epoch": 3.614864864864865, + "grad_norm": 0.3351232707500458, + "learning_rate": 4.029269741239081e-05, + "loss": 0.2678, + "num_input_tokens_seen": 1326912, + "step": 6955 + }, + { + "epoch": 3.6174636174636174, + "grad_norm": 0.6512705683708191, + "learning_rate": 4.027475061394499e-05, + "loss": 0.2812, + "num_input_tokens_seen": 1327904, + "step": 6960 + }, + { + "epoch": 3.62006237006237, + "grad_norm": 0.31831592321395874, + "learning_rate": 4.0256791245979024e-05, + "loss": 0.1684, + "num_input_tokens_seen": 1328832, + "step": 6965 + }, + { + "epoch": 3.6226611226611225, + "grad_norm": 0.19433963298797607, + "learning_rate": 4.023881932327159e-05, + "loss": 0.3088, + "num_input_tokens_seen": 1329728, + "step": 6970 + }, + { + "epoch": 3.625259875259875, + "grad_norm": 0.2738890051841736, + "learning_rate": 4.0220834860611705e-05, + "loss": 0.2248, + "num_input_tokens_seen": 1330656, + "step": 6975 + }, + { + "epoch": 3.627858627858628, + "grad_norm": 0.5409745573997498, + "learning_rate": 4.0202837872798695e-05, + "loss": 0.2498, + "num_input_tokens_seen": 1331616, + "step": 6980 + }, + { + "epoch": 3.6304573804573805, + "grad_norm": 0.6487385630607605, + "learning_rate": 4.018482837464219e-05, + "loss": 0.2771, + "num_input_tokens_seen": 1332544, + "step": 6985 + }, + { + "epoch": 3.633056133056133, + "grad_norm": 0.749728262424469, + "learning_rate": 4.016680638096212e-05, + "loss": 0.2874, + "num_input_tokens_seen": 1333632, + "step": 6990 + }, + { + "epoch": 3.6356548856548856, + "grad_norm": 0.44904825091362, + "learning_rate": 4.0148771906588706e-05, + "loss": 0.2488, + "num_input_tokens_seen": 1334528, + "step": 6995 + }, + { + "epoch": 3.638253638253638, + "grad_norm": 0.19491326808929443, + "learning_rate": 4.013072496636243e-05, + "loss": 0.2871, + "num_input_tokens_seen": 1335488, + "step": 7000 + }, + { + "epoch": 3.640852390852391, + "grad_norm": 0.43629661202430725, + "learning_rate": 4.0112665575134033e-05, + "loss": 0.2915, + "num_input_tokens_seen": 1336416, + "step": 7005 + }, + { + "epoch": 3.643451143451143, + "grad_norm": 0.40088263154029846, + "learning_rate": 4.009459374776451e-05, + "loss": 0.2601, + "num_input_tokens_seen": 1337344, + "step": 7010 + }, + { + "epoch": 3.646049896049896, + "grad_norm": 0.45676952600479126, + "learning_rate": 4.007650949912506e-05, + "loss": 0.2837, + "num_input_tokens_seen": 1338336, + "step": 7015 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.455421507358551, + "learning_rate": 4.0058412844097153e-05, + "loss": 0.3024, + "num_input_tokens_seen": 1339296, + "step": 7020 + }, + { + "epoch": 3.651247401247401, + "grad_norm": 0.7707850337028503, + "learning_rate": 4.004030379757243e-05, + "loss": 0.2804, + "num_input_tokens_seen": 1340224, + "step": 7025 + }, + { + "epoch": 3.6538461538461537, + "grad_norm": 0.4824521839618683, + "learning_rate": 4.0022182374452736e-05, + "loss": 0.2913, + "num_input_tokens_seen": 1341216, + "step": 7030 + }, + { + "epoch": 3.6564449064449063, + "grad_norm": 0.883143961429596, + "learning_rate": 4.0004048589650104e-05, + "loss": 0.2489, + "num_input_tokens_seen": 1342112, + "step": 7035 + }, + { + "epoch": 3.6590436590436592, + "grad_norm": 0.6461930871009827, + "learning_rate": 3.9985902458086746e-05, + "loss": 0.2126, + "num_input_tokens_seen": 1343040, + "step": 7040 + }, + { + "epoch": 3.6616424116424118, + "grad_norm": 0.4628750681877136, + "learning_rate": 3.996774399469502e-05, + "loss": 0.316, + "num_input_tokens_seen": 1343968, + "step": 7045 + }, + { + "epoch": 3.6642411642411643, + "grad_norm": 0.9992142915725708, + "learning_rate": 3.9949573214417447e-05, + "loss": 0.2489, + "num_input_tokens_seen": 1344832, + "step": 7050 + }, + { + "epoch": 3.666839916839917, + "grad_norm": 0.2938157916069031, + "learning_rate": 3.993139013220668e-05, + "loss": 0.3663, + "num_input_tokens_seen": 1345792, + "step": 7055 + }, + { + "epoch": 3.6694386694386694, + "grad_norm": 0.5989623665809631, + "learning_rate": 3.9913194763025486e-05, + "loss": 0.2999, + "num_input_tokens_seen": 1346720, + "step": 7060 + }, + { + "epoch": 3.672037422037422, + "grad_norm": 0.25279501080513, + "learning_rate": 3.989498712184674e-05, + "loss": 0.2546, + "num_input_tokens_seen": 1347680, + "step": 7065 + }, + { + "epoch": 3.6746361746361744, + "grad_norm": 0.37163302302360535, + "learning_rate": 3.9876767223653446e-05, + "loss": 0.2698, + "num_input_tokens_seen": 1348704, + "step": 7070 + }, + { + "epoch": 3.6772349272349274, + "grad_norm": 0.7391625642776489, + "learning_rate": 3.985853508343865e-05, + "loss": 0.2217, + "num_input_tokens_seen": 1349728, + "step": 7075 + }, + { + "epoch": 3.67983367983368, + "grad_norm": 0.3211708962917328, + "learning_rate": 3.9840290716205495e-05, + "loss": 0.2979, + "num_input_tokens_seen": 1350720, + "step": 7080 + }, + { + "epoch": 3.6824324324324325, + "grad_norm": 0.5216235518455505, + "learning_rate": 3.98220341369672e-05, + "loss": 0.2503, + "num_input_tokens_seen": 1351808, + "step": 7085 + }, + { + "epoch": 3.685031185031185, + "grad_norm": 0.2507557272911072, + "learning_rate": 3.980376536074701e-05, + "loss": 0.1743, + "num_input_tokens_seen": 1352736, + "step": 7090 + }, + { + "epoch": 3.6876299376299375, + "grad_norm": 0.4401816129684448, + "learning_rate": 3.9785484402578216e-05, + "loss": 0.2304, + "num_input_tokens_seen": 1353728, + "step": 7095 + }, + { + "epoch": 3.6902286902286905, + "grad_norm": 0.7707040309906006, + "learning_rate": 3.976719127750413e-05, + "loss": 0.28, + "num_input_tokens_seen": 1354688, + "step": 7100 + }, + { + "epoch": 3.6928274428274426, + "grad_norm": 0.4677363932132721, + "learning_rate": 3.974888600057808e-05, + "loss": 0.1799, + "num_input_tokens_seen": 1355744, + "step": 7105 + }, + { + "epoch": 3.6954261954261955, + "grad_norm": 0.6858312487602234, + "learning_rate": 3.9730568586863384e-05, + "loss": 0.3481, + "num_input_tokens_seen": 1356704, + "step": 7110 + }, + { + "epoch": 3.698024948024948, + "grad_norm": 0.5458012223243713, + "learning_rate": 3.971223905143336e-05, + "loss": 0.2916, + "num_input_tokens_seen": 1357632, + "step": 7115 + }, + { + "epoch": 3.7006237006237006, + "grad_norm": 0.6073631048202515, + "learning_rate": 3.9693897409371316e-05, + "loss": 0.2646, + "num_input_tokens_seen": 1358560, + "step": 7120 + }, + { + "epoch": 3.703222453222453, + "grad_norm": 0.810498833656311, + "learning_rate": 3.967554367577047e-05, + "loss": 0.2502, + "num_input_tokens_seen": 1359616, + "step": 7125 + }, + { + "epoch": 3.7058212058212057, + "grad_norm": 0.48573145270347595, + "learning_rate": 3.965717786573404e-05, + "loss": 0.2604, + "num_input_tokens_seen": 1360640, + "step": 7130 + }, + { + "epoch": 3.7084199584199586, + "grad_norm": 0.3712937831878662, + "learning_rate": 3.963879999437516e-05, + "loss": 0.2914, + "num_input_tokens_seen": 1361600, + "step": 7135 + }, + { + "epoch": 3.711018711018711, + "grad_norm": 0.9090651273727417, + "learning_rate": 3.962041007681691e-05, + "loss": 0.2554, + "num_input_tokens_seen": 1362528, + "step": 7140 + }, + { + "epoch": 3.7136174636174637, + "grad_norm": 0.2794598937034607, + "learning_rate": 3.960200812819223e-05, + "loss": 0.2406, + "num_input_tokens_seen": 1363520, + "step": 7145 + }, + { + "epoch": 3.7162162162162162, + "grad_norm": 0.10991623252630234, + "learning_rate": 3.9583594163644036e-05, + "loss": 0.2168, + "num_input_tokens_seen": 1364416, + "step": 7150 + }, + { + "epoch": 3.7188149688149688, + "grad_norm": 1.0446041822433472, + "learning_rate": 3.9565168198325064e-05, + "loss": 0.3195, + "num_input_tokens_seen": 1365344, + "step": 7155 + }, + { + "epoch": 3.7214137214137213, + "grad_norm": 0.23861949145793915, + "learning_rate": 3.954673024739797e-05, + "loss": 0.2365, + "num_input_tokens_seen": 1366240, + "step": 7160 + }, + { + "epoch": 3.724012474012474, + "grad_norm": 0.35845547914505005, + "learning_rate": 3.952828032603525e-05, + "loss": 0.3143, + "num_input_tokens_seen": 1367200, + "step": 7165 + }, + { + "epoch": 3.726611226611227, + "grad_norm": 0.4110104441642761, + "learning_rate": 3.950981844941926e-05, + "loss": 0.2736, + "num_input_tokens_seen": 1368064, + "step": 7170 + }, + { + "epoch": 3.7292099792099793, + "grad_norm": 0.2994477450847626, + "learning_rate": 3.949134463274218e-05, + "loss": 0.2334, + "num_input_tokens_seen": 1369088, + "step": 7175 + }, + { + "epoch": 3.731808731808732, + "grad_norm": 0.5532571077346802, + "learning_rate": 3.947285889120605e-05, + "loss": 0.2964, + "num_input_tokens_seen": 1370080, + "step": 7180 + }, + { + "epoch": 3.7344074844074844, + "grad_norm": 0.16474415361881256, + "learning_rate": 3.945436124002268e-05, + "loss": 0.2639, + "num_input_tokens_seen": 1371040, + "step": 7185 + }, + { + "epoch": 3.737006237006237, + "grad_norm": 0.6277565360069275, + "learning_rate": 3.94358516944137e-05, + "loss": 0.1894, + "num_input_tokens_seen": 1372000, + "step": 7190 + }, + { + "epoch": 3.73960498960499, + "grad_norm": 0.6450810432434082, + "learning_rate": 3.941733026961054e-05, + "loss": 0.3118, + "num_input_tokens_seen": 1372992, + "step": 7195 + }, + { + "epoch": 3.742203742203742, + "grad_norm": 0.2922810912132263, + "learning_rate": 3.939879698085439e-05, + "loss": 0.2483, + "num_input_tokens_seen": 1373952, + "step": 7200 + }, + { + "epoch": 3.744802494802495, + "grad_norm": 0.4408545196056366, + "learning_rate": 3.93802518433962e-05, + "loss": 0.1398, + "num_input_tokens_seen": 1374816, + "step": 7205 + }, + { + "epoch": 3.7474012474012475, + "grad_norm": 0.6438238620758057, + "learning_rate": 3.936169487249667e-05, + "loss": 0.2883, + "num_input_tokens_seen": 1375776, + "step": 7210 + }, + { + "epoch": 3.75, + "grad_norm": 0.6368221640586853, + "learning_rate": 3.9343126083426264e-05, + "loss": 0.305, + "num_input_tokens_seen": 1376736, + "step": 7215 + }, + { + "epoch": 3.7525987525987525, + "grad_norm": 0.5218846797943115, + "learning_rate": 3.932454549146513e-05, + "loss": 0.2136, + "num_input_tokens_seen": 1377664, + "step": 7220 + }, + { + "epoch": 3.755197505197505, + "grad_norm": 0.6391943097114563, + "learning_rate": 3.930595311190316e-05, + "loss": 0.3427, + "num_input_tokens_seen": 1378592, + "step": 7225 + }, + { + "epoch": 3.757796257796258, + "grad_norm": 0.29417499899864197, + "learning_rate": 3.9287348960039926e-05, + "loss": 0.3336, + "num_input_tokens_seen": 1379552, + "step": 7230 + }, + { + "epoch": 3.76039501039501, + "grad_norm": 0.34143537282943726, + "learning_rate": 3.926873305118471e-05, + "loss": 0.2795, + "num_input_tokens_seen": 1380448, + "step": 7235 + }, + { + "epoch": 3.762993762993763, + "grad_norm": 0.5157080292701721, + "learning_rate": 3.9250105400656456e-05, + "loss": 0.2733, + "num_input_tokens_seen": 1381376, + "step": 7240 + }, + { + "epoch": 3.7655925155925156, + "grad_norm": 0.38931432366371155, + "learning_rate": 3.9231466023783756e-05, + "loss": 0.3005, + "num_input_tokens_seen": 1382368, + "step": 7245 + }, + { + "epoch": 3.768191268191268, + "grad_norm": 0.8407925367355347, + "learning_rate": 3.9212814935904874e-05, + "loss": 0.2653, + "num_input_tokens_seen": 1383328, + "step": 7250 + }, + { + "epoch": 3.7707900207900207, + "grad_norm": 0.31431519985198975, + "learning_rate": 3.9194152152367695e-05, + "loss": 0.2077, + "num_input_tokens_seen": 1384256, + "step": 7255 + }, + { + "epoch": 3.773388773388773, + "grad_norm": 0.6329095363616943, + "learning_rate": 3.917547768852975e-05, + "loss": 0.2776, + "num_input_tokens_seen": 1385248, + "step": 7260 + }, + { + "epoch": 3.775987525987526, + "grad_norm": 0.1885174959897995, + "learning_rate": 3.915679155975815e-05, + "loss": 0.3125, + "num_input_tokens_seen": 1386208, + "step": 7265 + }, + { + "epoch": 3.7785862785862787, + "grad_norm": 0.6240255832672119, + "learning_rate": 3.913809378142964e-05, + "loss": 0.3073, + "num_input_tokens_seen": 1387168, + "step": 7270 + }, + { + "epoch": 3.7811850311850312, + "grad_norm": 0.5409123301506042, + "learning_rate": 3.911938436893051e-05, + "loss": 0.2082, + "num_input_tokens_seen": 1388128, + "step": 7275 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.31268444657325745, + "learning_rate": 3.9100663337656676e-05, + "loss": 0.2685, + "num_input_tokens_seen": 1389056, + "step": 7280 + }, + { + "epoch": 3.7863825363825363, + "grad_norm": 0.5846900939941406, + "learning_rate": 3.908193070301356e-05, + "loss": 0.2636, + "num_input_tokens_seen": 1390016, + "step": 7285 + }, + { + "epoch": 3.788981288981289, + "grad_norm": 0.6987483501434326, + "learning_rate": 3.906318648041617e-05, + "loss": 0.3005, + "num_input_tokens_seen": 1390912, + "step": 7290 + }, + { + "epoch": 3.7915800415800414, + "grad_norm": 1.0621354579925537, + "learning_rate": 3.904443068528905e-05, + "loss": 0.2558, + "num_input_tokens_seen": 1391872, + "step": 7295 + }, + { + "epoch": 3.7941787941787943, + "grad_norm": 0.48554283380508423, + "learning_rate": 3.902566333306623e-05, + "loss": 0.2654, + "num_input_tokens_seen": 1392832, + "step": 7300 + }, + { + "epoch": 3.796777546777547, + "grad_norm": 0.598369836807251, + "learning_rate": 3.900688443919129e-05, + "loss": 0.213, + "num_input_tokens_seen": 1393824, + "step": 7305 + }, + { + "epoch": 3.7993762993762994, + "grad_norm": 0.3082329332828522, + "learning_rate": 3.8988094019117294e-05, + "loss": 0.2326, + "num_input_tokens_seen": 1394752, + "step": 7310 + }, + { + "epoch": 3.801975051975052, + "grad_norm": 0.5977778434753418, + "learning_rate": 3.896929208830679e-05, + "loss": 0.2893, + "num_input_tokens_seen": 1395616, + "step": 7315 + }, + { + "epoch": 3.8045738045738045, + "grad_norm": 0.16928720474243164, + "learning_rate": 3.895047866223179e-05, + "loss": 0.2817, + "num_input_tokens_seen": 1396576, + "step": 7320 + }, + { + "epoch": 3.8071725571725574, + "grad_norm": 0.46952158212661743, + "learning_rate": 3.893165375637378e-05, + "loss": 0.2755, + "num_input_tokens_seen": 1397536, + "step": 7325 + }, + { + "epoch": 3.8097713097713095, + "grad_norm": 0.5953111052513123, + "learning_rate": 3.891281738622369e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1398496, + "step": 7330 + }, + { + "epoch": 3.8123700623700625, + "grad_norm": 0.7574178576469421, + "learning_rate": 3.889396956728187e-05, + "loss": 0.3206, + "num_input_tokens_seen": 1399488, + "step": 7335 + }, + { + "epoch": 3.814968814968815, + "grad_norm": 0.12627577781677246, + "learning_rate": 3.887511031505811e-05, + "loss": 0.2718, + "num_input_tokens_seen": 1400384, + "step": 7340 + }, + { + "epoch": 3.8175675675675675, + "grad_norm": 0.21827027201652527, + "learning_rate": 3.8856239645071604e-05, + "loss": 0.1926, + "num_input_tokens_seen": 1401376, + "step": 7345 + }, + { + "epoch": 3.82016632016632, + "grad_norm": 0.30479469895362854, + "learning_rate": 3.883735757285092e-05, + "loss": 0.3086, + "num_input_tokens_seen": 1402304, + "step": 7350 + }, + { + "epoch": 3.8227650727650726, + "grad_norm": 0.5041913390159607, + "learning_rate": 3.881846411393403e-05, + "loss": 0.3013, + "num_input_tokens_seen": 1403296, + "step": 7355 + }, + { + "epoch": 3.8253638253638256, + "grad_norm": 0.2724202573299408, + "learning_rate": 3.879955928386829e-05, + "loss": 0.2851, + "num_input_tokens_seen": 1404256, + "step": 7360 + }, + { + "epoch": 3.827962577962578, + "grad_norm": 0.7427892684936523, + "learning_rate": 3.878064309821038e-05, + "loss": 0.2923, + "num_input_tokens_seen": 1405280, + "step": 7365 + }, + { + "epoch": 3.8305613305613306, + "grad_norm": 0.5057810544967651, + "learning_rate": 3.876171557252633e-05, + "loss": 0.2532, + "num_input_tokens_seen": 1406240, + "step": 7370 + }, + { + "epoch": 3.833160083160083, + "grad_norm": 0.5084899663925171, + "learning_rate": 3.874277672239154e-05, + "loss": 0.2885, + "num_input_tokens_seen": 1407168, + "step": 7375 + }, + { + "epoch": 3.8357588357588357, + "grad_norm": 0.43706148862838745, + "learning_rate": 3.872382656339068e-05, + "loss": 0.2636, + "num_input_tokens_seen": 1408064, + "step": 7380 + }, + { + "epoch": 3.8383575883575882, + "grad_norm": 0.5472244620323181, + "learning_rate": 3.8704865111117746e-05, + "loss": 0.2949, + "num_input_tokens_seen": 1409056, + "step": 7385 + }, + { + "epoch": 3.8409563409563408, + "grad_norm": 0.4099772572517395, + "learning_rate": 3.8685892381176034e-05, + "loss": 0.2884, + "num_input_tokens_seen": 1410016, + "step": 7390 + }, + { + "epoch": 3.8435550935550937, + "grad_norm": 0.4299851953983307, + "learning_rate": 3.8666908389178127e-05, + "loss": 0.291, + "num_input_tokens_seen": 1410912, + "step": 7395 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.4300438165664673, + "learning_rate": 3.864791315074583e-05, + "loss": 0.2792, + "num_input_tokens_seen": 1411840, + "step": 7400 + }, + { + "epoch": 3.848752598752599, + "grad_norm": 0.30818331241607666, + "learning_rate": 3.862890668151025e-05, + "loss": 0.2772, + "num_input_tokens_seen": 1412768, + "step": 7405 + }, + { + "epoch": 3.8513513513513513, + "grad_norm": 0.2694954574108124, + "learning_rate": 3.8609888997111734e-05, + "loss": 0.236, + "num_input_tokens_seen": 1413760, + "step": 7410 + }, + { + "epoch": 3.853950103950104, + "grad_norm": 0.5757660865783691, + "learning_rate": 3.8590860113199835e-05, + "loss": 0.2725, + "num_input_tokens_seen": 1414688, + "step": 7415 + }, + { + "epoch": 3.856548856548857, + "grad_norm": 0.40677401423454285, + "learning_rate": 3.8571820045433326e-05, + "loss": 0.2963, + "num_input_tokens_seen": 1415648, + "step": 7420 + }, + { + "epoch": 3.859147609147609, + "grad_norm": 0.611247718334198, + "learning_rate": 3.85527688094802e-05, + "loss": 0.3058, + "num_input_tokens_seen": 1416640, + "step": 7425 + }, + { + "epoch": 3.861746361746362, + "grad_norm": 0.3816022574901581, + "learning_rate": 3.8533706421017614e-05, + "loss": 0.2919, + "num_input_tokens_seen": 1417536, + "step": 7430 + }, + { + "epoch": 3.8643451143451144, + "grad_norm": 0.5135728120803833, + "learning_rate": 3.851463289573193e-05, + "loss": 0.3017, + "num_input_tokens_seen": 1418432, + "step": 7435 + }, + { + "epoch": 3.866943866943867, + "grad_norm": 0.4545844793319702, + "learning_rate": 3.8495548249318655e-05, + "loss": 0.2791, + "num_input_tokens_seen": 1419328, + "step": 7440 + }, + { + "epoch": 3.8695426195426195, + "grad_norm": 0.5021741986274719, + "learning_rate": 3.8476452497482466e-05, + "loss": 0.3031, + "num_input_tokens_seen": 1420288, + "step": 7445 + }, + { + "epoch": 3.872141372141372, + "grad_norm": 0.6796638369560242, + "learning_rate": 3.845734565593716e-05, + "loss": 0.2981, + "num_input_tokens_seen": 1421216, + "step": 7450 + }, + { + "epoch": 3.874740124740125, + "grad_norm": 0.3684832453727722, + "learning_rate": 3.843822774040565e-05, + "loss": 0.2905, + "num_input_tokens_seen": 1422240, + "step": 7455 + }, + { + "epoch": 3.8773388773388775, + "grad_norm": 0.3018234372138977, + "learning_rate": 3.841909876662001e-05, + "loss": 0.2941, + "num_input_tokens_seen": 1423200, + "step": 7460 + }, + { + "epoch": 3.87993762993763, + "grad_norm": 0.30805909633636475, + "learning_rate": 3.839995875032135e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1424160, + "step": 7465 + }, + { + "epoch": 3.8825363825363826, + "grad_norm": 0.8054865002632141, + "learning_rate": 3.8380807707259923e-05, + "loss": 0.2379, + "num_input_tokens_seen": 1425056, + "step": 7470 + }, + { + "epoch": 3.885135135135135, + "grad_norm": 0.12726379930973053, + "learning_rate": 3.8361645653195026e-05, + "loss": 0.2405, + "num_input_tokens_seen": 1426016, + "step": 7475 + }, + { + "epoch": 3.8877338877338876, + "grad_norm": 0.4250637888908386, + "learning_rate": 3.8342472603895024e-05, + "loss": 0.2414, + "num_input_tokens_seen": 1427072, + "step": 7480 + }, + { + "epoch": 3.89033264033264, + "grad_norm": 0.29089605808258057, + "learning_rate": 3.8323288575137316e-05, + "loss": 0.2701, + "num_input_tokens_seen": 1428064, + "step": 7485 + }, + { + "epoch": 3.892931392931393, + "grad_norm": 0.9702491164207458, + "learning_rate": 3.8304093582708366e-05, + "loss": 0.3388, + "num_input_tokens_seen": 1429088, + "step": 7490 + }, + { + "epoch": 3.8955301455301456, + "grad_norm": 0.5680705904960632, + "learning_rate": 3.828488764240363e-05, + "loss": 0.2849, + "num_input_tokens_seen": 1430080, + "step": 7495 + }, + { + "epoch": 3.898128898128898, + "grad_norm": 0.8594683408737183, + "learning_rate": 3.826567077002759e-05, + "loss": 0.2354, + "num_input_tokens_seen": 1430976, + "step": 7500 + }, + { + "epoch": 3.9007276507276507, + "grad_norm": 0.42320939898490906, + "learning_rate": 3.824644298139371e-05, + "loss": 0.2921, + "num_input_tokens_seen": 1431872, + "step": 7505 + }, + { + "epoch": 3.9033264033264032, + "grad_norm": 0.3183545172214508, + "learning_rate": 3.8227204292324484e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1432832, + "step": 7510 + }, + { + "epoch": 3.9059251559251558, + "grad_norm": 0.3866598308086395, + "learning_rate": 3.820795471865129e-05, + "loss": 0.2731, + "num_input_tokens_seen": 1433824, + "step": 7515 + }, + { + "epoch": 3.9085239085239083, + "grad_norm": 0.3482164442539215, + "learning_rate": 3.818869427621453e-05, + "loss": 0.2385, + "num_input_tokens_seen": 1434720, + "step": 7520 + }, + { + "epoch": 3.9111226611226613, + "grad_norm": 0.32367533445358276, + "learning_rate": 3.8169422980863544e-05, + "loss": 0.217, + "num_input_tokens_seen": 1435680, + "step": 7525 + }, + { + "epoch": 3.913721413721414, + "grad_norm": 0.3274195194244385, + "learning_rate": 3.8150140848456574e-05, + "loss": 0.276, + "num_input_tokens_seen": 1436640, + "step": 7530 + }, + { + "epoch": 3.9163201663201663, + "grad_norm": 0.2682936191558838, + "learning_rate": 3.81308478948608e-05, + "loss": 0.2345, + "num_input_tokens_seen": 1437504, + "step": 7535 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.5205312967300415, + "learning_rate": 3.81115441359523e-05, + "loss": 0.2887, + "num_input_tokens_seen": 1438432, + "step": 7540 + }, + { + "epoch": 3.9215176715176714, + "grad_norm": 0.5195420980453491, + "learning_rate": 3.809222958761605e-05, + "loss": 0.2653, + "num_input_tokens_seen": 1439328, + "step": 7545 + }, + { + "epoch": 3.9241164241164244, + "grad_norm": 0.5438607335090637, + "learning_rate": 3.80729042657459e-05, + "loss": 0.2142, + "num_input_tokens_seen": 1440288, + "step": 7550 + }, + { + "epoch": 3.9267151767151764, + "grad_norm": 0.6354686617851257, + "learning_rate": 3.805356818624457e-05, + "loss": 0.2718, + "num_input_tokens_seen": 1441312, + "step": 7555 + }, + { + "epoch": 3.9293139293139294, + "grad_norm": 0.4868507981300354, + "learning_rate": 3.8034221365023624e-05, + "loss": 0.2518, + "num_input_tokens_seen": 1442272, + "step": 7560 + }, + { + "epoch": 3.931912681912682, + "grad_norm": 0.6008889079093933, + "learning_rate": 3.801486381800347e-05, + "loss": 0.2376, + "num_input_tokens_seen": 1443264, + "step": 7565 + }, + { + "epoch": 3.9345114345114345, + "grad_norm": 0.3171575963497162, + "learning_rate": 3.7995495561113336e-05, + "loss": 0.2619, + "num_input_tokens_seen": 1444224, + "step": 7570 + }, + { + "epoch": 3.937110187110187, + "grad_norm": 0.5235034823417664, + "learning_rate": 3.797611661029128e-05, + "loss": 0.3274, + "num_input_tokens_seen": 1445152, + "step": 7575 + }, + { + "epoch": 3.9397089397089395, + "grad_norm": 0.39506208896636963, + "learning_rate": 3.795672698148415e-05, + "loss": 0.287, + "num_input_tokens_seen": 1446016, + "step": 7580 + }, + { + "epoch": 3.9423076923076925, + "grad_norm": 0.4598883092403412, + "learning_rate": 3.7937326690647556e-05, + "loss": 0.2352, + "num_input_tokens_seen": 1447072, + "step": 7585 + }, + { + "epoch": 3.944906444906445, + "grad_norm": 0.3456081748008728, + "learning_rate": 3.7917915753745935e-05, + "loss": 0.2416, + "num_input_tokens_seen": 1448032, + "step": 7590 + }, + { + "epoch": 3.9475051975051976, + "grad_norm": 0.35404956340789795, + "learning_rate": 3.789849418675245e-05, + "loss": 0.3355, + "num_input_tokens_seen": 1448992, + "step": 7595 + }, + { + "epoch": 3.95010395010395, + "grad_norm": 0.5233042240142822, + "learning_rate": 3.7879062005649e-05, + "loss": 0.3144, + "num_input_tokens_seen": 1449920, + "step": 7600 + }, + { + "epoch": 3.9527027027027026, + "grad_norm": 0.40119054913520813, + "learning_rate": 3.785961922642626e-05, + "loss": 0.2563, + "num_input_tokens_seen": 1450816, + "step": 7605 + }, + { + "epoch": 3.955301455301455, + "grad_norm": 0.228755384683609, + "learning_rate": 3.784016586508357e-05, + "loss": 0.2815, + "num_input_tokens_seen": 1451840, + "step": 7610 + }, + { + "epoch": 3.9579002079002077, + "grad_norm": 0.833567202091217, + "learning_rate": 3.782070193762904e-05, + "loss": 0.2858, + "num_input_tokens_seen": 1452768, + "step": 7615 + }, + { + "epoch": 3.9604989604989607, + "grad_norm": 0.7445682883262634, + "learning_rate": 3.7801227460079424e-05, + "loss": 0.2623, + "num_input_tokens_seen": 1453792, + "step": 7620 + }, + { + "epoch": 3.963097713097713, + "grad_norm": 0.7549951672554016, + "learning_rate": 3.778174244846019e-05, + "loss": 0.2449, + "num_input_tokens_seen": 1454688, + "step": 7625 + }, + { + "epoch": 3.9656964656964657, + "grad_norm": 0.29562893509864807, + "learning_rate": 3.776224691880545e-05, + "loss": 0.2862, + "num_input_tokens_seen": 1455616, + "step": 7630 + }, + { + "epoch": 3.9682952182952183, + "grad_norm": 0.3384116590023041, + "learning_rate": 3.7742740887158e-05, + "loss": 0.295, + "num_input_tokens_seen": 1456544, + "step": 7635 + }, + { + "epoch": 3.970893970893971, + "grad_norm": 0.427617609500885, + "learning_rate": 3.772322436956924e-05, + "loss": 0.2871, + "num_input_tokens_seen": 1457536, + "step": 7640 + }, + { + "epoch": 3.9734927234927238, + "grad_norm": 0.2648871839046478, + "learning_rate": 3.7703697382099234e-05, + "loss": 0.2354, + "num_input_tokens_seen": 1458464, + "step": 7645 + }, + { + "epoch": 3.976091476091476, + "grad_norm": 0.6151626706123352, + "learning_rate": 3.768415994081664e-05, + "loss": 0.2335, + "num_input_tokens_seen": 1459456, + "step": 7650 + }, + { + "epoch": 3.978690228690229, + "grad_norm": 1.2134990692138672, + "learning_rate": 3.766461206179874e-05, + "loss": 0.3016, + "num_input_tokens_seen": 1460448, + "step": 7655 + }, + { + "epoch": 3.9812889812889813, + "grad_norm": 0.45823463797569275, + "learning_rate": 3.764505376113138e-05, + "loss": 0.254, + "num_input_tokens_seen": 1461408, + "step": 7660 + }, + { + "epoch": 3.983887733887734, + "grad_norm": 0.5032821893692017, + "learning_rate": 3.762548505490899e-05, + "loss": 0.312, + "num_input_tokens_seen": 1462368, + "step": 7665 + }, + { + "epoch": 3.9864864864864864, + "grad_norm": 0.38291558623313904, + "learning_rate": 3.7605905959234576e-05, + "loss": 0.3386, + "num_input_tokens_seen": 1463360, + "step": 7670 + }, + { + "epoch": 3.989085239085239, + "grad_norm": 0.8862304091453552, + "learning_rate": 3.758631649021968e-05, + "loss": 0.2728, + "num_input_tokens_seen": 1464384, + "step": 7675 + }, + { + "epoch": 3.991683991683992, + "grad_norm": 0.3495965301990509, + "learning_rate": 3.756671666398438e-05, + "loss": 0.2771, + "num_input_tokens_seen": 1465376, + "step": 7680 + }, + { + "epoch": 3.9942827442827444, + "grad_norm": 0.5066314339637756, + "learning_rate": 3.754710649665728e-05, + "loss": 0.2424, + "num_input_tokens_seen": 1466400, + "step": 7685 + }, + { + "epoch": 3.996881496881497, + "grad_norm": 0.6459429860115051, + "learning_rate": 3.7527486004375506e-05, + "loss": 0.2879, + "num_input_tokens_seen": 1467424, + "step": 7690 + }, + { + "epoch": 3.9994802494802495, + "grad_norm": 0.5735722780227661, + "learning_rate": 3.750785520328465e-05, + "loss": 0.2514, + "num_input_tokens_seen": 1468384, + "step": 7695 + }, + { + "epoch": 4.0, + "eval_loss": 0.2522806227207184, + "eval_runtime": 7.9246, + "eval_samples_per_second": 108.018, + "eval_steps_per_second": 27.004, + "num_input_tokens_seen": 1468552, + "step": 7696 + }, + { + "epoch": 4.002079002079002, + "grad_norm": 0.33928152918815613, + "learning_rate": 3.748821410953882e-05, + "loss": 0.2029, + "num_input_tokens_seen": 1469320, + "step": 7700 + }, + { + "epoch": 4.004677754677755, + "grad_norm": 0.2966035008430481, + "learning_rate": 3.746856273930058e-05, + "loss": 0.2738, + "num_input_tokens_seen": 1470248, + "step": 7705 + }, + { + "epoch": 4.007276507276507, + "grad_norm": 0.3500209152698517, + "learning_rate": 3.744890110874093e-05, + "loss": 0.2161, + "num_input_tokens_seen": 1471304, + "step": 7710 + }, + { + "epoch": 4.00987525987526, + "grad_norm": 0.32959428429603577, + "learning_rate": 3.742922923403935e-05, + "loss": 0.255, + "num_input_tokens_seen": 1472232, + "step": 7715 + }, + { + "epoch": 4.012474012474012, + "grad_norm": 0.17104996740818024, + "learning_rate": 3.740954713138373e-05, + "loss": 0.3057, + "num_input_tokens_seen": 1473192, + "step": 7720 + }, + { + "epoch": 4.015072765072765, + "grad_norm": 0.5476298332214355, + "learning_rate": 3.7389854816970386e-05, + "loss": 0.2521, + "num_input_tokens_seen": 1474120, + "step": 7725 + }, + { + "epoch": 4.017671517671518, + "grad_norm": 0.23354721069335938, + "learning_rate": 3.737015230700402e-05, + "loss": 0.2653, + "num_input_tokens_seen": 1475016, + "step": 7730 + }, + { + "epoch": 4.02027027027027, + "grad_norm": 0.6190209984779358, + "learning_rate": 3.7350439617697734e-05, + "loss": 0.2069, + "num_input_tokens_seen": 1475944, + "step": 7735 + }, + { + "epoch": 4.022869022869023, + "grad_norm": 0.3804231584072113, + "learning_rate": 3.733071676527302e-05, + "loss": 0.2371, + "num_input_tokens_seen": 1476840, + "step": 7740 + }, + { + "epoch": 4.025467775467775, + "grad_norm": 0.5974214673042297, + "learning_rate": 3.73109837659597e-05, + "loss": 0.2936, + "num_input_tokens_seen": 1477736, + "step": 7745 + }, + { + "epoch": 4.028066528066528, + "grad_norm": 0.5583809614181519, + "learning_rate": 3.7291240635995985e-05, + "loss": 0.2924, + "num_input_tokens_seen": 1478664, + "step": 7750 + }, + { + "epoch": 4.03066528066528, + "grad_norm": 0.5017284750938416, + "learning_rate": 3.727148739162839e-05, + "loss": 0.2766, + "num_input_tokens_seen": 1479624, + "step": 7755 + }, + { + "epoch": 4.033264033264033, + "grad_norm": 0.950586199760437, + "learning_rate": 3.725172404911177e-05, + "loss": 0.2251, + "num_input_tokens_seen": 1480584, + "step": 7760 + }, + { + "epoch": 4.035862785862786, + "grad_norm": 0.6722506284713745, + "learning_rate": 3.723195062470929e-05, + "loss": 0.2628, + "num_input_tokens_seen": 1481512, + "step": 7765 + }, + { + "epoch": 4.038461538461538, + "grad_norm": 0.13438855111598969, + "learning_rate": 3.7212167134692414e-05, + "loss": 0.243, + "num_input_tokens_seen": 1482472, + "step": 7770 + }, + { + "epoch": 4.041060291060291, + "grad_norm": 0.08356055617332458, + "learning_rate": 3.719237359534087e-05, + "loss": 0.2902, + "num_input_tokens_seen": 1483400, + "step": 7775 + }, + { + "epoch": 4.043659043659043, + "grad_norm": 0.6139398813247681, + "learning_rate": 3.717257002294267e-05, + "loss": 0.2746, + "num_input_tokens_seen": 1484328, + "step": 7780 + }, + { + "epoch": 4.046257796257796, + "grad_norm": 0.27322089672088623, + "learning_rate": 3.715275643379408e-05, + "loss": 0.3458, + "num_input_tokens_seen": 1485416, + "step": 7785 + }, + { + "epoch": 4.048856548856548, + "grad_norm": 0.3742868900299072, + "learning_rate": 3.7132932844199614e-05, + "loss": 0.2815, + "num_input_tokens_seen": 1486376, + "step": 7790 + }, + { + "epoch": 4.051455301455301, + "grad_norm": 1.031416654586792, + "learning_rate": 3.7113099270472005e-05, + "loss": 0.2521, + "num_input_tokens_seen": 1487368, + "step": 7795 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 0.4036618769168854, + "learning_rate": 3.709325572893221e-05, + "loss": 0.2615, + "num_input_tokens_seen": 1488328, + "step": 7800 + }, + { + "epoch": 4.0566528066528065, + "grad_norm": 0.68915855884552, + "learning_rate": 3.707340223590939e-05, + "loss": 0.2812, + "num_input_tokens_seen": 1489224, + "step": 7805 + }, + { + "epoch": 4.0592515592515594, + "grad_norm": 0.5036302208900452, + "learning_rate": 3.705353880774088e-05, + "loss": 0.2963, + "num_input_tokens_seen": 1490088, + "step": 7810 + }, + { + "epoch": 4.0618503118503115, + "grad_norm": 0.8123134970664978, + "learning_rate": 3.703366546077221e-05, + "loss": 0.2495, + "num_input_tokens_seen": 1491048, + "step": 7815 + }, + { + "epoch": 4.0644490644490645, + "grad_norm": 0.5972231030464172, + "learning_rate": 3.701378221135707e-05, + "loss": 0.2459, + "num_input_tokens_seen": 1492008, + "step": 7820 + }, + { + "epoch": 4.0670478170478175, + "grad_norm": 0.6133677959442139, + "learning_rate": 3.699388907585727e-05, + "loss": 0.2742, + "num_input_tokens_seen": 1493000, + "step": 7825 + }, + { + "epoch": 4.06964656964657, + "grad_norm": 0.204736590385437, + "learning_rate": 3.697398607064279e-05, + "loss": 0.2918, + "num_input_tokens_seen": 1493960, + "step": 7830 + }, + { + "epoch": 4.0722453222453225, + "grad_norm": 0.7015913724899292, + "learning_rate": 3.695407321209172e-05, + "loss": 0.2474, + "num_input_tokens_seen": 1494920, + "step": 7835 + }, + { + "epoch": 4.074844074844075, + "grad_norm": 0.37043821811676025, + "learning_rate": 3.693415051659026e-05, + "loss": 0.3059, + "num_input_tokens_seen": 1495880, + "step": 7840 + }, + { + "epoch": 4.077442827442828, + "grad_norm": 0.42031970620155334, + "learning_rate": 3.69142180005327e-05, + "loss": 0.2938, + "num_input_tokens_seen": 1496840, + "step": 7845 + }, + { + "epoch": 4.08004158004158, + "grad_norm": 0.30051887035369873, + "learning_rate": 3.689427568032141e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1497864, + "step": 7850 + }, + { + "epoch": 4.082640332640333, + "grad_norm": 0.38147860765457153, + "learning_rate": 3.687432357236683e-05, + "loss": 0.2813, + "num_input_tokens_seen": 1498920, + "step": 7855 + }, + { + "epoch": 4.085239085239086, + "grad_norm": 0.36278143525123596, + "learning_rate": 3.685436169308746e-05, + "loss": 0.2616, + "num_input_tokens_seen": 1499784, + "step": 7860 + }, + { + "epoch": 4.087837837837838, + "grad_norm": 0.4983363449573517, + "learning_rate": 3.683439005890983e-05, + "loss": 0.2906, + "num_input_tokens_seen": 1500712, + "step": 7865 + }, + { + "epoch": 4.090436590436591, + "grad_norm": 0.6475929021835327, + "learning_rate": 3.681440868626851e-05, + "loss": 0.2519, + "num_input_tokens_seen": 1501608, + "step": 7870 + }, + { + "epoch": 4.093035343035343, + "grad_norm": 0.8947976231575012, + "learning_rate": 3.679441759160608e-05, + "loss": 0.2946, + "num_input_tokens_seen": 1502600, + "step": 7875 + }, + { + "epoch": 4.095634095634096, + "grad_norm": 0.49472156167030334, + "learning_rate": 3.677441679137311e-05, + "loss": 0.3048, + "num_input_tokens_seen": 1503496, + "step": 7880 + }, + { + "epoch": 4.098232848232848, + "grad_norm": 0.3443291187286377, + "learning_rate": 3.675440630202817e-05, + "loss": 0.2965, + "num_input_tokens_seen": 1504392, + "step": 7885 + }, + { + "epoch": 4.100831600831601, + "grad_norm": 0.770571768283844, + "learning_rate": 3.673438614003778e-05, + "loss": 0.2361, + "num_input_tokens_seen": 1505384, + "step": 7890 + }, + { + "epoch": 4.103430353430354, + "grad_norm": 0.6795474886894226, + "learning_rate": 3.671435632187646e-05, + "loss": 0.2568, + "num_input_tokens_seen": 1506248, + "step": 7895 + }, + { + "epoch": 4.106029106029106, + "grad_norm": 0.5227054953575134, + "learning_rate": 3.669431686402664e-05, + "loss": 0.2254, + "num_input_tokens_seen": 1507240, + "step": 7900 + }, + { + "epoch": 4.108627858627859, + "grad_norm": 0.38214975595474243, + "learning_rate": 3.667426778297871e-05, + "loss": 0.2315, + "num_input_tokens_seen": 1508136, + "step": 7905 + }, + { + "epoch": 4.111226611226611, + "grad_norm": 0.5751751065254211, + "learning_rate": 3.6654209095230935e-05, + "loss": 0.2761, + "num_input_tokens_seen": 1509128, + "step": 7910 + }, + { + "epoch": 4.113825363825364, + "grad_norm": 0.5091524720191956, + "learning_rate": 3.663414081728954e-05, + "loss": 0.3023, + "num_input_tokens_seen": 1510056, + "step": 7915 + }, + { + "epoch": 4.116424116424117, + "grad_norm": 0.21647648513317108, + "learning_rate": 3.6614062965668614e-05, + "loss": 0.2585, + "num_input_tokens_seen": 1511016, + "step": 7920 + }, + { + "epoch": 4.119022869022869, + "grad_norm": 0.15782567858695984, + "learning_rate": 3.6593975556890106e-05, + "loss": 0.2865, + "num_input_tokens_seen": 1511976, + "step": 7925 + }, + { + "epoch": 4.121621621621622, + "grad_norm": 0.8455075025558472, + "learning_rate": 3.657387860748387e-05, + "loss": 0.2666, + "num_input_tokens_seen": 1512936, + "step": 7930 + }, + { + "epoch": 4.124220374220374, + "grad_norm": 0.5270572900772095, + "learning_rate": 3.655377213398759e-05, + "loss": 0.2557, + "num_input_tokens_seen": 1513832, + "step": 7935 + }, + { + "epoch": 4.126819126819127, + "grad_norm": 0.6137606501579285, + "learning_rate": 3.653365615294678e-05, + "loss": 0.2469, + "num_input_tokens_seen": 1514792, + "step": 7940 + }, + { + "epoch": 4.129417879417879, + "grad_norm": 0.5814701318740845, + "learning_rate": 3.651353068091479e-05, + "loss": 0.2522, + "num_input_tokens_seen": 1515784, + "step": 7945 + }, + { + "epoch": 4.132016632016632, + "grad_norm": 0.1994076818227768, + "learning_rate": 3.649339573445277e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1516744, + "step": 7950 + }, + { + "epoch": 4.134615384615385, + "grad_norm": 0.556069016456604, + "learning_rate": 3.647325133012969e-05, + "loss": 0.3144, + "num_input_tokens_seen": 1517736, + "step": 7955 + }, + { + "epoch": 4.137214137214137, + "grad_norm": 0.36066436767578125, + "learning_rate": 3.6453097484522257e-05, + "loss": 0.2448, + "num_input_tokens_seen": 1518600, + "step": 7960 + }, + { + "epoch": 4.13981288981289, + "grad_norm": 0.3099408447742462, + "learning_rate": 3.6432934214215e-05, + "loss": 0.2609, + "num_input_tokens_seen": 1519560, + "step": 7965 + }, + { + "epoch": 4.142411642411642, + "grad_norm": 0.40106847882270813, + "learning_rate": 3.641276153580016e-05, + "loss": 0.2626, + "num_input_tokens_seen": 1520552, + "step": 7970 + }, + { + "epoch": 4.145010395010395, + "grad_norm": 0.5704149603843689, + "learning_rate": 3.6392579465877754e-05, + "loss": 0.2568, + "num_input_tokens_seen": 1521480, + "step": 7975 + }, + { + "epoch": 4.147609147609147, + "grad_norm": 0.5518651008605957, + "learning_rate": 3.63723880210555e-05, + "loss": 0.2469, + "num_input_tokens_seen": 1522408, + "step": 7980 + }, + { + "epoch": 4.1502079002079, + "grad_norm": 0.26731470227241516, + "learning_rate": 3.635218721794886e-05, + "loss": 0.2628, + "num_input_tokens_seen": 1523400, + "step": 7985 + }, + { + "epoch": 4.152806652806653, + "grad_norm": 0.5047889351844788, + "learning_rate": 3.6331977073180964e-05, + "loss": 0.2911, + "num_input_tokens_seen": 1524360, + "step": 7990 + }, + { + "epoch": 4.155405405405405, + "grad_norm": 0.6159191131591797, + "learning_rate": 3.631175760338265e-05, + "loss": 0.2743, + "num_input_tokens_seen": 1525320, + "step": 7995 + }, + { + "epoch": 4.158004158004158, + "grad_norm": 0.4881305396556854, + "learning_rate": 3.629152882519242e-05, + "loss": 0.226, + "num_input_tokens_seen": 1526312, + "step": 8000 + }, + { + "epoch": 4.16060291060291, + "grad_norm": 0.40187251567840576, + "learning_rate": 3.627129075525645e-05, + "loss": 0.206, + "num_input_tokens_seen": 1527272, + "step": 8005 + }, + { + "epoch": 4.163201663201663, + "grad_norm": 0.46067968010902405, + "learning_rate": 3.625104341022854e-05, + "loss": 0.2263, + "num_input_tokens_seen": 1528296, + "step": 8010 + }, + { + "epoch": 4.165800415800415, + "grad_norm": 0.3744903802871704, + "learning_rate": 3.623078680677014e-05, + "loss": 0.2949, + "num_input_tokens_seen": 1529224, + "step": 8015 + }, + { + "epoch": 4.168399168399168, + "grad_norm": 1.0743458271026611, + "learning_rate": 3.6210520961550314e-05, + "loss": 0.3215, + "num_input_tokens_seen": 1530184, + "step": 8020 + }, + { + "epoch": 4.170997920997921, + "grad_norm": 0.28036805987358093, + "learning_rate": 3.619024589124573e-05, + "loss": 0.2502, + "num_input_tokens_seen": 1531144, + "step": 8025 + }, + { + "epoch": 4.173596673596673, + "grad_norm": 0.5047534704208374, + "learning_rate": 3.6169961612540645e-05, + "loss": 0.2833, + "num_input_tokens_seen": 1532136, + "step": 8030 + }, + { + "epoch": 4.176195426195426, + "grad_norm": 0.29835519194602966, + "learning_rate": 3.61496681421269e-05, + "loss": 0.2528, + "num_input_tokens_seen": 1533064, + "step": 8035 + }, + { + "epoch": 4.1787941787941785, + "grad_norm": 0.6970093250274658, + "learning_rate": 3.61293654967039e-05, + "loss": 0.2525, + "num_input_tokens_seen": 1534024, + "step": 8040 + }, + { + "epoch": 4.1813929313929314, + "grad_norm": 0.573638916015625, + "learning_rate": 3.610905369297859e-05, + "loss": 0.2041, + "num_input_tokens_seen": 1534952, + "step": 8045 + }, + { + "epoch": 4.183991683991684, + "grad_norm": 0.3438487648963928, + "learning_rate": 3.608873274766545e-05, + "loss": 0.2094, + "num_input_tokens_seen": 1535944, + "step": 8050 + }, + { + "epoch": 4.1865904365904365, + "grad_norm": 0.1862974315881729, + "learning_rate": 3.60684026774865e-05, + "loss": 0.2151, + "num_input_tokens_seen": 1536840, + "step": 8055 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.9484788179397583, + "learning_rate": 3.604806349917126e-05, + "loss": 0.344, + "num_input_tokens_seen": 1537672, + "step": 8060 + }, + { + "epoch": 4.191787941787942, + "grad_norm": 0.618593156337738, + "learning_rate": 3.6027715229456734e-05, + "loss": 0.3378, + "num_input_tokens_seen": 1538728, + "step": 8065 + }, + { + "epoch": 4.1943866943866945, + "grad_norm": 0.6992475390434265, + "learning_rate": 3.600735788508743e-05, + "loss": 0.2635, + "num_input_tokens_seen": 1539688, + "step": 8070 + }, + { + "epoch": 4.196985446985447, + "grad_norm": 0.5928134918212891, + "learning_rate": 3.59869914828153e-05, + "loss": 0.2865, + "num_input_tokens_seen": 1540744, + "step": 8075 + }, + { + "epoch": 4.1995841995842, + "grad_norm": 0.2233302742242813, + "learning_rate": 3.596661603939977e-05, + "loss": 0.287, + "num_input_tokens_seen": 1541672, + "step": 8080 + }, + { + "epoch": 4.202182952182953, + "grad_norm": 0.7811470627784729, + "learning_rate": 3.594623157160769e-05, + "loss": 0.2904, + "num_input_tokens_seen": 1542632, + "step": 8085 + }, + { + "epoch": 4.204781704781705, + "grad_norm": 0.6113465428352356, + "learning_rate": 3.592583809621334e-05, + "loss": 0.2257, + "num_input_tokens_seen": 1543560, + "step": 8090 + }, + { + "epoch": 4.207380457380458, + "grad_norm": 0.5303565263748169, + "learning_rate": 3.590543562999841e-05, + "loss": 0.2335, + "num_input_tokens_seen": 1544488, + "step": 8095 + }, + { + "epoch": 4.20997920997921, + "grad_norm": 0.15386508405208588, + "learning_rate": 3.588502418975201e-05, + "loss": 0.2739, + "num_input_tokens_seen": 1545416, + "step": 8100 + }, + { + "epoch": 4.212577962577963, + "grad_norm": 0.3631255030632019, + "learning_rate": 3.5864603792270604e-05, + "loss": 0.2753, + "num_input_tokens_seen": 1546344, + "step": 8105 + }, + { + "epoch": 4.215176715176715, + "grad_norm": 0.3587839901447296, + "learning_rate": 3.584417445435805e-05, + "loss": 0.22, + "num_input_tokens_seen": 1547304, + "step": 8110 + }, + { + "epoch": 4.217775467775468, + "grad_norm": 0.48485398292541504, + "learning_rate": 3.5823736192825545e-05, + "loss": 0.208, + "num_input_tokens_seen": 1548232, + "step": 8115 + }, + { + "epoch": 4.220374220374221, + "grad_norm": 0.5268808007240295, + "learning_rate": 3.580328902449164e-05, + "loss": 0.2799, + "num_input_tokens_seen": 1549256, + "step": 8120 + }, + { + "epoch": 4.222972972972973, + "grad_norm": 0.5642616748809814, + "learning_rate": 3.578283296618221e-05, + "loss": 0.1669, + "num_input_tokens_seen": 1550248, + "step": 8125 + }, + { + "epoch": 4.225571725571726, + "grad_norm": 0.47949185967445374, + "learning_rate": 3.5762368034730466e-05, + "loss": 0.2174, + "num_input_tokens_seen": 1551112, + "step": 8130 + }, + { + "epoch": 4.228170478170478, + "grad_norm": 0.33524471521377563, + "learning_rate": 3.574189424697688e-05, + "loss": 0.2317, + "num_input_tokens_seen": 1552040, + "step": 8135 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 0.37763145565986633, + "learning_rate": 3.5721411619769254e-05, + "loss": 0.3156, + "num_input_tokens_seen": 1552968, + "step": 8140 + }, + { + "epoch": 4.233367983367984, + "grad_norm": 0.39073190093040466, + "learning_rate": 3.5700920169962626e-05, + "loss": 0.3014, + "num_input_tokens_seen": 1553896, + "step": 8145 + }, + { + "epoch": 4.235966735966736, + "grad_norm": 0.3576788604259491, + "learning_rate": 3.568041991441934e-05, + "loss": 0.2437, + "num_input_tokens_seen": 1554888, + "step": 8150 + }, + { + "epoch": 4.238565488565489, + "grad_norm": 0.5503053069114685, + "learning_rate": 3.5659910870008934e-05, + "loss": 0.2352, + "num_input_tokens_seen": 1555816, + "step": 8155 + }, + { + "epoch": 4.241164241164241, + "grad_norm": 0.22198157012462616, + "learning_rate": 3.563939305360822e-05, + "loss": 0.2178, + "num_input_tokens_seen": 1556808, + "step": 8160 + }, + { + "epoch": 4.243762993762994, + "grad_norm": 0.3251372277736664, + "learning_rate": 3.56188664821012e-05, + "loss": 0.2762, + "num_input_tokens_seen": 1557736, + "step": 8165 + }, + { + "epoch": 4.246361746361746, + "grad_norm": 0.5582057237625122, + "learning_rate": 3.55983311723791e-05, + "loss": 0.3114, + "num_input_tokens_seen": 1558664, + "step": 8170 + }, + { + "epoch": 4.248960498960499, + "grad_norm": 0.5381997227668762, + "learning_rate": 3.557778714134033e-05, + "loss": 0.3073, + "num_input_tokens_seen": 1559624, + "step": 8175 + }, + { + "epoch": 4.251559251559252, + "grad_norm": 0.4234178960323334, + "learning_rate": 3.555723440589047e-05, + "loss": 0.2874, + "num_input_tokens_seen": 1560552, + "step": 8180 + }, + { + "epoch": 4.254158004158004, + "grad_norm": 0.34970882534980774, + "learning_rate": 3.5536672982942275e-05, + "loss": 0.2865, + "num_input_tokens_seen": 1561544, + "step": 8185 + }, + { + "epoch": 4.256756756756757, + "grad_norm": 0.40590977668762207, + "learning_rate": 3.551610288941564e-05, + "loss": 0.2768, + "num_input_tokens_seen": 1562568, + "step": 8190 + }, + { + "epoch": 4.259355509355509, + "grad_norm": 0.23491252958774567, + "learning_rate": 3.549552414223761e-05, + "loss": 0.2348, + "num_input_tokens_seen": 1563464, + "step": 8195 + }, + { + "epoch": 4.261954261954262, + "grad_norm": 0.4701528251171112, + "learning_rate": 3.547493675834232e-05, + "loss": 0.2487, + "num_input_tokens_seen": 1564392, + "step": 8200 + }, + { + "epoch": 4.264553014553014, + "grad_norm": 0.2575961947441101, + "learning_rate": 3.545434075467103e-05, + "loss": 0.2378, + "num_input_tokens_seen": 1565320, + "step": 8205 + }, + { + "epoch": 4.267151767151767, + "grad_norm": 0.19670826196670532, + "learning_rate": 3.543373614817212e-05, + "loss": 0.2819, + "num_input_tokens_seen": 1566248, + "step": 8210 + }, + { + "epoch": 4.26975051975052, + "grad_norm": 0.45278769731521606, + "learning_rate": 3.5413122955801005e-05, + "loss": 0.2626, + "num_input_tokens_seen": 1567144, + "step": 8215 + }, + { + "epoch": 4.272349272349272, + "grad_norm": 0.2934087812900543, + "learning_rate": 3.5392501194520174e-05, + "loss": 0.3152, + "num_input_tokens_seen": 1568104, + "step": 8220 + }, + { + "epoch": 4.274948024948025, + "grad_norm": 0.7013996839523315, + "learning_rate": 3.537187088129919e-05, + "loss": 0.257, + "num_input_tokens_seen": 1569064, + "step": 8225 + }, + { + "epoch": 4.277546777546777, + "grad_norm": 0.275242418050766, + "learning_rate": 3.535123203311464e-05, + "loss": 0.2876, + "num_input_tokens_seen": 1569992, + "step": 8230 + }, + { + "epoch": 4.28014553014553, + "grad_norm": 0.33148086071014404, + "learning_rate": 3.533058466695013e-05, + "loss": 0.3008, + "num_input_tokens_seen": 1570952, + "step": 8235 + }, + { + "epoch": 4.282744282744282, + "grad_norm": 0.471040815114975, + "learning_rate": 3.530992879979629e-05, + "loss": 0.2552, + "num_input_tokens_seen": 1571880, + "step": 8240 + }, + { + "epoch": 4.285343035343035, + "grad_norm": 0.4318445324897766, + "learning_rate": 3.528926444865073e-05, + "loss": 0.2943, + "num_input_tokens_seen": 1572840, + "step": 8245 + }, + { + "epoch": 4.287941787941788, + "grad_norm": 0.2394144982099533, + "learning_rate": 3.5268591630518036e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1573704, + "step": 8250 + }, + { + "epoch": 4.29054054054054, + "grad_norm": 0.23465891182422638, + "learning_rate": 3.524791036240979e-05, + "loss": 0.2482, + "num_input_tokens_seen": 1574664, + "step": 8255 + }, + { + "epoch": 4.293139293139293, + "grad_norm": 0.48307761549949646, + "learning_rate": 3.52272206613445e-05, + "loss": 0.2526, + "num_input_tokens_seen": 1575624, + "step": 8260 + }, + { + "epoch": 4.295738045738045, + "grad_norm": 0.30827662348747253, + "learning_rate": 3.520652254434762e-05, + "loss": 0.2668, + "num_input_tokens_seen": 1576552, + "step": 8265 + }, + { + "epoch": 4.298336798336798, + "grad_norm": 0.3367508351802826, + "learning_rate": 3.518581602845154e-05, + "loss": 0.3025, + "num_input_tokens_seen": 1577512, + "step": 8270 + }, + { + "epoch": 4.3009355509355505, + "grad_norm": 1.061774492263794, + "learning_rate": 3.516510113069555e-05, + "loss": 0.2552, + "num_input_tokens_seen": 1578440, + "step": 8275 + }, + { + "epoch": 4.303534303534303, + "grad_norm": 0.7282994389533997, + "learning_rate": 3.5144377868125855e-05, + "loss": 0.2954, + "num_input_tokens_seen": 1579368, + "step": 8280 + }, + { + "epoch": 4.306133056133056, + "grad_norm": 0.4292987287044525, + "learning_rate": 3.512364625779551e-05, + "loss": 0.2748, + "num_input_tokens_seen": 1580232, + "step": 8285 + }, + { + "epoch": 4.3087318087318085, + "grad_norm": 0.8506305813789368, + "learning_rate": 3.510290631676447e-05, + "loss": 0.2991, + "num_input_tokens_seen": 1581160, + "step": 8290 + }, + { + "epoch": 4.3113305613305615, + "grad_norm": 0.2620040774345398, + "learning_rate": 3.5082158062099536e-05, + "loss": 0.2768, + "num_input_tokens_seen": 1582152, + "step": 8295 + }, + { + "epoch": 4.313929313929314, + "grad_norm": 0.8220937848091125, + "learning_rate": 3.506140151087434e-05, + "loss": 0.2503, + "num_input_tokens_seen": 1583176, + "step": 8300 + }, + { + "epoch": 4.3165280665280665, + "grad_norm": 0.6856548190116882, + "learning_rate": 3.504063668016936e-05, + "loss": 0.2398, + "num_input_tokens_seen": 1584072, + "step": 8305 + }, + { + "epoch": 4.3191268191268195, + "grad_norm": 0.7885428667068481, + "learning_rate": 3.5019863587071867e-05, + "loss": 0.3231, + "num_input_tokens_seen": 1585032, + "step": 8310 + }, + { + "epoch": 4.321725571725572, + "grad_norm": 0.34771090745925903, + "learning_rate": 3.499908224867594e-05, + "loss": 0.2649, + "num_input_tokens_seen": 1585992, + "step": 8315 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.4239509105682373, + "learning_rate": 3.497829268208246e-05, + "loss": 0.2327, + "num_input_tokens_seen": 1586952, + "step": 8320 + }, + { + "epoch": 4.326923076923077, + "grad_norm": 0.263679176568985, + "learning_rate": 3.495749490439905e-05, + "loss": 0.2119, + "num_input_tokens_seen": 1587880, + "step": 8325 + }, + { + "epoch": 4.32952182952183, + "grad_norm": 0.8631793856620789, + "learning_rate": 3.493668893274011e-05, + "loss": 0.3043, + "num_input_tokens_seen": 1588872, + "step": 8330 + }, + { + "epoch": 4.332120582120582, + "grad_norm": 0.2129317671060562, + "learning_rate": 3.491587478422677e-05, + "loss": 0.2899, + "num_input_tokens_seen": 1589832, + "step": 8335 + }, + { + "epoch": 4.334719334719335, + "grad_norm": 0.369442343711853, + "learning_rate": 3.48950524759869e-05, + "loss": 0.2689, + "num_input_tokens_seen": 1590792, + "step": 8340 + }, + { + "epoch": 4.337318087318088, + "grad_norm": 0.410076379776001, + "learning_rate": 3.487422202515508e-05, + "loss": 0.2971, + "num_input_tokens_seen": 1591656, + "step": 8345 + }, + { + "epoch": 4.33991683991684, + "grad_norm": 0.4437035620212555, + "learning_rate": 3.485338344887258e-05, + "loss": 0.2866, + "num_input_tokens_seen": 1592616, + "step": 8350 + }, + { + "epoch": 4.342515592515593, + "grad_norm": 0.9339492917060852, + "learning_rate": 3.483253676428737e-05, + "loss": 0.2882, + "num_input_tokens_seen": 1593544, + "step": 8355 + }, + { + "epoch": 4.345114345114345, + "grad_norm": 0.5658653378486633, + "learning_rate": 3.481168198855409e-05, + "loss": 0.2811, + "num_input_tokens_seen": 1594504, + "step": 8360 + }, + { + "epoch": 4.347713097713098, + "grad_norm": 0.5081551671028137, + "learning_rate": 3.4790819138834044e-05, + "loss": 0.2739, + "num_input_tokens_seen": 1595368, + "step": 8365 + }, + { + "epoch": 4.350311850311851, + "grad_norm": 0.7029102444648743, + "learning_rate": 3.4769948232295166e-05, + "loss": 0.2591, + "num_input_tokens_seen": 1596392, + "step": 8370 + }, + { + "epoch": 4.352910602910603, + "grad_norm": 0.6194694638252258, + "learning_rate": 3.4749069286112027e-05, + "loss": 0.2398, + "num_input_tokens_seen": 1597320, + "step": 8375 + }, + { + "epoch": 4.355509355509356, + "grad_norm": 0.3036273717880249, + "learning_rate": 3.4728182317465795e-05, + "loss": 0.2962, + "num_input_tokens_seen": 1598280, + "step": 8380 + }, + { + "epoch": 4.358108108108108, + "grad_norm": 0.505592405796051, + "learning_rate": 3.470728734354429e-05, + "loss": 0.3057, + "num_input_tokens_seen": 1599208, + "step": 8385 + }, + { + "epoch": 4.360706860706861, + "grad_norm": 0.3488521873950958, + "learning_rate": 3.468638438154186e-05, + "loss": 0.2629, + "num_input_tokens_seen": 1600104, + "step": 8390 + }, + { + "epoch": 4.363305613305613, + "grad_norm": 0.5946361422538757, + "learning_rate": 3.466547344865948e-05, + "loss": 0.2253, + "num_input_tokens_seen": 1601000, + "step": 8395 + }, + { + "epoch": 4.365904365904366, + "grad_norm": 0.16172383725643158, + "learning_rate": 3.4644554562104634e-05, + "loss": 0.2309, + "num_input_tokens_seen": 1601928, + "step": 8400 + }, + { + "epoch": 4.368503118503119, + "grad_norm": 0.5654312968254089, + "learning_rate": 3.4623627739091384e-05, + "loss": 0.2776, + "num_input_tokens_seen": 1602920, + "step": 8405 + }, + { + "epoch": 4.371101871101871, + "grad_norm": 0.2474515289068222, + "learning_rate": 3.4602692996840324e-05, + "loss": 0.2146, + "num_input_tokens_seen": 1603880, + "step": 8410 + }, + { + "epoch": 4.373700623700624, + "grad_norm": 0.2278396636247635, + "learning_rate": 3.458175035257854e-05, + "loss": 0.2626, + "num_input_tokens_seen": 1604840, + "step": 8415 + }, + { + "epoch": 4.376299376299376, + "grad_norm": 0.20129051804542542, + "learning_rate": 3.4560799823539635e-05, + "loss": 0.2532, + "num_input_tokens_seen": 1605832, + "step": 8420 + }, + { + "epoch": 4.378898128898129, + "grad_norm": 0.3645390570163727, + "learning_rate": 3.453984142696372e-05, + "loss": 0.2498, + "num_input_tokens_seen": 1606792, + "step": 8425 + }, + { + "epoch": 4.381496881496881, + "grad_norm": 0.6108091473579407, + "learning_rate": 3.4518875180097335e-05, + "loss": 0.2929, + "num_input_tokens_seen": 1607816, + "step": 8430 + }, + { + "epoch": 4.384095634095634, + "grad_norm": 0.4264501929283142, + "learning_rate": 3.449790110019351e-05, + "loss": 0.2888, + "num_input_tokens_seen": 1608808, + "step": 8435 + }, + { + "epoch": 4.386694386694387, + "grad_norm": 0.549279510974884, + "learning_rate": 3.4476919204511735e-05, + "loss": 0.1905, + "num_input_tokens_seen": 1609736, + "step": 8440 + }, + { + "epoch": 4.389293139293139, + "grad_norm": 0.4441266655921936, + "learning_rate": 3.44559295103179e-05, + "loss": 0.3251, + "num_input_tokens_seen": 1610696, + "step": 8445 + }, + { + "epoch": 4.391891891891892, + "grad_norm": 0.6157656908035278, + "learning_rate": 3.443493203488431e-05, + "loss": 0.1955, + "num_input_tokens_seen": 1611560, + "step": 8450 + }, + { + "epoch": 4.394490644490644, + "grad_norm": 0.48113101720809937, + "learning_rate": 3.441392679548973e-05, + "loss": 0.3116, + "num_input_tokens_seen": 1612552, + "step": 8455 + }, + { + "epoch": 4.397089397089397, + "grad_norm": 0.6199339032173157, + "learning_rate": 3.439291380941923e-05, + "loss": 0.2003, + "num_input_tokens_seen": 1613512, + "step": 8460 + }, + { + "epoch": 4.399688149688149, + "grad_norm": 0.5686926245689392, + "learning_rate": 3.437189309396432e-05, + "loss": 0.2712, + "num_input_tokens_seen": 1614440, + "step": 8465 + }, + { + "epoch": 4.402286902286902, + "grad_norm": 0.5326319336891174, + "learning_rate": 3.435086466642284e-05, + "loss": 0.3031, + "num_input_tokens_seen": 1615432, + "step": 8470 + }, + { + "epoch": 4.404885654885655, + "grad_norm": 0.24904616177082062, + "learning_rate": 3.432982854409899e-05, + "loss": 0.2847, + "num_input_tokens_seen": 1616360, + "step": 8475 + }, + { + "epoch": 4.407484407484407, + "grad_norm": 0.7157745361328125, + "learning_rate": 3.4308784744303276e-05, + "loss": 0.2863, + "num_input_tokens_seen": 1617320, + "step": 8480 + }, + { + "epoch": 4.41008316008316, + "grad_norm": 0.6017506718635559, + "learning_rate": 3.4287733284352556e-05, + "loss": 0.2237, + "num_input_tokens_seen": 1618248, + "step": 8485 + }, + { + "epoch": 4.412681912681912, + "grad_norm": 0.6091199517250061, + "learning_rate": 3.426667418156999e-05, + "loss": 0.2682, + "num_input_tokens_seen": 1619176, + "step": 8490 + }, + { + "epoch": 4.415280665280665, + "grad_norm": 0.2705000936985016, + "learning_rate": 3.4245607453285e-05, + "loss": 0.2711, + "num_input_tokens_seen": 1620072, + "step": 8495 + }, + { + "epoch": 4.417879417879418, + "grad_norm": 0.48754507303237915, + "learning_rate": 3.422453311683329e-05, + "loss": 0.2392, + "num_input_tokens_seen": 1621096, + "step": 8500 + }, + { + "epoch": 4.42047817047817, + "grad_norm": 0.5500050187110901, + "learning_rate": 3.4203451189556844e-05, + "loss": 0.2786, + "num_input_tokens_seen": 1622056, + "step": 8505 + }, + { + "epoch": 4.423076923076923, + "grad_norm": 0.6106411218643188, + "learning_rate": 3.4182361688803886e-05, + "loss": 0.2545, + "num_input_tokens_seen": 1623048, + "step": 8510 + }, + { + "epoch": 4.425675675675675, + "grad_norm": 0.3030518591403961, + "learning_rate": 3.416126463192885e-05, + "loss": 0.3018, + "num_input_tokens_seen": 1623976, + "step": 8515 + }, + { + "epoch": 4.428274428274428, + "grad_norm": 0.12734399735927582, + "learning_rate": 3.4140160036292414e-05, + "loss": 0.2101, + "num_input_tokens_seen": 1624936, + "step": 8520 + }, + { + "epoch": 4.4308731808731805, + "grad_norm": 0.632735550403595, + "learning_rate": 3.4119047919261444e-05, + "loss": 0.2718, + "num_input_tokens_seen": 1625992, + "step": 8525 + }, + { + "epoch": 4.4334719334719335, + "grad_norm": 0.664923906326294, + "learning_rate": 3.4097928298209e-05, + "loss": 0.3012, + "num_input_tokens_seen": 1626952, + "step": 8530 + }, + { + "epoch": 4.436070686070686, + "grad_norm": 0.393841028213501, + "learning_rate": 3.4076801190514334e-05, + "loss": 0.289, + "num_input_tokens_seen": 1627944, + "step": 8535 + }, + { + "epoch": 4.4386694386694385, + "grad_norm": 0.35504573583602905, + "learning_rate": 3.405566661356284e-05, + "loss": 0.2689, + "num_input_tokens_seen": 1628904, + "step": 8540 + }, + { + "epoch": 4.4412681912681915, + "grad_norm": 0.4040564298629761, + "learning_rate": 3.4034524584746044e-05, + "loss": 0.2853, + "num_input_tokens_seen": 1629864, + "step": 8545 + }, + { + "epoch": 4.443866943866944, + "grad_norm": 0.43177545070648193, + "learning_rate": 3.4013375121461625e-05, + "loss": 0.2575, + "num_input_tokens_seen": 1630888, + "step": 8550 + }, + { + "epoch": 4.446465696465697, + "grad_norm": 0.5011608004570007, + "learning_rate": 3.39922182411134e-05, + "loss": 0.2829, + "num_input_tokens_seen": 1631816, + "step": 8555 + }, + { + "epoch": 4.4490644490644495, + "grad_norm": 0.5039840936660767, + "learning_rate": 3.3971053961111245e-05, + "loss": 0.2423, + "num_input_tokens_seen": 1632712, + "step": 8560 + }, + { + "epoch": 4.451663201663202, + "grad_norm": 0.2931845486164093, + "learning_rate": 3.394988229887114e-05, + "loss": 0.2411, + "num_input_tokens_seen": 1633704, + "step": 8565 + }, + { + "epoch": 4.454261954261955, + "grad_norm": 0.5595581531524658, + "learning_rate": 3.392870327181516e-05, + "loss": 0.1866, + "num_input_tokens_seen": 1634696, + "step": 8570 + }, + { + "epoch": 4.456860706860707, + "grad_norm": 0.33892035484313965, + "learning_rate": 3.390751689737143e-05, + "loss": 0.2718, + "num_input_tokens_seen": 1635752, + "step": 8575 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 0.3223518431186676, + "learning_rate": 3.3886323192974106e-05, + "loss": 0.2715, + "num_input_tokens_seen": 1636712, + "step": 8580 + }, + { + "epoch": 4.462058212058212, + "grad_norm": 0.44267648458480835, + "learning_rate": 3.386512217606339e-05, + "loss": 0.2458, + "num_input_tokens_seen": 1637608, + "step": 8585 + }, + { + "epoch": 4.464656964656965, + "grad_norm": 0.2320842444896698, + "learning_rate": 3.384391386408551e-05, + "loss": 0.2315, + "num_input_tokens_seen": 1638600, + "step": 8590 + }, + { + "epoch": 4.467255717255718, + "grad_norm": 0.6310675144195557, + "learning_rate": 3.382269827449267e-05, + "loss": 0.332, + "num_input_tokens_seen": 1639624, + "step": 8595 + }, + { + "epoch": 4.46985446985447, + "grad_norm": 0.45571136474609375, + "learning_rate": 3.3801475424743075e-05, + "loss": 0.3386, + "num_input_tokens_seen": 1640552, + "step": 8600 + }, + { + "epoch": 4.472453222453223, + "grad_norm": 0.5964590907096863, + "learning_rate": 3.378024533230093e-05, + "loss": 0.2394, + "num_input_tokens_seen": 1641512, + "step": 8605 + }, + { + "epoch": 4.475051975051975, + "grad_norm": 0.15430568158626556, + "learning_rate": 3.3759008014636365e-05, + "loss": 0.2443, + "num_input_tokens_seen": 1642440, + "step": 8610 + }, + { + "epoch": 4.477650727650728, + "grad_norm": 0.5627594590187073, + "learning_rate": 3.373776348922546e-05, + "loss": 0.288, + "num_input_tokens_seen": 1643400, + "step": 8615 + }, + { + "epoch": 4.48024948024948, + "grad_norm": 0.24863019585609436, + "learning_rate": 3.3716511773550256e-05, + "loss": 0.2779, + "num_input_tokens_seen": 1644328, + "step": 8620 + }, + { + "epoch": 4.482848232848233, + "grad_norm": 0.19959214329719543, + "learning_rate": 3.369525288509867e-05, + "loss": 0.2534, + "num_input_tokens_seen": 1645224, + "step": 8625 + }, + { + "epoch": 4.485446985446986, + "grad_norm": 0.4385533928871155, + "learning_rate": 3.367398684136454e-05, + "loss": 0.2634, + "num_input_tokens_seen": 1646120, + "step": 8630 + }, + { + "epoch": 4.488045738045738, + "grad_norm": 0.2662065923213959, + "learning_rate": 3.365271365984761e-05, + "loss": 0.3047, + "num_input_tokens_seen": 1647112, + "step": 8635 + }, + { + "epoch": 4.490644490644491, + "grad_norm": 0.6684717535972595, + "learning_rate": 3.363143335805347e-05, + "loss": 0.2715, + "num_input_tokens_seen": 1648104, + "step": 8640 + }, + { + "epoch": 4.493243243243243, + "grad_norm": 0.6485357880592346, + "learning_rate": 3.361014595349358e-05, + "loss": 0.1993, + "num_input_tokens_seen": 1649064, + "step": 8645 + }, + { + "epoch": 4.495841995841996, + "grad_norm": 0.5444954633712769, + "learning_rate": 3.358885146368524e-05, + "loss": 0.2152, + "num_input_tokens_seen": 1650056, + "step": 8650 + }, + { + "epoch": 4.498440748440748, + "grad_norm": 0.3615078330039978, + "learning_rate": 3.35675499061516e-05, + "loss": 0.2335, + "num_input_tokens_seen": 1650952, + "step": 8655 + }, + { + "epoch": 4.5, + "eval_loss": 0.25682446360588074, + "eval_runtime": 7.9252, + "eval_samples_per_second": 108.009, + "eval_steps_per_second": 27.002, + "num_input_tokens_seen": 1651528, + "step": 8658 + }, + { + "epoch": 4.501039501039501, + "grad_norm": 0.6088489294052124, + "learning_rate": 3.35462412984216e-05, + "loss": 0.3047, + "num_input_tokens_seen": 1651944, + "step": 8660 + }, + { + "epoch": 4.503638253638254, + "grad_norm": 0.607962429523468, + "learning_rate": 3.352492565802999e-05, + "loss": 0.3465, + "num_input_tokens_seen": 1652968, + "step": 8665 + }, + { + "epoch": 4.506237006237006, + "grad_norm": 0.3525327444076538, + "learning_rate": 3.350360300251732e-05, + "loss": 0.2861, + "num_input_tokens_seen": 1653928, + "step": 8670 + }, + { + "epoch": 4.508835758835759, + "grad_norm": 0.6369516849517822, + "learning_rate": 3.348227334942989e-05, + "loss": 0.2777, + "num_input_tokens_seen": 1654856, + "step": 8675 + }, + { + "epoch": 4.511434511434511, + "grad_norm": 0.32750818133354187, + "learning_rate": 3.346093671631979e-05, + "loss": 0.288, + "num_input_tokens_seen": 1655752, + "step": 8680 + }, + { + "epoch": 4.514033264033264, + "grad_norm": 0.2833538055419922, + "learning_rate": 3.3439593120744816e-05, + "loss": 0.2902, + "num_input_tokens_seen": 1656712, + "step": 8685 + }, + { + "epoch": 4.516632016632016, + "grad_norm": 0.4180823564529419, + "learning_rate": 3.341824258026851e-05, + "loss": 0.2732, + "num_input_tokens_seen": 1657608, + "step": 8690 + }, + { + "epoch": 4.519230769230769, + "grad_norm": 0.2480800300836563, + "learning_rate": 3.339688511246014e-05, + "loss": 0.2683, + "num_input_tokens_seen": 1658568, + "step": 8695 + }, + { + "epoch": 4.521829521829522, + "grad_norm": 0.30809834599494934, + "learning_rate": 3.337552073489467e-05, + "loss": 0.3026, + "num_input_tokens_seen": 1659528, + "step": 8700 + }, + { + "epoch": 4.524428274428274, + "grad_norm": 0.49703747034072876, + "learning_rate": 3.335414946515275e-05, + "loss": 0.2898, + "num_input_tokens_seen": 1660616, + "step": 8705 + }, + { + "epoch": 4.527027027027027, + "grad_norm": 0.8000227212905884, + "learning_rate": 3.3332771320820676e-05, + "loss": 0.2391, + "num_input_tokens_seen": 1661576, + "step": 8710 + }, + { + "epoch": 4.529625779625779, + "grad_norm": 0.23773467540740967, + "learning_rate": 3.3311386319490436e-05, + "loss": 0.2556, + "num_input_tokens_seen": 1662408, + "step": 8715 + }, + { + "epoch": 4.532224532224532, + "grad_norm": 0.6644234657287598, + "learning_rate": 3.328999447875965e-05, + "loss": 0.3163, + "num_input_tokens_seen": 1663336, + "step": 8720 + }, + { + "epoch": 4.534823284823284, + "grad_norm": 0.30792123079299927, + "learning_rate": 3.326859581623155e-05, + "loss": 0.2576, + "num_input_tokens_seen": 1664264, + "step": 8725 + }, + { + "epoch": 4.537422037422037, + "grad_norm": 0.6473323106765747, + "learning_rate": 3.3247190349515e-05, + "loss": 0.2393, + "num_input_tokens_seen": 1665160, + "step": 8730 + }, + { + "epoch": 4.54002079002079, + "grad_norm": 0.7801569104194641, + "learning_rate": 3.322577809622446e-05, + "loss": 0.2716, + "num_input_tokens_seen": 1666120, + "step": 8735 + }, + { + "epoch": 4.542619542619542, + "grad_norm": 0.28381532430648804, + "learning_rate": 3.3204359073979964e-05, + "loss": 0.2459, + "num_input_tokens_seen": 1667048, + "step": 8740 + }, + { + "epoch": 4.545218295218295, + "grad_norm": 0.3881625533103943, + "learning_rate": 3.318293330040714e-05, + "loss": 0.2313, + "num_input_tokens_seen": 1667880, + "step": 8745 + }, + { + "epoch": 4.547817047817047, + "grad_norm": 0.5682137608528137, + "learning_rate": 3.316150079313713e-05, + "loss": 0.2959, + "num_input_tokens_seen": 1668840, + "step": 8750 + }, + { + "epoch": 4.5504158004158, + "grad_norm": 0.5810385942459106, + "learning_rate": 3.3140061569806685e-05, + "loss": 0.2372, + "num_input_tokens_seen": 1669800, + "step": 8755 + }, + { + "epoch": 4.553014553014553, + "grad_norm": 0.7682206034660339, + "learning_rate": 3.3118615648058e-05, + "loss": 0.311, + "num_input_tokens_seen": 1670728, + "step": 8760 + }, + { + "epoch": 4.5556133056133055, + "grad_norm": 0.32110852003097534, + "learning_rate": 3.309716304553884e-05, + "loss": 0.2475, + "num_input_tokens_seen": 1671720, + "step": 8765 + }, + { + "epoch": 4.558212058212058, + "grad_norm": 0.6304294466972351, + "learning_rate": 3.307570377990245e-05, + "loss": 0.2145, + "num_input_tokens_seen": 1672616, + "step": 8770 + }, + { + "epoch": 4.5608108108108105, + "grad_norm": 0.25167036056518555, + "learning_rate": 3.3054237868807556e-05, + "loss": 0.3131, + "num_input_tokens_seen": 1673576, + "step": 8775 + }, + { + "epoch": 4.5634095634095635, + "grad_norm": 0.4197852909564972, + "learning_rate": 3.303276532991835e-05, + "loss": 0.2686, + "num_input_tokens_seen": 1674408, + "step": 8780 + }, + { + "epoch": 4.5660083160083165, + "grad_norm": 0.30084213614463806, + "learning_rate": 3.3011286180904494e-05, + "loss": 0.2847, + "num_input_tokens_seen": 1675336, + "step": 8785 + }, + { + "epoch": 4.5686070686070686, + "grad_norm": 0.7033189535140991, + "learning_rate": 3.298980043944107e-05, + "loss": 0.2556, + "num_input_tokens_seen": 1676328, + "step": 8790 + }, + { + "epoch": 4.5712058212058215, + "grad_norm": 0.5281503796577454, + "learning_rate": 3.2968308123208595e-05, + "loss": 0.2601, + "num_input_tokens_seen": 1677320, + "step": 8795 + }, + { + "epoch": 4.573804573804574, + "grad_norm": 0.3083471357822418, + "learning_rate": 3.2946809249893e-05, + "loss": 0.2913, + "num_input_tokens_seen": 1678248, + "step": 8800 + }, + { + "epoch": 4.576403326403327, + "grad_norm": 0.2610066533088684, + "learning_rate": 3.29253038371856e-05, + "loss": 0.3024, + "num_input_tokens_seen": 1679208, + "step": 8805 + }, + { + "epoch": 4.579002079002079, + "grad_norm": 0.6044509410858154, + "learning_rate": 3.29037919027831e-05, + "loss": 0.2361, + "num_input_tokens_seen": 1680232, + "step": 8810 + }, + { + "epoch": 4.581600831600832, + "grad_norm": 0.5266008973121643, + "learning_rate": 3.288227346438756e-05, + "loss": 0.2365, + "num_input_tokens_seen": 1681192, + "step": 8815 + }, + { + "epoch": 4.584199584199585, + "grad_norm": 0.5582547783851624, + "learning_rate": 3.286074853970642e-05, + "loss": 0.2197, + "num_input_tokens_seen": 1682152, + "step": 8820 + }, + { + "epoch": 4.586798336798337, + "grad_norm": 0.5311879515647888, + "learning_rate": 3.2839217146452426e-05, + "loss": 0.2156, + "num_input_tokens_seen": 1683048, + "step": 8825 + }, + { + "epoch": 4.58939708939709, + "grad_norm": 0.25787127017974854, + "learning_rate": 3.281767930234366e-05, + "loss": 0.2899, + "num_input_tokens_seen": 1684008, + "step": 8830 + }, + { + "epoch": 4.591995841995842, + "grad_norm": 0.23378416895866394, + "learning_rate": 3.279613502510352e-05, + "loss": 0.2925, + "num_input_tokens_seen": 1684968, + "step": 8835 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.8267520666122437, + "learning_rate": 3.277458433246068e-05, + "loss": 0.3272, + "num_input_tokens_seen": 1685992, + "step": 8840 + }, + { + "epoch": 4.597193347193347, + "grad_norm": 0.4422950744628906, + "learning_rate": 3.2753027242149105e-05, + "loss": 0.2883, + "num_input_tokens_seen": 1686920, + "step": 8845 + }, + { + "epoch": 4.5997920997921, + "grad_norm": 0.463366836309433, + "learning_rate": 3.273146377190803e-05, + "loss": 0.29, + "num_input_tokens_seen": 1687880, + "step": 8850 + }, + { + "epoch": 4.602390852390853, + "grad_norm": 0.2652297914028168, + "learning_rate": 3.270989393948193e-05, + "loss": 0.2657, + "num_input_tokens_seen": 1688776, + "step": 8855 + }, + { + "epoch": 4.604989604989605, + "grad_norm": 0.7605202794075012, + "learning_rate": 3.2688317762620513e-05, + "loss": 0.2637, + "num_input_tokens_seen": 1689672, + "step": 8860 + }, + { + "epoch": 4.607588357588358, + "grad_norm": 0.6647170782089233, + "learning_rate": 3.266673525907872e-05, + "loss": 0.2384, + "num_input_tokens_seen": 1690632, + "step": 8865 + }, + { + "epoch": 4.61018711018711, + "grad_norm": 0.3085973858833313, + "learning_rate": 3.2645146446616684e-05, + "loss": 0.237, + "num_input_tokens_seen": 1691592, + "step": 8870 + }, + { + "epoch": 4.612785862785863, + "grad_norm": 0.4431568682193756, + "learning_rate": 3.2623551342999734e-05, + "loss": 0.2012, + "num_input_tokens_seen": 1692488, + "step": 8875 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.4227140247821808, + "learning_rate": 3.2601949965998404e-05, + "loss": 0.2603, + "num_input_tokens_seen": 1693352, + "step": 8880 + }, + { + "epoch": 4.617983367983368, + "grad_norm": 0.7142307758331299, + "learning_rate": 3.258034233338834e-05, + "loss": 0.3083, + "num_input_tokens_seen": 1694248, + "step": 8885 + }, + { + "epoch": 4.620582120582121, + "grad_norm": 0.2296992987394333, + "learning_rate": 3.2558728462950364e-05, + "loss": 0.2523, + "num_input_tokens_seen": 1695176, + "step": 8890 + }, + { + "epoch": 4.623180873180873, + "grad_norm": 0.8130151629447937, + "learning_rate": 3.2537108372470425e-05, + "loss": 0.2632, + "num_input_tokens_seen": 1696136, + "step": 8895 + }, + { + "epoch": 4.625779625779626, + "grad_norm": 0.4692815840244293, + "learning_rate": 3.2515482079739615e-05, + "loss": 0.1985, + "num_input_tokens_seen": 1697064, + "step": 8900 + }, + { + "epoch": 4.628378378378378, + "grad_norm": 0.37024834752082825, + "learning_rate": 3.2493849602554076e-05, + "loss": 0.1999, + "num_input_tokens_seen": 1698024, + "step": 8905 + }, + { + "epoch": 4.630977130977131, + "grad_norm": 0.4442990720272064, + "learning_rate": 3.24722109587151e-05, + "loss": 0.2158, + "num_input_tokens_seen": 1698856, + "step": 8910 + }, + { + "epoch": 4.633575883575883, + "grad_norm": 0.19106918573379517, + "learning_rate": 3.245056616602901e-05, + "loss": 0.2183, + "num_input_tokens_seen": 1699816, + "step": 8915 + }, + { + "epoch": 4.636174636174636, + "grad_norm": 0.3146427273750305, + "learning_rate": 3.242891524230721e-05, + "loss": 0.3363, + "num_input_tokens_seen": 1700712, + "step": 8920 + }, + { + "epoch": 4.638773388773389, + "grad_norm": 0.5166797637939453, + "learning_rate": 3.2407258205366136e-05, + "loss": 0.2177, + "num_input_tokens_seen": 1701672, + "step": 8925 + }, + { + "epoch": 4.641372141372141, + "grad_norm": 0.5910977721214294, + "learning_rate": 3.238559507302726e-05, + "loss": 0.2211, + "num_input_tokens_seen": 1702696, + "step": 8930 + }, + { + "epoch": 4.643970893970894, + "grad_norm": 0.520616888999939, + "learning_rate": 3.236392586311709e-05, + "loss": 0.2651, + "num_input_tokens_seen": 1703688, + "step": 8935 + }, + { + "epoch": 4.646569646569646, + "grad_norm": 0.350982129573822, + "learning_rate": 3.23422505934671e-05, + "loss": 0.2952, + "num_input_tokens_seen": 1704648, + "step": 8940 + }, + { + "epoch": 4.649168399168399, + "grad_norm": 0.5879229307174683, + "learning_rate": 3.232056928191376e-05, + "loss": 0.2697, + "num_input_tokens_seen": 1705544, + "step": 8945 + }, + { + "epoch": 4.651767151767151, + "grad_norm": 0.6433422565460205, + "learning_rate": 3.229888194629854e-05, + "loss": 0.2654, + "num_input_tokens_seen": 1706440, + "step": 8950 + }, + { + "epoch": 4.654365904365904, + "grad_norm": 0.23641496896743774, + "learning_rate": 3.227718860446782e-05, + "loss": 0.2836, + "num_input_tokens_seen": 1707336, + "step": 8955 + }, + { + "epoch": 4.656964656964657, + "grad_norm": 0.49931254982948303, + "learning_rate": 3.2255489274272975e-05, + "loss": 0.265, + "num_input_tokens_seen": 1708296, + "step": 8960 + }, + { + "epoch": 4.659563409563409, + "grad_norm": 0.6781687140464783, + "learning_rate": 3.2233783973570274e-05, + "loss": 0.2523, + "num_input_tokens_seen": 1709288, + "step": 8965 + }, + { + "epoch": 4.662162162162162, + "grad_norm": 0.529626190662384, + "learning_rate": 3.22120727202209e-05, + "loss": 0.2448, + "num_input_tokens_seen": 1710184, + "step": 8970 + }, + { + "epoch": 4.664760914760915, + "grad_norm": 0.4363968074321747, + "learning_rate": 3.219035553209093e-05, + "loss": 0.3106, + "num_input_tokens_seen": 1711112, + "step": 8975 + }, + { + "epoch": 4.667359667359667, + "grad_norm": 0.2720423936843872, + "learning_rate": 3.216863242705136e-05, + "loss": 0.2196, + "num_input_tokens_seen": 1712040, + "step": 8980 + }, + { + "epoch": 4.66995841995842, + "grad_norm": 0.5254648327827454, + "learning_rate": 3.214690342297802e-05, + "loss": 0.3061, + "num_input_tokens_seen": 1713000, + "step": 8985 + }, + { + "epoch": 4.672557172557172, + "grad_norm": 0.26519426703453064, + "learning_rate": 3.212516853775161e-05, + "loss": 0.2169, + "num_input_tokens_seen": 1713928, + "step": 8990 + }, + { + "epoch": 4.675155925155925, + "grad_norm": 0.653466522693634, + "learning_rate": 3.210342778925763e-05, + "loss": 0.2571, + "num_input_tokens_seen": 1714888, + "step": 8995 + }, + { + "epoch": 4.6777546777546775, + "grad_norm": 0.5399028658866882, + "learning_rate": 3.2081681195386496e-05, + "loss": 0.2805, + "num_input_tokens_seen": 1715912, + "step": 9000 + }, + { + "epoch": 4.68035343035343, + "grad_norm": 0.5589391589164734, + "learning_rate": 3.205992877403334e-05, + "loss": 0.181, + "num_input_tokens_seen": 1716872, + "step": 9005 + }, + { + "epoch": 4.682952182952183, + "grad_norm": 0.3922744691371918, + "learning_rate": 3.203817054309813e-05, + "loss": 0.279, + "num_input_tokens_seen": 1717864, + "step": 9010 + }, + { + "epoch": 4.6855509355509355, + "grad_norm": 0.6602903604507446, + "learning_rate": 3.2016406520485636e-05, + "loss": 0.2591, + "num_input_tokens_seen": 1718824, + "step": 9015 + }, + { + "epoch": 4.6881496881496885, + "grad_norm": 0.42451801896095276, + "learning_rate": 3.199463672410534e-05, + "loss": 0.3326, + "num_input_tokens_seen": 1719880, + "step": 9020 + }, + { + "epoch": 4.6907484407484406, + "grad_norm": 0.3741670846939087, + "learning_rate": 3.197286117187151e-05, + "loss": 0.337, + "num_input_tokens_seen": 1720840, + "step": 9025 + }, + { + "epoch": 4.6933471933471935, + "grad_norm": 0.32705333828926086, + "learning_rate": 3.195107988170315e-05, + "loss": 0.2973, + "num_input_tokens_seen": 1721800, + "step": 9030 + }, + { + "epoch": 4.695945945945946, + "grad_norm": 0.4684741497039795, + "learning_rate": 3.1929292871523994e-05, + "loss": 0.2527, + "num_input_tokens_seen": 1722856, + "step": 9035 + }, + { + "epoch": 4.698544698544699, + "grad_norm": 0.4980649948120117, + "learning_rate": 3.190750015926244e-05, + "loss": 0.2602, + "num_input_tokens_seen": 1723720, + "step": 9040 + }, + { + "epoch": 4.701143451143452, + "grad_norm": 0.6037728190422058, + "learning_rate": 3.188570176285164e-05, + "loss": 0.2435, + "num_input_tokens_seen": 1724680, + "step": 9045 + }, + { + "epoch": 4.703742203742204, + "grad_norm": 0.2597607970237732, + "learning_rate": 3.1863897700229375e-05, + "loss": 0.2272, + "num_input_tokens_seen": 1725672, + "step": 9050 + }, + { + "epoch": 4.706340956340957, + "grad_norm": 0.3331742286682129, + "learning_rate": 3.18420879893381e-05, + "loss": 0.2949, + "num_input_tokens_seen": 1726696, + "step": 9055 + }, + { + "epoch": 4.708939708939709, + "grad_norm": 0.22155295312404633, + "learning_rate": 3.182027264812494e-05, + "loss": 0.2456, + "num_input_tokens_seen": 1727688, + "step": 9060 + }, + { + "epoch": 4.711538461538462, + "grad_norm": 0.18942470848560333, + "learning_rate": 3.179845169454162e-05, + "loss": 0.281, + "num_input_tokens_seen": 1728680, + "step": 9065 + }, + { + "epoch": 4.714137214137214, + "grad_norm": 0.4309619665145874, + "learning_rate": 3.1776625146544504e-05, + "loss": 0.2281, + "num_input_tokens_seen": 1729672, + "step": 9070 + }, + { + "epoch": 4.716735966735967, + "grad_norm": 0.11862193048000336, + "learning_rate": 3.175479302209455e-05, + "loss": 0.2746, + "num_input_tokens_seen": 1730600, + "step": 9075 + }, + { + "epoch": 4.71933471933472, + "grad_norm": 0.5858727097511292, + "learning_rate": 3.173295533915733e-05, + "loss": 0.2369, + "num_input_tokens_seen": 1731656, + "step": 9080 + }, + { + "epoch": 4.721933471933472, + "grad_norm": 0.5284850597381592, + "learning_rate": 3.1711112115702954e-05, + "loss": 0.2664, + "num_input_tokens_seen": 1732584, + "step": 9085 + }, + { + "epoch": 4.724532224532225, + "grad_norm": 0.28869590163230896, + "learning_rate": 3.1689263369706104e-05, + "loss": 0.2374, + "num_input_tokens_seen": 1733640, + "step": 9090 + }, + { + "epoch": 4.727130977130977, + "grad_norm": 0.25578606128692627, + "learning_rate": 3.166740911914603e-05, + "loss": 0.2619, + "num_input_tokens_seen": 1734536, + "step": 9095 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.8487573266029358, + "learning_rate": 3.164554938200647e-05, + "loss": 0.2877, + "num_input_tokens_seen": 1735464, + "step": 9100 + }, + { + "epoch": 4.732328482328482, + "grad_norm": 0.40920907258987427, + "learning_rate": 3.162368417627571e-05, + "loss": 0.2423, + "num_input_tokens_seen": 1736520, + "step": 9105 + }, + { + "epoch": 4.734927234927235, + "grad_norm": 0.17665216326713562, + "learning_rate": 3.1601813519946514e-05, + "loss": 0.251, + "num_input_tokens_seen": 1737512, + "step": 9110 + }, + { + "epoch": 4.737525987525988, + "grad_norm": 0.6316314339637756, + "learning_rate": 3.157993743101616e-05, + "loss": 0.3084, + "num_input_tokens_seen": 1738504, + "step": 9115 + }, + { + "epoch": 4.74012474012474, + "grad_norm": 0.39096421003341675, + "learning_rate": 3.1558055927486355e-05, + "loss": 0.3092, + "num_input_tokens_seen": 1739432, + "step": 9120 + }, + { + "epoch": 4.742723492723493, + "grad_norm": 0.5713022351264954, + "learning_rate": 3.1536169027363304e-05, + "loss": 0.2482, + "num_input_tokens_seen": 1740424, + "step": 9125 + }, + { + "epoch": 4.745322245322245, + "grad_norm": 0.8029695153236389, + "learning_rate": 3.151427674865763e-05, + "loss": 0.3228, + "num_input_tokens_seen": 1741384, + "step": 9130 + }, + { + "epoch": 4.747920997920998, + "grad_norm": 0.3339039981365204, + "learning_rate": 3.149237910938438e-05, + "loss": 0.2582, + "num_input_tokens_seen": 1742344, + "step": 9135 + }, + { + "epoch": 4.75051975051975, + "grad_norm": 1.0043460130691528, + "learning_rate": 3.147047612756302e-05, + "loss": 0.3049, + "num_input_tokens_seen": 1743272, + "step": 9140 + }, + { + "epoch": 4.753118503118503, + "grad_norm": 0.30551573634147644, + "learning_rate": 3.1448567821217415e-05, + "loss": 0.2541, + "num_input_tokens_seen": 1744232, + "step": 9145 + }, + { + "epoch": 4.755717255717256, + "grad_norm": 0.29834339022636414, + "learning_rate": 3.14266542083758e-05, + "loss": 0.2571, + "num_input_tokens_seen": 1745128, + "step": 9150 + }, + { + "epoch": 4.758316008316008, + "grad_norm": 0.6555430293083191, + "learning_rate": 3.1404735307070785e-05, + "loss": 0.2424, + "num_input_tokens_seen": 1746024, + "step": 9155 + }, + { + "epoch": 4.760914760914761, + "grad_norm": 0.26624175906181335, + "learning_rate": 3.138281113533933e-05, + "loss": 0.2993, + "num_input_tokens_seen": 1747048, + "step": 9160 + }, + { + "epoch": 4.763513513513513, + "grad_norm": 0.3669930696487427, + "learning_rate": 3.136088171122274e-05, + "loss": 0.2923, + "num_input_tokens_seen": 1747976, + "step": 9165 + }, + { + "epoch": 4.766112266112266, + "grad_norm": 0.5683193802833557, + "learning_rate": 3.133894705276662e-05, + "loss": 0.2096, + "num_input_tokens_seen": 1748968, + "step": 9170 + }, + { + "epoch": 4.768711018711018, + "grad_norm": 0.30474692583084106, + "learning_rate": 3.131700717802091e-05, + "loss": 0.2429, + "num_input_tokens_seen": 1749960, + "step": 9175 + }, + { + "epoch": 4.771309771309771, + "grad_norm": 0.6376957297325134, + "learning_rate": 3.129506210503983e-05, + "loss": 0.3576, + "num_input_tokens_seen": 1750984, + "step": 9180 + }, + { + "epoch": 4.773908523908524, + "grad_norm": 0.6736136674880981, + "learning_rate": 3.127311185188187e-05, + "loss": 0.1943, + "num_input_tokens_seen": 1751944, + "step": 9185 + }, + { + "epoch": 4.776507276507276, + "grad_norm": 0.11615397036075592, + "learning_rate": 3.125115643660978e-05, + "loss": 0.2228, + "num_input_tokens_seen": 1752872, + "step": 9190 + }, + { + "epoch": 4.779106029106029, + "grad_norm": 0.11152761429548264, + "learning_rate": 3.12291958772906e-05, + "loss": 0.3187, + "num_input_tokens_seen": 1753864, + "step": 9195 + }, + { + "epoch": 4.781704781704782, + "grad_norm": 0.6353819370269775, + "learning_rate": 3.120723019199554e-05, + "loss": 0.2809, + "num_input_tokens_seen": 1754824, + "step": 9200 + }, + { + "epoch": 4.784303534303534, + "grad_norm": 0.21763098239898682, + "learning_rate": 3.118525939880007e-05, + "loss": 0.2548, + "num_input_tokens_seen": 1755784, + "step": 9205 + }, + { + "epoch": 4.786902286902287, + "grad_norm": 0.3810828626155853, + "learning_rate": 3.116328351578384e-05, + "loss": 0.2754, + "num_input_tokens_seen": 1756840, + "step": 9210 + }, + { + "epoch": 4.789501039501039, + "grad_norm": 0.5753610134124756, + "learning_rate": 3.114130256103072e-05, + "loss": 0.2984, + "num_input_tokens_seen": 1757832, + "step": 9215 + }, + { + "epoch": 4.792099792099792, + "grad_norm": 0.6506615877151489, + "learning_rate": 3.111931655262872e-05, + "loss": 0.2515, + "num_input_tokens_seen": 1758888, + "step": 9220 + }, + { + "epoch": 4.794698544698544, + "grad_norm": 0.788743257522583, + "learning_rate": 3.109732550867003e-05, + "loss": 0.2672, + "num_input_tokens_seen": 1759848, + "step": 9225 + }, + { + "epoch": 4.797297297297297, + "grad_norm": 0.43677064776420593, + "learning_rate": 3.107532944725097e-05, + "loss": 0.2551, + "num_input_tokens_seen": 1760840, + "step": 9230 + }, + { + "epoch": 4.79989604989605, + "grad_norm": 0.315082311630249, + "learning_rate": 3.1053328386472e-05, + "loss": 0.2658, + "num_input_tokens_seen": 1761800, + "step": 9235 + }, + { + "epoch": 4.802494802494802, + "grad_norm": 0.24554385244846344, + "learning_rate": 3.103132234443768e-05, + "loss": 0.2462, + "num_input_tokens_seen": 1762728, + "step": 9240 + }, + { + "epoch": 4.805093555093555, + "grad_norm": 0.3098351061344147, + "learning_rate": 3.10093113392567e-05, + "loss": 0.2494, + "num_input_tokens_seen": 1763624, + "step": 9245 + }, + { + "epoch": 4.8076923076923075, + "grad_norm": 0.2575865387916565, + "learning_rate": 3.0987295389041786e-05, + "loss": 0.2478, + "num_input_tokens_seen": 1764520, + "step": 9250 + }, + { + "epoch": 4.8102910602910605, + "grad_norm": 0.5310531258583069, + "learning_rate": 3.096527451190978e-05, + "loss": 0.2859, + "num_input_tokens_seen": 1765544, + "step": 9255 + }, + { + "epoch": 4.8128898128898125, + "grad_norm": 0.6355285048484802, + "learning_rate": 3.094324872598154e-05, + "loss": 0.3076, + "num_input_tokens_seen": 1766536, + "step": 9260 + }, + { + "epoch": 4.8154885654885655, + "grad_norm": 0.4423603415489197, + "learning_rate": 3.0921218049382e-05, + "loss": 0.2909, + "num_input_tokens_seen": 1767560, + "step": 9265 + }, + { + "epoch": 4.8180873180873185, + "grad_norm": 0.9200291633605957, + "learning_rate": 3.089918250024008e-05, + "loss": 0.2647, + "num_input_tokens_seen": 1768552, + "step": 9270 + }, + { + "epoch": 4.820686070686071, + "grad_norm": 0.5840334296226501, + "learning_rate": 3.087714209668875e-05, + "loss": 0.2287, + "num_input_tokens_seen": 1769480, + "step": 9275 + }, + { + "epoch": 4.8232848232848236, + "grad_norm": 0.43025141954421997, + "learning_rate": 3.085509685686494e-05, + "loss": 0.2676, + "num_input_tokens_seen": 1770440, + "step": 9280 + }, + { + "epoch": 4.825883575883576, + "grad_norm": 1.1537047624588013, + "learning_rate": 3.0833046798909563e-05, + "loss": 0.3291, + "num_input_tokens_seen": 1771400, + "step": 9285 + }, + { + "epoch": 4.828482328482329, + "grad_norm": 0.22389288246631622, + "learning_rate": 3.0810991940967535e-05, + "loss": 0.2533, + "num_input_tokens_seen": 1772328, + "step": 9290 + }, + { + "epoch": 4.831081081081081, + "grad_norm": 0.2012796700000763, + "learning_rate": 3.078893230118767e-05, + "loss": 0.2707, + "num_input_tokens_seen": 1773288, + "step": 9295 + }, + { + "epoch": 4.833679833679834, + "grad_norm": 0.4952752888202667, + "learning_rate": 3.076686789772276e-05, + "loss": 0.2838, + "num_input_tokens_seen": 1774216, + "step": 9300 + }, + { + "epoch": 4.836278586278587, + "grad_norm": 0.6864033937454224, + "learning_rate": 3.0744798748729494e-05, + "loss": 0.2507, + "num_input_tokens_seen": 1775144, + "step": 9305 + }, + { + "epoch": 4.838877338877339, + "grad_norm": 0.43452855944633484, + "learning_rate": 3.072272487236847e-05, + "loss": 0.2753, + "num_input_tokens_seen": 1776136, + "step": 9310 + }, + { + "epoch": 4.841476091476092, + "grad_norm": 0.26889148354530334, + "learning_rate": 3.0700646286804165e-05, + "loss": 0.2523, + "num_input_tokens_seen": 1777064, + "step": 9315 + }, + { + "epoch": 4.844074844074844, + "grad_norm": 0.5042035579681396, + "learning_rate": 3.067856301020495e-05, + "loss": 0.2907, + "num_input_tokens_seen": 1777960, + "step": 9320 + }, + { + "epoch": 4.846673596673597, + "grad_norm": 0.6142670512199402, + "learning_rate": 3.065647506074306e-05, + "loss": 0.2719, + "num_input_tokens_seen": 1778920, + "step": 9325 + }, + { + "epoch": 4.849272349272349, + "grad_norm": 0.5737236738204956, + "learning_rate": 3.0634382456594543e-05, + "loss": 0.2805, + "num_input_tokens_seen": 1779880, + "step": 9330 + }, + { + "epoch": 4.851871101871102, + "grad_norm": 0.2585936486721039, + "learning_rate": 3.061228521593931e-05, + "loss": 0.2669, + "num_input_tokens_seen": 1780840, + "step": 9335 + }, + { + "epoch": 4.854469854469855, + "grad_norm": 0.3999156355857849, + "learning_rate": 3.059018335696109e-05, + "loss": 0.2875, + "num_input_tokens_seen": 1781768, + "step": 9340 + }, + { + "epoch": 4.857068607068607, + "grad_norm": 0.6700050830841064, + "learning_rate": 3.056807689784738e-05, + "loss": 0.2533, + "num_input_tokens_seen": 1782632, + "step": 9345 + }, + { + "epoch": 4.85966735966736, + "grad_norm": 0.3412896990776062, + "learning_rate": 3.0545965856789486e-05, + "loss": 0.3062, + "num_input_tokens_seen": 1783560, + "step": 9350 + }, + { + "epoch": 4.862266112266112, + "grad_norm": 0.44469642639160156, + "learning_rate": 3.0523850251982474e-05, + "loss": 0.2455, + "num_input_tokens_seen": 1784552, + "step": 9355 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.7601420283317566, + "learning_rate": 3.0501730101625182e-05, + "loss": 0.2621, + "num_input_tokens_seen": 1785480, + "step": 9360 + }, + { + "epoch": 4.867463617463617, + "grad_norm": 0.6039171814918518, + "learning_rate": 3.0479605423920165e-05, + "loss": 0.3013, + "num_input_tokens_seen": 1786440, + "step": 9365 + }, + { + "epoch": 4.87006237006237, + "grad_norm": 0.48772019147872925, + "learning_rate": 3.0457476237073723e-05, + "loss": 0.3041, + "num_input_tokens_seen": 1787432, + "step": 9370 + }, + { + "epoch": 4.872661122661123, + "grad_norm": 0.48569750785827637, + "learning_rate": 3.043534255929586e-05, + "loss": 0.2351, + "num_input_tokens_seen": 1788424, + "step": 9375 + }, + { + "epoch": 4.875259875259875, + "grad_norm": 0.684185266494751, + "learning_rate": 3.0413204408800265e-05, + "loss": 0.254, + "num_input_tokens_seen": 1789320, + "step": 9380 + }, + { + "epoch": 4.877858627858628, + "grad_norm": 0.2592814564704895, + "learning_rate": 3.0391061803804334e-05, + "loss": 0.2822, + "num_input_tokens_seen": 1790280, + "step": 9385 + }, + { + "epoch": 4.88045738045738, + "grad_norm": 0.3609214723110199, + "learning_rate": 3.036891476252911e-05, + "loss": 0.2633, + "num_input_tokens_seen": 1791208, + "step": 9390 + }, + { + "epoch": 4.883056133056133, + "grad_norm": 0.4120696485042572, + "learning_rate": 3.0346763303199273e-05, + "loss": 0.2228, + "num_input_tokens_seen": 1792168, + "step": 9395 + }, + { + "epoch": 4.885654885654886, + "grad_norm": 0.29365742206573486, + "learning_rate": 3.0324607444043162e-05, + "loss": 0.2532, + "num_input_tokens_seen": 1793032, + "step": 9400 + }, + { + "epoch": 4.888253638253638, + "grad_norm": 0.19108402729034424, + "learning_rate": 3.0302447203292737e-05, + "loss": 0.2858, + "num_input_tokens_seen": 1793992, + "step": 9405 + }, + { + "epoch": 4.890852390852391, + "grad_norm": 0.30659937858581543, + "learning_rate": 3.0280282599183547e-05, + "loss": 0.2922, + "num_input_tokens_seen": 1794952, + "step": 9410 + }, + { + "epoch": 4.893451143451143, + "grad_norm": 0.3474118709564209, + "learning_rate": 3.025811364995474e-05, + "loss": 0.2738, + "num_input_tokens_seen": 1795944, + "step": 9415 + }, + { + "epoch": 4.896049896049896, + "grad_norm": 0.2163195013999939, + "learning_rate": 3.0235940373849042e-05, + "loss": 0.22, + "num_input_tokens_seen": 1796872, + "step": 9420 + }, + { + "epoch": 4.898648648648649, + "grad_norm": 0.16588808596134186, + "learning_rate": 3.0213762789112737e-05, + "loss": 0.3329, + "num_input_tokens_seen": 1797832, + "step": 9425 + }, + { + "epoch": 4.901247401247401, + "grad_norm": 0.2556842863559723, + "learning_rate": 3.0191580913995655e-05, + "loss": 0.2223, + "num_input_tokens_seen": 1798760, + "step": 9430 + }, + { + "epoch": 4.903846153846154, + "grad_norm": 0.5680045485496521, + "learning_rate": 3.016939476675115e-05, + "loss": 0.2556, + "num_input_tokens_seen": 1799752, + "step": 9435 + }, + { + "epoch": 4.906444906444906, + "grad_norm": 0.4948420822620392, + "learning_rate": 3.0147204365636116e-05, + "loss": 0.2249, + "num_input_tokens_seen": 1800680, + "step": 9440 + }, + { + "epoch": 4.909043659043659, + "grad_norm": 0.33528783917427063, + "learning_rate": 3.0125009728910908e-05, + "loss": 0.3235, + "num_input_tokens_seen": 1801576, + "step": 9445 + }, + { + "epoch": 4.911642411642411, + "grad_norm": 0.5206117033958435, + "learning_rate": 3.01028108748394e-05, + "loss": 0.2269, + "num_input_tokens_seen": 1802568, + "step": 9450 + }, + { + "epoch": 4.914241164241164, + "grad_norm": 0.16960595548152924, + "learning_rate": 3.0080607821688922e-05, + "loss": 0.2603, + "num_input_tokens_seen": 1803560, + "step": 9455 + }, + { + "epoch": 4.916839916839917, + "grad_norm": 0.8878976106643677, + "learning_rate": 3.005840058773025e-05, + "loss": 0.2584, + "num_input_tokens_seen": 1804552, + "step": 9460 + }, + { + "epoch": 4.919438669438669, + "grad_norm": 0.36960166692733765, + "learning_rate": 3.003618919123763e-05, + "loss": 0.2964, + "num_input_tokens_seen": 1805480, + "step": 9465 + }, + { + "epoch": 4.922037422037422, + "grad_norm": 0.746817946434021, + "learning_rate": 3.0013973650488713e-05, + "loss": 0.3023, + "num_input_tokens_seen": 1806472, + "step": 9470 + }, + { + "epoch": 4.924636174636174, + "grad_norm": 0.45479604601860046, + "learning_rate": 2.9991753983764547e-05, + "loss": 0.2225, + "num_input_tokens_seen": 1807432, + "step": 9475 + }, + { + "epoch": 4.927234927234927, + "grad_norm": 0.7246955037117004, + "learning_rate": 2.9969530209349604e-05, + "loss": 0.2942, + "num_input_tokens_seen": 1808392, + "step": 9480 + }, + { + "epoch": 4.9298336798336795, + "grad_norm": 0.2558760941028595, + "learning_rate": 2.9947302345531725e-05, + "loss": 0.2486, + "num_input_tokens_seen": 1809352, + "step": 9485 + }, + { + "epoch": 4.9324324324324325, + "grad_norm": 0.54148930311203, + "learning_rate": 2.9925070410602112e-05, + "loss": 0.2404, + "num_input_tokens_seen": 1810280, + "step": 9490 + }, + { + "epoch": 4.935031185031185, + "grad_norm": 0.3385000228881836, + "learning_rate": 2.9902834422855308e-05, + "loss": 0.2694, + "num_input_tokens_seen": 1811240, + "step": 9495 + }, + { + "epoch": 4.9376299376299375, + "grad_norm": 0.7116971015930176, + "learning_rate": 2.9880594400589213e-05, + "loss": 0.2127, + "num_input_tokens_seen": 1812232, + "step": 9500 + }, + { + "epoch": 4.9402286902286905, + "grad_norm": 0.5928667783737183, + "learning_rate": 2.9858350362105035e-05, + "loss": 0.2682, + "num_input_tokens_seen": 1813192, + "step": 9505 + }, + { + "epoch": 4.942827442827443, + "grad_norm": 0.15544363856315613, + "learning_rate": 2.983610232570728e-05, + "loss": 0.3223, + "num_input_tokens_seen": 1814120, + "step": 9510 + }, + { + "epoch": 4.9454261954261955, + "grad_norm": 0.4602639675140381, + "learning_rate": 2.9813850309703773e-05, + "loss": 0.2501, + "num_input_tokens_seen": 1815048, + "step": 9515 + }, + { + "epoch": 4.948024948024948, + "grad_norm": 0.4486646056175232, + "learning_rate": 2.9791594332405576e-05, + "loss": 0.2821, + "num_input_tokens_seen": 1815944, + "step": 9520 + }, + { + "epoch": 4.950623700623701, + "grad_norm": 0.8385176658630371, + "learning_rate": 2.9769334412127024e-05, + "loss": 0.2744, + "num_input_tokens_seen": 1816936, + "step": 9525 + }, + { + "epoch": 4.953222453222454, + "grad_norm": 0.3087857961654663, + "learning_rate": 2.974707056718571e-05, + "loss": 0.2093, + "num_input_tokens_seen": 1817832, + "step": 9530 + }, + { + "epoch": 4.955821205821206, + "grad_norm": 0.430347740650177, + "learning_rate": 2.9724802815902443e-05, + "loss": 0.2782, + "num_input_tokens_seen": 1818760, + "step": 9535 + }, + { + "epoch": 4.958419958419959, + "grad_norm": 0.5272422432899475, + "learning_rate": 2.9702531176601257e-05, + "loss": 0.2933, + "num_input_tokens_seen": 1819688, + "step": 9540 + }, + { + "epoch": 4.961018711018711, + "grad_norm": 0.6440242528915405, + "learning_rate": 2.9680255667609368e-05, + "loss": 0.2318, + "num_input_tokens_seen": 1820680, + "step": 9545 + }, + { + "epoch": 4.963617463617464, + "grad_norm": 0.06918247789144516, + "learning_rate": 2.965797630725719e-05, + "loss": 0.2605, + "num_input_tokens_seen": 1821608, + "step": 9550 + }, + { + "epoch": 4.966216216216216, + "grad_norm": 0.5283791422843933, + "learning_rate": 2.9635693113878317e-05, + "loss": 0.3081, + "num_input_tokens_seen": 1822504, + "step": 9555 + }, + { + "epoch": 4.968814968814969, + "grad_norm": 0.7443902492523193, + "learning_rate": 2.961340610580946e-05, + "loss": 0.3479, + "num_input_tokens_seen": 1823432, + "step": 9560 + }, + { + "epoch": 4.971413721413722, + "grad_norm": 0.28910142183303833, + "learning_rate": 2.959111530139051e-05, + "loss": 0.251, + "num_input_tokens_seen": 1824424, + "step": 9565 + }, + { + "epoch": 4.974012474012474, + "grad_norm": 0.4707660675048828, + "learning_rate": 2.9568820718964464e-05, + "loss": 0.2609, + "num_input_tokens_seen": 1825352, + "step": 9570 + }, + { + "epoch": 4.976611226611227, + "grad_norm": 0.32397890090942383, + "learning_rate": 2.9546522376877416e-05, + "loss": 0.2701, + "num_input_tokens_seen": 1826280, + "step": 9575 + }, + { + "epoch": 4.979209979209979, + "grad_norm": 0.42722994089126587, + "learning_rate": 2.952422029347858e-05, + "loss": 0.2721, + "num_input_tokens_seen": 1827240, + "step": 9580 + }, + { + "epoch": 4.981808731808732, + "grad_norm": 0.28642362356185913, + "learning_rate": 2.9501914487120226e-05, + "loss": 0.302, + "num_input_tokens_seen": 1828168, + "step": 9585 + }, + { + "epoch": 4.984407484407484, + "grad_norm": 0.5844094753265381, + "learning_rate": 2.9479604976157705e-05, + "loss": 0.2817, + "num_input_tokens_seen": 1829160, + "step": 9590 + }, + { + "epoch": 4.987006237006237, + "grad_norm": 0.49745678901672363, + "learning_rate": 2.9457291778949396e-05, + "loss": 0.2683, + "num_input_tokens_seen": 1830152, + "step": 9595 + }, + { + "epoch": 4.98960498960499, + "grad_norm": 0.4985657334327698, + "learning_rate": 2.943497491385674e-05, + "loss": 0.2509, + "num_input_tokens_seen": 1831112, + "step": 9600 + }, + { + "epoch": 4.992203742203742, + "grad_norm": 0.16422833502292633, + "learning_rate": 2.9412654399244173e-05, + "loss": 0.2257, + "num_input_tokens_seen": 1832040, + "step": 9605 + }, + { + "epoch": 4.994802494802495, + "grad_norm": 0.45195215940475464, + "learning_rate": 2.939033025347913e-05, + "loss": 0.3487, + "num_input_tokens_seen": 1832968, + "step": 9610 + }, + { + "epoch": 4.997401247401247, + "grad_norm": 0.40260452032089233, + "learning_rate": 2.936800249493206e-05, + "loss": 0.2642, + "num_input_tokens_seen": 1833896, + "step": 9615 + }, + { + "epoch": 5.0, + "grad_norm": 0.6826242208480835, + "learning_rate": 2.9345671141976373e-05, + "loss": 0.247, + "num_input_tokens_seen": 1834816, + "step": 9620 + }, + { + "epoch": 5.0, + "eval_loss": 0.2539876699447632, + "eval_runtime": 7.937, + "eval_samples_per_second": 107.849, + "eval_steps_per_second": 26.962, + "num_input_tokens_seen": 1834816, + "step": 9620 + }, + { + "epoch": 5.002598752598753, + "grad_norm": 0.7938888669013977, + "learning_rate": 2.9323336212988413e-05, + "loss": 0.2574, + "num_input_tokens_seen": 1835776, + "step": 9625 + }, + { + "epoch": 5.005197505197505, + "grad_norm": 0.3782363831996918, + "learning_rate": 2.93009977263475e-05, + "loss": 0.2388, + "num_input_tokens_seen": 1836672, + "step": 9630 + }, + { + "epoch": 5.007796257796258, + "grad_norm": 0.8305359482765198, + "learning_rate": 2.9278655700435876e-05, + "loss": 0.2996, + "num_input_tokens_seen": 1837600, + "step": 9635 + }, + { + "epoch": 5.01039501039501, + "grad_norm": 0.5652069449424744, + "learning_rate": 2.925631015363868e-05, + "loss": 0.2422, + "num_input_tokens_seen": 1838592, + "step": 9640 + }, + { + "epoch": 5.012993762993763, + "grad_norm": 0.5401731133460999, + "learning_rate": 2.9233961104343954e-05, + "loss": 0.2404, + "num_input_tokens_seen": 1839584, + "step": 9645 + }, + { + "epoch": 5.015592515592515, + "grad_norm": 0.557310163974762, + "learning_rate": 2.9211608570942638e-05, + "loss": 0.2307, + "num_input_tokens_seen": 1840544, + "step": 9650 + }, + { + "epoch": 5.018191268191268, + "grad_norm": 0.06949073076248169, + "learning_rate": 2.918925257182851e-05, + "loss": 0.1914, + "num_input_tokens_seen": 1841440, + "step": 9655 + }, + { + "epoch": 5.020790020790021, + "grad_norm": 0.6955159902572632, + "learning_rate": 2.9166893125398225e-05, + "loss": 0.3615, + "num_input_tokens_seen": 1842368, + "step": 9660 + }, + { + "epoch": 5.023388773388773, + "grad_norm": 0.5905665159225464, + "learning_rate": 2.9144530250051265e-05, + "loss": 0.2351, + "num_input_tokens_seen": 1843360, + "step": 9665 + }, + { + "epoch": 5.025987525987526, + "grad_norm": 0.5384669899940491, + "learning_rate": 2.9122163964189946e-05, + "loss": 0.3172, + "num_input_tokens_seen": 1844256, + "step": 9670 + }, + { + "epoch": 5.028586278586278, + "grad_norm": 0.23534703254699707, + "learning_rate": 2.909979428621935e-05, + "loss": 0.2542, + "num_input_tokens_seen": 1845216, + "step": 9675 + }, + { + "epoch": 5.031185031185031, + "grad_norm": 0.34158003330230713, + "learning_rate": 2.9077421234547402e-05, + "loss": 0.3072, + "num_input_tokens_seen": 1846208, + "step": 9680 + }, + { + "epoch": 5.033783783783784, + "grad_norm": 0.7231248021125793, + "learning_rate": 2.905504482758479e-05, + "loss": 0.2701, + "num_input_tokens_seen": 1847136, + "step": 9685 + }, + { + "epoch": 5.036382536382536, + "grad_norm": 0.3831927180290222, + "learning_rate": 2.9032665083744926e-05, + "loss": 0.2629, + "num_input_tokens_seen": 1848160, + "step": 9690 + }, + { + "epoch": 5.038981288981289, + "grad_norm": 0.32751280069351196, + "learning_rate": 2.9010282021444008e-05, + "loss": 0.3024, + "num_input_tokens_seen": 1849152, + "step": 9695 + }, + { + "epoch": 5.041580041580041, + "grad_norm": 0.2682577073574066, + "learning_rate": 2.898789565910096e-05, + "loss": 0.2959, + "num_input_tokens_seen": 1850144, + "step": 9700 + }, + { + "epoch": 5.044178794178794, + "grad_norm": 0.14653058350086212, + "learning_rate": 2.89655060151374e-05, + "loss": 0.2384, + "num_input_tokens_seen": 1851136, + "step": 9705 + }, + { + "epoch": 5.046777546777546, + "grad_norm": 0.8391284942626953, + "learning_rate": 2.894311310797767e-05, + "loss": 0.3137, + "num_input_tokens_seen": 1852288, + "step": 9710 + }, + { + "epoch": 5.049376299376299, + "grad_norm": 0.4079807996749878, + "learning_rate": 2.892071695604878e-05, + "loss": 0.3068, + "num_input_tokens_seen": 1853216, + "step": 9715 + }, + { + "epoch": 5.051975051975052, + "grad_norm": 0.2509877681732178, + "learning_rate": 2.8898317577780425e-05, + "loss": 0.2541, + "num_input_tokens_seen": 1854176, + "step": 9720 + }, + { + "epoch": 5.0545738045738045, + "grad_norm": 0.3231804668903351, + "learning_rate": 2.8875914991604948e-05, + "loss": 0.2678, + "num_input_tokens_seen": 1855168, + "step": 9725 + }, + { + "epoch": 5.057172557172557, + "grad_norm": 0.28328898549079895, + "learning_rate": 2.8853509215957323e-05, + "loss": 0.2862, + "num_input_tokens_seen": 1856128, + "step": 9730 + }, + { + "epoch": 5.0597713097713095, + "grad_norm": 0.5293604731559753, + "learning_rate": 2.8831100269275168e-05, + "loss": 0.2583, + "num_input_tokens_seen": 1857056, + "step": 9735 + }, + { + "epoch": 5.0623700623700625, + "grad_norm": 0.6642389893531799, + "learning_rate": 2.8808688169998694e-05, + "loss": 0.2554, + "num_input_tokens_seen": 1858048, + "step": 9740 + }, + { + "epoch": 5.064968814968815, + "grad_norm": 0.47702786326408386, + "learning_rate": 2.878627293657071e-05, + "loss": 0.2607, + "num_input_tokens_seen": 1858976, + "step": 9745 + }, + { + "epoch": 5.0675675675675675, + "grad_norm": 0.5394213795661926, + "learning_rate": 2.8763854587436605e-05, + "loss": 0.2652, + "num_input_tokens_seen": 1859936, + "step": 9750 + }, + { + "epoch": 5.0701663201663205, + "grad_norm": 0.8154252171516418, + "learning_rate": 2.8741433141044334e-05, + "loss": 0.2411, + "num_input_tokens_seen": 1860864, + "step": 9755 + }, + { + "epoch": 5.072765072765073, + "grad_norm": 0.2124316543340683, + "learning_rate": 2.87190086158444e-05, + "loss": 0.3284, + "num_input_tokens_seen": 1861888, + "step": 9760 + }, + { + "epoch": 5.075363825363826, + "grad_norm": 0.34593498706817627, + "learning_rate": 2.8696581030289838e-05, + "loss": 0.2424, + "num_input_tokens_seen": 1862816, + "step": 9765 + }, + { + "epoch": 5.077962577962578, + "grad_norm": 0.297306627035141, + "learning_rate": 2.8674150402836202e-05, + "loss": 0.2728, + "num_input_tokens_seen": 1863808, + "step": 9770 + }, + { + "epoch": 5.080561330561331, + "grad_norm": 0.5679472088813782, + "learning_rate": 2.8651716751941555e-05, + "loss": 0.2317, + "num_input_tokens_seen": 1864768, + "step": 9775 + }, + { + "epoch": 5.083160083160083, + "grad_norm": 0.728058934211731, + "learning_rate": 2.862928009606643e-05, + "loss": 0.291, + "num_input_tokens_seen": 1865760, + "step": 9780 + }, + { + "epoch": 5.085758835758836, + "grad_norm": 0.4725840389728546, + "learning_rate": 2.8606840453673867e-05, + "loss": 0.2889, + "num_input_tokens_seen": 1866752, + "step": 9785 + }, + { + "epoch": 5.088357588357589, + "grad_norm": 0.20534370839595795, + "learning_rate": 2.8584397843229317e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1867616, + "step": 9790 + }, + { + "epoch": 5.090956340956341, + "grad_norm": 0.2789906859397888, + "learning_rate": 2.856195228320071e-05, + "loss": 0.2552, + "num_input_tokens_seen": 1868544, + "step": 9795 + }, + { + "epoch": 5.093555093555094, + "grad_norm": 0.3980441391468048, + "learning_rate": 2.8539503792058393e-05, + "loss": 0.2714, + "num_input_tokens_seen": 1869472, + "step": 9800 + }, + { + "epoch": 5.096153846153846, + "grad_norm": 0.42348405718803406, + "learning_rate": 2.8517052388275116e-05, + "loss": 0.2476, + "num_input_tokens_seen": 1870432, + "step": 9805 + }, + { + "epoch": 5.098752598752599, + "grad_norm": 0.3811087906360626, + "learning_rate": 2.8494598090326043e-05, + "loss": 0.2278, + "num_input_tokens_seen": 1871360, + "step": 9810 + }, + { + "epoch": 5.101351351351352, + "grad_norm": 0.40889716148376465, + "learning_rate": 2.8472140916688706e-05, + "loss": 0.2181, + "num_input_tokens_seen": 1872256, + "step": 9815 + }, + { + "epoch": 5.103950103950104, + "grad_norm": 0.5209667682647705, + "learning_rate": 2.8449680885843e-05, + "loss": 0.2877, + "num_input_tokens_seen": 1873184, + "step": 9820 + }, + { + "epoch": 5.106548856548857, + "grad_norm": 0.5652549266815186, + "learning_rate": 2.8427218016271185e-05, + "loss": 0.2416, + "num_input_tokens_seen": 1874208, + "step": 9825 + }, + { + "epoch": 5.109147609147609, + "grad_norm": 0.4178586006164551, + "learning_rate": 2.8404752326457856e-05, + "loss": 0.253, + "num_input_tokens_seen": 1875200, + "step": 9830 + }, + { + "epoch": 5.111746361746362, + "grad_norm": 0.4654427766799927, + "learning_rate": 2.8382283834889904e-05, + "loss": 0.2559, + "num_input_tokens_seen": 1876128, + "step": 9835 + }, + { + "epoch": 5.114345114345114, + "grad_norm": 0.26376214623451233, + "learning_rate": 2.8359812560056564e-05, + "loss": 0.2913, + "num_input_tokens_seen": 1877088, + "step": 9840 + }, + { + "epoch": 5.116943866943867, + "grad_norm": 0.4621710479259491, + "learning_rate": 2.8337338520449336e-05, + "loss": 0.2276, + "num_input_tokens_seen": 1878112, + "step": 9845 + }, + { + "epoch": 5.11954261954262, + "grad_norm": 0.7371581196784973, + "learning_rate": 2.8314861734561997e-05, + "loss": 0.2834, + "num_input_tokens_seen": 1879104, + "step": 9850 + }, + { + "epoch": 5.122141372141372, + "grad_norm": 0.6011269688606262, + "learning_rate": 2.829238222089059e-05, + "loss": 0.261, + "num_input_tokens_seen": 1880096, + "step": 9855 + }, + { + "epoch": 5.124740124740125, + "grad_norm": 0.4767458438873291, + "learning_rate": 2.82698999979334e-05, + "loss": 0.2553, + "num_input_tokens_seen": 1881056, + "step": 9860 + }, + { + "epoch": 5.127338877338877, + "grad_norm": 0.4321274161338806, + "learning_rate": 2.8247415084190953e-05, + "loss": 0.2777, + "num_input_tokens_seen": 1881984, + "step": 9865 + }, + { + "epoch": 5.12993762993763, + "grad_norm": 0.9224740266799927, + "learning_rate": 2.8224927498165964e-05, + "loss": 0.2923, + "num_input_tokens_seen": 1882944, + "step": 9870 + }, + { + "epoch": 5.132536382536382, + "grad_norm": 0.42835426330566406, + "learning_rate": 2.820243725836337e-05, + "loss": 0.2228, + "num_input_tokens_seen": 1883904, + "step": 9875 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.3641653060913086, + "learning_rate": 2.8179944383290274e-05, + "loss": 0.2977, + "num_input_tokens_seen": 1884864, + "step": 9880 + }, + { + "epoch": 5.137733887733888, + "grad_norm": 0.23097369074821472, + "learning_rate": 2.8157448891455963e-05, + "loss": 0.2868, + "num_input_tokens_seen": 1885824, + "step": 9885 + }, + { + "epoch": 5.14033264033264, + "grad_norm": 0.1933079957962036, + "learning_rate": 2.813495080137186e-05, + "loss": 0.2684, + "num_input_tokens_seen": 1886784, + "step": 9890 + }, + { + "epoch": 5.142931392931393, + "grad_norm": 0.3684043884277344, + "learning_rate": 2.8112450131551564e-05, + "loss": 0.2408, + "num_input_tokens_seen": 1887744, + "step": 9895 + }, + { + "epoch": 5.145530145530145, + "grad_norm": 0.2150520384311676, + "learning_rate": 2.808994690051075e-05, + "loss": 0.2565, + "num_input_tokens_seen": 1888704, + "step": 9900 + }, + { + "epoch": 5.148128898128898, + "grad_norm": 0.8467469215393066, + "learning_rate": 2.806744112676722e-05, + "loss": 0.3224, + "num_input_tokens_seen": 1889792, + "step": 9905 + }, + { + "epoch": 5.150727650727651, + "grad_norm": 0.34300005435943604, + "learning_rate": 2.804493282884087e-05, + "loss": 0.2341, + "num_input_tokens_seen": 1890784, + "step": 9910 + }, + { + "epoch": 5.153326403326403, + "grad_norm": 0.29641368985176086, + "learning_rate": 2.8022422025253682e-05, + "loss": 0.2879, + "num_input_tokens_seen": 1891712, + "step": 9915 + }, + { + "epoch": 5.155925155925156, + "grad_norm": 0.14234159886837006, + "learning_rate": 2.7999908734529673e-05, + "loss": 0.262, + "num_input_tokens_seen": 1892608, + "step": 9920 + }, + { + "epoch": 5.158523908523908, + "grad_norm": 0.34189608693122864, + "learning_rate": 2.7977392975194937e-05, + "loss": 0.2949, + "num_input_tokens_seen": 1893504, + "step": 9925 + }, + { + "epoch": 5.161122661122661, + "grad_norm": 0.3301973342895508, + "learning_rate": 2.7954874765777583e-05, + "loss": 0.2472, + "num_input_tokens_seen": 1894496, + "step": 9930 + }, + { + "epoch": 5.163721413721413, + "grad_norm": 0.8090334534645081, + "learning_rate": 2.793235412480774e-05, + "loss": 0.2926, + "num_input_tokens_seen": 1895456, + "step": 9935 + }, + { + "epoch": 5.166320166320166, + "grad_norm": 0.27867990732192993, + "learning_rate": 2.790983107081753e-05, + "loss": 0.2454, + "num_input_tokens_seen": 1896448, + "step": 9940 + }, + { + "epoch": 5.168918918918919, + "grad_norm": 0.16127963364124298, + "learning_rate": 2.7887305622341087e-05, + "loss": 0.2387, + "num_input_tokens_seen": 1897408, + "step": 9945 + }, + { + "epoch": 5.171517671517671, + "grad_norm": 0.5437464118003845, + "learning_rate": 2.786477779791447e-05, + "loss": 0.34, + "num_input_tokens_seen": 1898400, + "step": 9950 + }, + { + "epoch": 5.174116424116424, + "grad_norm": 0.451581209897995, + "learning_rate": 2.7842247616075734e-05, + "loss": 0.3169, + "num_input_tokens_seen": 1899424, + "step": 9955 + }, + { + "epoch": 5.1767151767151764, + "grad_norm": 0.40154677629470825, + "learning_rate": 2.7819715095364863e-05, + "loss": 0.2669, + "num_input_tokens_seen": 1900416, + "step": 9960 + }, + { + "epoch": 5.179313929313929, + "grad_norm": 0.7054794430732727, + "learning_rate": 2.779718025432375e-05, + "loss": 0.2977, + "num_input_tokens_seen": 1901344, + "step": 9965 + }, + { + "epoch": 5.1819126819126815, + "grad_norm": 0.332154780626297, + "learning_rate": 2.777464311149622e-05, + "loss": 0.2329, + "num_input_tokens_seen": 1902304, + "step": 9970 + }, + { + "epoch": 5.1845114345114345, + "grad_norm": 0.8069952130317688, + "learning_rate": 2.775210368542797e-05, + "loss": 0.2803, + "num_input_tokens_seen": 1903264, + "step": 9975 + }, + { + "epoch": 5.1871101871101875, + "grad_norm": 0.6512637138366699, + "learning_rate": 2.77295619946666e-05, + "loss": 0.2436, + "num_input_tokens_seen": 1904192, + "step": 9980 + }, + { + "epoch": 5.1897089397089395, + "grad_norm": 0.6060284376144409, + "learning_rate": 2.770701805776155e-05, + "loss": 0.2623, + "num_input_tokens_seen": 1905152, + "step": 9985 + }, + { + "epoch": 5.1923076923076925, + "grad_norm": 0.29258984327316284, + "learning_rate": 2.7684471893264124e-05, + "loss": 0.2412, + "num_input_tokens_seen": 1906016, + "step": 9990 + }, + { + "epoch": 5.194906444906445, + "grad_norm": 0.3100377321243286, + "learning_rate": 2.7661923519727463e-05, + "loss": 0.2522, + "num_input_tokens_seen": 1906976, + "step": 9995 + }, + { + "epoch": 5.197505197505198, + "grad_norm": 0.48051759600639343, + "learning_rate": 2.76393729557065e-05, + "loss": 0.2469, + "num_input_tokens_seen": 1907808, + "step": 10000 + }, + { + "epoch": 5.20010395010395, + "grad_norm": 0.48483848571777344, + "learning_rate": 2.7616820219757993e-05, + "loss": 0.2291, + "num_input_tokens_seen": 1908736, + "step": 10005 + }, + { + "epoch": 5.202702702702703, + "grad_norm": 0.34169793128967285, + "learning_rate": 2.7594265330440494e-05, + "loss": 0.2668, + "num_input_tokens_seen": 1909696, + "step": 10010 + }, + { + "epoch": 5.205301455301456, + "grad_norm": 0.4183018207550049, + "learning_rate": 2.7571708306314298e-05, + "loss": 0.3187, + "num_input_tokens_seen": 1910656, + "step": 10015 + }, + { + "epoch": 5.207900207900208, + "grad_norm": 0.25197669863700867, + "learning_rate": 2.754914916594148e-05, + "loss": 0.2592, + "num_input_tokens_seen": 1911584, + "step": 10020 + }, + { + "epoch": 5.210498960498961, + "grad_norm": 0.8913759589195251, + "learning_rate": 2.7526587927885857e-05, + "loss": 0.3741, + "num_input_tokens_seen": 1912512, + "step": 10025 + }, + { + "epoch": 5.213097713097713, + "grad_norm": 0.3800581395626068, + "learning_rate": 2.7504024610712963e-05, + "loss": 0.2522, + "num_input_tokens_seen": 1913440, + "step": 10030 + }, + { + "epoch": 5.215696465696466, + "grad_norm": 0.3668855130672455, + "learning_rate": 2.7481459232990038e-05, + "loss": 0.2698, + "num_input_tokens_seen": 1914336, + "step": 10035 + }, + { + "epoch": 5.218295218295219, + "grad_norm": 0.26087266206741333, + "learning_rate": 2.7458891813286024e-05, + "loss": 0.2329, + "num_input_tokens_seen": 1915264, + "step": 10040 + }, + { + "epoch": 5.220893970893971, + "grad_norm": 0.42438259720802307, + "learning_rate": 2.7436322370171562e-05, + "loss": 0.3053, + "num_input_tokens_seen": 1916192, + "step": 10045 + }, + { + "epoch": 5.223492723492724, + "grad_norm": 0.40331465005874634, + "learning_rate": 2.7413750922218917e-05, + "loss": 0.2908, + "num_input_tokens_seen": 1917184, + "step": 10050 + }, + { + "epoch": 5.226091476091476, + "grad_norm": 0.22147265076637268, + "learning_rate": 2.739117748800204e-05, + "loss": 0.2728, + "num_input_tokens_seen": 1918112, + "step": 10055 + }, + { + "epoch": 5.228690228690229, + "grad_norm": 0.397192120552063, + "learning_rate": 2.7368602086096494e-05, + "loss": 0.271, + "num_input_tokens_seen": 1919072, + "step": 10060 + }, + { + "epoch": 5.231288981288981, + "grad_norm": 0.619055449962616, + "learning_rate": 2.7346024735079486e-05, + "loss": 0.232, + "num_input_tokens_seen": 1919936, + "step": 10065 + }, + { + "epoch": 5.233887733887734, + "grad_norm": 0.3230665624141693, + "learning_rate": 2.7323445453529795e-05, + "loss": 0.2887, + "num_input_tokens_seen": 1920928, + "step": 10070 + }, + { + "epoch": 5.236486486486487, + "grad_norm": 0.529472827911377, + "learning_rate": 2.730086426002782e-05, + "loss": 0.2639, + "num_input_tokens_seen": 1921824, + "step": 10075 + }, + { + "epoch": 5.239085239085239, + "grad_norm": 0.26376795768737793, + "learning_rate": 2.7278281173155507e-05, + "loss": 0.3054, + "num_input_tokens_seen": 1922720, + "step": 10080 + }, + { + "epoch": 5.241683991683992, + "grad_norm": 0.38713744282722473, + "learning_rate": 2.7255696211496375e-05, + "loss": 0.268, + "num_input_tokens_seen": 1923680, + "step": 10085 + }, + { + "epoch": 5.244282744282744, + "grad_norm": 0.44681787490844727, + "learning_rate": 2.7233109393635482e-05, + "loss": 0.2505, + "num_input_tokens_seen": 1924640, + "step": 10090 + }, + { + "epoch": 5.246881496881497, + "grad_norm": 0.35928434133529663, + "learning_rate": 2.7210520738159423e-05, + "loss": 0.2622, + "num_input_tokens_seen": 1925568, + "step": 10095 + }, + { + "epoch": 5.24948024948025, + "grad_norm": 0.1799210160970688, + "learning_rate": 2.718793026365628e-05, + "loss": 0.2592, + "num_input_tokens_seen": 1926496, + "step": 10100 + }, + { + "epoch": 5.252079002079002, + "grad_norm": 0.38611674308776855, + "learning_rate": 2.716533798871565e-05, + "loss": 0.2621, + "num_input_tokens_seen": 1927488, + "step": 10105 + }, + { + "epoch": 5.254677754677755, + "grad_norm": 0.33028680086135864, + "learning_rate": 2.7142743931928628e-05, + "loss": 0.2345, + "num_input_tokens_seen": 1928480, + "step": 10110 + }, + { + "epoch": 5.257276507276507, + "grad_norm": 0.2383003681898117, + "learning_rate": 2.7120148111887732e-05, + "loss": 0.2409, + "num_input_tokens_seen": 1929344, + "step": 10115 + }, + { + "epoch": 5.25987525987526, + "grad_norm": 0.4700615108013153, + "learning_rate": 2.7097550547186973e-05, + "loss": 0.2025, + "num_input_tokens_seen": 1930304, + "step": 10120 + }, + { + "epoch": 5.262474012474012, + "grad_norm": 0.579394519329071, + "learning_rate": 2.7074951256421776e-05, + "loss": 0.27, + "num_input_tokens_seen": 1931200, + "step": 10125 + }, + { + "epoch": 5.265072765072765, + "grad_norm": 0.5808722972869873, + "learning_rate": 2.7052350258188987e-05, + "loss": 0.293, + "num_input_tokens_seen": 1932128, + "step": 10130 + }, + { + "epoch": 5.267671517671518, + "grad_norm": 0.5339131951332092, + "learning_rate": 2.7029747571086857e-05, + "loss": 0.2264, + "num_input_tokens_seen": 1933088, + "step": 10135 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 0.3658030033111572, + "learning_rate": 2.700714321371504e-05, + "loss": 0.2589, + "num_input_tokens_seen": 1934080, + "step": 10140 + }, + { + "epoch": 5.272869022869023, + "grad_norm": 0.47355735301971436, + "learning_rate": 2.6984537204674548e-05, + "loss": 0.3113, + "num_input_tokens_seen": 1934976, + "step": 10145 + }, + { + "epoch": 5.275467775467775, + "grad_norm": 0.17446744441986084, + "learning_rate": 2.6961929562567767e-05, + "loss": 0.2164, + "num_input_tokens_seen": 1935968, + "step": 10150 + }, + { + "epoch": 5.278066528066528, + "grad_norm": 0.40762999653816223, + "learning_rate": 2.693932030599841e-05, + "loss": 0.3295, + "num_input_tokens_seen": 1936992, + "step": 10155 + }, + { + "epoch": 5.28066528066528, + "grad_norm": 0.6897422075271606, + "learning_rate": 2.691670945357154e-05, + "loss": 0.2889, + "num_input_tokens_seen": 1937920, + "step": 10160 + }, + { + "epoch": 5.283264033264033, + "grad_norm": 0.8089295625686646, + "learning_rate": 2.6894097023893504e-05, + "loss": 0.2694, + "num_input_tokens_seen": 1938880, + "step": 10165 + }, + { + "epoch": 5.285862785862786, + "grad_norm": 0.25785356760025024, + "learning_rate": 2.6871483035571977e-05, + "loss": 0.2663, + "num_input_tokens_seen": 1939808, + "step": 10170 + }, + { + "epoch": 5.288461538461538, + "grad_norm": 0.24975447356700897, + "learning_rate": 2.68488675072159e-05, + "loss": 0.2785, + "num_input_tokens_seen": 1940768, + "step": 10175 + }, + { + "epoch": 5.291060291060291, + "grad_norm": 0.3516393005847931, + "learning_rate": 2.6826250457435475e-05, + "loss": 0.2763, + "num_input_tokens_seen": 1941728, + "step": 10180 + }, + { + "epoch": 5.293659043659043, + "grad_norm": 0.8078452944755554, + "learning_rate": 2.6803631904842174e-05, + "loss": 0.3047, + "num_input_tokens_seen": 1942688, + "step": 10185 + }, + { + "epoch": 5.296257796257796, + "grad_norm": 0.37490248680114746, + "learning_rate": 2.67810118680487e-05, + "loss": 0.2788, + "num_input_tokens_seen": 1943680, + "step": 10190 + }, + { + "epoch": 5.298856548856548, + "grad_norm": 0.4332335293292999, + "learning_rate": 2.675839036566897e-05, + "loss": 0.2869, + "num_input_tokens_seen": 1944672, + "step": 10195 + }, + { + "epoch": 5.301455301455301, + "grad_norm": 0.7661706805229187, + "learning_rate": 2.673576741631811e-05, + "loss": 0.2625, + "num_input_tokens_seen": 1945696, + "step": 10200 + }, + { + "epoch": 5.304054054054054, + "grad_norm": 0.45960792899131775, + "learning_rate": 2.671314303861244e-05, + "loss": 0.2982, + "num_input_tokens_seen": 1946688, + "step": 10205 + }, + { + "epoch": 5.3066528066528065, + "grad_norm": 0.6291543245315552, + "learning_rate": 2.6690517251169455e-05, + "loss": 0.2646, + "num_input_tokens_seen": 1947744, + "step": 10210 + }, + { + "epoch": 5.3092515592515594, + "grad_norm": 0.47896963357925415, + "learning_rate": 2.6667890072607805e-05, + "loss": 0.2963, + "num_input_tokens_seen": 1948736, + "step": 10215 + }, + { + "epoch": 5.3118503118503115, + "grad_norm": 0.22665895521640778, + "learning_rate": 2.6645261521547294e-05, + "loss": 0.2686, + "num_input_tokens_seen": 1949696, + "step": 10220 + }, + { + "epoch": 5.3144490644490645, + "grad_norm": 0.4136798679828644, + "learning_rate": 2.6622631616608845e-05, + "loss": 0.2461, + "num_input_tokens_seen": 1950624, + "step": 10225 + }, + { + "epoch": 5.317047817047817, + "grad_norm": 0.2290763556957245, + "learning_rate": 2.6600000376414496e-05, + "loss": 0.23, + "num_input_tokens_seen": 1951520, + "step": 10230 + }, + { + "epoch": 5.31964656964657, + "grad_norm": 0.6676218509674072, + "learning_rate": 2.65773678195874e-05, + "loss": 0.2043, + "num_input_tokens_seen": 1952384, + "step": 10235 + }, + { + "epoch": 5.3222453222453225, + "grad_norm": 0.6911576986312866, + "learning_rate": 2.6554733964751776e-05, + "loss": 0.2591, + "num_input_tokens_seen": 1953376, + "step": 10240 + }, + { + "epoch": 5.324844074844075, + "grad_norm": 0.5326479077339172, + "learning_rate": 2.653209883053291e-05, + "loss": 0.313, + "num_input_tokens_seen": 1954272, + "step": 10245 + }, + { + "epoch": 5.327442827442828, + "grad_norm": 0.50472092628479, + "learning_rate": 2.6509462435557152e-05, + "loss": 0.3347, + "num_input_tokens_seen": 1955264, + "step": 10250 + }, + { + "epoch": 5.33004158004158, + "grad_norm": 0.575984537601471, + "learning_rate": 2.6486824798451892e-05, + "loss": 0.297, + "num_input_tokens_seen": 1956224, + "step": 10255 + }, + { + "epoch": 5.332640332640333, + "grad_norm": 0.7674715518951416, + "learning_rate": 2.646418593784552e-05, + "loss": 0.2259, + "num_input_tokens_seen": 1957120, + "step": 10260 + }, + { + "epoch": 5.335239085239086, + "grad_norm": 0.43594369292259216, + "learning_rate": 2.6441545872367453e-05, + "loss": 0.2788, + "num_input_tokens_seen": 1958112, + "step": 10265 + }, + { + "epoch": 5.337837837837838, + "grad_norm": 0.5888736248016357, + "learning_rate": 2.6418904620648094e-05, + "loss": 0.2874, + "num_input_tokens_seen": 1959072, + "step": 10270 + }, + { + "epoch": 5.340436590436591, + "grad_norm": 0.47382020950317383, + "learning_rate": 2.6396262201318823e-05, + "loss": 0.2575, + "num_input_tokens_seen": 1960000, + "step": 10275 + }, + { + "epoch": 5.343035343035343, + "grad_norm": 0.7016987800598145, + "learning_rate": 2.637361863301198e-05, + "loss": 0.2702, + "num_input_tokens_seen": 1960992, + "step": 10280 + }, + { + "epoch": 5.345634095634096, + "grad_norm": 0.6882029175758362, + "learning_rate": 2.6350973934360857e-05, + "loss": 0.2477, + "num_input_tokens_seen": 1961792, + "step": 10285 + }, + { + "epoch": 5.348232848232848, + "grad_norm": 0.6828579306602478, + "learning_rate": 2.6328328123999664e-05, + "loss": 0.2631, + "num_input_tokens_seen": 1962720, + "step": 10290 + }, + { + "epoch": 5.350831600831601, + "grad_norm": 0.31319189071655273, + "learning_rate": 2.6305681220563526e-05, + "loss": 0.239, + "num_input_tokens_seen": 1963648, + "step": 10295 + }, + { + "epoch": 5.353430353430354, + "grad_norm": 0.5755786299705505, + "learning_rate": 2.6283033242688478e-05, + "loss": 0.2437, + "num_input_tokens_seen": 1964576, + "step": 10300 + }, + { + "epoch": 5.356029106029106, + "grad_norm": 0.24511100351810455, + "learning_rate": 2.626038420901144e-05, + "loss": 0.3287, + "num_input_tokens_seen": 1965472, + "step": 10305 + }, + { + "epoch": 5.358627858627859, + "grad_norm": 0.14392057061195374, + "learning_rate": 2.6237734138170177e-05, + "loss": 0.2408, + "num_input_tokens_seen": 1966368, + "step": 10310 + }, + { + "epoch": 5.361226611226611, + "grad_norm": 0.46924716234207153, + "learning_rate": 2.6215083048803348e-05, + "loss": 0.1988, + "num_input_tokens_seen": 1967264, + "step": 10315 + }, + { + "epoch": 5.363825363825364, + "grad_norm": 0.3217489421367645, + "learning_rate": 2.6192430959550407e-05, + "loss": 0.2909, + "num_input_tokens_seen": 1968224, + "step": 10320 + }, + { + "epoch": 5.366424116424117, + "grad_norm": 0.2615918815135956, + "learning_rate": 2.616977788905166e-05, + "loss": 0.2295, + "num_input_tokens_seen": 1969184, + "step": 10325 + }, + { + "epoch": 5.369022869022869, + "grad_norm": 0.2853372097015381, + "learning_rate": 2.614712385594822e-05, + "loss": 0.2466, + "num_input_tokens_seen": 1970208, + "step": 10330 + }, + { + "epoch": 5.371621621621622, + "grad_norm": 0.5614522099494934, + "learning_rate": 2.6124468878881968e-05, + "loss": 0.3076, + "num_input_tokens_seen": 1971168, + "step": 10335 + }, + { + "epoch": 5.374220374220374, + "grad_norm": 0.5547161102294922, + "learning_rate": 2.61018129764956e-05, + "loss": 0.2308, + "num_input_tokens_seen": 1972096, + "step": 10340 + }, + { + "epoch": 5.376819126819127, + "grad_norm": 0.15231645107269287, + "learning_rate": 2.6079156167432524e-05, + "loss": 0.2115, + "num_input_tokens_seen": 1973088, + "step": 10345 + }, + { + "epoch": 5.379417879417879, + "grad_norm": 0.6312798857688904, + "learning_rate": 2.6056498470336936e-05, + "loss": 0.2237, + "num_input_tokens_seen": 1974016, + "step": 10350 + }, + { + "epoch": 5.382016632016632, + "grad_norm": 0.4913633167743683, + "learning_rate": 2.6033839903853745e-05, + "loss": 0.2444, + "num_input_tokens_seen": 1974944, + "step": 10355 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 0.467853844165802, + "learning_rate": 2.6011180486628585e-05, + "loss": 0.2168, + "num_input_tokens_seen": 1975872, + "step": 10360 + }, + { + "epoch": 5.387214137214137, + "grad_norm": 0.43751510977745056, + "learning_rate": 2.5988520237307774e-05, + "loss": 0.2651, + "num_input_tokens_seen": 1976736, + "step": 10365 + }, + { + "epoch": 5.38981288981289, + "grad_norm": 0.554132878780365, + "learning_rate": 2.596585917453833e-05, + "loss": 0.2719, + "num_input_tokens_seen": 1977632, + "step": 10370 + }, + { + "epoch": 5.392411642411642, + "grad_norm": 0.23070941865444183, + "learning_rate": 2.5943197316967933e-05, + "loss": 0.2896, + "num_input_tokens_seen": 1978624, + "step": 10375 + }, + { + "epoch": 5.395010395010395, + "grad_norm": 0.8450644612312317, + "learning_rate": 2.5920534683244914e-05, + "loss": 0.286, + "num_input_tokens_seen": 1979584, + "step": 10380 + }, + { + "epoch": 5.397609147609147, + "grad_norm": 0.5293058753013611, + "learning_rate": 2.5897871292018256e-05, + "loss": 0.2119, + "num_input_tokens_seen": 1980480, + "step": 10385 + }, + { + "epoch": 5.4002079002079, + "grad_norm": 0.09686457365751266, + "learning_rate": 2.5875207161937553e-05, + "loss": 0.212, + "num_input_tokens_seen": 1981408, + "step": 10390 + }, + { + "epoch": 5.402806652806653, + "grad_norm": 0.2186676561832428, + "learning_rate": 2.5852542311653005e-05, + "loss": 0.2808, + "num_input_tokens_seen": 1982368, + "step": 10395 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.5161146521568298, + "learning_rate": 2.5829876759815414e-05, + "loss": 0.2293, + "num_input_tokens_seen": 1983296, + "step": 10400 + }, + { + "epoch": 5.408004158004158, + "grad_norm": 0.5759539008140564, + "learning_rate": 2.5807210525076158e-05, + "loss": 0.2474, + "num_input_tokens_seen": 1984160, + "step": 10405 + }, + { + "epoch": 5.41060291060291, + "grad_norm": 0.4438892900943756, + "learning_rate": 2.5784543626087172e-05, + "loss": 0.2474, + "num_input_tokens_seen": 1985088, + "step": 10410 + }, + { + "epoch": 5.413201663201663, + "grad_norm": 0.6300112009048462, + "learning_rate": 2.576187608150094e-05, + "loss": 0.29, + "num_input_tokens_seen": 1986048, + "step": 10415 + }, + { + "epoch": 5.415800415800415, + "grad_norm": 0.6211110949516296, + "learning_rate": 2.5739207909970485e-05, + "loss": 0.3351, + "num_input_tokens_seen": 1986944, + "step": 10420 + }, + { + "epoch": 5.418399168399168, + "grad_norm": 0.6868324875831604, + "learning_rate": 2.5716539130149326e-05, + "loss": 0.2374, + "num_input_tokens_seen": 1987904, + "step": 10425 + }, + { + "epoch": 5.420997920997921, + "grad_norm": 0.46908456087112427, + "learning_rate": 2.56938697606915e-05, + "loss": 0.2795, + "num_input_tokens_seen": 1988928, + "step": 10430 + }, + { + "epoch": 5.423596673596673, + "grad_norm": 0.4523158669471741, + "learning_rate": 2.5671199820251534e-05, + "loss": 0.2941, + "num_input_tokens_seen": 1989888, + "step": 10435 + }, + { + "epoch": 5.426195426195426, + "grad_norm": 0.6162075400352478, + "learning_rate": 2.56485293274844e-05, + "loss": 0.2606, + "num_input_tokens_seen": 1990816, + "step": 10440 + }, + { + "epoch": 5.4287941787941785, + "grad_norm": 0.22353142499923706, + "learning_rate": 2.5625858301045535e-05, + "loss": 0.2229, + "num_input_tokens_seen": 1991776, + "step": 10445 + }, + { + "epoch": 5.4313929313929314, + "grad_norm": 0.6252297759056091, + "learning_rate": 2.5603186759590837e-05, + "loss": 0.285, + "num_input_tokens_seen": 1992768, + "step": 10450 + }, + { + "epoch": 5.4339916839916835, + "grad_norm": 0.7953232526779175, + "learning_rate": 2.558051472177661e-05, + "loss": 0.2917, + "num_input_tokens_seen": 1993728, + "step": 10455 + }, + { + "epoch": 5.4365904365904365, + "grad_norm": 0.37410908937454224, + "learning_rate": 2.5557842206259552e-05, + "loss": 0.2866, + "num_input_tokens_seen": 1994688, + "step": 10460 + }, + { + "epoch": 5.4391891891891895, + "grad_norm": 0.42629292607307434, + "learning_rate": 2.5535169231696777e-05, + "loss": 0.2321, + "num_input_tokens_seen": 1995648, + "step": 10465 + }, + { + "epoch": 5.441787941787942, + "grad_norm": 0.547283947467804, + "learning_rate": 2.5512495816745773e-05, + "loss": 0.2944, + "num_input_tokens_seen": 1996608, + "step": 10470 + }, + { + "epoch": 5.4443866943866945, + "grad_norm": 0.678135097026825, + "learning_rate": 2.5489821980064383e-05, + "loss": 0.2582, + "num_input_tokens_seen": 1997632, + "step": 10475 + }, + { + "epoch": 5.446985446985447, + "grad_norm": 0.17865225672721863, + "learning_rate": 2.546714774031079e-05, + "loss": 0.2518, + "num_input_tokens_seen": 1998560, + "step": 10480 + }, + { + "epoch": 5.4495841995842, + "grad_norm": 0.5331645011901855, + "learning_rate": 2.5444473116143534e-05, + "loss": 0.2281, + "num_input_tokens_seen": 1999520, + "step": 10485 + }, + { + "epoch": 5.452182952182953, + "grad_norm": 0.29969048500061035, + "learning_rate": 2.5421798126221447e-05, + "loss": 0.3003, + "num_input_tokens_seen": 2000448, + "step": 10490 + }, + { + "epoch": 5.454781704781705, + "grad_norm": 0.2715916335582733, + "learning_rate": 2.5399122789203672e-05, + "loss": 0.2203, + "num_input_tokens_seen": 2001408, + "step": 10495 + }, + { + "epoch": 5.457380457380458, + "grad_norm": 0.1415930539369583, + "learning_rate": 2.537644712374965e-05, + "loss": 0.2882, + "num_input_tokens_seen": 2002368, + "step": 10500 + }, + { + "epoch": 5.45997920997921, + "grad_norm": 0.49190598726272583, + "learning_rate": 2.5353771148519057e-05, + "loss": 0.2698, + "num_input_tokens_seen": 2003360, + "step": 10505 + }, + { + "epoch": 5.462577962577963, + "grad_norm": 0.7090172171592712, + "learning_rate": 2.5331094882171857e-05, + "loss": 0.213, + "num_input_tokens_seen": 2004320, + "step": 10510 + }, + { + "epoch": 5.465176715176715, + "grad_norm": 0.4820115864276886, + "learning_rate": 2.5308418343368247e-05, + "loss": 0.2728, + "num_input_tokens_seen": 2005280, + "step": 10515 + }, + { + "epoch": 5.467775467775468, + "grad_norm": 0.8879924416542053, + "learning_rate": 2.528574155076864e-05, + "loss": 0.308, + "num_input_tokens_seen": 2006176, + "step": 10520 + }, + { + "epoch": 5.470374220374221, + "grad_norm": 0.18939773738384247, + "learning_rate": 2.5263064523033653e-05, + "loss": 0.2203, + "num_input_tokens_seen": 2007104, + "step": 10525 + }, + { + "epoch": 5.472972972972973, + "grad_norm": 0.2525785565376282, + "learning_rate": 2.524038727882411e-05, + "loss": 0.2558, + "num_input_tokens_seen": 2008064, + "step": 10530 + }, + { + "epoch": 5.475571725571726, + "grad_norm": 0.4734834134578705, + "learning_rate": 2.521770983680102e-05, + "loss": 0.2353, + "num_input_tokens_seen": 2009088, + "step": 10535 + }, + { + "epoch": 5.478170478170478, + "grad_norm": 0.5846004486083984, + "learning_rate": 2.5195032215625524e-05, + "loss": 0.2521, + "num_input_tokens_seen": 2010016, + "step": 10540 + }, + { + "epoch": 5.480769230769231, + "grad_norm": 0.4790395498275757, + "learning_rate": 2.5172354433958944e-05, + "loss": 0.2492, + "num_input_tokens_seen": 2010976, + "step": 10545 + }, + { + "epoch": 5.483367983367984, + "grad_norm": 0.5607297420501709, + "learning_rate": 2.5149676510462717e-05, + "loss": 0.2782, + "num_input_tokens_seen": 2011840, + "step": 10550 + }, + { + "epoch": 5.485966735966736, + "grad_norm": 0.32085302472114563, + "learning_rate": 2.5126998463798396e-05, + "loss": 0.2249, + "num_input_tokens_seen": 2012832, + "step": 10555 + }, + { + "epoch": 5.488565488565489, + "grad_norm": 0.5531225800514221, + "learning_rate": 2.5104320312627634e-05, + "loss": 0.3043, + "num_input_tokens_seen": 2013760, + "step": 10560 + }, + { + "epoch": 5.491164241164241, + "grad_norm": 0.48570093512535095, + "learning_rate": 2.5081642075612177e-05, + "loss": 0.2856, + "num_input_tokens_seen": 2014688, + "step": 10565 + }, + { + "epoch": 5.493762993762994, + "grad_norm": 0.31736257672309875, + "learning_rate": 2.5058963771413844e-05, + "loss": 0.1834, + "num_input_tokens_seen": 2015648, + "step": 10570 + }, + { + "epoch": 5.496361746361746, + "grad_norm": 0.6303454041481018, + "learning_rate": 2.5036285418694507e-05, + "loss": 0.2279, + "num_input_tokens_seen": 2016608, + "step": 10575 + }, + { + "epoch": 5.498960498960499, + "grad_norm": 0.46419382095336914, + "learning_rate": 2.5013607036116065e-05, + "loss": 0.2706, + "num_input_tokens_seen": 2017632, + "step": 10580 + }, + { + "epoch": 5.5, + "eval_loss": 0.2503683567047119, + "eval_runtime": 8.0624, + "eval_samples_per_second": 106.172, + "eval_steps_per_second": 26.543, + "num_input_tokens_seen": 2018016, + "step": 10582 + }, + { + "epoch": 5.501559251559252, + "grad_norm": 0.5265673995018005, + "learning_rate": 2.4990928642340468e-05, + "loss": 0.2473, + "num_input_tokens_seen": 2018592, + "step": 10585 + }, + { + "epoch": 5.504158004158004, + "grad_norm": 0.26973241567611694, + "learning_rate": 2.4968250256029636e-05, + "loss": 0.3093, + "num_input_tokens_seen": 2019616, + "step": 10590 + }, + { + "epoch": 5.506756756756757, + "grad_norm": 0.3416658937931061, + "learning_rate": 2.4945571895845523e-05, + "loss": 0.2639, + "num_input_tokens_seen": 2020576, + "step": 10595 + }, + { + "epoch": 5.509355509355509, + "grad_norm": 0.6309340000152588, + "learning_rate": 2.4922893580450038e-05, + "loss": 0.2499, + "num_input_tokens_seen": 2021600, + "step": 10600 + }, + { + "epoch": 5.511954261954262, + "grad_norm": 0.5381622910499573, + "learning_rate": 2.4900215328505063e-05, + "loss": 0.3168, + "num_input_tokens_seen": 2022528, + "step": 10605 + }, + { + "epoch": 5.514553014553014, + "grad_norm": 0.7273787260055542, + "learning_rate": 2.4877537158672427e-05, + "loss": 0.2861, + "num_input_tokens_seen": 2023456, + "step": 10610 + }, + { + "epoch": 5.517151767151767, + "grad_norm": 0.5074452757835388, + "learning_rate": 2.485485908961388e-05, + "loss": 0.3093, + "num_input_tokens_seen": 2024416, + "step": 10615 + }, + { + "epoch": 5.51975051975052, + "grad_norm": 0.7163800597190857, + "learning_rate": 2.48321811399911e-05, + "loss": 0.2976, + "num_input_tokens_seen": 2025408, + "step": 10620 + }, + { + "epoch": 5.522349272349272, + "grad_norm": 0.35409918427467346, + "learning_rate": 2.480950332846567e-05, + "loss": 0.2665, + "num_input_tokens_seen": 2026464, + "step": 10625 + }, + { + "epoch": 5.524948024948025, + "grad_norm": 0.336329847574234, + "learning_rate": 2.4786825673699052e-05, + "loss": 0.2666, + "num_input_tokens_seen": 2027392, + "step": 10630 + }, + { + "epoch": 5.527546777546777, + "grad_norm": 0.4494556486606598, + "learning_rate": 2.476414819435258e-05, + "loss": 0.2702, + "num_input_tokens_seen": 2028320, + "step": 10635 + }, + { + "epoch": 5.53014553014553, + "grad_norm": 0.3479055166244507, + "learning_rate": 2.4741470909087457e-05, + "loss": 0.2783, + "num_input_tokens_seen": 2029280, + "step": 10640 + }, + { + "epoch": 5.532744282744282, + "grad_norm": 0.2638600170612335, + "learning_rate": 2.471879383656469e-05, + "loss": 0.2432, + "num_input_tokens_seen": 2030208, + "step": 10645 + }, + { + "epoch": 5.535343035343035, + "grad_norm": 0.35642626881599426, + "learning_rate": 2.4696116995445147e-05, + "loss": 0.281, + "num_input_tokens_seen": 2031264, + "step": 10650 + }, + { + "epoch": 5.537941787941788, + "grad_norm": 0.5651821494102478, + "learning_rate": 2.4673440404389493e-05, + "loss": 0.2261, + "num_input_tokens_seen": 2032224, + "step": 10655 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 0.5694889426231384, + "learning_rate": 2.465076408205818e-05, + "loss": 0.1973, + "num_input_tokens_seen": 2033120, + "step": 10660 + }, + { + "epoch": 5.543139293139293, + "grad_norm": 0.4572083353996277, + "learning_rate": 2.4628088047111464e-05, + "loss": 0.2328, + "num_input_tokens_seen": 2034016, + "step": 10665 + }, + { + "epoch": 5.545738045738045, + "grad_norm": 0.48062992095947266, + "learning_rate": 2.4605412318209332e-05, + "loss": 0.2657, + "num_input_tokens_seen": 2034976, + "step": 10670 + }, + { + "epoch": 5.548336798336798, + "grad_norm": 0.20615443587303162, + "learning_rate": 2.458273691401156e-05, + "loss": 0.3178, + "num_input_tokens_seen": 2035904, + "step": 10675 + }, + { + "epoch": 5.5509355509355505, + "grad_norm": 0.5392249822616577, + "learning_rate": 2.4560061853177594e-05, + "loss": 0.2077, + "num_input_tokens_seen": 2036864, + "step": 10680 + }, + { + "epoch": 5.553534303534303, + "grad_norm": 0.38639694452285767, + "learning_rate": 2.4537387154366653e-05, + "loss": 0.2278, + "num_input_tokens_seen": 2037856, + "step": 10685 + }, + { + "epoch": 5.556133056133056, + "grad_norm": 0.40719136595726013, + "learning_rate": 2.4514712836237638e-05, + "loss": 0.2247, + "num_input_tokens_seen": 2038816, + "step": 10690 + }, + { + "epoch": 5.5587318087318085, + "grad_norm": 0.9188973903656006, + "learning_rate": 2.4492038917449137e-05, + "loss": 0.3178, + "num_input_tokens_seen": 2039872, + "step": 10695 + }, + { + "epoch": 5.5613305613305615, + "grad_norm": 0.5198147296905518, + "learning_rate": 2.446936541665941e-05, + "loss": 0.2495, + "num_input_tokens_seen": 2040864, + "step": 10700 + }, + { + "epoch": 5.563929313929314, + "grad_norm": 0.44606369733810425, + "learning_rate": 2.4446692352526387e-05, + "loss": 0.2809, + "num_input_tokens_seen": 2041824, + "step": 10705 + }, + { + "epoch": 5.5665280665280665, + "grad_norm": 0.5995461344718933, + "learning_rate": 2.4424019743707607e-05, + "loss": 0.2354, + "num_input_tokens_seen": 2042784, + "step": 10710 + }, + { + "epoch": 5.5691268191268195, + "grad_norm": 0.5553898811340332, + "learning_rate": 2.4401347608860257e-05, + "loss": 0.2784, + "num_input_tokens_seen": 2043808, + "step": 10715 + }, + { + "epoch": 5.571725571725572, + "grad_norm": 0.5509878396987915, + "learning_rate": 2.4378675966641134e-05, + "loss": 0.2145, + "num_input_tokens_seen": 2044768, + "step": 10720 + }, + { + "epoch": 5.574324324324325, + "grad_norm": 0.8378271460533142, + "learning_rate": 2.4356004835706625e-05, + "loss": 0.2458, + "num_input_tokens_seen": 2045696, + "step": 10725 + }, + { + "epoch": 5.576923076923077, + "grad_norm": 0.4963880777359009, + "learning_rate": 2.4333334234712697e-05, + "loss": 0.2091, + "num_input_tokens_seen": 2046656, + "step": 10730 + }, + { + "epoch": 5.57952182952183, + "grad_norm": 0.8046156764030457, + "learning_rate": 2.4310664182314873e-05, + "loss": 0.3069, + "num_input_tokens_seen": 2047616, + "step": 10735 + }, + { + "epoch": 5.582120582120583, + "grad_norm": 0.11870864033699036, + "learning_rate": 2.4287994697168247e-05, + "loss": 0.2583, + "num_input_tokens_seen": 2048544, + "step": 10740 + }, + { + "epoch": 5.584719334719335, + "grad_norm": 0.2628503739833832, + "learning_rate": 2.426532579792742e-05, + "loss": 0.2842, + "num_input_tokens_seen": 2049536, + "step": 10745 + }, + { + "epoch": 5.587318087318088, + "grad_norm": 0.46483108401298523, + "learning_rate": 2.4242657503246523e-05, + "loss": 0.2412, + "num_input_tokens_seen": 2050528, + "step": 10750 + }, + { + "epoch": 5.58991683991684, + "grad_norm": 0.28265181183815, + "learning_rate": 2.4219989831779187e-05, + "loss": 0.3047, + "num_input_tokens_seen": 2051488, + "step": 10755 + }, + { + "epoch": 5.592515592515593, + "grad_norm": 0.542441189289093, + "learning_rate": 2.4197322802178534e-05, + "loss": 0.2506, + "num_input_tokens_seen": 2052416, + "step": 10760 + }, + { + "epoch": 5.595114345114345, + "grad_norm": 0.6489547491073608, + "learning_rate": 2.417465643309716e-05, + "loss": 0.2543, + "num_input_tokens_seen": 2053312, + "step": 10765 + }, + { + "epoch": 5.597713097713098, + "grad_norm": 0.612777054309845, + "learning_rate": 2.415199074318712e-05, + "loss": 0.2635, + "num_input_tokens_seen": 2054208, + "step": 10770 + }, + { + "epoch": 5.600311850311851, + "grad_norm": 0.6412058472633362, + "learning_rate": 2.412932575109988e-05, + "loss": 0.2334, + "num_input_tokens_seen": 2055168, + "step": 10775 + }, + { + "epoch": 5.602910602910603, + "grad_norm": 0.2306017279624939, + "learning_rate": 2.410666147548637e-05, + "loss": 0.2711, + "num_input_tokens_seen": 2056128, + "step": 10780 + }, + { + "epoch": 5.605509355509356, + "grad_norm": 0.4575825035572052, + "learning_rate": 2.408399793499691e-05, + "loss": 0.2851, + "num_input_tokens_seen": 2057024, + "step": 10785 + }, + { + "epoch": 5.608108108108108, + "grad_norm": 0.679241955280304, + "learning_rate": 2.4061335148281224e-05, + "loss": 0.264, + "num_input_tokens_seen": 2058048, + "step": 10790 + }, + { + "epoch": 5.610706860706861, + "grad_norm": 0.15984942018985748, + "learning_rate": 2.403867313398841e-05, + "loss": 0.3042, + "num_input_tokens_seen": 2059008, + "step": 10795 + }, + { + "epoch": 5.613305613305613, + "grad_norm": 0.24839149415493011, + "learning_rate": 2.401601191076694e-05, + "loss": 0.2822, + "num_input_tokens_seen": 2059968, + "step": 10800 + }, + { + "epoch": 5.615904365904366, + "grad_norm": 0.3842454254627228, + "learning_rate": 2.399335149726463e-05, + "loss": 0.2486, + "num_input_tokens_seen": 2060960, + "step": 10805 + }, + { + "epoch": 5.618503118503119, + "grad_norm": 0.7647603750228882, + "learning_rate": 2.3970691912128608e-05, + "loss": 0.2797, + "num_input_tokens_seen": 2061920, + "step": 10810 + }, + { + "epoch": 5.621101871101871, + "grad_norm": 0.3778878152370453, + "learning_rate": 2.394803317400535e-05, + "loss": 0.2356, + "num_input_tokens_seen": 2062816, + "step": 10815 + }, + { + "epoch": 5.623700623700624, + "grad_norm": 0.2502991557121277, + "learning_rate": 2.3925375301540627e-05, + "loss": 0.2391, + "num_input_tokens_seen": 2063744, + "step": 10820 + }, + { + "epoch": 5.626299376299376, + "grad_norm": 0.46172645688056946, + "learning_rate": 2.390271831337949e-05, + "loss": 0.204, + "num_input_tokens_seen": 2064640, + "step": 10825 + }, + { + "epoch": 5.628898128898129, + "grad_norm": 0.2537570297718048, + "learning_rate": 2.3880062228166276e-05, + "loss": 0.2845, + "num_input_tokens_seen": 2065536, + "step": 10830 + }, + { + "epoch": 5.631496881496881, + "grad_norm": 0.20018669962882996, + "learning_rate": 2.3857407064544567e-05, + "loss": 0.2259, + "num_input_tokens_seen": 2066432, + "step": 10835 + }, + { + "epoch": 5.634095634095634, + "grad_norm": 0.20573347806930542, + "learning_rate": 2.3834752841157188e-05, + "loss": 0.2897, + "num_input_tokens_seen": 2067328, + "step": 10840 + }, + { + "epoch": 5.636694386694387, + "grad_norm": 0.25868505239486694, + "learning_rate": 2.381209957664619e-05, + "loss": 0.2521, + "num_input_tokens_seen": 2068416, + "step": 10845 + }, + { + "epoch": 5.639293139293139, + "grad_norm": 0.6069724559783936, + "learning_rate": 2.3789447289652838e-05, + "loss": 0.2564, + "num_input_tokens_seen": 2069312, + "step": 10850 + }, + { + "epoch": 5.641891891891892, + "grad_norm": 0.2242106944322586, + "learning_rate": 2.37667959988176e-05, + "loss": 0.2511, + "num_input_tokens_seen": 2070240, + "step": 10855 + }, + { + "epoch": 5.644490644490644, + "grad_norm": 0.7654104232788086, + "learning_rate": 2.374414572278011e-05, + "loss": 0.3137, + "num_input_tokens_seen": 2071200, + "step": 10860 + }, + { + "epoch": 5.647089397089397, + "grad_norm": 0.7347099781036377, + "learning_rate": 2.372149648017917e-05, + "loss": 0.2502, + "num_input_tokens_seen": 2072192, + "step": 10865 + }, + { + "epoch": 5.649688149688149, + "grad_norm": 0.26216498017311096, + "learning_rate": 2.3698848289652747e-05, + "loss": 0.2719, + "num_input_tokens_seen": 2073120, + "step": 10870 + }, + { + "epoch": 5.652286902286902, + "grad_norm": 0.5763298273086548, + "learning_rate": 2.3676201169837917e-05, + "loss": 0.3089, + "num_input_tokens_seen": 2074144, + "step": 10875 + }, + { + "epoch": 5.654885654885655, + "grad_norm": 0.3128834664821625, + "learning_rate": 2.365355513937089e-05, + "loss": 0.2866, + "num_input_tokens_seen": 2075040, + "step": 10880 + }, + { + "epoch": 5.657484407484407, + "grad_norm": 0.2743260860443115, + "learning_rate": 2.3630910216886982e-05, + "loss": 0.2916, + "num_input_tokens_seen": 2076064, + "step": 10885 + }, + { + "epoch": 5.66008316008316, + "grad_norm": 0.5811701416969299, + "learning_rate": 2.3608266421020592e-05, + "loss": 0.2769, + "num_input_tokens_seen": 2077024, + "step": 10890 + }, + { + "epoch": 5.662681912681912, + "grad_norm": 0.290126234292984, + "learning_rate": 2.358562377040519e-05, + "loss": 0.2604, + "num_input_tokens_seen": 2077952, + "step": 10895 + }, + { + "epoch": 5.665280665280665, + "grad_norm": 0.4915136694908142, + "learning_rate": 2.356298228367331e-05, + "loss": 0.2539, + "num_input_tokens_seen": 2078944, + "step": 10900 + }, + { + "epoch": 5.667879417879417, + "grad_norm": 0.268018513917923, + "learning_rate": 2.354034197945653e-05, + "loss": 0.2903, + "num_input_tokens_seen": 2079904, + "step": 10905 + }, + { + "epoch": 5.67047817047817, + "grad_norm": 0.7351523041725159, + "learning_rate": 2.351770287638543e-05, + "loss": 0.2352, + "num_input_tokens_seen": 2080896, + "step": 10910 + }, + { + "epoch": 5.673076923076923, + "grad_norm": 0.8738045692443848, + "learning_rate": 2.3495064993089637e-05, + "loss": 0.2919, + "num_input_tokens_seen": 2081888, + "step": 10915 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 0.39474496245384216, + "learning_rate": 2.3472428348197754e-05, + "loss": 0.2568, + "num_input_tokens_seen": 2082784, + "step": 10920 + }, + { + "epoch": 5.678274428274428, + "grad_norm": 0.0726313665509224, + "learning_rate": 2.344979296033737e-05, + "loss": 0.2325, + "num_input_tokens_seen": 2083744, + "step": 10925 + }, + { + "epoch": 5.6808731808731805, + "grad_norm": 0.27377748489379883, + "learning_rate": 2.3427158848135035e-05, + "loss": 0.3012, + "num_input_tokens_seen": 2084640, + "step": 10930 + }, + { + "epoch": 5.6834719334719335, + "grad_norm": 0.5511705279350281, + "learning_rate": 2.340452603021627e-05, + "loss": 0.2084, + "num_input_tokens_seen": 2085632, + "step": 10935 + }, + { + "epoch": 5.686070686070686, + "grad_norm": 0.4831693172454834, + "learning_rate": 2.338189452520549e-05, + "loss": 0.2768, + "num_input_tokens_seen": 2086560, + "step": 10940 + }, + { + "epoch": 5.6886694386694385, + "grad_norm": 0.43529626727104187, + "learning_rate": 2.335926435172606e-05, + "loss": 0.2692, + "num_input_tokens_seen": 2087616, + "step": 10945 + }, + { + "epoch": 5.6912681912681915, + "grad_norm": 0.7305035591125488, + "learning_rate": 2.333663552840025e-05, + "loss": 0.253, + "num_input_tokens_seen": 2088544, + "step": 10950 + }, + { + "epoch": 5.693866943866944, + "grad_norm": 0.9263004064559937, + "learning_rate": 2.3314008073849207e-05, + "loss": 0.305, + "num_input_tokens_seen": 2089472, + "step": 10955 + }, + { + "epoch": 5.696465696465697, + "grad_norm": 0.12667357921600342, + "learning_rate": 2.329138200669296e-05, + "loss": 0.2647, + "num_input_tokens_seen": 2090464, + "step": 10960 + }, + { + "epoch": 5.6990644490644495, + "grad_norm": 0.4754107594490051, + "learning_rate": 2.3268757345550383e-05, + "loss": 0.2779, + "num_input_tokens_seen": 2091424, + "step": 10965 + }, + { + "epoch": 5.701663201663202, + "grad_norm": 0.2474718540906906, + "learning_rate": 2.3246134109039226e-05, + "loss": 0.2278, + "num_input_tokens_seen": 2092352, + "step": 10970 + }, + { + "epoch": 5.704261954261955, + "grad_norm": 0.6315350532531738, + "learning_rate": 2.3223512315776022e-05, + "loss": 0.2159, + "num_input_tokens_seen": 2093248, + "step": 10975 + }, + { + "epoch": 5.706860706860707, + "grad_norm": 0.5534874200820923, + "learning_rate": 2.320089198437614e-05, + "loss": 0.3417, + "num_input_tokens_seen": 2094240, + "step": 10980 + }, + { + "epoch": 5.70945945945946, + "grad_norm": 0.2817750871181488, + "learning_rate": 2.3178273133453748e-05, + "loss": 0.2461, + "num_input_tokens_seen": 2095168, + "step": 10985 + }, + { + "epoch": 5.712058212058212, + "grad_norm": 0.4528232216835022, + "learning_rate": 2.3155655781621793e-05, + "loss": 0.2599, + "num_input_tokens_seen": 2096160, + "step": 10990 + }, + { + "epoch": 5.714656964656965, + "grad_norm": 0.1833333820104599, + "learning_rate": 2.3133039947491987e-05, + "loss": 0.3052, + "num_input_tokens_seen": 2097088, + "step": 10995 + }, + { + "epoch": 5.717255717255718, + "grad_norm": 0.6780665516853333, + "learning_rate": 2.3110425649674796e-05, + "loss": 0.2262, + "num_input_tokens_seen": 2098080, + "step": 11000 + }, + { + "epoch": 5.71985446985447, + "grad_norm": 0.6678966283798218, + "learning_rate": 2.3087812906779408e-05, + "loss": 0.2546, + "num_input_tokens_seen": 2099008, + "step": 11005 + }, + { + "epoch": 5.722453222453223, + "grad_norm": 0.780660092830658, + "learning_rate": 2.3065201737413748e-05, + "loss": 0.2727, + "num_input_tokens_seen": 2100000, + "step": 11010 + }, + { + "epoch": 5.725051975051975, + "grad_norm": 0.35556942224502563, + "learning_rate": 2.3042592160184444e-05, + "loss": 0.2837, + "num_input_tokens_seen": 2100928, + "step": 11015 + }, + { + "epoch": 5.727650727650728, + "grad_norm": 0.5394973754882812, + "learning_rate": 2.3019984193696804e-05, + "loss": 0.283, + "num_input_tokens_seen": 2101856, + "step": 11020 + }, + { + "epoch": 5.73024948024948, + "grad_norm": 0.2765316963195801, + "learning_rate": 2.2997377856554822e-05, + "loss": 0.2317, + "num_input_tokens_seen": 2102784, + "step": 11025 + }, + { + "epoch": 5.732848232848233, + "grad_norm": 0.6379207372665405, + "learning_rate": 2.2974773167361146e-05, + "loss": 0.2616, + "num_input_tokens_seen": 2103680, + "step": 11030 + }, + { + "epoch": 5.735446985446986, + "grad_norm": 0.2794868052005768, + "learning_rate": 2.295217014471707e-05, + "loss": 0.2865, + "num_input_tokens_seen": 2104640, + "step": 11035 + }, + { + "epoch": 5.738045738045738, + "grad_norm": 0.37620094418525696, + "learning_rate": 2.2929568807222508e-05, + "loss": 0.2532, + "num_input_tokens_seen": 2105568, + "step": 11040 + }, + { + "epoch": 5.740644490644491, + "grad_norm": 0.4997257888317108, + "learning_rate": 2.2906969173475995e-05, + "loss": 0.2572, + "num_input_tokens_seen": 2106496, + "step": 11045 + }, + { + "epoch": 5.743243243243243, + "grad_norm": 0.3888058364391327, + "learning_rate": 2.2884371262074665e-05, + "loss": 0.2646, + "num_input_tokens_seen": 2107456, + "step": 11050 + }, + { + "epoch": 5.745841995841996, + "grad_norm": 0.4997672438621521, + "learning_rate": 2.2861775091614233e-05, + "loss": 0.2766, + "num_input_tokens_seen": 2108384, + "step": 11055 + }, + { + "epoch": 5.748440748440748, + "grad_norm": 0.2948383390903473, + "learning_rate": 2.2839180680688983e-05, + "loss": 0.2262, + "num_input_tokens_seen": 2109376, + "step": 11060 + }, + { + "epoch": 5.751039501039501, + "grad_norm": 0.20159879326820374, + "learning_rate": 2.2816588047891753e-05, + "loss": 0.242, + "num_input_tokens_seen": 2110272, + "step": 11065 + }, + { + "epoch": 5.753638253638254, + "grad_norm": 0.29176443815231323, + "learning_rate": 2.27939972118139e-05, + "loss": 0.2477, + "num_input_tokens_seen": 2111200, + "step": 11070 + }, + { + "epoch": 5.756237006237006, + "grad_norm": 0.5278854370117188, + "learning_rate": 2.2771408191045322e-05, + "loss": 0.2153, + "num_input_tokens_seen": 2112160, + "step": 11075 + }, + { + "epoch": 5.758835758835759, + "grad_norm": 0.5179525017738342, + "learning_rate": 2.274882100417442e-05, + "loss": 0.178, + "num_input_tokens_seen": 2113056, + "step": 11080 + }, + { + "epoch": 5.761434511434511, + "grad_norm": 0.5270639061927795, + "learning_rate": 2.2726235669788083e-05, + "loss": 0.2876, + "num_input_tokens_seen": 2114016, + "step": 11085 + }, + { + "epoch": 5.764033264033264, + "grad_norm": 0.8954312801361084, + "learning_rate": 2.2703652206471667e-05, + "loss": 0.2289, + "num_input_tokens_seen": 2114944, + "step": 11090 + }, + { + "epoch": 5.766632016632016, + "grad_norm": 0.24581021070480347, + "learning_rate": 2.2681070632809014e-05, + "loss": 0.305, + "num_input_tokens_seen": 2115904, + "step": 11095 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.40760889649391174, + "learning_rate": 2.26584909673824e-05, + "loss": 0.2397, + "num_input_tokens_seen": 2116864, + "step": 11100 + }, + { + "epoch": 5.771829521829522, + "grad_norm": 0.39881840348243713, + "learning_rate": 2.2635913228772496e-05, + "loss": 0.2074, + "num_input_tokens_seen": 2117824, + "step": 11105 + }, + { + "epoch": 5.774428274428274, + "grad_norm": 0.9204950332641602, + "learning_rate": 2.2613337435558433e-05, + "loss": 0.2152, + "num_input_tokens_seen": 2118816, + "step": 11110 + }, + { + "epoch": 5.777027027027027, + "grad_norm": 0.5254481434822083, + "learning_rate": 2.2590763606317723e-05, + "loss": 0.2951, + "num_input_tokens_seen": 2119744, + "step": 11115 + }, + { + "epoch": 5.779625779625779, + "grad_norm": 0.35799649357795715, + "learning_rate": 2.2568191759626263e-05, + "loss": 0.2728, + "num_input_tokens_seen": 2120672, + "step": 11120 + }, + { + "epoch": 5.782224532224532, + "grad_norm": 0.470278799533844, + "learning_rate": 2.254562191405832e-05, + "loss": 0.1856, + "num_input_tokens_seen": 2121696, + "step": 11125 + }, + { + "epoch": 5.784823284823284, + "grad_norm": 0.3128708004951477, + "learning_rate": 2.252305408818652e-05, + "loss": 0.2804, + "num_input_tokens_seen": 2122624, + "step": 11130 + }, + { + "epoch": 5.787422037422037, + "grad_norm": 0.6210241913795471, + "learning_rate": 2.250048830058181e-05, + "loss": 0.2759, + "num_input_tokens_seen": 2123584, + "step": 11135 + }, + { + "epoch": 5.79002079002079, + "grad_norm": 0.3521151840686798, + "learning_rate": 2.2477924569813473e-05, + "loss": 0.2391, + "num_input_tokens_seen": 2124448, + "step": 11140 + }, + { + "epoch": 5.792619542619542, + "grad_norm": 0.5399062037467957, + "learning_rate": 2.2455362914449094e-05, + "loss": 0.3184, + "num_input_tokens_seen": 2125408, + "step": 11145 + }, + { + "epoch": 5.795218295218295, + "grad_norm": 0.33013561367988586, + "learning_rate": 2.243280335305456e-05, + "loss": 0.2485, + "num_input_tokens_seen": 2126400, + "step": 11150 + }, + { + "epoch": 5.797817047817047, + "grad_norm": 0.2155887633562088, + "learning_rate": 2.2410245904194018e-05, + "loss": 0.2919, + "num_input_tokens_seen": 2127360, + "step": 11155 + }, + { + "epoch": 5.8004158004158, + "grad_norm": 0.6237737536430359, + "learning_rate": 2.2387690586429893e-05, + "loss": 0.2133, + "num_input_tokens_seen": 2128288, + "step": 11160 + }, + { + "epoch": 5.803014553014553, + "grad_norm": 0.1780012995004654, + "learning_rate": 2.2365137418322855e-05, + "loss": 0.29, + "num_input_tokens_seen": 2129248, + "step": 11165 + }, + { + "epoch": 5.8056133056133055, + "grad_norm": 0.374859094619751, + "learning_rate": 2.234258641843179e-05, + "loss": 0.2533, + "num_input_tokens_seen": 2130240, + "step": 11170 + }, + { + "epoch": 5.808212058212058, + "grad_norm": 0.5145270228385925, + "learning_rate": 2.2320037605313808e-05, + "loss": 0.2212, + "num_input_tokens_seen": 2131200, + "step": 11175 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.8164387941360474, + "learning_rate": 2.2297490997524224e-05, + "loss": 0.2284, + "num_input_tokens_seen": 2132192, + "step": 11180 + }, + { + "epoch": 5.8134095634095635, + "grad_norm": 0.5526375770568848, + "learning_rate": 2.2274946613616537e-05, + "loss": 0.2783, + "num_input_tokens_seen": 2133120, + "step": 11185 + }, + { + "epoch": 5.8160083160083165, + "grad_norm": 0.4104156494140625, + "learning_rate": 2.2252404472142414e-05, + "loss": 0.2837, + "num_input_tokens_seen": 2134080, + "step": 11190 + }, + { + "epoch": 5.8186070686070686, + "grad_norm": 0.6052871942520142, + "learning_rate": 2.2229864591651684e-05, + "loss": 0.2995, + "num_input_tokens_seen": 2135072, + "step": 11195 + }, + { + "epoch": 5.8212058212058215, + "grad_norm": 0.4698418080806732, + "learning_rate": 2.220732699069229e-05, + "loss": 0.2625, + "num_input_tokens_seen": 2135968, + "step": 11200 + }, + { + "epoch": 5.823804573804574, + "grad_norm": 0.16088397800922394, + "learning_rate": 2.2184791687810327e-05, + "loss": 0.2414, + "num_input_tokens_seen": 2136960, + "step": 11205 + }, + { + "epoch": 5.826403326403327, + "grad_norm": 0.33115169405937195, + "learning_rate": 2.216225870154999e-05, + "loss": 0.3172, + "num_input_tokens_seen": 2137856, + "step": 11210 + }, + { + "epoch": 5.829002079002079, + "grad_norm": 0.35244491696357727, + "learning_rate": 2.213972805045356e-05, + "loss": 0.2546, + "num_input_tokens_seen": 2138784, + "step": 11215 + }, + { + "epoch": 5.831600831600832, + "grad_norm": 0.28598296642303467, + "learning_rate": 2.2117199753061414e-05, + "loss": 0.2589, + "num_input_tokens_seen": 2139712, + "step": 11220 + }, + { + "epoch": 5.834199584199585, + "grad_norm": 0.3251672089099884, + "learning_rate": 2.209467382791198e-05, + "loss": 0.2858, + "num_input_tokens_seen": 2140704, + "step": 11225 + }, + { + "epoch": 5.836798336798337, + "grad_norm": 0.26966315507888794, + "learning_rate": 2.2072150293541743e-05, + "loss": 0.269, + "num_input_tokens_seen": 2141696, + "step": 11230 + }, + { + "epoch": 5.83939708939709, + "grad_norm": 0.3258943259716034, + "learning_rate": 2.2049629168485193e-05, + "loss": 0.3041, + "num_input_tokens_seen": 2142624, + "step": 11235 + }, + { + "epoch": 5.841995841995842, + "grad_norm": 0.3753716051578522, + "learning_rate": 2.2027110471274863e-05, + "loss": 0.268, + "num_input_tokens_seen": 2143552, + "step": 11240 + }, + { + "epoch": 5.844594594594595, + "grad_norm": 0.23844976723194122, + "learning_rate": 2.200459422044129e-05, + "loss": 0.2709, + "num_input_tokens_seen": 2144512, + "step": 11245 + }, + { + "epoch": 5.847193347193347, + "grad_norm": 0.27746573090553284, + "learning_rate": 2.198208043451299e-05, + "loss": 0.2927, + "num_input_tokens_seen": 2145408, + "step": 11250 + }, + { + "epoch": 5.8497920997921, + "grad_norm": 0.18766234815120697, + "learning_rate": 2.1959569132016445e-05, + "loss": 0.2633, + "num_input_tokens_seen": 2146304, + "step": 11255 + }, + { + "epoch": 5.852390852390853, + "grad_norm": 0.27507978677749634, + "learning_rate": 2.193706033147611e-05, + "loss": 0.2762, + "num_input_tokens_seen": 2147264, + "step": 11260 + }, + { + "epoch": 5.854989604989605, + "grad_norm": 0.3834633529186249, + "learning_rate": 2.1914554051414354e-05, + "loss": 0.2671, + "num_input_tokens_seen": 2148160, + "step": 11265 + }, + { + "epoch": 5.857588357588358, + "grad_norm": 0.5757665634155273, + "learning_rate": 2.1892050310351503e-05, + "loss": 0.2605, + "num_input_tokens_seen": 2149056, + "step": 11270 + }, + { + "epoch": 5.86018711018711, + "grad_norm": 0.6149711608886719, + "learning_rate": 2.1869549126805774e-05, + "loss": 0.268, + "num_input_tokens_seen": 2150048, + "step": 11275 + }, + { + "epoch": 5.862785862785863, + "grad_norm": 0.14719565212726593, + "learning_rate": 2.1847050519293284e-05, + "loss": 0.2793, + "num_input_tokens_seen": 2151040, + "step": 11280 + }, + { + "epoch": 5.865384615384615, + "grad_norm": 0.3770942986011505, + "learning_rate": 2.182455450632803e-05, + "loss": 0.272, + "num_input_tokens_seen": 2152000, + "step": 11285 + }, + { + "epoch": 5.867983367983368, + "grad_norm": 0.6398370862007141, + "learning_rate": 2.1802061106421883e-05, + "loss": 0.2847, + "num_input_tokens_seen": 2152896, + "step": 11290 + }, + { + "epoch": 5.870582120582121, + "grad_norm": 0.4161762297153473, + "learning_rate": 2.177957033808455e-05, + "loss": 0.2629, + "num_input_tokens_seen": 2153856, + "step": 11295 + }, + { + "epoch": 5.873180873180873, + "grad_norm": 0.22141008079051971, + "learning_rate": 2.1757082219823572e-05, + "loss": 0.209, + "num_input_tokens_seen": 2154784, + "step": 11300 + }, + { + "epoch": 5.875779625779626, + "grad_norm": 0.5749073624610901, + "learning_rate": 2.1734596770144324e-05, + "loss": 0.2443, + "num_input_tokens_seen": 2155680, + "step": 11305 + }, + { + "epoch": 5.878378378378378, + "grad_norm": 0.4894883930683136, + "learning_rate": 2.171211400754997e-05, + "loss": 0.2794, + "num_input_tokens_seen": 2156608, + "step": 11310 + }, + { + "epoch": 5.880977130977131, + "grad_norm": 0.23383097350597382, + "learning_rate": 2.1689633950541475e-05, + "loss": 0.2772, + "num_input_tokens_seen": 2157568, + "step": 11315 + }, + { + "epoch": 5.883575883575883, + "grad_norm": 0.5546057224273682, + "learning_rate": 2.1667156617617568e-05, + "loss": 0.2244, + "num_input_tokens_seen": 2158464, + "step": 11320 + }, + { + "epoch": 5.886174636174636, + "grad_norm": 0.24843432009220123, + "learning_rate": 2.164468202727474e-05, + "loss": 0.2412, + "num_input_tokens_seen": 2159424, + "step": 11325 + }, + { + "epoch": 5.888773388773389, + "grad_norm": 0.5441539883613586, + "learning_rate": 2.1622210198007238e-05, + "loss": 0.3087, + "num_input_tokens_seen": 2160416, + "step": 11330 + }, + { + "epoch": 5.891372141372141, + "grad_norm": 0.6054286360740662, + "learning_rate": 2.1599741148306997e-05, + "loss": 0.2452, + "num_input_tokens_seen": 2161440, + "step": 11335 + }, + { + "epoch": 5.893970893970894, + "grad_norm": 0.1720559149980545, + "learning_rate": 2.1577274896663714e-05, + "loss": 0.2316, + "num_input_tokens_seen": 2162336, + "step": 11340 + }, + { + "epoch": 5.896569646569646, + "grad_norm": 0.5085504055023193, + "learning_rate": 2.155481146156475e-05, + "loss": 0.2565, + "num_input_tokens_seen": 2163296, + "step": 11345 + }, + { + "epoch": 5.899168399168399, + "grad_norm": 0.2533365488052368, + "learning_rate": 2.1532350861495168e-05, + "loss": 0.2914, + "num_input_tokens_seen": 2164256, + "step": 11350 + }, + { + "epoch": 5.901767151767151, + "grad_norm": 0.5498880743980408, + "learning_rate": 2.1509893114937688e-05, + "loss": 0.2519, + "num_input_tokens_seen": 2165248, + "step": 11355 + }, + { + "epoch": 5.904365904365904, + "grad_norm": 0.377157986164093, + "learning_rate": 2.148743824037269e-05, + "loss": 0.2889, + "num_input_tokens_seen": 2166336, + "step": 11360 + }, + { + "epoch": 5.906964656964657, + "grad_norm": 0.6642311215400696, + "learning_rate": 2.1464986256278167e-05, + "loss": 0.2488, + "num_input_tokens_seen": 2167264, + "step": 11365 + }, + { + "epoch": 5.909563409563409, + "grad_norm": 0.5835360884666443, + "learning_rate": 2.1442537181129757e-05, + "loss": 0.2204, + "num_input_tokens_seen": 2168160, + "step": 11370 + }, + { + "epoch": 5.912162162162162, + "grad_norm": 0.4681536853313446, + "learning_rate": 2.1420091033400705e-05, + "loss": 0.3104, + "num_input_tokens_seen": 2169120, + "step": 11375 + }, + { + "epoch": 5.914760914760915, + "grad_norm": 0.3479747474193573, + "learning_rate": 2.139764783156183e-05, + "loss": 0.2705, + "num_input_tokens_seen": 2170048, + "step": 11380 + }, + { + "epoch": 5.917359667359667, + "grad_norm": 0.5079150199890137, + "learning_rate": 2.1375207594081547e-05, + "loss": 0.2757, + "num_input_tokens_seen": 2170944, + "step": 11385 + }, + { + "epoch": 5.91995841995842, + "grad_norm": 0.47558364272117615, + "learning_rate": 2.135277033942582e-05, + "loss": 0.3001, + "num_input_tokens_seen": 2171936, + "step": 11390 + }, + { + "epoch": 5.922557172557172, + "grad_norm": 0.6917049288749695, + "learning_rate": 2.1330336086058154e-05, + "loss": 0.3152, + "num_input_tokens_seen": 2172928, + "step": 11395 + }, + { + "epoch": 5.925155925155925, + "grad_norm": 0.3505042791366577, + "learning_rate": 2.1307904852439593e-05, + "loss": 0.2697, + "num_input_tokens_seen": 2173952, + "step": 11400 + }, + { + "epoch": 5.9277546777546775, + "grad_norm": 0.2929537296295166, + "learning_rate": 2.128547665702869e-05, + "loss": 0.2719, + "num_input_tokens_seen": 2174848, + "step": 11405 + }, + { + "epoch": 5.93035343035343, + "grad_norm": 0.21516355872154236, + "learning_rate": 2.126305151828151e-05, + "loss": 0.2492, + "num_input_tokens_seen": 2175776, + "step": 11410 + }, + { + "epoch": 5.932952182952183, + "grad_norm": 0.8078261613845825, + "learning_rate": 2.1240629454651583e-05, + "loss": 0.2449, + "num_input_tokens_seen": 2176800, + "step": 11415 + }, + { + "epoch": 5.9355509355509355, + "grad_norm": 0.3066753149032593, + "learning_rate": 2.1218210484589924e-05, + "loss": 0.2532, + "num_input_tokens_seen": 2177728, + "step": 11420 + }, + { + "epoch": 5.9381496881496885, + "grad_norm": 0.27220502495765686, + "learning_rate": 2.1195794626545007e-05, + "loss": 0.2174, + "num_input_tokens_seen": 2178656, + "step": 11425 + }, + { + "epoch": 5.9407484407484406, + "grad_norm": 0.34336817264556885, + "learning_rate": 2.117338189896272e-05, + "loss": 0.1969, + "num_input_tokens_seen": 2179744, + "step": 11430 + }, + { + "epoch": 5.9433471933471935, + "grad_norm": 0.3140237033367157, + "learning_rate": 2.1150972320286398e-05, + "loss": 0.2437, + "num_input_tokens_seen": 2180672, + "step": 11435 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 0.6606806516647339, + "learning_rate": 2.1128565908956775e-05, + "loss": 0.2192, + "num_input_tokens_seen": 2181632, + "step": 11440 + }, + { + "epoch": 5.948544698544699, + "grad_norm": 0.5487345457077026, + "learning_rate": 2.1106162683411983e-05, + "loss": 0.2662, + "num_input_tokens_seen": 2182656, + "step": 11445 + }, + { + "epoch": 5.951143451143452, + "grad_norm": 0.6095933318138123, + "learning_rate": 2.108376266208753e-05, + "loss": 0.3114, + "num_input_tokens_seen": 2183648, + "step": 11450 + }, + { + "epoch": 5.953742203742204, + "grad_norm": 0.46215370297431946, + "learning_rate": 2.106136586341629e-05, + "loss": 0.2036, + "num_input_tokens_seen": 2184640, + "step": 11455 + }, + { + "epoch": 5.956340956340957, + "grad_norm": 0.3792390823364258, + "learning_rate": 2.1038972305828486e-05, + "loss": 0.2113, + "num_input_tokens_seen": 2185568, + "step": 11460 + }, + { + "epoch": 5.958939708939709, + "grad_norm": 0.48057684302330017, + "learning_rate": 2.1016582007751658e-05, + "loss": 0.2945, + "num_input_tokens_seen": 2186496, + "step": 11465 + }, + { + "epoch": 5.961538461538462, + "grad_norm": 0.5294195413589478, + "learning_rate": 2.099419498761069e-05, + "loss": 0.2515, + "num_input_tokens_seen": 2187488, + "step": 11470 + }, + { + "epoch": 5.964137214137214, + "grad_norm": 0.16083940863609314, + "learning_rate": 2.0971811263827746e-05, + "loss": 0.2969, + "num_input_tokens_seen": 2188416, + "step": 11475 + }, + { + "epoch": 5.966735966735967, + "grad_norm": 0.48345229029655457, + "learning_rate": 2.0949430854822288e-05, + "loss": 0.2884, + "num_input_tokens_seen": 2189376, + "step": 11480 + }, + { + "epoch": 5.96933471933472, + "grad_norm": 0.30932700634002686, + "learning_rate": 2.092705377901105e-05, + "loss": 0.2589, + "num_input_tokens_seen": 2190432, + "step": 11485 + }, + { + "epoch": 5.971933471933472, + "grad_norm": 0.386326402425766, + "learning_rate": 2.090468005480804e-05, + "loss": 0.3118, + "num_input_tokens_seen": 2191360, + "step": 11490 + }, + { + "epoch": 5.974532224532225, + "grad_norm": 0.5959592461585999, + "learning_rate": 2.0882309700624457e-05, + "loss": 0.2777, + "num_input_tokens_seen": 2192288, + "step": 11495 + }, + { + "epoch": 5.977130977130977, + "grad_norm": 0.33859506249427795, + "learning_rate": 2.0859942734868778e-05, + "loss": 0.2631, + "num_input_tokens_seen": 2193312, + "step": 11500 + }, + { + "epoch": 5.97972972972973, + "grad_norm": 0.2077333778142929, + "learning_rate": 2.0837579175946674e-05, + "loss": 0.2812, + "num_input_tokens_seen": 2194272, + "step": 11505 + }, + { + "epoch": 5.982328482328482, + "grad_norm": 0.3791508674621582, + "learning_rate": 2.0815219042261003e-05, + "loss": 0.2666, + "num_input_tokens_seen": 2195200, + "step": 11510 + }, + { + "epoch": 5.984927234927235, + "grad_norm": 0.7815367579460144, + "learning_rate": 2.0792862352211822e-05, + "loss": 0.2574, + "num_input_tokens_seen": 2196160, + "step": 11515 + }, + { + "epoch": 5.987525987525988, + "grad_norm": 0.27312275767326355, + "learning_rate": 2.077050912419634e-05, + "loss": 0.2706, + "num_input_tokens_seen": 2197120, + "step": 11520 + }, + { + "epoch": 5.99012474012474, + "grad_norm": 0.39380931854248047, + "learning_rate": 2.074815937660894e-05, + "loss": 0.2706, + "num_input_tokens_seen": 2198112, + "step": 11525 + }, + { + "epoch": 5.992723492723493, + "grad_norm": 0.27328813076019287, + "learning_rate": 2.0725813127841103e-05, + "loss": 0.2539, + "num_input_tokens_seen": 2199040, + "step": 11530 + }, + { + "epoch": 5.995322245322245, + "grad_norm": 0.32413938641548157, + "learning_rate": 2.0703470396281454e-05, + "loss": 0.2867, + "num_input_tokens_seen": 2199904, + "step": 11535 + }, + { + "epoch": 5.997920997920998, + "grad_norm": 0.290528804063797, + "learning_rate": 2.068113120031573e-05, + "loss": 0.2512, + "num_input_tokens_seen": 2200832, + "step": 11540 + }, + { + "epoch": 6.0, + "eval_loss": 0.25053250789642334, + "eval_runtime": 7.978, + "eval_samples_per_second": 107.295, + "eval_steps_per_second": 26.824, + "num_input_tokens_seen": 2201584, + "step": 11544 + }, + { + "epoch": 6.000519750519751, + "grad_norm": 0.6085042953491211, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.2042, + "num_input_tokens_seen": 2201744, + "step": 11545 + }, + { + "epoch": 6.003118503118503, + "grad_norm": 0.24860122799873352, + "learning_rate": 2.0636463488694392e-05, + "loss": 0.2299, + "num_input_tokens_seen": 2202672, + "step": 11550 + }, + { + "epoch": 6.005717255717256, + "grad_norm": 0.2617388367652893, + "learning_rate": 2.0614135009795633e-05, + "loss": 0.2053, + "num_input_tokens_seen": 2203632, + "step": 11555 + }, + { + "epoch": 6.008316008316008, + "grad_norm": 0.2560470998287201, + "learning_rate": 2.059181014000446e-05, + "loss": 0.2093, + "num_input_tokens_seen": 2204592, + "step": 11560 + }, + { + "epoch": 6.010914760914761, + "grad_norm": 1.0033950805664062, + "learning_rate": 2.0569488897691898e-05, + "loss": 0.2258, + "num_input_tokens_seen": 2205552, + "step": 11565 + }, + { + "epoch": 6.013513513513513, + "grad_norm": 0.39375266432762146, + "learning_rate": 2.0547171301226007e-05, + "loss": 0.335, + "num_input_tokens_seen": 2206512, + "step": 11570 + }, + { + "epoch": 6.016112266112266, + "grad_norm": 0.387870728969574, + "learning_rate": 2.052485736897182e-05, + "loss": 0.2923, + "num_input_tokens_seen": 2207440, + "step": 11575 + }, + { + "epoch": 6.018711018711019, + "grad_norm": 0.40304073691368103, + "learning_rate": 2.050254711929137e-05, + "loss": 0.2224, + "num_input_tokens_seen": 2208400, + "step": 11580 + }, + { + "epoch": 6.021309771309771, + "grad_norm": 0.4105667471885681, + "learning_rate": 2.048024057054366e-05, + "loss": 0.242, + "num_input_tokens_seen": 2209360, + "step": 11585 + }, + { + "epoch": 6.023908523908524, + "grad_norm": 0.6146612763404846, + "learning_rate": 2.0457937741084644e-05, + "loss": 0.3039, + "num_input_tokens_seen": 2210256, + "step": 11590 + }, + { + "epoch": 6.026507276507276, + "grad_norm": 0.31568896770477295, + "learning_rate": 2.0435638649267205e-05, + "loss": 0.3051, + "num_input_tokens_seen": 2211248, + "step": 11595 + }, + { + "epoch": 6.029106029106029, + "grad_norm": 0.21735043823719025, + "learning_rate": 2.0413343313441165e-05, + "loss": 0.245, + "num_input_tokens_seen": 2212208, + "step": 11600 + }, + { + "epoch": 6.031704781704781, + "grad_norm": 0.548270583152771, + "learning_rate": 2.0391051751953256e-05, + "loss": 0.2709, + "num_input_tokens_seen": 2213232, + "step": 11605 + }, + { + "epoch": 6.034303534303534, + "grad_norm": 0.2152281254529953, + "learning_rate": 2.0368763983147092e-05, + "loss": 0.3411, + "num_input_tokens_seen": 2214128, + "step": 11610 + }, + { + "epoch": 6.036902286902287, + "grad_norm": 0.47063785791397095, + "learning_rate": 2.034648002536318e-05, + "loss": 0.2458, + "num_input_tokens_seen": 2215056, + "step": 11615 + }, + { + "epoch": 6.039501039501039, + "grad_norm": 0.6626997590065002, + "learning_rate": 2.0324199896938883e-05, + "loss": 0.2303, + "num_input_tokens_seen": 2216016, + "step": 11620 + }, + { + "epoch": 6.042099792099792, + "grad_norm": 0.588590681552887, + "learning_rate": 2.0301923616208404e-05, + "loss": 0.2261, + "num_input_tokens_seen": 2216944, + "step": 11625 + }, + { + "epoch": 6.044698544698544, + "grad_norm": 0.44495663046836853, + "learning_rate": 2.0279651201502793e-05, + "loss": 0.267, + "num_input_tokens_seen": 2217872, + "step": 11630 + }, + { + "epoch": 6.047297297297297, + "grad_norm": 0.5020132660865784, + "learning_rate": 2.0257382671149914e-05, + "loss": 0.2679, + "num_input_tokens_seen": 2218864, + "step": 11635 + }, + { + "epoch": 6.04989604989605, + "grad_norm": 0.5105405449867249, + "learning_rate": 2.023511804347444e-05, + "loss": 0.2141, + "num_input_tokens_seen": 2219760, + "step": 11640 + }, + { + "epoch": 6.052494802494802, + "grad_norm": 1.036781907081604, + "learning_rate": 2.0212857336797823e-05, + "loss": 0.2894, + "num_input_tokens_seen": 2220816, + "step": 11645 + }, + { + "epoch": 6.055093555093555, + "grad_norm": 0.4637434184551239, + "learning_rate": 2.01906005694383e-05, + "loss": 0.2323, + "num_input_tokens_seen": 2221840, + "step": 11650 + }, + { + "epoch": 6.0576923076923075, + "grad_norm": 0.4725373685359955, + "learning_rate": 2.016834775971087e-05, + "loss": 0.1739, + "num_input_tokens_seen": 2222800, + "step": 11655 + }, + { + "epoch": 6.0602910602910605, + "grad_norm": 0.3079119324684143, + "learning_rate": 2.014609892592724e-05, + "loss": 0.2426, + "num_input_tokens_seen": 2223760, + "step": 11660 + }, + { + "epoch": 6.0628898128898125, + "grad_norm": 0.23410429060459137, + "learning_rate": 2.012385408639588e-05, + "loss": 0.2173, + "num_input_tokens_seen": 2224656, + "step": 11665 + }, + { + "epoch": 6.0654885654885655, + "grad_norm": 0.3601711690425873, + "learning_rate": 2.0101613259421963e-05, + "loss": 0.2551, + "num_input_tokens_seen": 2225648, + "step": 11670 + }, + { + "epoch": 6.0680873180873185, + "grad_norm": 0.2540207505226135, + "learning_rate": 2.0079376463307368e-05, + "loss": 0.3264, + "num_input_tokens_seen": 2226576, + "step": 11675 + }, + { + "epoch": 6.070686070686071, + "grad_norm": 0.47469767928123474, + "learning_rate": 2.005714371635064e-05, + "loss": 0.2092, + "num_input_tokens_seen": 2227472, + "step": 11680 + }, + { + "epoch": 6.0732848232848236, + "grad_norm": 0.45131441950798035, + "learning_rate": 2.003491503684701e-05, + "loss": 0.211, + "num_input_tokens_seen": 2228368, + "step": 11685 + }, + { + "epoch": 6.075883575883576, + "grad_norm": 0.2318720817565918, + "learning_rate": 2.0012690443088344e-05, + "loss": 0.2709, + "num_input_tokens_seen": 2229296, + "step": 11690 + }, + { + "epoch": 6.078482328482329, + "grad_norm": 0.5033720135688782, + "learning_rate": 1.999046995336316e-05, + "loss": 0.3296, + "num_input_tokens_seen": 2230224, + "step": 11695 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 0.48632892966270447, + "learning_rate": 1.9968253585956598e-05, + "loss": 0.255, + "num_input_tokens_seen": 2231152, + "step": 11700 + }, + { + "epoch": 6.083679833679834, + "grad_norm": 0.6411331295967102, + "learning_rate": 1.9946041359150393e-05, + "loss": 0.2317, + "num_input_tokens_seen": 2232144, + "step": 11705 + }, + { + "epoch": 6.086278586278587, + "grad_norm": 0.2702583074569702, + "learning_rate": 1.992383329122289e-05, + "loss": 0.271, + "num_input_tokens_seen": 2233136, + "step": 11710 + }, + { + "epoch": 6.088877338877339, + "grad_norm": 0.7974379062652588, + "learning_rate": 1.9901629400448997e-05, + "loss": 0.2864, + "num_input_tokens_seen": 2234096, + "step": 11715 + }, + { + "epoch": 6.091476091476092, + "grad_norm": 0.7660951018333435, + "learning_rate": 1.9879429705100204e-05, + "loss": 0.2678, + "num_input_tokens_seen": 2234960, + "step": 11720 + }, + { + "epoch": 6.094074844074844, + "grad_norm": 0.562554657459259, + "learning_rate": 1.9857234223444516e-05, + "loss": 0.2444, + "num_input_tokens_seen": 2235856, + "step": 11725 + }, + { + "epoch": 6.096673596673597, + "grad_norm": 0.44360050559043884, + "learning_rate": 1.98350429737465e-05, + "loss": 0.2683, + "num_input_tokens_seen": 2236752, + "step": 11730 + }, + { + "epoch": 6.099272349272349, + "grad_norm": 0.4943171441555023, + "learning_rate": 1.9812855974267225e-05, + "loss": 0.2845, + "num_input_tokens_seen": 2237712, + "step": 11735 + }, + { + "epoch": 6.101871101871102, + "grad_norm": 0.5821974873542786, + "learning_rate": 1.979067324326428e-05, + "loss": 0.3146, + "num_input_tokens_seen": 2238704, + "step": 11740 + }, + { + "epoch": 6.104469854469855, + "grad_norm": 0.8068905472755432, + "learning_rate": 1.9768494798991714e-05, + "loss": 0.2651, + "num_input_tokens_seen": 2239728, + "step": 11745 + }, + { + "epoch": 6.107068607068607, + "grad_norm": 0.45626574754714966, + "learning_rate": 1.974632065970008e-05, + "loss": 0.2865, + "num_input_tokens_seen": 2240816, + "step": 11750 + }, + { + "epoch": 6.10966735966736, + "grad_norm": 0.307933509349823, + "learning_rate": 1.9724150843636375e-05, + "loss": 0.2618, + "num_input_tokens_seen": 2241776, + "step": 11755 + }, + { + "epoch": 6.112266112266112, + "grad_norm": 0.6520818471908569, + "learning_rate": 1.9701985369044013e-05, + "loss": 0.2367, + "num_input_tokens_seen": 2242768, + "step": 11760 + }, + { + "epoch": 6.114864864864865, + "grad_norm": 0.2515491843223572, + "learning_rate": 1.9679824254162864e-05, + "loss": 0.2502, + "num_input_tokens_seen": 2243792, + "step": 11765 + }, + { + "epoch": 6.117463617463618, + "grad_norm": 0.43348148465156555, + "learning_rate": 1.965766751722922e-05, + "loss": 0.3058, + "num_input_tokens_seen": 2244816, + "step": 11770 + }, + { + "epoch": 6.12006237006237, + "grad_norm": 0.7409997582435608, + "learning_rate": 1.9635515176475747e-05, + "loss": 0.244, + "num_input_tokens_seen": 2245872, + "step": 11775 + }, + { + "epoch": 6.122661122661123, + "grad_norm": 0.7907001376152039, + "learning_rate": 1.96133672501315e-05, + "loss": 0.2317, + "num_input_tokens_seen": 2246800, + "step": 11780 + }, + { + "epoch": 6.125259875259875, + "grad_norm": 0.48305052518844604, + "learning_rate": 1.9591223756421916e-05, + "loss": 0.2024, + "num_input_tokens_seen": 2247792, + "step": 11785 + }, + { + "epoch": 6.127858627858628, + "grad_norm": 0.24116460978984833, + "learning_rate": 1.9569084713568752e-05, + "loss": 0.2732, + "num_input_tokens_seen": 2248752, + "step": 11790 + }, + { + "epoch": 6.13045738045738, + "grad_norm": 0.5510217547416687, + "learning_rate": 1.954695013979013e-05, + "loss": 0.2535, + "num_input_tokens_seen": 2249744, + "step": 11795 + }, + { + "epoch": 6.133056133056133, + "grad_norm": 0.4684186279773712, + "learning_rate": 1.9524820053300485e-05, + "loss": 0.187, + "num_input_tokens_seen": 2250608, + "step": 11800 + }, + { + "epoch": 6.135654885654886, + "grad_norm": 0.290677547454834, + "learning_rate": 1.950269447231056e-05, + "loss": 0.252, + "num_input_tokens_seen": 2251536, + "step": 11805 + }, + { + "epoch": 6.138253638253638, + "grad_norm": 0.3675389289855957, + "learning_rate": 1.9480573415027395e-05, + "loss": 0.3304, + "num_input_tokens_seen": 2252528, + "step": 11810 + }, + { + "epoch": 6.140852390852391, + "grad_norm": 0.5738985538482666, + "learning_rate": 1.9458456899654303e-05, + "loss": 0.2181, + "num_input_tokens_seen": 2253520, + "step": 11815 + }, + { + "epoch": 6.143451143451143, + "grad_norm": 0.2189769297838211, + "learning_rate": 1.943634494439086e-05, + "loss": 0.3272, + "num_input_tokens_seen": 2254480, + "step": 11820 + }, + { + "epoch": 6.146049896049896, + "grad_norm": 0.48961886763572693, + "learning_rate": 1.9414237567432886e-05, + "loss": 0.2903, + "num_input_tokens_seen": 2255408, + "step": 11825 + }, + { + "epoch": 6.148648648648648, + "grad_norm": 0.3956512212753296, + "learning_rate": 1.939213478697244e-05, + "loss": 0.2376, + "num_input_tokens_seen": 2256400, + "step": 11830 + }, + { + "epoch": 6.151247401247401, + "grad_norm": 0.2470903843641281, + "learning_rate": 1.9370036621197793e-05, + "loss": 0.2457, + "num_input_tokens_seen": 2257232, + "step": 11835 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.7660176753997803, + "learning_rate": 1.9347943088293423e-05, + "loss": 0.2464, + "num_input_tokens_seen": 2258192, + "step": 11840 + }, + { + "epoch": 6.156444906444906, + "grad_norm": 0.2419728934764862, + "learning_rate": 1.9325854206439996e-05, + "loss": 0.2645, + "num_input_tokens_seen": 2259280, + "step": 11845 + }, + { + "epoch": 6.159043659043659, + "grad_norm": 0.5934350490570068, + "learning_rate": 1.9303769993814353e-05, + "loss": 0.2438, + "num_input_tokens_seen": 2260208, + "step": 11850 + }, + { + "epoch": 6.161642411642411, + "grad_norm": 0.5761550664901733, + "learning_rate": 1.9281690468589473e-05, + "loss": 0.2623, + "num_input_tokens_seen": 2261168, + "step": 11855 + }, + { + "epoch": 6.164241164241164, + "grad_norm": 0.2610374093055725, + "learning_rate": 1.9259615648934505e-05, + "loss": 0.2791, + "num_input_tokens_seen": 2262096, + "step": 11860 + }, + { + "epoch": 6.166839916839917, + "grad_norm": 0.4123693108558655, + "learning_rate": 1.923754555301471e-05, + "loss": 0.2777, + "num_input_tokens_seen": 2263088, + "step": 11865 + }, + { + "epoch": 6.169438669438669, + "grad_norm": 0.5606864094734192, + "learning_rate": 1.9215480198991466e-05, + "loss": 0.2765, + "num_input_tokens_seen": 2264016, + "step": 11870 + }, + { + "epoch": 6.172037422037422, + "grad_norm": 0.18072210252285004, + "learning_rate": 1.9193419605022248e-05, + "loss": 0.2451, + "num_input_tokens_seen": 2265040, + "step": 11875 + }, + { + "epoch": 6.174636174636174, + "grad_norm": 0.16165952384471893, + "learning_rate": 1.9171363789260614e-05, + "loss": 0.2542, + "num_input_tokens_seen": 2265968, + "step": 11880 + }, + { + "epoch": 6.177234927234927, + "grad_norm": 0.28618013858795166, + "learning_rate": 1.914931276985621e-05, + "loss": 0.2279, + "num_input_tokens_seen": 2266928, + "step": 11885 + }, + { + "epoch": 6.1798336798336795, + "grad_norm": 0.5307198166847229, + "learning_rate": 1.9127266564954678e-05, + "loss": 0.2452, + "num_input_tokens_seen": 2267792, + "step": 11890 + }, + { + "epoch": 6.1824324324324325, + "grad_norm": 0.4987882971763611, + "learning_rate": 1.910522519269776e-05, + "loss": 0.2621, + "num_input_tokens_seen": 2268816, + "step": 11895 + }, + { + "epoch": 6.185031185031185, + "grad_norm": 0.6451588273048401, + "learning_rate": 1.9083188671223196e-05, + "loss": 0.2875, + "num_input_tokens_seen": 2269712, + "step": 11900 + }, + { + "epoch": 6.1876299376299375, + "grad_norm": 0.2683323621749878, + "learning_rate": 1.906115701866473e-05, + "loss": 0.2948, + "num_input_tokens_seen": 2270672, + "step": 11905 + }, + { + "epoch": 6.1902286902286905, + "grad_norm": 0.32352814078330994, + "learning_rate": 1.903913025315211e-05, + "loss": 0.3021, + "num_input_tokens_seen": 2271696, + "step": 11910 + }, + { + "epoch": 6.192827442827443, + "grad_norm": 0.5122302174568176, + "learning_rate": 1.9017108392811065e-05, + "loss": 0.2444, + "num_input_tokens_seen": 2272656, + "step": 11915 + }, + { + "epoch": 6.1954261954261955, + "grad_norm": 0.1304008960723877, + "learning_rate": 1.8995091455763254e-05, + "loss": 0.2816, + "num_input_tokens_seen": 2273584, + "step": 11920 + }, + { + "epoch": 6.198024948024948, + "grad_norm": 0.2379635125398636, + "learning_rate": 1.8973079460126334e-05, + "loss": 0.2758, + "num_input_tokens_seen": 2274512, + "step": 11925 + }, + { + "epoch": 6.200623700623701, + "grad_norm": 0.7577180862426758, + "learning_rate": 1.895107242401386e-05, + "loss": 0.2954, + "num_input_tokens_seen": 2275504, + "step": 11930 + }, + { + "epoch": 6.203222453222454, + "grad_norm": 0.39200687408447266, + "learning_rate": 1.8929070365535323e-05, + "loss": 0.2457, + "num_input_tokens_seen": 2276528, + "step": 11935 + }, + { + "epoch": 6.205821205821206, + "grad_norm": 0.31603941321372986, + "learning_rate": 1.8907073302796115e-05, + "loss": 0.2632, + "num_input_tokens_seen": 2277488, + "step": 11940 + }, + { + "epoch": 6.208419958419959, + "grad_norm": 0.2534750998020172, + "learning_rate": 1.8885081253897504e-05, + "loss": 0.2401, + "num_input_tokens_seen": 2278448, + "step": 11945 + }, + { + "epoch": 6.211018711018711, + "grad_norm": 0.4222598671913147, + "learning_rate": 1.886309423693667e-05, + "loss": 0.2864, + "num_input_tokens_seen": 2279408, + "step": 11950 + }, + { + "epoch": 6.213617463617464, + "grad_norm": 0.6650441288948059, + "learning_rate": 1.8841112270006596e-05, + "loss": 0.2912, + "num_input_tokens_seen": 2280464, + "step": 11955 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.43700337409973145, + "learning_rate": 1.881913537119615e-05, + "loss": 0.227, + "num_input_tokens_seen": 2281584, + "step": 11960 + }, + { + "epoch": 6.218814968814969, + "grad_norm": 0.5805399417877197, + "learning_rate": 1.8797163558590018e-05, + "loss": 0.2461, + "num_input_tokens_seen": 2282608, + "step": 11965 + }, + { + "epoch": 6.221413721413722, + "grad_norm": 0.3302517533302307, + "learning_rate": 1.8775196850268703e-05, + "loss": 0.2354, + "num_input_tokens_seen": 2283600, + "step": 11970 + }, + { + "epoch": 6.224012474012474, + "grad_norm": 0.6404489278793335, + "learning_rate": 1.8753235264308504e-05, + "loss": 0.2642, + "num_input_tokens_seen": 2284496, + "step": 11975 + }, + { + "epoch": 6.226611226611227, + "grad_norm": 0.7325204014778137, + "learning_rate": 1.8731278818781506e-05, + "loss": 0.2513, + "num_input_tokens_seen": 2285360, + "step": 11980 + }, + { + "epoch": 6.229209979209979, + "grad_norm": 0.5883045792579651, + "learning_rate": 1.8709327531755562e-05, + "loss": 0.3107, + "num_input_tokens_seen": 2286384, + "step": 11985 + }, + { + "epoch": 6.231808731808732, + "grad_norm": 0.6326375603675842, + "learning_rate": 1.8687381421294287e-05, + "loss": 0.2001, + "num_input_tokens_seen": 2287312, + "step": 11990 + }, + { + "epoch": 6.234407484407485, + "grad_norm": 0.8824358582496643, + "learning_rate": 1.8665440505457027e-05, + "loss": 0.2918, + "num_input_tokens_seen": 2288272, + "step": 11995 + }, + { + "epoch": 6.237006237006237, + "grad_norm": 0.2508125603199005, + "learning_rate": 1.864350480229886e-05, + "loss": 0.2692, + "num_input_tokens_seen": 2289200, + "step": 12000 + }, + { + "epoch": 6.23960498960499, + "grad_norm": 0.4984094202518463, + "learning_rate": 1.8621574329870575e-05, + "loss": 0.2702, + "num_input_tokens_seen": 2290192, + "step": 12005 + }, + { + "epoch": 6.242203742203742, + "grad_norm": 0.19818717241287231, + "learning_rate": 1.859964910621865e-05, + "loss": 0.2816, + "num_input_tokens_seen": 2291088, + "step": 12010 + }, + { + "epoch": 6.244802494802495, + "grad_norm": 0.41090127825737, + "learning_rate": 1.8577729149385257e-05, + "loss": 0.2408, + "num_input_tokens_seen": 2292144, + "step": 12015 + }, + { + "epoch": 6.247401247401247, + "grad_norm": 0.6550759673118591, + "learning_rate": 1.8555814477408214e-05, + "loss": 0.2179, + "num_input_tokens_seen": 2293104, + "step": 12020 + }, + { + "epoch": 6.25, + "grad_norm": 0.515549898147583, + "learning_rate": 1.8533905108321005e-05, + "loss": 0.3136, + "num_input_tokens_seen": 2294096, + "step": 12025 + }, + { + "epoch": 6.252598752598753, + "grad_norm": 0.42127054929733276, + "learning_rate": 1.8512001060152744e-05, + "loss": 0.3207, + "num_input_tokens_seen": 2295056, + "step": 12030 + }, + { + "epoch": 6.255197505197505, + "grad_norm": 0.32585883140563965, + "learning_rate": 1.8490102350928172e-05, + "loss": 0.2939, + "num_input_tokens_seen": 2295984, + "step": 12035 + }, + { + "epoch": 6.257796257796258, + "grad_norm": 0.38163334131240845, + "learning_rate": 1.8468208998667636e-05, + "loss": 0.2859, + "num_input_tokens_seen": 2296912, + "step": 12040 + }, + { + "epoch": 6.26039501039501, + "grad_norm": 0.31120938062667847, + "learning_rate": 1.8446321021387078e-05, + "loss": 0.2562, + "num_input_tokens_seen": 2297840, + "step": 12045 + }, + { + "epoch": 6.262993762993763, + "grad_norm": 0.4145154058933258, + "learning_rate": 1.842443843709799e-05, + "loss": 0.2777, + "num_input_tokens_seen": 2298864, + "step": 12050 + }, + { + "epoch": 6.265592515592515, + "grad_norm": 0.5213630199432373, + "learning_rate": 1.840256126380746e-05, + "loss": 0.2335, + "num_input_tokens_seen": 2299792, + "step": 12055 + }, + { + "epoch": 6.268191268191268, + "grad_norm": 0.5337929129600525, + "learning_rate": 1.8380689519518112e-05, + "loss": 0.2414, + "num_input_tokens_seen": 2300720, + "step": 12060 + }, + { + "epoch": 6.270790020790021, + "grad_norm": 0.2684619128704071, + "learning_rate": 1.8358823222228097e-05, + "loss": 0.3279, + "num_input_tokens_seen": 2301744, + "step": 12065 + }, + { + "epoch": 6.273388773388773, + "grad_norm": 0.5271618366241455, + "learning_rate": 1.8336962389931085e-05, + "loss": 0.2837, + "num_input_tokens_seen": 2302672, + "step": 12070 + }, + { + "epoch": 6.275987525987526, + "grad_norm": 0.16052906215190887, + "learning_rate": 1.8315107040616263e-05, + "loss": 0.2991, + "num_input_tokens_seen": 2303632, + "step": 12075 + }, + { + "epoch": 6.278586278586278, + "grad_norm": 0.7256244421005249, + "learning_rate": 1.8293257192268296e-05, + "loss": 0.2412, + "num_input_tokens_seen": 2304592, + "step": 12080 + }, + { + "epoch": 6.281185031185031, + "grad_norm": 0.4099905490875244, + "learning_rate": 1.8271412862867305e-05, + "loss": 0.2853, + "num_input_tokens_seen": 2305520, + "step": 12085 + }, + { + "epoch": 6.283783783783784, + "grad_norm": 0.33971625566482544, + "learning_rate": 1.8249574070388893e-05, + "loss": 0.2738, + "num_input_tokens_seen": 2306480, + "step": 12090 + }, + { + "epoch": 6.286382536382536, + "grad_norm": 0.6518958210945129, + "learning_rate": 1.82277408328041e-05, + "loss": 0.2564, + "num_input_tokens_seen": 2307408, + "step": 12095 + }, + { + "epoch": 6.288981288981289, + "grad_norm": 0.7348439693450928, + "learning_rate": 1.820591316807939e-05, + "loss": 0.298, + "num_input_tokens_seen": 2308368, + "step": 12100 + }, + { + "epoch": 6.291580041580041, + "grad_norm": 0.2540704011917114, + "learning_rate": 1.818409109417666e-05, + "loss": 0.2608, + "num_input_tokens_seen": 2309264, + "step": 12105 + }, + { + "epoch": 6.294178794178794, + "grad_norm": 0.16027520596981049, + "learning_rate": 1.816227462905318e-05, + "loss": 0.2583, + "num_input_tokens_seen": 2310192, + "step": 12110 + }, + { + "epoch": 6.296777546777546, + "grad_norm": 0.5470715165138245, + "learning_rate": 1.8140463790661606e-05, + "loss": 0.2506, + "num_input_tokens_seen": 2311152, + "step": 12115 + }, + { + "epoch": 6.299376299376299, + "grad_norm": 0.23867540061473846, + "learning_rate": 1.811865859694999e-05, + "loss": 0.2638, + "num_input_tokens_seen": 2312080, + "step": 12120 + }, + { + "epoch": 6.301975051975052, + "grad_norm": 0.3554694354534149, + "learning_rate": 1.8096859065861722e-05, + "loss": 0.2487, + "num_input_tokens_seen": 2313008, + "step": 12125 + }, + { + "epoch": 6.3045738045738045, + "grad_norm": 0.46693313121795654, + "learning_rate": 1.8075065215335525e-05, + "loss": 0.2573, + "num_input_tokens_seen": 2314032, + "step": 12130 + }, + { + "epoch": 6.307172557172557, + "grad_norm": 0.5563350915908813, + "learning_rate": 1.8053277063305456e-05, + "loss": 0.2836, + "num_input_tokens_seen": 2314896, + "step": 12135 + }, + { + "epoch": 6.3097713097713095, + "grad_norm": 0.20817671716213226, + "learning_rate": 1.803149462770089e-05, + "loss": 0.2149, + "num_input_tokens_seen": 2315824, + "step": 12140 + }, + { + "epoch": 6.3123700623700625, + "grad_norm": 0.5240094661712646, + "learning_rate": 1.8009717926446492e-05, + "loss": 0.2744, + "num_input_tokens_seen": 2316752, + "step": 12145 + }, + { + "epoch": 6.314968814968815, + "grad_norm": 0.36313194036483765, + "learning_rate": 1.7987946977462194e-05, + "loss": 0.2683, + "num_input_tokens_seen": 2317712, + "step": 12150 + }, + { + "epoch": 6.3175675675675675, + "grad_norm": 0.5685921311378479, + "learning_rate": 1.7966181798663218e-05, + "loss": 0.2936, + "num_input_tokens_seen": 2318576, + "step": 12155 + }, + { + "epoch": 6.3201663201663205, + "grad_norm": 0.2945777475833893, + "learning_rate": 1.794442240796002e-05, + "loss": 0.2717, + "num_input_tokens_seen": 2319536, + "step": 12160 + }, + { + "epoch": 6.322765072765073, + "grad_norm": 0.2185148447751999, + "learning_rate": 1.7922668823258304e-05, + "loss": 0.2511, + "num_input_tokens_seen": 2320496, + "step": 12165 + }, + { + "epoch": 6.325363825363826, + "grad_norm": 0.5452297329902649, + "learning_rate": 1.790092106245899e-05, + "loss": 0.2233, + "num_input_tokens_seen": 2321488, + "step": 12170 + }, + { + "epoch": 6.327962577962578, + "grad_norm": 0.42427441477775574, + "learning_rate": 1.7879179143458212e-05, + "loss": 0.2534, + "num_input_tokens_seen": 2322448, + "step": 12175 + }, + { + "epoch": 6.330561330561331, + "grad_norm": 0.5529835820198059, + "learning_rate": 1.7857443084147296e-05, + "loss": 0.2499, + "num_input_tokens_seen": 2323472, + "step": 12180 + }, + { + "epoch": 6.333160083160083, + "grad_norm": 0.3004399836063385, + "learning_rate": 1.7835712902412726e-05, + "loss": 0.3404, + "num_input_tokens_seen": 2324464, + "step": 12185 + }, + { + "epoch": 6.335758835758836, + "grad_norm": 0.3070316016674042, + "learning_rate": 1.7813988616136177e-05, + "loss": 0.2204, + "num_input_tokens_seen": 2325360, + "step": 12190 + }, + { + "epoch": 6.338357588357589, + "grad_norm": 0.14908786118030548, + "learning_rate": 1.7792270243194452e-05, + "loss": 0.2446, + "num_input_tokens_seen": 2326320, + "step": 12195 + }, + { + "epoch": 6.340956340956341, + "grad_norm": 0.2901034951210022, + "learning_rate": 1.7770557801459513e-05, + "loss": 0.2621, + "num_input_tokens_seen": 2327248, + "step": 12200 + }, + { + "epoch": 6.343555093555094, + "grad_norm": 0.606558620929718, + "learning_rate": 1.774885130879842e-05, + "loss": 0.2459, + "num_input_tokens_seen": 2328240, + "step": 12205 + }, + { + "epoch": 6.346153846153846, + "grad_norm": 0.559778094291687, + "learning_rate": 1.7727150783073352e-05, + "loss": 0.2432, + "num_input_tokens_seen": 2329072, + "step": 12210 + }, + { + "epoch": 6.348752598752599, + "grad_norm": 0.4571913182735443, + "learning_rate": 1.7705456242141547e-05, + "loss": 0.2609, + "num_input_tokens_seen": 2330064, + "step": 12215 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 0.30332714319229126, + "learning_rate": 1.7683767703855354e-05, + "loss": 0.2319, + "num_input_tokens_seen": 2330992, + "step": 12220 + }, + { + "epoch": 6.353950103950104, + "grad_norm": 0.37339311838150024, + "learning_rate": 1.7662085186062165e-05, + "loss": 0.2586, + "num_input_tokens_seen": 2331952, + "step": 12225 + }, + { + "epoch": 6.356548856548857, + "grad_norm": 0.4846251606941223, + "learning_rate": 1.7640408706604422e-05, + "loss": 0.3074, + "num_input_tokens_seen": 2332880, + "step": 12230 + }, + { + "epoch": 6.359147609147609, + "grad_norm": 0.3128998577594757, + "learning_rate": 1.7618738283319604e-05, + "loss": 0.2498, + "num_input_tokens_seen": 2333712, + "step": 12235 + }, + { + "epoch": 6.361746361746362, + "grad_norm": 0.8070204854011536, + "learning_rate": 1.7597073934040193e-05, + "loss": 0.2838, + "num_input_tokens_seen": 2334736, + "step": 12240 + }, + { + "epoch": 6.364345114345114, + "grad_norm": 0.33729252219200134, + "learning_rate": 1.7575415676593688e-05, + "loss": 0.2504, + "num_input_tokens_seen": 2335696, + "step": 12245 + }, + { + "epoch": 6.366943866943867, + "grad_norm": 0.16875793039798737, + "learning_rate": 1.7553763528802554e-05, + "loss": 0.2692, + "num_input_tokens_seen": 2336624, + "step": 12250 + }, + { + "epoch": 6.36954261954262, + "grad_norm": 0.6552897095680237, + "learning_rate": 1.7532117508484243e-05, + "loss": 0.2215, + "num_input_tokens_seen": 2337520, + "step": 12255 + }, + { + "epoch": 6.372141372141372, + "grad_norm": 0.3960229456424713, + "learning_rate": 1.7510477633451172e-05, + "loss": 0.2996, + "num_input_tokens_seen": 2338608, + "step": 12260 + }, + { + "epoch": 6.374740124740125, + "grad_norm": 0.2832229435443878, + "learning_rate": 1.748884392151069e-05, + "loss": 0.251, + "num_input_tokens_seen": 2339600, + "step": 12265 + }, + { + "epoch": 6.377338877338877, + "grad_norm": 0.7412986755371094, + "learning_rate": 1.746721639046507e-05, + "loss": 0.2765, + "num_input_tokens_seen": 2340560, + "step": 12270 + }, + { + "epoch": 6.37993762993763, + "grad_norm": 0.348195344209671, + "learning_rate": 1.744559505811152e-05, + "loss": 0.2669, + "num_input_tokens_seen": 2341456, + "step": 12275 + }, + { + "epoch": 6.382536382536383, + "grad_norm": 0.7025204300880432, + "learning_rate": 1.742397994224211e-05, + "loss": 0.2596, + "num_input_tokens_seen": 2342416, + "step": 12280 + }, + { + "epoch": 6.385135135135135, + "grad_norm": 0.31826144456863403, + "learning_rate": 1.740237106064383e-05, + "loss": 0.2658, + "num_input_tokens_seen": 2343344, + "step": 12285 + }, + { + "epoch": 6.387733887733888, + "grad_norm": 0.40388938784599304, + "learning_rate": 1.7380768431098527e-05, + "loss": 0.3038, + "num_input_tokens_seen": 2344336, + "step": 12290 + }, + { + "epoch": 6.39033264033264, + "grad_norm": 0.799174427986145, + "learning_rate": 1.7359172071382897e-05, + "loss": 0.2674, + "num_input_tokens_seen": 2345264, + "step": 12295 + }, + { + "epoch": 6.392931392931393, + "grad_norm": 0.699569046497345, + "learning_rate": 1.733758199926849e-05, + "loss": 0.2713, + "num_input_tokens_seen": 2346256, + "step": 12300 + }, + { + "epoch": 6.395530145530145, + "grad_norm": 0.43257758021354675, + "learning_rate": 1.731599823252167e-05, + "loss": 0.3004, + "num_input_tokens_seen": 2347248, + "step": 12305 + }, + { + "epoch": 6.398128898128898, + "grad_norm": 0.39023008942604065, + "learning_rate": 1.7294420788903627e-05, + "loss": 0.2639, + "num_input_tokens_seen": 2348240, + "step": 12310 + }, + { + "epoch": 6.400727650727651, + "grad_norm": 0.7674957513809204, + "learning_rate": 1.7272849686170314e-05, + "loss": 0.2833, + "num_input_tokens_seen": 2349200, + "step": 12315 + }, + { + "epoch": 6.403326403326403, + "grad_norm": 0.27089715003967285, + "learning_rate": 1.72512849420725e-05, + "loss": 0.2316, + "num_input_tokens_seen": 2350096, + "step": 12320 + }, + { + "epoch": 6.405925155925156, + "grad_norm": 0.8748087286949158, + "learning_rate": 1.722972657435572e-05, + "loss": 0.2729, + "num_input_tokens_seen": 2351056, + "step": 12325 + }, + { + "epoch": 6.408523908523908, + "grad_norm": 0.3377643823623657, + "learning_rate": 1.7208174600760247e-05, + "loss": 0.2929, + "num_input_tokens_seen": 2351984, + "step": 12330 + }, + { + "epoch": 6.411122661122661, + "grad_norm": 0.7161197662353516, + "learning_rate": 1.7186629039021102e-05, + "loss": 0.272, + "num_input_tokens_seen": 2352944, + "step": 12335 + }, + { + "epoch": 6.413721413721413, + "grad_norm": 0.2442370504140854, + "learning_rate": 1.7165089906868028e-05, + "loss": 0.2248, + "num_input_tokens_seen": 2353904, + "step": 12340 + }, + { + "epoch": 6.416320166320166, + "grad_norm": 0.2221386879682541, + "learning_rate": 1.714355722202546e-05, + "loss": 0.2705, + "num_input_tokens_seen": 2354896, + "step": 12345 + }, + { + "epoch": 6.418918918918919, + "grad_norm": 0.755764365196228, + "learning_rate": 1.7122031002212556e-05, + "loss": 0.2945, + "num_input_tokens_seen": 2355760, + "step": 12350 + }, + { + "epoch": 6.421517671517671, + "grad_norm": 0.1828920841217041, + "learning_rate": 1.7100511265143132e-05, + "loss": 0.237, + "num_input_tokens_seen": 2356688, + "step": 12355 + }, + { + "epoch": 6.424116424116424, + "grad_norm": 0.2388656586408615, + "learning_rate": 1.707899802852569e-05, + "loss": 0.3102, + "num_input_tokens_seen": 2357648, + "step": 12360 + }, + { + "epoch": 6.4267151767151764, + "grad_norm": 0.17702357470989227, + "learning_rate": 1.7057491310063355e-05, + "loss": 0.212, + "num_input_tokens_seen": 2358512, + "step": 12365 + }, + { + "epoch": 6.429313929313929, + "grad_norm": 0.4289292097091675, + "learning_rate": 1.703599112745392e-05, + "loss": 0.3031, + "num_input_tokens_seen": 2359440, + "step": 12370 + }, + { + "epoch": 6.4319126819126815, + "grad_norm": 0.22310739755630493, + "learning_rate": 1.701449749838978e-05, + "loss": 0.2909, + "num_input_tokens_seen": 2360336, + "step": 12375 + }, + { + "epoch": 6.4345114345114345, + "grad_norm": 0.9695992469787598, + "learning_rate": 1.699301044055793e-05, + "loss": 0.3172, + "num_input_tokens_seen": 2361296, + "step": 12380 + }, + { + "epoch": 6.4371101871101875, + "grad_norm": 0.17064037919044495, + "learning_rate": 1.6971529971639975e-05, + "loss": 0.2625, + "num_input_tokens_seen": 2362320, + "step": 12385 + }, + { + "epoch": 6.4397089397089395, + "grad_norm": 0.2851293981075287, + "learning_rate": 1.6950056109312097e-05, + "loss": 0.244, + "num_input_tokens_seen": 2363312, + "step": 12390 + }, + { + "epoch": 6.4423076923076925, + "grad_norm": 0.7029688358306885, + "learning_rate": 1.692858887124503e-05, + "loss": 0.2417, + "num_input_tokens_seen": 2364272, + "step": 12395 + }, + { + "epoch": 6.444906444906445, + "grad_norm": 0.7033951282501221, + "learning_rate": 1.6907128275104063e-05, + "loss": 0.2083, + "num_input_tokens_seen": 2365136, + "step": 12400 + }, + { + "epoch": 6.447505197505198, + "grad_norm": 0.46621647477149963, + "learning_rate": 1.6885674338549025e-05, + "loss": 0.2405, + "num_input_tokens_seen": 2366064, + "step": 12405 + }, + { + "epoch": 6.45010395010395, + "grad_norm": 0.6139922142028809, + "learning_rate": 1.686422707923425e-05, + "loss": 0.2223, + "num_input_tokens_seen": 2367024, + "step": 12410 + }, + { + "epoch": 6.452702702702703, + "grad_norm": 0.48811522126197815, + "learning_rate": 1.6842786514808593e-05, + "loss": 0.2422, + "num_input_tokens_seen": 2367952, + "step": 12415 + }, + { + "epoch": 6.455301455301456, + "grad_norm": 0.4067057967185974, + "learning_rate": 1.6821352662915388e-05, + "loss": 0.1864, + "num_input_tokens_seen": 2368880, + "step": 12420 + }, + { + "epoch": 6.457900207900208, + "grad_norm": 0.3720586895942688, + "learning_rate": 1.6799925541192454e-05, + "loss": 0.2967, + "num_input_tokens_seen": 2369808, + "step": 12425 + }, + { + "epoch": 6.460498960498961, + "grad_norm": 0.4630081355571747, + "learning_rate": 1.677850516727207e-05, + "loss": 0.2504, + "num_input_tokens_seen": 2370736, + "step": 12430 + }, + { + "epoch": 6.463097713097713, + "grad_norm": 0.5824365615844727, + "learning_rate": 1.6757091558780955e-05, + "loss": 0.2974, + "num_input_tokens_seen": 2371664, + "step": 12435 + }, + { + "epoch": 6.465696465696466, + "grad_norm": 0.5544726252555847, + "learning_rate": 1.6735684733340278e-05, + "loss": 0.3235, + "num_input_tokens_seen": 2372624, + "step": 12440 + }, + { + "epoch": 6.468295218295219, + "grad_norm": 0.2777087390422821, + "learning_rate": 1.6714284708565598e-05, + "loss": 0.3165, + "num_input_tokens_seen": 2373584, + "step": 12445 + }, + { + "epoch": 6.470893970893971, + "grad_norm": 0.3154396712779999, + "learning_rate": 1.6692891502066903e-05, + "loss": 0.3409, + "num_input_tokens_seen": 2374576, + "step": 12450 + }, + { + "epoch": 6.473492723492724, + "grad_norm": 0.4095641076564789, + "learning_rate": 1.667150513144856e-05, + "loss": 0.2986, + "num_input_tokens_seen": 2375568, + "step": 12455 + }, + { + "epoch": 6.476091476091476, + "grad_norm": 0.6107293963432312, + "learning_rate": 1.6650125614309314e-05, + "loss": 0.2983, + "num_input_tokens_seen": 2376560, + "step": 12460 + }, + { + "epoch": 6.478690228690229, + "grad_norm": 0.7893036007881165, + "learning_rate": 1.6628752968242272e-05, + "loss": 0.2762, + "num_input_tokens_seen": 2377552, + "step": 12465 + }, + { + "epoch": 6.481288981288981, + "grad_norm": 0.3479389250278473, + "learning_rate": 1.6607387210834887e-05, + "loss": 0.2797, + "num_input_tokens_seen": 2378544, + "step": 12470 + }, + { + "epoch": 6.483887733887734, + "grad_norm": 0.8621401190757751, + "learning_rate": 1.6586028359668922e-05, + "loss": 0.2826, + "num_input_tokens_seen": 2379504, + "step": 12475 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 0.32184529304504395, + "learning_rate": 1.6564676432320485e-05, + "loss": 0.2815, + "num_input_tokens_seen": 2380496, + "step": 12480 + }, + { + "epoch": 6.489085239085239, + "grad_norm": 0.41441676020622253, + "learning_rate": 1.6543331446359976e-05, + "loss": 0.2845, + "num_input_tokens_seen": 2381328, + "step": 12485 + }, + { + "epoch": 6.491683991683992, + "grad_norm": 0.7768648862838745, + "learning_rate": 1.652199341935209e-05, + "loss": 0.2718, + "num_input_tokens_seen": 2382288, + "step": 12490 + }, + { + "epoch": 6.494282744282744, + "grad_norm": 0.7621197700500488, + "learning_rate": 1.6500662368855776e-05, + "loss": 0.2523, + "num_input_tokens_seen": 2383184, + "step": 12495 + }, + { + "epoch": 6.496881496881497, + "grad_norm": 0.338191419839859, + "learning_rate": 1.6479338312424258e-05, + "loss": 0.2548, + "num_input_tokens_seen": 2384080, + "step": 12500 + }, + { + "epoch": 6.49948024948025, + "grad_norm": 0.6008409261703491, + "learning_rate": 1.6458021267605018e-05, + "loss": 0.2484, + "num_input_tokens_seen": 2385008, + "step": 12505 + }, + { + "epoch": 6.5, + "eval_loss": 0.24993197619915009, + "eval_runtime": 7.9069, + "eval_samples_per_second": 108.26, + "eval_steps_per_second": 27.065, + "num_input_tokens_seen": 2385200, + "step": 12506 + }, + { + "epoch": 6.502079002079002, + "grad_norm": 0.7298408150672913, + "learning_rate": 1.643671125193973e-05, + "loss": 0.2677, + "num_input_tokens_seen": 2386000, + "step": 12510 + }, + { + "epoch": 6.504677754677755, + "grad_norm": 0.6041808724403381, + "learning_rate": 1.6415408282964313e-05, + "loss": 0.2574, + "num_input_tokens_seen": 2386960, + "step": 12515 + }, + { + "epoch": 6.507276507276507, + "grad_norm": 0.13694576919078827, + "learning_rate": 1.6394112378208877e-05, + "loss": 0.2167, + "num_input_tokens_seen": 2387952, + "step": 12520 + }, + { + "epoch": 6.50987525987526, + "grad_norm": 0.5352184772491455, + "learning_rate": 1.6372823555197726e-05, + "loss": 0.249, + "num_input_tokens_seen": 2388880, + "step": 12525 + }, + { + "epoch": 6.512474012474012, + "grad_norm": 0.13553518056869507, + "learning_rate": 1.635154183144933e-05, + "loss": 0.2403, + "num_input_tokens_seen": 2389840, + "step": 12530 + }, + { + "epoch": 6.515072765072765, + "grad_norm": 0.2598569393157959, + "learning_rate": 1.6330267224476326e-05, + "loss": 0.2344, + "num_input_tokens_seen": 2390864, + "step": 12535 + }, + { + "epoch": 6.517671517671518, + "grad_norm": 0.5924814343452454, + "learning_rate": 1.630899975178547e-05, + "loss": 0.3433, + "num_input_tokens_seen": 2391856, + "step": 12540 + }, + { + "epoch": 6.52027027027027, + "grad_norm": 0.19286495447158813, + "learning_rate": 1.628773943087768e-05, + "loss": 0.1824, + "num_input_tokens_seen": 2392784, + "step": 12545 + }, + { + "epoch": 6.522869022869023, + "grad_norm": 0.4875711500644684, + "learning_rate": 1.6266486279247968e-05, + "loss": 0.2279, + "num_input_tokens_seen": 2393808, + "step": 12550 + }, + { + "epoch": 6.525467775467775, + "grad_norm": 0.33765512704849243, + "learning_rate": 1.6245240314385458e-05, + "loss": 0.1847, + "num_input_tokens_seen": 2394864, + "step": 12555 + }, + { + "epoch": 6.528066528066528, + "grad_norm": 0.5492702126502991, + "learning_rate": 1.6224001553773345e-05, + "loss": 0.2744, + "num_input_tokens_seen": 2395760, + "step": 12560 + }, + { + "epoch": 6.53066528066528, + "grad_norm": 0.4759461581707001, + "learning_rate": 1.6202770014888906e-05, + "loss": 0.2776, + "num_input_tokens_seen": 2396752, + "step": 12565 + }, + { + "epoch": 6.533264033264033, + "grad_norm": 0.5720187425613403, + "learning_rate": 1.6181545715203488e-05, + "loss": 0.2548, + "num_input_tokens_seen": 2397744, + "step": 12570 + }, + { + "epoch": 6.535862785862786, + "grad_norm": 0.25001639127731323, + "learning_rate": 1.6160328672182445e-05, + "loss": 0.2342, + "num_input_tokens_seen": 2398704, + "step": 12575 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.2485172003507614, + "learning_rate": 1.61391189032852e-05, + "loss": 0.2254, + "num_input_tokens_seen": 2399600, + "step": 12580 + }, + { + "epoch": 6.541060291060291, + "grad_norm": 0.6127662062644958, + "learning_rate": 1.611791642596516e-05, + "loss": 0.2259, + "num_input_tokens_seen": 2400560, + "step": 12585 + }, + { + "epoch": 6.543659043659043, + "grad_norm": 0.2960638701915741, + "learning_rate": 1.609672125766975e-05, + "loss": 0.2848, + "num_input_tokens_seen": 2401424, + "step": 12590 + }, + { + "epoch": 6.546257796257796, + "grad_norm": 0.2005082666873932, + "learning_rate": 1.6075533415840372e-05, + "loss": 0.2805, + "num_input_tokens_seen": 2402416, + "step": 12595 + }, + { + "epoch": 6.548856548856548, + "grad_norm": 0.30063512921333313, + "learning_rate": 1.60543529179124e-05, + "loss": 0.2208, + "num_input_tokens_seen": 2403312, + "step": 12600 + }, + { + "epoch": 6.551455301455301, + "grad_norm": 0.570164680480957, + "learning_rate": 1.6033179781315178e-05, + "loss": 0.317, + "num_input_tokens_seen": 2404304, + "step": 12605 + }, + { + "epoch": 6.554054054054054, + "grad_norm": 0.2566395401954651, + "learning_rate": 1.6012014023471954e-05, + "loss": 0.2436, + "num_input_tokens_seen": 2405200, + "step": 12610 + }, + { + "epoch": 6.5566528066528065, + "grad_norm": 0.41997191309928894, + "learning_rate": 1.5990855661799947e-05, + "loss": 0.2964, + "num_input_tokens_seen": 2406160, + "step": 12615 + }, + { + "epoch": 6.5592515592515594, + "grad_norm": 0.7333978414535522, + "learning_rate": 1.5969704713710275e-05, + "loss": 0.2719, + "num_input_tokens_seen": 2407088, + "step": 12620 + }, + { + "epoch": 6.5618503118503115, + "grad_norm": 0.3951340317726135, + "learning_rate": 1.594856119660794e-05, + "loss": 0.2773, + "num_input_tokens_seen": 2408048, + "step": 12625 + }, + { + "epoch": 6.5644490644490645, + "grad_norm": 0.35083723068237305, + "learning_rate": 1.5927425127891856e-05, + "loss": 0.2448, + "num_input_tokens_seen": 2409072, + "step": 12630 + }, + { + "epoch": 6.567047817047817, + "grad_norm": 0.3858892321586609, + "learning_rate": 1.59062965249548e-05, + "loss": 0.233, + "num_input_tokens_seen": 2410000, + "step": 12635 + }, + { + "epoch": 6.56964656964657, + "grad_norm": 0.5075581669807434, + "learning_rate": 1.588517540518338e-05, + "loss": 0.297, + "num_input_tokens_seen": 2410960, + "step": 12640 + }, + { + "epoch": 6.5722453222453225, + "grad_norm": 0.5830019116401672, + "learning_rate": 1.5864061785958076e-05, + "loss": 0.1977, + "num_input_tokens_seen": 2411920, + "step": 12645 + }, + { + "epoch": 6.574844074844075, + "grad_norm": 0.4190521538257599, + "learning_rate": 1.584295568465318e-05, + "loss": 0.3144, + "num_input_tokens_seen": 2412912, + "step": 12650 + }, + { + "epoch": 6.577442827442828, + "grad_norm": 0.46960213780403137, + "learning_rate": 1.582185711863681e-05, + "loss": 0.2666, + "num_input_tokens_seen": 2413904, + "step": 12655 + }, + { + "epoch": 6.58004158004158, + "grad_norm": 0.5828394889831543, + "learning_rate": 1.5800766105270877e-05, + "loss": 0.2103, + "num_input_tokens_seen": 2414864, + "step": 12660 + }, + { + "epoch": 6.582640332640333, + "grad_norm": 0.4000866115093231, + "learning_rate": 1.5779682661911072e-05, + "loss": 0.3208, + "num_input_tokens_seen": 2415760, + "step": 12665 + }, + { + "epoch": 6.585239085239085, + "grad_norm": 0.6144781112670898, + "learning_rate": 1.5758606805906867e-05, + "loss": 0.245, + "num_input_tokens_seen": 2416752, + "step": 12670 + }, + { + "epoch": 6.587837837837838, + "grad_norm": 1.0559616088867188, + "learning_rate": 1.5737538554601473e-05, + "loss": 0.3371, + "num_input_tokens_seen": 2417712, + "step": 12675 + }, + { + "epoch": 6.590436590436591, + "grad_norm": 0.6279468536376953, + "learning_rate": 1.571647792533186e-05, + "loss": 0.2654, + "num_input_tokens_seen": 2418672, + "step": 12680 + }, + { + "epoch": 6.593035343035343, + "grad_norm": 0.411770761013031, + "learning_rate": 1.569542493542872e-05, + "loss": 0.2559, + "num_input_tokens_seen": 2419600, + "step": 12685 + }, + { + "epoch": 6.595634095634096, + "grad_norm": 0.6128449440002441, + "learning_rate": 1.5674379602216464e-05, + "loss": 0.2359, + "num_input_tokens_seen": 2420528, + "step": 12690 + }, + { + "epoch": 6.598232848232849, + "grad_norm": 0.6752115488052368, + "learning_rate": 1.5653341943013195e-05, + "loss": 0.2291, + "num_input_tokens_seen": 2421552, + "step": 12695 + }, + { + "epoch": 6.600831600831601, + "grad_norm": 0.5830577611923218, + "learning_rate": 1.5632311975130705e-05, + "loss": 0.2283, + "num_input_tokens_seen": 2422480, + "step": 12700 + }, + { + "epoch": 6.603430353430354, + "grad_norm": 0.41544556617736816, + "learning_rate": 1.5611289715874443e-05, + "loss": 0.2728, + "num_input_tokens_seen": 2423408, + "step": 12705 + }, + { + "epoch": 6.606029106029106, + "grad_norm": 0.5062900185585022, + "learning_rate": 1.559027518254354e-05, + "loss": 0.2388, + "num_input_tokens_seen": 2424336, + "step": 12710 + }, + { + "epoch": 6.608627858627859, + "grad_norm": 0.29658496379852295, + "learning_rate": 1.5569268392430753e-05, + "loss": 0.2582, + "num_input_tokens_seen": 2425296, + "step": 12715 + }, + { + "epoch": 6.611226611226611, + "grad_norm": 0.5579611659049988, + "learning_rate": 1.554826936282247e-05, + "loss": 0.2951, + "num_input_tokens_seen": 2426192, + "step": 12720 + }, + { + "epoch": 6.613825363825364, + "grad_norm": 1.0296722650527954, + "learning_rate": 1.552727811099869e-05, + "loss": 0.3815, + "num_input_tokens_seen": 2427120, + "step": 12725 + }, + { + "epoch": 6.616424116424117, + "grad_norm": 0.19392791390419006, + "learning_rate": 1.5506294654233023e-05, + "loss": 0.2474, + "num_input_tokens_seen": 2428112, + "step": 12730 + }, + { + "epoch": 6.619022869022869, + "grad_norm": 0.6550665497779846, + "learning_rate": 1.548531900979266e-05, + "loss": 0.2587, + "num_input_tokens_seen": 2429104, + "step": 12735 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 0.4370359480381012, + "learning_rate": 1.5464351194938337e-05, + "loss": 0.2868, + "num_input_tokens_seen": 2430064, + "step": 12740 + }, + { + "epoch": 6.624220374220374, + "grad_norm": 0.6674965023994446, + "learning_rate": 1.5443391226924386e-05, + "loss": 0.2389, + "num_input_tokens_seen": 2430992, + "step": 12745 + }, + { + "epoch": 6.626819126819127, + "grad_norm": 0.45819926261901855, + "learning_rate": 1.542243912299866e-05, + "loss": 0.3061, + "num_input_tokens_seen": 2431952, + "step": 12750 + }, + { + "epoch": 6.629417879417879, + "grad_norm": 0.6867055892944336, + "learning_rate": 1.5401494900402557e-05, + "loss": 0.2512, + "num_input_tokens_seen": 2432976, + "step": 12755 + }, + { + "epoch": 6.632016632016632, + "grad_norm": 0.24567800760269165, + "learning_rate": 1.538055857637097e-05, + "loss": 0.2622, + "num_input_tokens_seen": 2433936, + "step": 12760 + }, + { + "epoch": 6.634615384615385, + "grad_norm": 0.5913975834846497, + "learning_rate": 1.5359630168132316e-05, + "loss": 0.233, + "num_input_tokens_seen": 2434864, + "step": 12765 + }, + { + "epoch": 6.637214137214137, + "grad_norm": 0.7577527165412903, + "learning_rate": 1.5338709692908456e-05, + "loss": 0.2753, + "num_input_tokens_seen": 2435728, + "step": 12770 + }, + { + "epoch": 6.63981288981289, + "grad_norm": 0.4594014286994934, + "learning_rate": 1.5317797167914767e-05, + "loss": 0.2423, + "num_input_tokens_seen": 2436688, + "step": 12775 + }, + { + "epoch": 6.642411642411642, + "grad_norm": 0.4189419448375702, + "learning_rate": 1.5296892610360064e-05, + "loss": 0.2327, + "num_input_tokens_seen": 2437744, + "step": 12780 + }, + { + "epoch": 6.645010395010395, + "grad_norm": 0.5131450891494751, + "learning_rate": 1.527599603744661e-05, + "loss": 0.2614, + "num_input_tokens_seen": 2438640, + "step": 12785 + }, + { + "epoch": 6.647609147609147, + "grad_norm": 0.46674636006355286, + "learning_rate": 1.52551074663701e-05, + "loss": 0.2928, + "num_input_tokens_seen": 2439664, + "step": 12790 + }, + { + "epoch": 6.6502079002079, + "grad_norm": 0.3743995130062103, + "learning_rate": 1.5234226914319632e-05, + "loss": 0.2503, + "num_input_tokens_seen": 2440624, + "step": 12795 + }, + { + "epoch": 6.652806652806653, + "grad_norm": 0.24247224628925323, + "learning_rate": 1.5213354398477722e-05, + "loss": 0.2933, + "num_input_tokens_seen": 2441648, + "step": 12800 + }, + { + "epoch": 6.655405405405405, + "grad_norm": 0.5346494317054749, + "learning_rate": 1.5192489936020257e-05, + "loss": 0.195, + "num_input_tokens_seen": 2442576, + "step": 12805 + }, + { + "epoch": 6.658004158004158, + "grad_norm": 0.761898934841156, + "learning_rate": 1.5171633544116509e-05, + "loss": 0.3249, + "num_input_tokens_seen": 2443440, + "step": 12810 + }, + { + "epoch": 6.66060291060291, + "grad_norm": 0.5817584991455078, + "learning_rate": 1.5150785239929102e-05, + "loss": 0.2566, + "num_input_tokens_seen": 2444368, + "step": 12815 + }, + { + "epoch": 6.663201663201663, + "grad_norm": 0.4779649078845978, + "learning_rate": 1.512994504061401e-05, + "loss": 0.2698, + "num_input_tokens_seen": 2445296, + "step": 12820 + }, + { + "epoch": 6.665800415800415, + "grad_norm": 0.8338134288787842, + "learning_rate": 1.5109112963320532e-05, + "loss": 0.2528, + "num_input_tokens_seen": 2446256, + "step": 12825 + }, + { + "epoch": 6.668399168399168, + "grad_norm": 0.5958486795425415, + "learning_rate": 1.5088289025191293e-05, + "loss": 0.2564, + "num_input_tokens_seen": 2447216, + "step": 12830 + }, + { + "epoch": 6.670997920997921, + "grad_norm": 0.43067339062690735, + "learning_rate": 1.5067473243362204e-05, + "loss": 0.262, + "num_input_tokens_seen": 2448112, + "step": 12835 + }, + { + "epoch": 6.673596673596673, + "grad_norm": 0.6147897839546204, + "learning_rate": 1.5046665634962476e-05, + "loss": 0.23, + "num_input_tokens_seen": 2449072, + "step": 12840 + }, + { + "epoch": 6.676195426195426, + "grad_norm": 0.1735864132642746, + "learning_rate": 1.5025866217114592e-05, + "loss": 0.2831, + "num_input_tokens_seen": 2450000, + "step": 12845 + }, + { + "epoch": 6.6787941787941785, + "grad_norm": 0.6100995540618896, + "learning_rate": 1.50050750069343e-05, + "loss": 0.2497, + "num_input_tokens_seen": 2451056, + "step": 12850 + }, + { + "epoch": 6.6813929313929314, + "grad_norm": 0.768913745880127, + "learning_rate": 1.4984292021530578e-05, + "loss": 0.2295, + "num_input_tokens_seen": 2451984, + "step": 12855 + }, + { + "epoch": 6.6839916839916835, + "grad_norm": 0.34024778008461, + "learning_rate": 1.4963517278005656e-05, + "loss": 0.2207, + "num_input_tokens_seen": 2452944, + "step": 12860 + }, + { + "epoch": 6.6865904365904365, + "grad_norm": 0.49388596415519714, + "learning_rate": 1.494275079345498e-05, + "loss": 0.2565, + "num_input_tokens_seen": 2453872, + "step": 12865 + }, + { + "epoch": 6.6891891891891895, + "grad_norm": 0.30383652448654175, + "learning_rate": 1.492199258496717e-05, + "loss": 0.2548, + "num_input_tokens_seen": 2454832, + "step": 12870 + }, + { + "epoch": 6.691787941787942, + "grad_norm": 0.3790271580219269, + "learning_rate": 1.4901242669624065e-05, + "loss": 0.1608, + "num_input_tokens_seen": 2455792, + "step": 12875 + }, + { + "epoch": 6.6943866943866945, + "grad_norm": 0.6093438267707825, + "learning_rate": 1.488050106450068e-05, + "loss": 0.3173, + "num_input_tokens_seen": 2456784, + "step": 12880 + }, + { + "epoch": 6.696985446985447, + "grad_norm": 0.8415329456329346, + "learning_rate": 1.4859767786665183e-05, + "loss": 0.3046, + "num_input_tokens_seen": 2457744, + "step": 12885 + }, + { + "epoch": 6.6995841995842, + "grad_norm": 0.26247638463974, + "learning_rate": 1.4839042853178886e-05, + "loss": 0.2537, + "num_input_tokens_seen": 2458704, + "step": 12890 + }, + { + "epoch": 6.702182952182953, + "grad_norm": 0.7619677782058716, + "learning_rate": 1.481832628109625e-05, + "loss": 0.3185, + "num_input_tokens_seen": 2459728, + "step": 12895 + }, + { + "epoch": 6.704781704781705, + "grad_norm": 0.606061577796936, + "learning_rate": 1.4797618087464827e-05, + "loss": 0.2789, + "num_input_tokens_seen": 2460656, + "step": 12900 + }, + { + "epoch": 6.707380457380458, + "grad_norm": 0.3488970398902893, + "learning_rate": 1.4776918289325298e-05, + "loss": 0.3096, + "num_input_tokens_seen": 2461712, + "step": 12905 + }, + { + "epoch": 6.70997920997921, + "grad_norm": 0.2807706296443939, + "learning_rate": 1.4756226903711429e-05, + "loss": 0.2429, + "num_input_tokens_seen": 2462608, + "step": 12910 + }, + { + "epoch": 6.712577962577963, + "grad_norm": 0.6593853831291199, + "learning_rate": 1.4735543947650066e-05, + "loss": 0.236, + "num_input_tokens_seen": 2463536, + "step": 12915 + }, + { + "epoch": 6.715176715176716, + "grad_norm": 0.46176856756210327, + "learning_rate": 1.4714869438161116e-05, + "loss": 0.275, + "num_input_tokens_seen": 2464464, + "step": 12920 + }, + { + "epoch": 6.717775467775468, + "grad_norm": 0.5905977487564087, + "learning_rate": 1.4694203392257536e-05, + "loss": 0.238, + "num_input_tokens_seen": 2465392, + "step": 12925 + }, + { + "epoch": 6.720374220374221, + "grad_norm": 0.18355590105056763, + "learning_rate": 1.467354582694532e-05, + "loss": 0.2633, + "num_input_tokens_seen": 2466352, + "step": 12930 + }, + { + "epoch": 6.722972972972973, + "grad_norm": 0.604714572429657, + "learning_rate": 1.4652896759223472e-05, + "loss": 0.3318, + "num_input_tokens_seen": 2467312, + "step": 12935 + }, + { + "epoch": 6.725571725571726, + "grad_norm": 0.430433064699173, + "learning_rate": 1.4632256206084016e-05, + "loss": 0.2864, + "num_input_tokens_seen": 2468304, + "step": 12940 + }, + { + "epoch": 6.728170478170478, + "grad_norm": 0.6376120448112488, + "learning_rate": 1.4611624184511968e-05, + "loss": 0.2675, + "num_input_tokens_seen": 2469296, + "step": 12945 + }, + { + "epoch": 6.730769230769231, + "grad_norm": 0.429217129945755, + "learning_rate": 1.4591000711485314e-05, + "loss": 0.2504, + "num_input_tokens_seen": 2470256, + "step": 12950 + }, + { + "epoch": 6.733367983367984, + "grad_norm": 0.34081125259399414, + "learning_rate": 1.4570385803975031e-05, + "loss": 0.2678, + "num_input_tokens_seen": 2471312, + "step": 12955 + }, + { + "epoch": 6.735966735966736, + "grad_norm": 0.13203753530979156, + "learning_rate": 1.4549779478945005e-05, + "loss": 0.2737, + "num_input_tokens_seen": 2472240, + "step": 12960 + }, + { + "epoch": 6.738565488565489, + "grad_norm": 0.5996462106704712, + "learning_rate": 1.4529181753352117e-05, + "loss": 0.2522, + "num_input_tokens_seen": 2473168, + "step": 12965 + }, + { + "epoch": 6.741164241164241, + "grad_norm": 0.31986284255981445, + "learning_rate": 1.4508592644146093e-05, + "loss": 0.2692, + "num_input_tokens_seen": 2474224, + "step": 12970 + }, + { + "epoch": 6.743762993762994, + "grad_norm": 0.4999290406703949, + "learning_rate": 1.448801216826965e-05, + "loss": 0.2513, + "num_input_tokens_seen": 2475216, + "step": 12975 + }, + { + "epoch": 6.746361746361746, + "grad_norm": 0.4326496422290802, + "learning_rate": 1.446744034265834e-05, + "loss": 0.2934, + "num_input_tokens_seen": 2476144, + "step": 12980 + }, + { + "epoch": 6.748960498960499, + "grad_norm": 0.519690752029419, + "learning_rate": 1.4446877184240643e-05, + "loss": 0.287, + "num_input_tokens_seen": 2477104, + "step": 12985 + }, + { + "epoch": 6.751559251559252, + "grad_norm": 0.5787778496742249, + "learning_rate": 1.4426322709937862e-05, + "loss": 0.1955, + "num_input_tokens_seen": 2478032, + "step": 12990 + }, + { + "epoch": 6.754158004158004, + "grad_norm": 0.834307074546814, + "learning_rate": 1.4405776936664203e-05, + "loss": 0.2808, + "num_input_tokens_seen": 2478928, + "step": 12995 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 0.3027605712413788, + "learning_rate": 1.4385239881326673e-05, + "loss": 0.3058, + "num_input_tokens_seen": 2479824, + "step": 13000 + }, + { + "epoch": 6.759355509355509, + "grad_norm": 0.6182340383529663, + "learning_rate": 1.4364711560825104e-05, + "loss": 0.2798, + "num_input_tokens_seen": 2480752, + "step": 13005 + }, + { + "epoch": 6.761954261954262, + "grad_norm": 0.7231260538101196, + "learning_rate": 1.4344191992052178e-05, + "loss": 0.2925, + "num_input_tokens_seen": 2481616, + "step": 13010 + }, + { + "epoch": 6.764553014553014, + "grad_norm": 0.6614211201667786, + "learning_rate": 1.4323681191893328e-05, + "loss": 0.2144, + "num_input_tokens_seen": 2482448, + "step": 13015 + }, + { + "epoch": 6.767151767151767, + "grad_norm": 0.5947413444519043, + "learning_rate": 1.4303179177226823e-05, + "loss": 0.1894, + "num_input_tokens_seen": 2483344, + "step": 13020 + }, + { + "epoch": 6.76975051975052, + "grad_norm": 0.282692551612854, + "learning_rate": 1.4282685964923642e-05, + "loss": 0.2284, + "num_input_tokens_seen": 2484336, + "step": 13025 + }, + { + "epoch": 6.772349272349272, + "grad_norm": 0.4707482159137726, + "learning_rate": 1.4262201571847584e-05, + "loss": 0.2013, + "num_input_tokens_seen": 2485264, + "step": 13030 + }, + { + "epoch": 6.774948024948025, + "grad_norm": 0.606964647769928, + "learning_rate": 1.4241726014855139e-05, + "loss": 0.296, + "num_input_tokens_seen": 2486128, + "step": 13035 + }, + { + "epoch": 6.777546777546777, + "grad_norm": 0.409159779548645, + "learning_rate": 1.4221259310795543e-05, + "loss": 0.1633, + "num_input_tokens_seen": 2487120, + "step": 13040 + }, + { + "epoch": 6.78014553014553, + "grad_norm": 0.6287622451782227, + "learning_rate": 1.4200801476510767e-05, + "loss": 0.2441, + "num_input_tokens_seen": 2488080, + "step": 13045 + }, + { + "epoch": 6.782744282744282, + "grad_norm": 0.4032878577709198, + "learning_rate": 1.418035252883545e-05, + "loss": 0.2277, + "num_input_tokens_seen": 2489040, + "step": 13050 + }, + { + "epoch": 6.785343035343035, + "grad_norm": 0.4396466612815857, + "learning_rate": 1.4159912484596949e-05, + "loss": 0.2147, + "num_input_tokens_seen": 2489904, + "step": 13055 + }, + { + "epoch": 6.787941787941788, + "grad_norm": 0.4376817047595978, + "learning_rate": 1.4139481360615275e-05, + "loss": 0.1948, + "num_input_tokens_seen": 2490832, + "step": 13060 + }, + { + "epoch": 6.79054054054054, + "grad_norm": 0.5762144923210144, + "learning_rate": 1.4119059173703089e-05, + "loss": 0.3286, + "num_input_tokens_seen": 2491760, + "step": 13065 + }, + { + "epoch": 6.793139293139293, + "grad_norm": 0.4465710520744324, + "learning_rate": 1.4098645940665737e-05, + "loss": 0.2749, + "num_input_tokens_seen": 2492688, + "step": 13070 + }, + { + "epoch": 6.795738045738045, + "grad_norm": 0.25080224871635437, + "learning_rate": 1.4078241678301146e-05, + "loss": 0.2462, + "num_input_tokens_seen": 2493648, + "step": 13075 + }, + { + "epoch": 6.798336798336798, + "grad_norm": 0.4510662257671356, + "learning_rate": 1.4057846403399912e-05, + "loss": 0.3389, + "num_input_tokens_seen": 2494544, + "step": 13080 + }, + { + "epoch": 6.8009355509355505, + "grad_norm": 0.7399756908416748, + "learning_rate": 1.4037460132745189e-05, + "loss": 0.3179, + "num_input_tokens_seen": 2495504, + "step": 13085 + }, + { + "epoch": 6.803534303534303, + "grad_norm": 0.6382899284362793, + "learning_rate": 1.4017082883112764e-05, + "loss": 0.2459, + "num_input_tokens_seen": 2496368, + "step": 13090 + }, + { + "epoch": 6.806133056133056, + "grad_norm": 0.6618250012397766, + "learning_rate": 1.3996714671270969e-05, + "loss": 0.2896, + "num_input_tokens_seen": 2497328, + "step": 13095 + }, + { + "epoch": 6.8087318087318085, + "grad_norm": 0.3024352490901947, + "learning_rate": 1.3976355513980708e-05, + "loss": 0.2671, + "num_input_tokens_seen": 2498352, + "step": 13100 + }, + { + "epoch": 6.8113305613305615, + "grad_norm": 0.27134567499160767, + "learning_rate": 1.3956005427995421e-05, + "loss": 0.2574, + "num_input_tokens_seen": 2499248, + "step": 13105 + }, + { + "epoch": 6.813929313929314, + "grad_norm": 0.41929394006729126, + "learning_rate": 1.3935664430061129e-05, + "loss": 0.277, + "num_input_tokens_seen": 2500208, + "step": 13110 + }, + { + "epoch": 6.8165280665280665, + "grad_norm": 0.6512788534164429, + "learning_rate": 1.3915332536916314e-05, + "loss": 0.242, + "num_input_tokens_seen": 2501168, + "step": 13115 + }, + { + "epoch": 6.8191268191268195, + "grad_norm": 0.6582696437835693, + "learning_rate": 1.3895009765292011e-05, + "loss": 0.269, + "num_input_tokens_seen": 2502128, + "step": 13120 + }, + { + "epoch": 6.821725571725572, + "grad_norm": 0.322623074054718, + "learning_rate": 1.3874696131911746e-05, + "loss": 0.2535, + "num_input_tokens_seen": 2503088, + "step": 13125 + }, + { + "epoch": 6.824324324324325, + "grad_norm": 0.6018292903900146, + "learning_rate": 1.3854391653491478e-05, + "loss": 0.2051, + "num_input_tokens_seen": 2504048, + "step": 13130 + }, + { + "epoch": 6.826923076923077, + "grad_norm": 0.5953723788261414, + "learning_rate": 1.3834096346739689e-05, + "loss": 0.2791, + "num_input_tokens_seen": 2505008, + "step": 13135 + }, + { + "epoch": 6.82952182952183, + "grad_norm": 0.31568780541419983, + "learning_rate": 1.3813810228357283e-05, + "loss": 0.2515, + "num_input_tokens_seen": 2505936, + "step": 13140 + }, + { + "epoch": 6.832120582120583, + "grad_norm": 0.54046630859375, + "learning_rate": 1.3793533315037616e-05, + "loss": 0.2937, + "num_input_tokens_seen": 2506832, + "step": 13145 + }, + { + "epoch": 6.834719334719335, + "grad_norm": 0.17967692017555237, + "learning_rate": 1.3773265623466458e-05, + "loss": 0.2316, + "num_input_tokens_seen": 2507824, + "step": 13150 + }, + { + "epoch": 6.837318087318088, + "grad_norm": 0.5128589868545532, + "learning_rate": 1.3753007170322008e-05, + "loss": 0.2717, + "num_input_tokens_seen": 2508752, + "step": 13155 + }, + { + "epoch": 6.83991683991684, + "grad_norm": 0.3053089678287506, + "learning_rate": 1.3732757972274845e-05, + "loss": 0.314, + "num_input_tokens_seen": 2509680, + "step": 13160 + }, + { + "epoch": 6.842515592515593, + "grad_norm": 0.5983067750930786, + "learning_rate": 1.371251804598793e-05, + "loss": 0.2308, + "num_input_tokens_seen": 2510672, + "step": 13165 + }, + { + "epoch": 6.845114345114345, + "grad_norm": 0.5117744207382202, + "learning_rate": 1.3692287408116617e-05, + "loss": 0.1961, + "num_input_tokens_seen": 2511632, + "step": 13170 + }, + { + "epoch": 6.847713097713098, + "grad_norm": 0.7792048454284668, + "learning_rate": 1.3672066075308587e-05, + "loss": 0.2738, + "num_input_tokens_seen": 2512688, + "step": 13175 + }, + { + "epoch": 6.850311850311851, + "grad_norm": 0.5392738580703735, + "learning_rate": 1.3651854064203901e-05, + "loss": 0.2294, + "num_input_tokens_seen": 2513744, + "step": 13180 + }, + { + "epoch": 6.852910602910603, + "grad_norm": 0.5207778215408325, + "learning_rate": 1.3631651391434902e-05, + "loss": 0.2644, + "num_input_tokens_seen": 2514672, + "step": 13185 + }, + { + "epoch": 6.855509355509356, + "grad_norm": 0.4744807183742523, + "learning_rate": 1.3611458073626293e-05, + "loss": 0.3119, + "num_input_tokens_seen": 2515632, + "step": 13190 + }, + { + "epoch": 6.858108108108108, + "grad_norm": 0.19855350255966187, + "learning_rate": 1.359127412739506e-05, + "loss": 0.2839, + "num_input_tokens_seen": 2516528, + "step": 13195 + }, + { + "epoch": 6.860706860706861, + "grad_norm": 0.614963710308075, + "learning_rate": 1.3571099569350456e-05, + "loss": 0.2441, + "num_input_tokens_seen": 2517488, + "step": 13200 + }, + { + "epoch": 6.863305613305613, + "grad_norm": 0.2518925368785858, + "learning_rate": 1.3550934416094058e-05, + "loss": 0.2087, + "num_input_tokens_seen": 2518480, + "step": 13205 + }, + { + "epoch": 6.865904365904366, + "grad_norm": 0.5742847323417664, + "learning_rate": 1.3530778684219648e-05, + "loss": 0.2133, + "num_input_tokens_seen": 2519440, + "step": 13210 + }, + { + "epoch": 6.868503118503119, + "grad_norm": 0.4958479702472687, + "learning_rate": 1.3510632390313307e-05, + "loss": 0.2189, + "num_input_tokens_seen": 2520432, + "step": 13215 + }, + { + "epoch": 6.871101871101871, + "grad_norm": 0.24970105290412903, + "learning_rate": 1.3490495550953303e-05, + "loss": 0.2152, + "num_input_tokens_seen": 2521360, + "step": 13220 + }, + { + "epoch": 6.873700623700624, + "grad_norm": 0.2322247326374054, + "learning_rate": 1.347036818271018e-05, + "loss": 0.2735, + "num_input_tokens_seen": 2522288, + "step": 13225 + }, + { + "epoch": 6.876299376299376, + "grad_norm": 0.23913812637329102, + "learning_rate": 1.345025030214661e-05, + "loss": 0.2718, + "num_input_tokens_seen": 2523248, + "step": 13230 + }, + { + "epoch": 6.878898128898129, + "grad_norm": 0.5117342472076416, + "learning_rate": 1.3430141925817532e-05, + "loss": 0.3339, + "num_input_tokens_seen": 2524176, + "step": 13235 + }, + { + "epoch": 6.881496881496881, + "grad_norm": 0.5902634859085083, + "learning_rate": 1.3410043070270017e-05, + "loss": 0.248, + "num_input_tokens_seen": 2525040, + "step": 13240 + }, + { + "epoch": 6.884095634095634, + "grad_norm": 0.5389296412467957, + "learning_rate": 1.3389953752043327e-05, + "loss": 0.2494, + "num_input_tokens_seen": 2526000, + "step": 13245 + }, + { + "epoch": 6.886694386694387, + "grad_norm": 0.5388863682746887, + "learning_rate": 1.3369873987668873e-05, + "loss": 0.1957, + "num_input_tokens_seen": 2526864, + "step": 13250 + }, + { + "epoch": 6.889293139293139, + "grad_norm": 0.30111241340637207, + "learning_rate": 1.3349803793670196e-05, + "loss": 0.2598, + "num_input_tokens_seen": 2527824, + "step": 13255 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 0.5472490191459656, + "learning_rate": 1.332974318656296e-05, + "loss": 0.1841, + "num_input_tokens_seen": 2528816, + "step": 13260 + }, + { + "epoch": 6.894490644490644, + "grad_norm": 0.45200586318969727, + "learning_rate": 1.3309692182854932e-05, + "loss": 0.1962, + "num_input_tokens_seen": 2529744, + "step": 13265 + }, + { + "epoch": 6.897089397089397, + "grad_norm": 0.8232107758522034, + "learning_rate": 1.328965079904601e-05, + "loss": 0.3124, + "num_input_tokens_seen": 2530640, + "step": 13270 + }, + { + "epoch": 6.899688149688149, + "grad_norm": 0.33931082487106323, + "learning_rate": 1.3269619051628135e-05, + "loss": 0.2916, + "num_input_tokens_seen": 2531632, + "step": 13275 + }, + { + "epoch": 6.902286902286902, + "grad_norm": 0.5422768592834473, + "learning_rate": 1.3249596957085353e-05, + "loss": 0.2741, + "num_input_tokens_seen": 2532560, + "step": 13280 + }, + { + "epoch": 6.904885654885655, + "grad_norm": 0.5408825874328613, + "learning_rate": 1.322958453189374e-05, + "loss": 0.269, + "num_input_tokens_seen": 2533488, + "step": 13285 + }, + { + "epoch": 6.907484407484407, + "grad_norm": 0.3333829641342163, + "learning_rate": 1.3209581792521437e-05, + "loss": 0.2407, + "num_input_tokens_seen": 2534448, + "step": 13290 + }, + { + "epoch": 6.91008316008316, + "grad_norm": 0.7687291502952576, + "learning_rate": 1.3189588755428598e-05, + "loss": 0.3507, + "num_input_tokens_seen": 2535440, + "step": 13295 + }, + { + "epoch": 6.912681912681912, + "grad_norm": 0.13630379736423492, + "learning_rate": 1.3169605437067387e-05, + "loss": 0.2601, + "num_input_tokens_seen": 2536432, + "step": 13300 + }, + { + "epoch": 6.915280665280665, + "grad_norm": 0.4095359444618225, + "learning_rate": 1.3149631853882005e-05, + "loss": 0.2248, + "num_input_tokens_seen": 2537424, + "step": 13305 + }, + { + "epoch": 6.917879417879417, + "grad_norm": 0.21338407695293427, + "learning_rate": 1.3129668022308598e-05, + "loss": 0.2558, + "num_input_tokens_seen": 2538352, + "step": 13310 + }, + { + "epoch": 6.92047817047817, + "grad_norm": 0.2788509726524353, + "learning_rate": 1.3109713958775327e-05, + "loss": 0.2267, + "num_input_tokens_seen": 2539312, + "step": 13315 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 0.42697790265083313, + "learning_rate": 1.3089769679702288e-05, + "loss": 0.2414, + "num_input_tokens_seen": 2540272, + "step": 13320 + }, + { + "epoch": 6.925675675675675, + "grad_norm": 0.5211015939712524, + "learning_rate": 1.3069835201501526e-05, + "loss": 0.2704, + "num_input_tokens_seen": 2541232, + "step": 13325 + }, + { + "epoch": 6.928274428274428, + "grad_norm": 0.5129971504211426, + "learning_rate": 1.3049910540577046e-05, + "loss": 0.2336, + "num_input_tokens_seen": 2542256, + "step": 13330 + }, + { + "epoch": 6.9308731808731805, + "grad_norm": 0.520443320274353, + "learning_rate": 1.3029995713324738e-05, + "loss": 0.3193, + "num_input_tokens_seen": 2543216, + "step": 13335 + }, + { + "epoch": 6.9334719334719335, + "grad_norm": 0.48967018723487854, + "learning_rate": 1.3010090736132442e-05, + "loss": 0.3146, + "num_input_tokens_seen": 2544112, + "step": 13340 + }, + { + "epoch": 6.936070686070686, + "grad_norm": 0.1414259523153305, + "learning_rate": 1.2990195625379847e-05, + "loss": 0.244, + "num_input_tokens_seen": 2545040, + "step": 13345 + }, + { + "epoch": 6.9386694386694385, + "grad_norm": 0.27762851119041443, + "learning_rate": 1.2970310397438564e-05, + "loss": 0.2015, + "num_input_tokens_seen": 2546032, + "step": 13350 + }, + { + "epoch": 6.9412681912681915, + "grad_norm": 0.729150116443634, + "learning_rate": 1.2950435068672046e-05, + "loss": 0.291, + "num_input_tokens_seen": 2547024, + "step": 13355 + }, + { + "epoch": 6.943866943866944, + "grad_norm": 0.351684033870697, + "learning_rate": 1.2930569655435609e-05, + "loss": 0.2985, + "num_input_tokens_seen": 2547984, + "step": 13360 + }, + { + "epoch": 6.946465696465697, + "grad_norm": 0.24237602949142456, + "learning_rate": 1.2910714174076394e-05, + "loss": 0.2662, + "num_input_tokens_seen": 2548880, + "step": 13365 + }, + { + "epoch": 6.9490644490644495, + "grad_norm": 0.414602130651474, + "learning_rate": 1.2890868640933395e-05, + "loss": 0.2781, + "num_input_tokens_seen": 2549840, + "step": 13370 + }, + { + "epoch": 6.951663201663202, + "grad_norm": 0.420116126537323, + "learning_rate": 1.2871033072337413e-05, + "loss": 0.3012, + "num_input_tokens_seen": 2550832, + "step": 13375 + }, + { + "epoch": 6.954261954261955, + "grad_norm": 0.7470518946647644, + "learning_rate": 1.2851207484611033e-05, + "loss": 0.2506, + "num_input_tokens_seen": 2551760, + "step": 13380 + }, + { + "epoch": 6.956860706860707, + "grad_norm": 0.23536665737628937, + "learning_rate": 1.2831391894068647e-05, + "loss": 0.2949, + "num_input_tokens_seen": 2552752, + "step": 13385 + }, + { + "epoch": 6.95945945945946, + "grad_norm": 0.2741304934024811, + "learning_rate": 1.281158631701641e-05, + "loss": 0.2993, + "num_input_tokens_seen": 2553680, + "step": 13390 + }, + { + "epoch": 6.962058212058212, + "grad_norm": 0.8996334075927734, + "learning_rate": 1.2791790769752232e-05, + "loss": 0.2807, + "num_input_tokens_seen": 2554576, + "step": 13395 + }, + { + "epoch": 6.964656964656965, + "grad_norm": 0.8357927203178406, + "learning_rate": 1.2772005268565768e-05, + "loss": 0.2643, + "num_input_tokens_seen": 2555536, + "step": 13400 + }, + { + "epoch": 6.967255717255718, + "grad_norm": 0.6355381608009338, + "learning_rate": 1.2752229829738429e-05, + "loss": 0.2914, + "num_input_tokens_seen": 2556496, + "step": 13405 + }, + { + "epoch": 6.96985446985447, + "grad_norm": 0.7574785947799683, + "learning_rate": 1.2732464469543314e-05, + "loss": 0.278, + "num_input_tokens_seen": 2557456, + "step": 13410 + }, + { + "epoch": 6.972453222453223, + "grad_norm": 0.6493260264396667, + "learning_rate": 1.2712709204245269e-05, + "loss": 0.2915, + "num_input_tokens_seen": 2558416, + "step": 13415 + }, + { + "epoch": 6.975051975051975, + "grad_norm": 0.7340941429138184, + "learning_rate": 1.2692964050100791e-05, + "loss": 0.2456, + "num_input_tokens_seen": 2559312, + "step": 13420 + }, + { + "epoch": 6.977650727650728, + "grad_norm": 0.41267651319503784, + "learning_rate": 1.2673229023358065e-05, + "loss": 0.2715, + "num_input_tokens_seen": 2560272, + "step": 13425 + }, + { + "epoch": 6.98024948024948, + "grad_norm": 0.2656625509262085, + "learning_rate": 1.2653504140256978e-05, + "loss": 0.2596, + "num_input_tokens_seen": 2561136, + "step": 13430 + }, + { + "epoch": 6.982848232848233, + "grad_norm": 0.38752928376197815, + "learning_rate": 1.2633789417029014e-05, + "loss": 0.2646, + "num_input_tokens_seen": 2562096, + "step": 13435 + }, + { + "epoch": 6.985446985446986, + "grad_norm": 0.6620343327522278, + "learning_rate": 1.2614084869897353e-05, + "loss": 0.2455, + "num_input_tokens_seen": 2563056, + "step": 13440 + }, + { + "epoch": 6.988045738045738, + "grad_norm": 0.43419766426086426, + "learning_rate": 1.2594390515076748e-05, + "loss": 0.2945, + "num_input_tokens_seen": 2563920, + "step": 13445 + }, + { + "epoch": 6.990644490644491, + "grad_norm": 0.18599824607372284, + "learning_rate": 1.2574706368773615e-05, + "loss": 0.2455, + "num_input_tokens_seen": 2564944, + "step": 13450 + }, + { + "epoch": 6.993243243243243, + "grad_norm": 0.662268340587616, + "learning_rate": 1.2555032447185932e-05, + "loss": 0.2972, + "num_input_tokens_seen": 2565872, + "step": 13455 + }, + { + "epoch": 6.995841995841996, + "grad_norm": 0.38364073634147644, + "learning_rate": 1.253536876650327e-05, + "loss": 0.2756, + "num_input_tokens_seen": 2566832, + "step": 13460 + }, + { + "epoch": 6.998440748440748, + "grad_norm": 0.633747398853302, + "learning_rate": 1.2515715342906795e-05, + "loss": 0.2463, + "num_input_tokens_seen": 2567728, + "step": 13465 + }, + { + "epoch": 7.0, + "eval_loss": 0.250967800617218, + "eval_runtime": 7.9097, + "eval_samples_per_second": 108.221, + "eval_steps_per_second": 27.055, + "num_input_tokens_seen": 2568288, + "step": 13468 + }, + { + "epoch": 7.001039501039501, + "grad_norm": 0.29121413826942444, + "learning_rate": 1.2496072192569197e-05, + "loss": 0.2712, + "num_input_tokens_seen": 2568672, + "step": 13470 + }, + { + "epoch": 7.003638253638254, + "grad_norm": 0.3744790256023407, + "learning_rate": 1.2476439331654754e-05, + "loss": 0.2255, + "num_input_tokens_seen": 2569632, + "step": 13475 + }, + { + "epoch": 7.006237006237006, + "grad_norm": 0.5911356806755066, + "learning_rate": 1.2456816776319233e-05, + "loss": 0.2143, + "num_input_tokens_seen": 2570592, + "step": 13480 + }, + { + "epoch": 7.008835758835759, + "grad_norm": 0.5630500316619873, + "learning_rate": 1.2437204542709974e-05, + "loss": 0.2171, + "num_input_tokens_seen": 2571488, + "step": 13485 + }, + { + "epoch": 7.011434511434511, + "grad_norm": 0.7554962038993835, + "learning_rate": 1.2417602646965749e-05, + "loss": 0.3219, + "num_input_tokens_seen": 2572480, + "step": 13490 + }, + { + "epoch": 7.014033264033264, + "grad_norm": 0.4744302034378052, + "learning_rate": 1.2398011105216883e-05, + "loss": 0.2568, + "num_input_tokens_seen": 2573376, + "step": 13495 + }, + { + "epoch": 7.016632016632016, + "grad_norm": 0.12748946249485016, + "learning_rate": 1.2378429933585179e-05, + "loss": 0.1579, + "num_input_tokens_seen": 2574336, + "step": 13500 + }, + { + "epoch": 7.019230769230769, + "grad_norm": 0.4990732669830322, + "learning_rate": 1.2358859148183868e-05, + "loss": 0.3272, + "num_input_tokens_seen": 2575296, + "step": 13505 + }, + { + "epoch": 7.021829521829522, + "grad_norm": 0.5517512559890747, + "learning_rate": 1.2339298765117677e-05, + "loss": 0.3149, + "num_input_tokens_seen": 2576192, + "step": 13510 + }, + { + "epoch": 7.024428274428274, + "grad_norm": 0.7850328683853149, + "learning_rate": 1.231974880048273e-05, + "loss": 0.2483, + "num_input_tokens_seen": 2577184, + "step": 13515 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 0.5030069947242737, + "learning_rate": 1.2300209270366636e-05, + "loss": 0.1542, + "num_input_tokens_seen": 2578080, + "step": 13520 + }, + { + "epoch": 7.029625779625779, + "grad_norm": 0.2773001492023468, + "learning_rate": 1.228068019084834e-05, + "loss": 0.2773, + "num_input_tokens_seen": 2579040, + "step": 13525 + }, + { + "epoch": 7.032224532224532, + "grad_norm": 0.5223790407180786, + "learning_rate": 1.2261161577998257e-05, + "loss": 0.2755, + "num_input_tokens_seen": 2580000, + "step": 13530 + }, + { + "epoch": 7.034823284823285, + "grad_norm": 0.3269633650779724, + "learning_rate": 1.2241653447878146e-05, + "loss": 0.312, + "num_input_tokens_seen": 2580960, + "step": 13535 + }, + { + "epoch": 7.037422037422037, + "grad_norm": 0.49501994252204895, + "learning_rate": 1.2222155816541167e-05, + "loss": 0.265, + "num_input_tokens_seen": 2581920, + "step": 13540 + }, + { + "epoch": 7.04002079002079, + "grad_norm": 0.2569766938686371, + "learning_rate": 1.220266870003182e-05, + "loss": 0.2652, + "num_input_tokens_seen": 2582848, + "step": 13545 + }, + { + "epoch": 7.042619542619542, + "grad_norm": 0.5627973079681396, + "learning_rate": 1.2183192114385969e-05, + "loss": 0.2235, + "num_input_tokens_seen": 2583840, + "step": 13550 + }, + { + "epoch": 7.045218295218295, + "grad_norm": 0.22427086532115936, + "learning_rate": 1.2163726075630804e-05, + "loss": 0.273, + "num_input_tokens_seen": 2584768, + "step": 13555 + }, + { + "epoch": 7.047817047817047, + "grad_norm": 0.555225133895874, + "learning_rate": 1.2144270599784824e-05, + "loss": 0.2292, + "num_input_tokens_seen": 2585760, + "step": 13560 + }, + { + "epoch": 7.0504158004158, + "grad_norm": 0.10338834673166275, + "learning_rate": 1.2124825702857865e-05, + "loss": 0.2626, + "num_input_tokens_seen": 2586688, + "step": 13565 + }, + { + "epoch": 7.053014553014553, + "grad_norm": 0.36592140793800354, + "learning_rate": 1.210539140085102e-05, + "loss": 0.2837, + "num_input_tokens_seen": 2587648, + "step": 13570 + }, + { + "epoch": 7.0556133056133055, + "grad_norm": 0.22669006884098053, + "learning_rate": 1.2085967709756712e-05, + "loss": 0.2864, + "num_input_tokens_seen": 2588608, + "step": 13575 + }, + { + "epoch": 7.058212058212058, + "grad_norm": 1.0435200929641724, + "learning_rate": 1.2066554645558578e-05, + "loss": 0.2873, + "num_input_tokens_seen": 2589568, + "step": 13580 + }, + { + "epoch": 7.0608108108108105, + "grad_norm": 0.688499927520752, + "learning_rate": 1.2047152224231558e-05, + "loss": 0.2743, + "num_input_tokens_seen": 2590528, + "step": 13585 + }, + { + "epoch": 7.0634095634095635, + "grad_norm": 0.7949774861335754, + "learning_rate": 1.2027760461741804e-05, + "loss": 0.2715, + "num_input_tokens_seen": 2591520, + "step": 13590 + }, + { + "epoch": 7.066008316008316, + "grad_norm": 0.7176355123519897, + "learning_rate": 1.2008379374046696e-05, + "loss": 0.2242, + "num_input_tokens_seen": 2592384, + "step": 13595 + }, + { + "epoch": 7.0686070686070686, + "grad_norm": 0.28821152448654175, + "learning_rate": 1.198900897709486e-05, + "loss": 0.2547, + "num_input_tokens_seen": 2593344, + "step": 13600 + }, + { + "epoch": 7.0712058212058215, + "grad_norm": 0.4474780261516571, + "learning_rate": 1.1969649286826082e-05, + "loss": 0.2538, + "num_input_tokens_seen": 2594304, + "step": 13605 + }, + { + "epoch": 7.073804573804574, + "grad_norm": 0.46153151988983154, + "learning_rate": 1.195030031917138e-05, + "loss": 0.2672, + "num_input_tokens_seen": 2595232, + "step": 13610 + }, + { + "epoch": 7.076403326403327, + "grad_norm": 0.35570111870765686, + "learning_rate": 1.1930962090052918e-05, + "loss": 0.2655, + "num_input_tokens_seen": 2596128, + "step": 13615 + }, + { + "epoch": 7.079002079002079, + "grad_norm": 0.4865088164806366, + "learning_rate": 1.191163461538403e-05, + "loss": 0.1673, + "num_input_tokens_seen": 2597056, + "step": 13620 + }, + { + "epoch": 7.081600831600832, + "grad_norm": 0.7414199709892273, + "learning_rate": 1.1892317911069212e-05, + "loss": 0.1971, + "num_input_tokens_seen": 2598016, + "step": 13625 + }, + { + "epoch": 7.084199584199585, + "grad_norm": 0.27913904190063477, + "learning_rate": 1.1873011993004076e-05, + "loss": 0.288, + "num_input_tokens_seen": 2598976, + "step": 13630 + }, + { + "epoch": 7.086798336798337, + "grad_norm": 0.23184342682361603, + "learning_rate": 1.1853716877075392e-05, + "loss": 0.2893, + "num_input_tokens_seen": 2599936, + "step": 13635 + }, + { + "epoch": 7.08939708939709, + "grad_norm": 0.4829941391944885, + "learning_rate": 1.1834432579160996e-05, + "loss": 0.3176, + "num_input_tokens_seen": 2600896, + "step": 13640 + }, + { + "epoch": 7.091995841995842, + "grad_norm": 0.2394745796918869, + "learning_rate": 1.1815159115129865e-05, + "loss": 0.24, + "num_input_tokens_seen": 2601792, + "step": 13645 + }, + { + "epoch": 7.094594594594595, + "grad_norm": 0.4834596812725067, + "learning_rate": 1.1795896500842036e-05, + "loss": 0.3167, + "num_input_tokens_seen": 2602656, + "step": 13650 + }, + { + "epoch": 7.097193347193347, + "grad_norm": 0.2685593068599701, + "learning_rate": 1.1776644752148617e-05, + "loss": 0.3133, + "num_input_tokens_seen": 2603584, + "step": 13655 + }, + { + "epoch": 7.0997920997921, + "grad_norm": 0.4665239155292511, + "learning_rate": 1.175740388489178e-05, + "loss": 0.2703, + "num_input_tokens_seen": 2604512, + "step": 13660 + }, + { + "epoch": 7.102390852390853, + "grad_norm": 0.15655194222927094, + "learning_rate": 1.1738173914904754e-05, + "loss": 0.2655, + "num_input_tokens_seen": 2605472, + "step": 13665 + }, + { + "epoch": 7.104989604989605, + "grad_norm": 0.2792645990848541, + "learning_rate": 1.1718954858011777e-05, + "loss": 0.2398, + "num_input_tokens_seen": 2606464, + "step": 13670 + }, + { + "epoch": 7.107588357588358, + "grad_norm": 0.33537355065345764, + "learning_rate": 1.169974673002813e-05, + "loss": 0.2504, + "num_input_tokens_seen": 2607456, + "step": 13675 + }, + { + "epoch": 7.11018711018711, + "grad_norm": 0.513008177280426, + "learning_rate": 1.1680549546760108e-05, + "loss": 0.2448, + "num_input_tokens_seen": 2608448, + "step": 13680 + }, + { + "epoch": 7.112785862785863, + "grad_norm": 0.48004665970802307, + "learning_rate": 1.1661363324004943e-05, + "loss": 0.2946, + "num_input_tokens_seen": 2609376, + "step": 13685 + }, + { + "epoch": 7.115384615384615, + "grad_norm": 0.9724025726318359, + "learning_rate": 1.164218807755092e-05, + "loss": 0.2718, + "num_input_tokens_seen": 2610272, + "step": 13690 + }, + { + "epoch": 7.117983367983368, + "grad_norm": 0.22264830768108368, + "learning_rate": 1.1623023823177235e-05, + "loss": 0.2289, + "num_input_tokens_seen": 2611200, + "step": 13695 + }, + { + "epoch": 7.120582120582121, + "grad_norm": 0.3015475869178772, + "learning_rate": 1.1603870576654083e-05, + "loss": 0.2554, + "num_input_tokens_seen": 2612128, + "step": 13700 + }, + { + "epoch": 7.123180873180873, + "grad_norm": 0.29531732201576233, + "learning_rate": 1.1584728353742563e-05, + "loss": 0.2711, + "num_input_tokens_seen": 2613120, + "step": 13705 + }, + { + "epoch": 7.125779625779626, + "grad_norm": 0.20097607374191284, + "learning_rate": 1.1565597170194737e-05, + "loss": 0.271, + "num_input_tokens_seen": 2614112, + "step": 13710 + }, + { + "epoch": 7.128378378378378, + "grad_norm": 0.5531322360038757, + "learning_rate": 1.1546477041753553e-05, + "loss": 0.2093, + "num_input_tokens_seen": 2615040, + "step": 13715 + }, + { + "epoch": 7.130977130977131, + "grad_norm": 0.24337182939052582, + "learning_rate": 1.1527367984152872e-05, + "loss": 0.2067, + "num_input_tokens_seen": 2615968, + "step": 13720 + }, + { + "epoch": 7.133575883575883, + "grad_norm": 0.5274387001991272, + "learning_rate": 1.1508270013117465e-05, + "loss": 0.2712, + "num_input_tokens_seen": 2616896, + "step": 13725 + }, + { + "epoch": 7.136174636174636, + "grad_norm": 0.5354726314544678, + "learning_rate": 1.1489183144362936e-05, + "loss": 0.3055, + "num_input_tokens_seen": 2617824, + "step": 13730 + }, + { + "epoch": 7.138773388773389, + "grad_norm": 0.5101941227912903, + "learning_rate": 1.1470107393595805e-05, + "loss": 0.2907, + "num_input_tokens_seen": 2618816, + "step": 13735 + }, + { + "epoch": 7.141372141372141, + "grad_norm": 0.1784830242395401, + "learning_rate": 1.1451042776513396e-05, + "loss": 0.1872, + "num_input_tokens_seen": 2619776, + "step": 13740 + }, + { + "epoch": 7.143970893970894, + "grad_norm": 0.57330721616745, + "learning_rate": 1.1431989308803911e-05, + "loss": 0.2631, + "num_input_tokens_seen": 2620672, + "step": 13745 + }, + { + "epoch": 7.146569646569646, + "grad_norm": 0.6857166290283203, + "learning_rate": 1.141294700614635e-05, + "loss": 0.2221, + "num_input_tokens_seen": 2621728, + "step": 13750 + }, + { + "epoch": 7.149168399168399, + "grad_norm": 0.47492676973342896, + "learning_rate": 1.1393915884210523e-05, + "loss": 0.2327, + "num_input_tokens_seen": 2622720, + "step": 13755 + }, + { + "epoch": 7.151767151767152, + "grad_norm": 0.5189419984817505, + "learning_rate": 1.1374895958657073e-05, + "loss": 0.2057, + "num_input_tokens_seen": 2623648, + "step": 13760 + }, + { + "epoch": 7.154365904365904, + "grad_norm": 0.566721498966217, + "learning_rate": 1.1355887245137383e-05, + "loss": 0.2503, + "num_input_tokens_seen": 2624640, + "step": 13765 + }, + { + "epoch": 7.156964656964657, + "grad_norm": 0.3475683629512787, + "learning_rate": 1.1336889759293656e-05, + "loss": 0.2343, + "num_input_tokens_seen": 2625600, + "step": 13770 + }, + { + "epoch": 7.159563409563409, + "grad_norm": 0.24872730672359467, + "learning_rate": 1.1317903516758813e-05, + "loss": 0.2091, + "num_input_tokens_seen": 2626560, + "step": 13775 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 0.5111477375030518, + "learning_rate": 1.1298928533156572e-05, + "loss": 0.2675, + "num_input_tokens_seen": 2627520, + "step": 13780 + }, + { + "epoch": 7.164760914760914, + "grad_norm": 0.2246561050415039, + "learning_rate": 1.1279964824101321e-05, + "loss": 0.2736, + "num_input_tokens_seen": 2628416, + "step": 13785 + }, + { + "epoch": 7.167359667359667, + "grad_norm": 0.482371985912323, + "learning_rate": 1.1261012405198231e-05, + "loss": 0.2932, + "num_input_tokens_seen": 2629376, + "step": 13790 + }, + { + "epoch": 7.16995841995842, + "grad_norm": 1.1080574989318848, + "learning_rate": 1.1242071292043144e-05, + "loss": 0.3701, + "num_input_tokens_seen": 2630336, + "step": 13795 + }, + { + "epoch": 7.172557172557172, + "grad_norm": 0.7168050408363342, + "learning_rate": 1.122314150022262e-05, + "loss": 0.3044, + "num_input_tokens_seen": 2631232, + "step": 13800 + }, + { + "epoch": 7.175155925155925, + "grad_norm": 0.15901440382003784, + "learning_rate": 1.1204223045313903e-05, + "loss": 0.2434, + "num_input_tokens_seen": 2632192, + "step": 13805 + }, + { + "epoch": 7.1777546777546775, + "grad_norm": 0.1920352280139923, + "learning_rate": 1.1185315942884883e-05, + "loss": 0.2163, + "num_input_tokens_seen": 2633088, + "step": 13810 + }, + { + "epoch": 7.18035343035343, + "grad_norm": 0.5989684462547302, + "learning_rate": 1.1166420208494155e-05, + "loss": 0.2302, + "num_input_tokens_seen": 2633984, + "step": 13815 + }, + { + "epoch": 7.182952182952183, + "grad_norm": 0.6869581341743469, + "learning_rate": 1.1147535857690889e-05, + "loss": 0.2438, + "num_input_tokens_seen": 2635008, + "step": 13820 + }, + { + "epoch": 7.1855509355509355, + "grad_norm": 0.6324297189712524, + "learning_rate": 1.1128662906014958e-05, + "loss": 0.2809, + "num_input_tokens_seen": 2635936, + "step": 13825 + }, + { + "epoch": 7.1881496881496885, + "grad_norm": 0.5092098116874695, + "learning_rate": 1.1109801368996806e-05, + "loss": 0.3025, + "num_input_tokens_seen": 2636864, + "step": 13830 + }, + { + "epoch": 7.1907484407484406, + "grad_norm": 0.7362660765647888, + "learning_rate": 1.109095126215752e-05, + "loss": 0.3192, + "num_input_tokens_seen": 2637792, + "step": 13835 + }, + { + "epoch": 7.1933471933471935, + "grad_norm": 0.5703468918800354, + "learning_rate": 1.1072112601008746e-05, + "loss": 0.212, + "num_input_tokens_seen": 2638656, + "step": 13840 + }, + { + "epoch": 7.195945945945946, + "grad_norm": 0.6362816691398621, + "learning_rate": 1.1053285401052749e-05, + "loss": 0.286, + "num_input_tokens_seen": 2639584, + "step": 13845 + }, + { + "epoch": 7.198544698544699, + "grad_norm": 0.15467604994773865, + "learning_rate": 1.1034469677782333e-05, + "loss": 0.2188, + "num_input_tokens_seen": 2640448, + "step": 13850 + }, + { + "epoch": 7.201143451143452, + "grad_norm": 0.523328959941864, + "learning_rate": 1.1015665446680859e-05, + "loss": 0.2762, + "num_input_tokens_seen": 2641344, + "step": 13855 + }, + { + "epoch": 7.203742203742204, + "grad_norm": 0.446111261844635, + "learning_rate": 1.0996872723222256e-05, + "loss": 0.2678, + "num_input_tokens_seen": 2642304, + "step": 13860 + }, + { + "epoch": 7.206340956340957, + "grad_norm": 0.8104705810546875, + "learning_rate": 1.0978091522870954e-05, + "loss": 0.2547, + "num_input_tokens_seen": 2643232, + "step": 13865 + }, + { + "epoch": 7.208939708939709, + "grad_norm": 0.16189736127853394, + "learning_rate": 1.0959321861081928e-05, + "loss": 0.2376, + "num_input_tokens_seen": 2644224, + "step": 13870 + }, + { + "epoch": 7.211538461538462, + "grad_norm": 0.5778164267539978, + "learning_rate": 1.0940563753300626e-05, + "loss": 0.214, + "num_input_tokens_seen": 2645152, + "step": 13875 + }, + { + "epoch": 7.214137214137214, + "grad_norm": 0.8122738599777222, + "learning_rate": 1.0921817214963026e-05, + "loss": 0.2848, + "num_input_tokens_seen": 2646144, + "step": 13880 + }, + { + "epoch": 7.216735966735967, + "grad_norm": 0.965513288974762, + "learning_rate": 1.0903082261495559e-05, + "loss": 0.2607, + "num_input_tokens_seen": 2647104, + "step": 13885 + }, + { + "epoch": 7.21933471933472, + "grad_norm": 0.4891218841075897, + "learning_rate": 1.0884358908315116e-05, + "loss": 0.3194, + "num_input_tokens_seen": 2648000, + "step": 13890 + }, + { + "epoch": 7.221933471933472, + "grad_norm": 0.3284003734588623, + "learning_rate": 1.0865647170829075e-05, + "loss": 0.2134, + "num_input_tokens_seen": 2648960, + "step": 13895 + }, + { + "epoch": 7.224532224532225, + "grad_norm": 0.14527484774589539, + "learning_rate": 1.084694706443522e-05, + "loss": 0.2502, + "num_input_tokens_seen": 2649856, + "step": 13900 + }, + { + "epoch": 7.227130977130977, + "grad_norm": 0.5365539193153381, + "learning_rate": 1.0828258604521798e-05, + "loss": 0.3123, + "num_input_tokens_seen": 2650752, + "step": 13905 + }, + { + "epoch": 7.22972972972973, + "grad_norm": 0.8454024791717529, + "learning_rate": 1.0809581806467447e-05, + "loss": 0.2827, + "num_input_tokens_seen": 2651776, + "step": 13910 + }, + { + "epoch": 7.232328482328482, + "grad_norm": 0.4215852618217468, + "learning_rate": 1.0790916685641211e-05, + "loss": 0.2405, + "num_input_tokens_seen": 2652672, + "step": 13915 + }, + { + "epoch": 7.234927234927235, + "grad_norm": 0.45110398530960083, + "learning_rate": 1.0772263257402526e-05, + "loss": 0.2254, + "num_input_tokens_seen": 2653568, + "step": 13920 + }, + { + "epoch": 7.237525987525988, + "grad_norm": 0.2665134072303772, + "learning_rate": 1.0753621537101216e-05, + "loss": 0.2742, + "num_input_tokens_seen": 2654528, + "step": 13925 + }, + { + "epoch": 7.24012474012474, + "grad_norm": 0.5227984189987183, + "learning_rate": 1.0734991540077474e-05, + "loss": 0.2356, + "num_input_tokens_seen": 2655488, + "step": 13930 + }, + { + "epoch": 7.242723492723493, + "grad_norm": 0.2613558769226074, + "learning_rate": 1.071637328166182e-05, + "loss": 0.2779, + "num_input_tokens_seen": 2656480, + "step": 13935 + }, + { + "epoch": 7.245322245322245, + "grad_norm": 0.5286238193511963, + "learning_rate": 1.0697766777175146e-05, + "loss": 0.1895, + "num_input_tokens_seen": 2657440, + "step": 13940 + }, + { + "epoch": 7.247920997920998, + "grad_norm": 0.536004900932312, + "learning_rate": 1.0679172041928654e-05, + "loss": 0.2277, + "num_input_tokens_seen": 2658400, + "step": 13945 + }, + { + "epoch": 7.25051975051975, + "grad_norm": 0.5360105037689209, + "learning_rate": 1.0660589091223855e-05, + "loss": 0.2483, + "num_input_tokens_seen": 2659392, + "step": 13950 + }, + { + "epoch": 7.253118503118503, + "grad_norm": 0.29925084114074707, + "learning_rate": 1.064201794035257e-05, + "loss": 0.1873, + "num_input_tokens_seen": 2660320, + "step": 13955 + }, + { + "epoch": 7.255717255717256, + "grad_norm": 0.4331764280796051, + "learning_rate": 1.0623458604596923e-05, + "loss": 0.2972, + "num_input_tokens_seen": 2661248, + "step": 13960 + }, + { + "epoch": 7.258316008316008, + "grad_norm": 0.25433996319770813, + "learning_rate": 1.0604911099229289e-05, + "loss": 0.2692, + "num_input_tokens_seen": 2662208, + "step": 13965 + }, + { + "epoch": 7.260914760914761, + "grad_norm": 0.18407195806503296, + "learning_rate": 1.0586375439512341e-05, + "loss": 0.3287, + "num_input_tokens_seen": 2663136, + "step": 13970 + }, + { + "epoch": 7.263513513513513, + "grad_norm": 0.5456838011741638, + "learning_rate": 1.0567851640698978e-05, + "loss": 0.3048, + "num_input_tokens_seen": 2664128, + "step": 13975 + }, + { + "epoch": 7.266112266112266, + "grad_norm": 0.546923816204071, + "learning_rate": 1.0549339718032336e-05, + "loss": 0.2741, + "num_input_tokens_seen": 2665088, + "step": 13980 + }, + { + "epoch": 7.268711018711019, + "grad_norm": 0.6112350821495056, + "learning_rate": 1.0530839686745805e-05, + "loss": 0.2655, + "num_input_tokens_seen": 2665984, + "step": 13985 + }, + { + "epoch": 7.271309771309771, + "grad_norm": 0.4855708181858063, + "learning_rate": 1.0512351562062958e-05, + "loss": 0.2315, + "num_input_tokens_seen": 2666912, + "step": 13990 + }, + { + "epoch": 7.273908523908524, + "grad_norm": 0.6615074872970581, + "learning_rate": 1.0493875359197599e-05, + "loss": 0.3171, + "num_input_tokens_seen": 2667872, + "step": 13995 + }, + { + "epoch": 7.276507276507276, + "grad_norm": 0.6814348697662354, + "learning_rate": 1.0475411093353698e-05, + "loss": 0.2341, + "num_input_tokens_seen": 2668832, + "step": 14000 + }, + { + "epoch": 7.279106029106029, + "grad_norm": 0.5918706059455872, + "learning_rate": 1.0456958779725426e-05, + "loss": 0.186, + "num_input_tokens_seen": 2669792, + "step": 14005 + }, + { + "epoch": 7.281704781704781, + "grad_norm": 0.2526942193508148, + "learning_rate": 1.0438518433497094e-05, + "loss": 0.2102, + "num_input_tokens_seen": 2670720, + "step": 14010 + }, + { + "epoch": 7.284303534303534, + "grad_norm": 0.5397124886512756, + "learning_rate": 1.0420090069843167e-05, + "loss": 0.2791, + "num_input_tokens_seen": 2671616, + "step": 14015 + }, + { + "epoch": 7.286902286902287, + "grad_norm": 0.5105023384094238, + "learning_rate": 1.0401673703928278e-05, + "loss": 0.2547, + "num_input_tokens_seen": 2672608, + "step": 14020 + }, + { + "epoch": 7.289501039501039, + "grad_norm": 0.2757166028022766, + "learning_rate": 1.0383269350907152e-05, + "loss": 0.3247, + "num_input_tokens_seen": 2673536, + "step": 14025 + }, + { + "epoch": 7.292099792099792, + "grad_norm": 0.48254674673080444, + "learning_rate": 1.0364877025924658e-05, + "loss": 0.2959, + "num_input_tokens_seen": 2674496, + "step": 14030 + }, + { + "epoch": 7.294698544698544, + "grad_norm": 0.5300012230873108, + "learning_rate": 1.0346496744115736e-05, + "loss": 0.2904, + "num_input_tokens_seen": 2675392, + "step": 14035 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 0.647004246711731, + "learning_rate": 1.032812852060546e-05, + "loss": 0.2964, + "num_input_tokens_seen": 2676288, + "step": 14040 + }, + { + "epoch": 7.29989604989605, + "grad_norm": 0.27710989117622375, + "learning_rate": 1.0309772370508922e-05, + "loss": 0.2588, + "num_input_tokens_seen": 2677248, + "step": 14045 + }, + { + "epoch": 7.302494802494802, + "grad_norm": 0.4014013707637787, + "learning_rate": 1.0291428308931325e-05, + "loss": 0.3132, + "num_input_tokens_seen": 2678208, + "step": 14050 + }, + { + "epoch": 7.305093555093555, + "grad_norm": 0.2422640025615692, + "learning_rate": 1.027309635096792e-05, + "loss": 0.2581, + "num_input_tokens_seen": 2679136, + "step": 14055 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 0.15279555320739746, + "learning_rate": 1.0254776511703976e-05, + "loss": 0.2396, + "num_input_tokens_seen": 2680032, + "step": 14060 + }, + { + "epoch": 7.3102910602910605, + "grad_norm": 0.5861887335777283, + "learning_rate": 1.023646880621481e-05, + "loss": 0.2805, + "num_input_tokens_seen": 2681056, + "step": 14065 + }, + { + "epoch": 7.3128898128898125, + "grad_norm": 0.3075284957885742, + "learning_rate": 1.0218173249565741e-05, + "loss": 0.2476, + "num_input_tokens_seen": 2682016, + "step": 14070 + }, + { + "epoch": 7.3154885654885655, + "grad_norm": 0.3302763104438782, + "learning_rate": 1.0199889856812112e-05, + "loss": 0.2619, + "num_input_tokens_seen": 2683008, + "step": 14075 + }, + { + "epoch": 7.3180873180873185, + "grad_norm": 0.6038004755973816, + "learning_rate": 1.018161864299921e-05, + "loss": 0.2682, + "num_input_tokens_seen": 2683936, + "step": 14080 + }, + { + "epoch": 7.320686070686071, + "grad_norm": 0.4965473711490631, + "learning_rate": 1.0163359623162357e-05, + "loss": 0.1983, + "num_input_tokens_seen": 2684928, + "step": 14085 + }, + { + "epoch": 7.3232848232848236, + "grad_norm": 0.5294256210327148, + "learning_rate": 1.0145112812326799e-05, + "loss": 0.2101, + "num_input_tokens_seen": 2685856, + "step": 14090 + }, + { + "epoch": 7.325883575883576, + "grad_norm": 0.4988727867603302, + "learning_rate": 1.0126878225507761e-05, + "loss": 0.2827, + "num_input_tokens_seen": 2686784, + "step": 14095 + }, + { + "epoch": 7.328482328482329, + "grad_norm": 0.26366233825683594, + "learning_rate": 1.0108655877710386e-05, + "loss": 0.2288, + "num_input_tokens_seen": 2687776, + "step": 14100 + }, + { + "epoch": 7.331081081081081, + "grad_norm": 0.319663405418396, + "learning_rate": 1.0090445783929774e-05, + "loss": 0.2455, + "num_input_tokens_seen": 2688672, + "step": 14105 + }, + { + "epoch": 7.333679833679834, + "grad_norm": 0.5028585195541382, + "learning_rate": 1.0072247959150919e-05, + "loss": 0.2214, + "num_input_tokens_seen": 2689696, + "step": 14110 + }, + { + "epoch": 7.336278586278587, + "grad_norm": 0.47158822417259216, + "learning_rate": 1.0054062418348714e-05, + "loss": 0.2361, + "num_input_tokens_seen": 2690624, + "step": 14115 + }, + { + "epoch": 7.338877338877339, + "grad_norm": 0.4208287000656128, + "learning_rate": 1.0035889176487973e-05, + "loss": 0.2013, + "num_input_tokens_seen": 2691584, + "step": 14120 + }, + { + "epoch": 7.341476091476092, + "grad_norm": 0.27884045243263245, + "learning_rate": 1.001772824852335e-05, + "loss": 0.2468, + "num_input_tokens_seen": 2692576, + "step": 14125 + }, + { + "epoch": 7.344074844074844, + "grad_norm": 0.597190797328949, + "learning_rate": 9.999579649399408e-06, + "loss": 0.2748, + "num_input_tokens_seen": 2693568, + "step": 14130 + }, + { + "epoch": 7.346673596673597, + "grad_norm": 0.4900455176830292, + "learning_rate": 9.981443394050525e-06, + "loss": 0.348, + "num_input_tokens_seen": 2694528, + "step": 14135 + }, + { + "epoch": 7.349272349272349, + "grad_norm": 0.508175253868103, + "learning_rate": 9.963319497400957e-06, + "loss": 0.2201, + "num_input_tokens_seen": 2695520, + "step": 14140 + }, + { + "epoch": 7.351871101871102, + "grad_norm": 0.5494135022163391, + "learning_rate": 9.945207974364768e-06, + "loss": 0.2289, + "num_input_tokens_seen": 2696448, + "step": 14145 + }, + { + "epoch": 7.354469854469855, + "grad_norm": 0.3335646390914917, + "learning_rate": 9.92710883984583e-06, + "loss": 0.3086, + "num_input_tokens_seen": 2697504, + "step": 14150 + }, + { + "epoch": 7.357068607068607, + "grad_norm": 0.8535512685775757, + "learning_rate": 9.909022108737856e-06, + "loss": 0.269, + "num_input_tokens_seen": 2698464, + "step": 14155 + }, + { + "epoch": 7.35966735966736, + "grad_norm": 0.14061585068702698, + "learning_rate": 9.890947795924313e-06, + "loss": 0.2266, + "num_input_tokens_seen": 2699392, + "step": 14160 + }, + { + "epoch": 7.362266112266112, + "grad_norm": 0.20226357877254486, + "learning_rate": 9.872885916278488e-06, + "loss": 0.2945, + "num_input_tokens_seen": 2700320, + "step": 14165 + }, + { + "epoch": 7.364864864864865, + "grad_norm": 0.5283157825469971, + "learning_rate": 9.854836484663404e-06, + "loss": 0.2882, + "num_input_tokens_seen": 2701280, + "step": 14170 + }, + { + "epoch": 7.367463617463617, + "grad_norm": 0.5314437747001648, + "learning_rate": 9.836799515931847e-06, + "loss": 0.2357, + "num_input_tokens_seen": 2702304, + "step": 14175 + }, + { + "epoch": 7.37006237006237, + "grad_norm": 1.0417439937591553, + "learning_rate": 9.818775024926369e-06, + "loss": 0.2692, + "num_input_tokens_seen": 2703360, + "step": 14180 + }, + { + "epoch": 7.372661122661123, + "grad_norm": 0.47171449661254883, + "learning_rate": 9.80076302647922e-06, + "loss": 0.2337, + "num_input_tokens_seen": 2704320, + "step": 14185 + }, + { + "epoch": 7.375259875259875, + "grad_norm": 0.24723322689533234, + "learning_rate": 9.782763535412409e-06, + "loss": 0.2892, + "num_input_tokens_seen": 2705280, + "step": 14190 + }, + { + "epoch": 7.377858627858628, + "grad_norm": 0.5769591927528381, + "learning_rate": 9.764776566537615e-06, + "loss": 0.1958, + "num_input_tokens_seen": 2706144, + "step": 14195 + }, + { + "epoch": 7.38045738045738, + "grad_norm": 0.5689525008201599, + "learning_rate": 9.746802134656245e-06, + "loss": 0.2525, + "num_input_tokens_seen": 2707168, + "step": 14200 + }, + { + "epoch": 7.383056133056133, + "grad_norm": 1.0256725549697876, + "learning_rate": 9.728840254559366e-06, + "loss": 0.3021, + "num_input_tokens_seen": 2708160, + "step": 14205 + }, + { + "epoch": 7.385654885654886, + "grad_norm": 0.5822081565856934, + "learning_rate": 9.710890941027722e-06, + "loss": 0.2925, + "num_input_tokens_seen": 2709216, + "step": 14210 + }, + { + "epoch": 7.388253638253638, + "grad_norm": 0.2985183298587799, + "learning_rate": 9.692954208831714e-06, + "loss": 0.3128, + "num_input_tokens_seen": 2710176, + "step": 14215 + }, + { + "epoch": 7.390852390852391, + "grad_norm": 0.6186666488647461, + "learning_rate": 9.67503007273141e-06, + "loss": 0.2743, + "num_input_tokens_seen": 2711040, + "step": 14220 + }, + { + "epoch": 7.393451143451143, + "grad_norm": 0.7077853679656982, + "learning_rate": 9.65711854747648e-06, + "loss": 0.2468, + "num_input_tokens_seen": 2711968, + "step": 14225 + }, + { + "epoch": 7.396049896049896, + "grad_norm": 0.4802187383174896, + "learning_rate": 9.639219647806239e-06, + "loss": 0.2756, + "num_input_tokens_seen": 2712928, + "step": 14230 + }, + { + "epoch": 7.398648648648648, + "grad_norm": 0.41590017080307007, + "learning_rate": 9.621333388449619e-06, + "loss": 0.2405, + "num_input_tokens_seen": 2713856, + "step": 14235 + }, + { + "epoch": 7.401247401247401, + "grad_norm": 0.6558035016059875, + "learning_rate": 9.60345978412513e-06, + "loss": 0.2644, + "num_input_tokens_seen": 2714816, + "step": 14240 + }, + { + "epoch": 7.403846153846154, + "grad_norm": 0.7530800700187683, + "learning_rate": 9.585598849540874e-06, + "loss": 0.2435, + "num_input_tokens_seen": 2715808, + "step": 14245 + }, + { + "epoch": 7.406444906444906, + "grad_norm": 0.26563695073127747, + "learning_rate": 9.567750599394524e-06, + "loss": 0.2929, + "num_input_tokens_seen": 2716896, + "step": 14250 + }, + { + "epoch": 7.409043659043659, + "grad_norm": 0.323307067155838, + "learning_rate": 9.549915048373334e-06, + "loss": 0.262, + "num_input_tokens_seen": 2717792, + "step": 14255 + }, + { + "epoch": 7.411642411642411, + "grad_norm": 0.1812925934791565, + "learning_rate": 9.532092211154082e-06, + "loss": 0.2519, + "num_input_tokens_seen": 2718720, + "step": 14260 + }, + { + "epoch": 7.414241164241164, + "grad_norm": 0.7130565643310547, + "learning_rate": 9.51428210240311e-06, + "loss": 0.3017, + "num_input_tokens_seen": 2719776, + "step": 14265 + }, + { + "epoch": 7.416839916839917, + "grad_norm": 0.6552014946937561, + "learning_rate": 9.496484736776267e-06, + "loss": 0.2897, + "num_input_tokens_seen": 2720704, + "step": 14270 + }, + { + "epoch": 7.419438669438669, + "grad_norm": 0.6138710975646973, + "learning_rate": 9.47870012891891e-06, + "loss": 0.2558, + "num_input_tokens_seen": 2721664, + "step": 14275 + }, + { + "epoch": 7.422037422037422, + "grad_norm": 0.3483041524887085, + "learning_rate": 9.46092829346593e-06, + "loss": 0.2167, + "num_input_tokens_seen": 2722720, + "step": 14280 + }, + { + "epoch": 7.424636174636174, + "grad_norm": 0.8081187605857849, + "learning_rate": 9.443169245041664e-06, + "loss": 0.2523, + "num_input_tokens_seen": 2723776, + "step": 14285 + }, + { + "epoch": 7.427234927234927, + "grad_norm": 0.14765658974647522, + "learning_rate": 9.425422998259966e-06, + "loss": 0.2472, + "num_input_tokens_seen": 2724640, + "step": 14290 + }, + { + "epoch": 7.4298336798336795, + "grad_norm": 0.5210628509521484, + "learning_rate": 9.407689567724129e-06, + "loss": 0.2686, + "num_input_tokens_seen": 2725600, + "step": 14295 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 0.15248380601406097, + "learning_rate": 9.389968968026919e-06, + "loss": 0.2212, + "num_input_tokens_seen": 2726560, + "step": 14300 + }, + { + "epoch": 7.435031185031185, + "grad_norm": 0.16912174224853516, + "learning_rate": 9.372261213750528e-06, + "loss": 0.2743, + "num_input_tokens_seen": 2727520, + "step": 14305 + }, + { + "epoch": 7.4376299376299375, + "grad_norm": 0.3773612082004547, + "learning_rate": 9.354566319466573e-06, + "loss": 0.2588, + "num_input_tokens_seen": 2728544, + "step": 14310 + }, + { + "epoch": 7.4402286902286905, + "grad_norm": 0.43433719873428345, + "learning_rate": 9.33688429973612e-06, + "loss": 0.2274, + "num_input_tokens_seen": 2729408, + "step": 14315 + }, + { + "epoch": 7.442827442827443, + "grad_norm": 0.43097880482673645, + "learning_rate": 9.3192151691096e-06, + "loss": 0.282, + "num_input_tokens_seen": 2730368, + "step": 14320 + }, + { + "epoch": 7.4454261954261955, + "grad_norm": 0.46828797459602356, + "learning_rate": 9.301558942126872e-06, + "loss": 0.2627, + "num_input_tokens_seen": 2731296, + "step": 14325 + }, + { + "epoch": 7.448024948024948, + "grad_norm": 0.4648362696170807, + "learning_rate": 9.28391563331715e-06, + "loss": 0.2879, + "num_input_tokens_seen": 2732160, + "step": 14330 + }, + { + "epoch": 7.450623700623701, + "grad_norm": 0.5661013722419739, + "learning_rate": 9.266285257199051e-06, + "loss": 0.2242, + "num_input_tokens_seen": 2733120, + "step": 14335 + }, + { + "epoch": 7.453222453222454, + "grad_norm": 0.44374367594718933, + "learning_rate": 9.248667828280493e-06, + "loss": 0.2122, + "num_input_tokens_seen": 2734048, + "step": 14340 + }, + { + "epoch": 7.455821205821206, + "grad_norm": 0.4740055501461029, + "learning_rate": 9.231063361058806e-06, + "loss": 0.2892, + "num_input_tokens_seen": 2735040, + "step": 14345 + }, + { + "epoch": 7.458419958419959, + "grad_norm": 0.10726796090602875, + "learning_rate": 9.213471870020601e-06, + "loss": 0.2258, + "num_input_tokens_seen": 2735968, + "step": 14350 + }, + { + "epoch": 7.461018711018711, + "grad_norm": 0.08521643280982971, + "learning_rate": 9.195893369641841e-06, + "loss": 0.2452, + "num_input_tokens_seen": 2736928, + "step": 14355 + }, + { + "epoch": 7.463617463617464, + "grad_norm": 0.6412904858589172, + "learning_rate": 9.178327874387807e-06, + "loss": 0.2659, + "num_input_tokens_seen": 2737888, + "step": 14360 + }, + { + "epoch": 7.466216216216216, + "grad_norm": 0.21919231116771698, + "learning_rate": 9.160775398713037e-06, + "loss": 0.2634, + "num_input_tokens_seen": 2738880, + "step": 14365 + }, + { + "epoch": 7.468814968814969, + "grad_norm": 0.5282589793205261, + "learning_rate": 9.143235957061407e-06, + "loss": 0.2184, + "num_input_tokens_seen": 2739840, + "step": 14370 + }, + { + "epoch": 7.471413721413722, + "grad_norm": 0.20567086338996887, + "learning_rate": 9.12570956386601e-06, + "loss": 0.2479, + "num_input_tokens_seen": 2740832, + "step": 14375 + }, + { + "epoch": 7.474012474012474, + "grad_norm": 0.5538673996925354, + "learning_rate": 9.108196233549256e-06, + "loss": 0.2205, + "num_input_tokens_seen": 2741728, + "step": 14380 + }, + { + "epoch": 7.476611226611227, + "grad_norm": 0.45367151498794556, + "learning_rate": 9.090695980522765e-06, + "loss": 0.3694, + "num_input_tokens_seen": 2742656, + "step": 14385 + }, + { + "epoch": 7.479209979209979, + "grad_norm": 0.5914590358734131, + "learning_rate": 9.073208819187429e-06, + "loss": 0.3041, + "num_input_tokens_seen": 2743680, + "step": 14390 + }, + { + "epoch": 7.481808731808732, + "grad_norm": 0.6434351801872253, + "learning_rate": 9.055734763933335e-06, + "loss": 0.2918, + "num_input_tokens_seen": 2744768, + "step": 14395 + }, + { + "epoch": 7.484407484407485, + "grad_norm": 0.43350544571876526, + "learning_rate": 9.038273829139816e-06, + "loss": 0.3064, + "num_input_tokens_seen": 2745792, + "step": 14400 + }, + { + "epoch": 7.487006237006237, + "grad_norm": 0.4218592941761017, + "learning_rate": 9.020826029175384e-06, + "loss": 0.2534, + "num_input_tokens_seen": 2746752, + "step": 14405 + }, + { + "epoch": 7.48960498960499, + "grad_norm": 0.3164967894554138, + "learning_rate": 9.00339137839774e-06, + "loss": 0.2808, + "num_input_tokens_seen": 2747776, + "step": 14410 + }, + { + "epoch": 7.492203742203742, + "grad_norm": 0.7335985898971558, + "learning_rate": 8.985969891153801e-06, + "loss": 0.2561, + "num_input_tokens_seen": 2748768, + "step": 14415 + }, + { + "epoch": 7.494802494802495, + "grad_norm": 0.4494043290615082, + "learning_rate": 8.968561581779602e-06, + "loss": 0.2752, + "num_input_tokens_seen": 2749760, + "step": 14420 + }, + { + "epoch": 7.497401247401247, + "grad_norm": 0.67536860704422, + "learning_rate": 8.95116646460038e-06, + "loss": 0.2354, + "num_input_tokens_seen": 2750688, + "step": 14425 + }, + { + "epoch": 7.5, + "grad_norm": 0.5181062817573547, + "learning_rate": 8.933784553930478e-06, + "loss": 0.2712, + "num_input_tokens_seen": 2751584, + "step": 14430 + }, + { + "epoch": 7.5, + "eval_loss": 0.2527921795845032, + "eval_runtime": 7.9507, + "eval_samples_per_second": 107.663, + "eval_steps_per_second": 26.916, + "num_input_tokens_seen": 2751584, + "step": 14430 + }, + { + "epoch": 7.502598752598753, + "grad_norm": 0.35381069779396057, + "learning_rate": 8.9164158640734e-06, + "loss": 0.2835, + "num_input_tokens_seen": 2752512, + "step": 14435 + }, + { + "epoch": 7.505197505197505, + "grad_norm": 0.41863635182380676, + "learning_rate": 8.899060409321755e-06, + "loss": 0.2829, + "num_input_tokens_seen": 2753440, + "step": 14440 + }, + { + "epoch": 7.507796257796258, + "grad_norm": 0.6599665284156799, + "learning_rate": 8.881718203957254e-06, + "loss": 0.2174, + "num_input_tokens_seen": 2754400, + "step": 14445 + }, + { + "epoch": 7.51039501039501, + "grad_norm": 0.13941404223442078, + "learning_rate": 8.864389262250732e-06, + "loss": 0.2889, + "num_input_tokens_seen": 2755360, + "step": 14450 + }, + { + "epoch": 7.512993762993763, + "grad_norm": 0.23385325074195862, + "learning_rate": 8.847073598462082e-06, + "loss": 0.2559, + "num_input_tokens_seen": 2756320, + "step": 14455 + }, + { + "epoch": 7.515592515592516, + "grad_norm": 0.5908382534980774, + "learning_rate": 8.829771226840294e-06, + "loss": 0.2211, + "num_input_tokens_seen": 2757280, + "step": 14460 + }, + { + "epoch": 7.518191268191268, + "grad_norm": 0.5762922167778015, + "learning_rate": 8.8124821616234e-06, + "loss": 0.2372, + "num_input_tokens_seen": 2758304, + "step": 14465 + }, + { + "epoch": 7.520790020790021, + "grad_norm": 0.3286007344722748, + "learning_rate": 8.79520641703849e-06, + "loss": 0.252, + "num_input_tokens_seen": 2759264, + "step": 14470 + }, + { + "epoch": 7.523388773388773, + "grad_norm": 0.5192843079566956, + "learning_rate": 8.777944007301686e-06, + "loss": 0.2875, + "num_input_tokens_seen": 2760288, + "step": 14475 + }, + { + "epoch": 7.525987525987526, + "grad_norm": 0.3141108751296997, + "learning_rate": 8.760694946618151e-06, + "loss": 0.2992, + "num_input_tokens_seen": 2761248, + "step": 14480 + }, + { + "epoch": 7.528586278586278, + "grad_norm": 0.1834563910961151, + "learning_rate": 8.74345924918206e-06, + "loss": 0.2667, + "num_input_tokens_seen": 2762208, + "step": 14485 + }, + { + "epoch": 7.531185031185031, + "grad_norm": 0.6514828205108643, + "learning_rate": 8.726236929176576e-06, + "loss": 0.2118, + "num_input_tokens_seen": 2763104, + "step": 14490 + }, + { + "epoch": 7.533783783783784, + "grad_norm": 0.35649800300598145, + "learning_rate": 8.70902800077388e-06, + "loss": 0.2688, + "num_input_tokens_seen": 2764032, + "step": 14495 + }, + { + "epoch": 7.536382536382536, + "grad_norm": 0.36438843607902527, + "learning_rate": 8.69183247813511e-06, + "loss": 0.2685, + "num_input_tokens_seen": 2764992, + "step": 14500 + }, + { + "epoch": 7.538981288981289, + "grad_norm": 1.06179940700531, + "learning_rate": 8.67465037541038e-06, + "loss": 0.2713, + "num_input_tokens_seen": 2765984, + "step": 14505 + }, + { + "epoch": 7.541580041580041, + "grad_norm": 0.8098747730255127, + "learning_rate": 8.657481706738749e-06, + "loss": 0.2345, + "num_input_tokens_seen": 2766944, + "step": 14510 + }, + { + "epoch": 7.544178794178794, + "grad_norm": 0.27579134702682495, + "learning_rate": 8.640326486248254e-06, + "loss": 0.2394, + "num_input_tokens_seen": 2767968, + "step": 14515 + }, + { + "epoch": 7.546777546777546, + "grad_norm": 0.4976500868797302, + "learning_rate": 8.623184728055828e-06, + "loss": 0.2548, + "num_input_tokens_seen": 2768896, + "step": 14520 + }, + { + "epoch": 7.549376299376299, + "grad_norm": 0.5289362072944641, + "learning_rate": 8.60605644626736e-06, + "loss": 0.2322, + "num_input_tokens_seen": 2769856, + "step": 14525 + }, + { + "epoch": 7.551975051975052, + "grad_norm": 0.6962937712669373, + "learning_rate": 8.588941654977622e-06, + "loss": 0.2577, + "num_input_tokens_seen": 2770848, + "step": 14530 + }, + { + "epoch": 7.5545738045738045, + "grad_norm": 0.46438759565353394, + "learning_rate": 8.571840368270287e-06, + "loss": 0.2419, + "num_input_tokens_seen": 2771840, + "step": 14535 + }, + { + "epoch": 7.557172557172557, + "grad_norm": 0.6509832739830017, + "learning_rate": 8.554752600217941e-06, + "loss": 0.2236, + "num_input_tokens_seen": 2772768, + "step": 14540 + }, + { + "epoch": 7.5597713097713095, + "grad_norm": 0.19377896189689636, + "learning_rate": 8.537678364882013e-06, + "loss": 0.2591, + "num_input_tokens_seen": 2773792, + "step": 14545 + }, + { + "epoch": 7.5623700623700625, + "grad_norm": 0.523945689201355, + "learning_rate": 8.52061767631282e-06, + "loss": 0.2366, + "num_input_tokens_seen": 2774688, + "step": 14550 + }, + { + "epoch": 7.564968814968815, + "grad_norm": 0.5893678665161133, + "learning_rate": 8.503570548549511e-06, + "loss": 0.2423, + "num_input_tokens_seen": 2775648, + "step": 14555 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 0.34912896156311035, + "learning_rate": 8.486536995620103e-06, + "loss": 0.2391, + "num_input_tokens_seen": 2776544, + "step": 14560 + }, + { + "epoch": 7.5701663201663205, + "grad_norm": 0.15650367736816406, + "learning_rate": 8.469517031541419e-06, + "loss": 0.2828, + "num_input_tokens_seen": 2777504, + "step": 14565 + }, + { + "epoch": 7.572765072765073, + "grad_norm": 0.5498448610305786, + "learning_rate": 8.452510670319094e-06, + "loss": 0.2718, + "num_input_tokens_seen": 2778496, + "step": 14570 + }, + { + "epoch": 7.575363825363826, + "grad_norm": 0.25440001487731934, + "learning_rate": 8.435517925947606e-06, + "loss": 0.2787, + "num_input_tokens_seen": 2779456, + "step": 14575 + }, + { + "epoch": 7.577962577962578, + "grad_norm": 0.45510804653167725, + "learning_rate": 8.418538812410182e-06, + "loss": 0.2751, + "num_input_tokens_seen": 2780416, + "step": 14580 + }, + { + "epoch": 7.580561330561331, + "grad_norm": 0.545515775680542, + "learning_rate": 8.40157334367887e-06, + "loss": 0.2735, + "num_input_tokens_seen": 2781376, + "step": 14585 + }, + { + "epoch": 7.583160083160083, + "grad_norm": 0.38539958000183105, + "learning_rate": 8.384621533714462e-06, + "loss": 0.2393, + "num_input_tokens_seen": 2782400, + "step": 14590 + }, + { + "epoch": 7.585758835758836, + "grad_norm": 0.14502368867397308, + "learning_rate": 8.367683396466547e-06, + "loss": 0.2723, + "num_input_tokens_seen": 2783296, + "step": 14595 + }, + { + "epoch": 7.588357588357589, + "grad_norm": 0.36461788415908813, + "learning_rate": 8.350758945873401e-06, + "loss": 0.2701, + "num_input_tokens_seen": 2784256, + "step": 14600 + }, + { + "epoch": 7.590956340956341, + "grad_norm": 0.13259409368038177, + "learning_rate": 8.333848195862093e-06, + "loss": 0.1938, + "num_input_tokens_seen": 2785216, + "step": 14605 + }, + { + "epoch": 7.593555093555094, + "grad_norm": 0.30025550723075867, + "learning_rate": 8.31695116034841e-06, + "loss": 0.2774, + "num_input_tokens_seen": 2786208, + "step": 14610 + }, + { + "epoch": 7.596153846153846, + "grad_norm": 0.61702561378479, + "learning_rate": 8.300067853236823e-06, + "loss": 0.2422, + "num_input_tokens_seen": 2787200, + "step": 14615 + }, + { + "epoch": 7.598752598752599, + "grad_norm": 0.47157007455825806, + "learning_rate": 8.283198288420543e-06, + "loss": 0.1983, + "num_input_tokens_seen": 2788064, + "step": 14620 + }, + { + "epoch": 7.601351351351351, + "grad_norm": 0.6109801530838013, + "learning_rate": 8.26634247978144e-06, + "loss": 0.1778, + "num_input_tokens_seen": 2788928, + "step": 14625 + }, + { + "epoch": 7.603950103950104, + "grad_norm": 0.2698463499546051, + "learning_rate": 8.249500441190104e-06, + "loss": 0.2346, + "num_input_tokens_seen": 2789856, + "step": 14630 + }, + { + "epoch": 7.606548856548857, + "grad_norm": 0.8377765417098999, + "learning_rate": 8.232672186505733e-06, + "loss": 0.2403, + "num_input_tokens_seen": 2790720, + "step": 14635 + }, + { + "epoch": 7.609147609147609, + "grad_norm": 0.29322391748428345, + "learning_rate": 8.215857729576245e-06, + "loss": 0.2159, + "num_input_tokens_seen": 2791712, + "step": 14640 + }, + { + "epoch": 7.611746361746362, + "grad_norm": 0.8188920617103577, + "learning_rate": 8.199057084238165e-06, + "loss": 0.3434, + "num_input_tokens_seen": 2792672, + "step": 14645 + }, + { + "epoch": 7.614345114345114, + "grad_norm": 0.5677478313446045, + "learning_rate": 8.182270264316674e-06, + "loss": 0.2642, + "num_input_tokens_seen": 2793600, + "step": 14650 + }, + { + "epoch": 7.616943866943867, + "grad_norm": 0.4944738447666168, + "learning_rate": 8.165497283625554e-06, + "loss": 0.197, + "num_input_tokens_seen": 2794464, + "step": 14655 + }, + { + "epoch": 7.61954261954262, + "grad_norm": 0.597891092300415, + "learning_rate": 8.14873815596722e-06, + "loss": 0.2707, + "num_input_tokens_seen": 2795456, + "step": 14660 + }, + { + "epoch": 7.622141372141372, + "grad_norm": 0.35452184081077576, + "learning_rate": 8.131992895132693e-06, + "loss": 0.2471, + "num_input_tokens_seen": 2796384, + "step": 14665 + }, + { + "epoch": 7.624740124740125, + "grad_norm": 0.503668487071991, + "learning_rate": 8.11526151490154e-06, + "loss": 0.2932, + "num_input_tokens_seen": 2797280, + "step": 14670 + }, + { + "epoch": 7.627338877338877, + "grad_norm": 0.5252199769020081, + "learning_rate": 8.098544029041955e-06, + "loss": 0.2693, + "num_input_tokens_seen": 2798240, + "step": 14675 + }, + { + "epoch": 7.62993762993763, + "grad_norm": 0.4936702251434326, + "learning_rate": 8.081840451310666e-06, + "loss": 0.214, + "num_input_tokens_seen": 2799168, + "step": 14680 + }, + { + "epoch": 7.632536382536383, + "grad_norm": 0.25110360980033875, + "learning_rate": 8.065150795452983e-06, + "loss": 0.2851, + "num_input_tokens_seen": 2800128, + "step": 14685 + }, + { + "epoch": 7.635135135135135, + "grad_norm": 0.25441744923591614, + "learning_rate": 8.048475075202727e-06, + "loss": 0.2745, + "num_input_tokens_seen": 2801024, + "step": 14690 + }, + { + "epoch": 7.637733887733888, + "grad_norm": 0.6252768039703369, + "learning_rate": 8.031813304282287e-06, + "loss": 0.2552, + "num_input_tokens_seen": 2801952, + "step": 14695 + }, + { + "epoch": 7.64033264033264, + "grad_norm": 0.7559264898300171, + "learning_rate": 8.015165496402549e-06, + "loss": 0.2694, + "num_input_tokens_seen": 2802944, + "step": 14700 + }, + { + "epoch": 7.642931392931393, + "grad_norm": 0.3222495913505554, + "learning_rate": 7.998531665262907e-06, + "loss": 0.2291, + "num_input_tokens_seen": 2803936, + "step": 14705 + }, + { + "epoch": 7.645530145530145, + "grad_norm": 0.38148489594459534, + "learning_rate": 7.981911824551274e-06, + "loss": 0.2912, + "num_input_tokens_seen": 2804832, + "step": 14710 + }, + { + "epoch": 7.648128898128898, + "grad_norm": 0.5480355620384216, + "learning_rate": 7.965305987944027e-06, + "loss": 0.2409, + "num_input_tokens_seen": 2805856, + "step": 14715 + }, + { + "epoch": 7.650727650727651, + "grad_norm": 0.40972772240638733, + "learning_rate": 7.948714169106048e-06, + "loss": 0.3077, + "num_input_tokens_seen": 2806880, + "step": 14720 + }, + { + "epoch": 7.653326403326403, + "grad_norm": 0.5451943278312683, + "learning_rate": 7.932136381690644e-06, + "loss": 0.2859, + "num_input_tokens_seen": 2807840, + "step": 14725 + }, + { + "epoch": 7.655925155925156, + "grad_norm": 0.21562369167804718, + "learning_rate": 7.91557263933962e-06, + "loss": 0.2562, + "num_input_tokens_seen": 2808832, + "step": 14730 + }, + { + "epoch": 7.658523908523908, + "grad_norm": 0.18361343443393707, + "learning_rate": 7.899022955683188e-06, + "loss": 0.2739, + "num_input_tokens_seen": 2809824, + "step": 14735 + }, + { + "epoch": 7.661122661122661, + "grad_norm": 0.21176238358020782, + "learning_rate": 7.88248734434e-06, + "loss": 0.2466, + "num_input_tokens_seen": 2810848, + "step": 14740 + }, + { + "epoch": 7.663721413721413, + "grad_norm": 0.20151501893997192, + "learning_rate": 7.865965818917149e-06, + "loss": 0.2474, + "num_input_tokens_seen": 2811744, + "step": 14745 + }, + { + "epoch": 7.666320166320166, + "grad_norm": 0.16004450619220734, + "learning_rate": 7.849458393010103e-06, + "loss": 0.2788, + "num_input_tokens_seen": 2812672, + "step": 14750 + }, + { + "epoch": 7.668918918918919, + "grad_norm": 0.2737521529197693, + "learning_rate": 7.832965080202762e-06, + "loss": 0.2806, + "num_input_tokens_seen": 2813632, + "step": 14755 + }, + { + "epoch": 7.671517671517671, + "grad_norm": 0.36251989006996155, + "learning_rate": 7.816485894067382e-06, + "loss": 0.3398, + "num_input_tokens_seen": 2814592, + "step": 14760 + }, + { + "epoch": 7.674116424116424, + "grad_norm": 0.6586429476737976, + "learning_rate": 7.800020848164615e-06, + "loss": 0.2689, + "num_input_tokens_seen": 2815520, + "step": 14765 + }, + { + "epoch": 7.6767151767151764, + "grad_norm": 0.30106234550476074, + "learning_rate": 7.78356995604346e-06, + "loss": 0.2801, + "num_input_tokens_seen": 2816544, + "step": 14770 + }, + { + "epoch": 7.679313929313929, + "grad_norm": 0.33270734548568726, + "learning_rate": 7.767133231241288e-06, + "loss": 0.2786, + "num_input_tokens_seen": 2817472, + "step": 14775 + }, + { + "epoch": 7.6819126819126815, + "grad_norm": 0.39173492789268494, + "learning_rate": 7.750710687283793e-06, + "loss": 0.2838, + "num_input_tokens_seen": 2818464, + "step": 14780 + }, + { + "epoch": 7.6845114345114345, + "grad_norm": 0.6625440716743469, + "learning_rate": 7.734302337685018e-06, + "loss": 0.2455, + "num_input_tokens_seen": 2819424, + "step": 14785 + }, + { + "epoch": 7.6871101871101875, + "grad_norm": 0.2875805199146271, + "learning_rate": 7.717908195947316e-06, + "loss": 0.2384, + "num_input_tokens_seen": 2820480, + "step": 14790 + }, + { + "epoch": 7.6897089397089395, + "grad_norm": 0.6747420430183411, + "learning_rate": 7.701528275561348e-06, + "loss": 0.2522, + "num_input_tokens_seen": 2821408, + "step": 14795 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.777950644493103, + "learning_rate": 7.68516259000607e-06, + "loss": 0.2948, + "num_input_tokens_seen": 2822368, + "step": 14800 + }, + { + "epoch": 7.694906444906445, + "grad_norm": 0.5080218315124512, + "learning_rate": 7.668811152748723e-06, + "loss": 0.2983, + "num_input_tokens_seen": 2823328, + "step": 14805 + }, + { + "epoch": 7.697505197505198, + "grad_norm": 0.21668533980846405, + "learning_rate": 7.652473977244837e-06, + "loss": 0.2479, + "num_input_tokens_seen": 2824256, + "step": 14810 + }, + { + "epoch": 7.70010395010395, + "grad_norm": 0.635835587978363, + "learning_rate": 7.636151076938185e-06, + "loss": 0.2084, + "num_input_tokens_seen": 2825280, + "step": 14815 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 0.28207874298095703, + "learning_rate": 7.619842465260824e-06, + "loss": 0.2401, + "num_input_tokens_seen": 2826304, + "step": 14820 + }, + { + "epoch": 7.705301455301456, + "grad_norm": 0.5089048147201538, + "learning_rate": 7.6035481556330195e-06, + "loss": 0.2612, + "num_input_tokens_seen": 2827328, + "step": 14825 + }, + { + "epoch": 7.707900207900208, + "grad_norm": 0.5003844499588013, + "learning_rate": 7.587268161463274e-06, + "loss": 0.2641, + "num_input_tokens_seen": 2828288, + "step": 14830 + }, + { + "epoch": 7.710498960498961, + "grad_norm": 0.4894239604473114, + "learning_rate": 7.571002496148338e-06, + "loss": 0.2754, + "num_input_tokens_seen": 2829248, + "step": 14835 + }, + { + "epoch": 7.713097713097713, + "grad_norm": 0.5576859712600708, + "learning_rate": 7.554751173073133e-06, + "loss": 0.2501, + "num_input_tokens_seen": 2830176, + "step": 14840 + }, + { + "epoch": 7.715696465696466, + "grad_norm": 0.6011063456535339, + "learning_rate": 7.538514205610808e-06, + "loss": 0.2866, + "num_input_tokens_seen": 2831104, + "step": 14845 + }, + { + "epoch": 7.718295218295218, + "grad_norm": 0.20870646834373474, + "learning_rate": 7.522291607122678e-06, + "loss": 0.3092, + "num_input_tokens_seen": 2832000, + "step": 14850 + }, + { + "epoch": 7.720893970893971, + "grad_norm": 0.35533982515335083, + "learning_rate": 7.506083390958252e-06, + "loss": 0.2239, + "num_input_tokens_seen": 2832928, + "step": 14855 + }, + { + "epoch": 7.723492723492724, + "grad_norm": 0.41472309827804565, + "learning_rate": 7.489889570455191e-06, + "loss": 0.2549, + "num_input_tokens_seen": 2833824, + "step": 14860 + }, + { + "epoch": 7.726091476091476, + "grad_norm": 0.6331830620765686, + "learning_rate": 7.473710158939307e-06, + "loss": 0.2368, + "num_input_tokens_seen": 2834752, + "step": 14865 + }, + { + "epoch": 7.728690228690229, + "grad_norm": 0.6632162928581238, + "learning_rate": 7.45754516972457e-06, + "loss": 0.2692, + "num_input_tokens_seen": 2835712, + "step": 14870 + }, + { + "epoch": 7.731288981288982, + "grad_norm": 0.38315272331237793, + "learning_rate": 7.441394616113062e-06, + "loss": 0.2711, + "num_input_tokens_seen": 2836704, + "step": 14875 + }, + { + "epoch": 7.733887733887734, + "grad_norm": 0.09725698083639145, + "learning_rate": 7.425258511395014e-06, + "loss": 0.2433, + "num_input_tokens_seen": 2837568, + "step": 14880 + }, + { + "epoch": 7.736486486486487, + "grad_norm": 0.3892571032047272, + "learning_rate": 7.409136868848735e-06, + "loss": 0.2894, + "num_input_tokens_seen": 2838560, + "step": 14885 + }, + { + "epoch": 7.739085239085239, + "grad_norm": 0.46217650175094604, + "learning_rate": 7.393029701740667e-06, + "loss": 0.2801, + "num_input_tokens_seen": 2839456, + "step": 14890 + }, + { + "epoch": 7.741683991683992, + "grad_norm": 0.5038431286811829, + "learning_rate": 7.376937023325298e-06, + "loss": 0.2587, + "num_input_tokens_seen": 2840448, + "step": 14895 + }, + { + "epoch": 7.744282744282744, + "grad_norm": 0.26335379481315613, + "learning_rate": 7.360858846845234e-06, + "loss": 0.2948, + "num_input_tokens_seen": 2841344, + "step": 14900 + }, + { + "epoch": 7.746881496881497, + "grad_norm": 0.4442214369773865, + "learning_rate": 7.344795185531117e-06, + "loss": 0.2559, + "num_input_tokens_seen": 2842336, + "step": 14905 + }, + { + "epoch": 7.74948024948025, + "grad_norm": 0.2261035293340683, + "learning_rate": 7.328746052601665e-06, + "loss": 0.2393, + "num_input_tokens_seen": 2843264, + "step": 14910 + }, + { + "epoch": 7.752079002079002, + "grad_norm": 0.19564774632453918, + "learning_rate": 7.312711461263647e-06, + "loss": 0.2823, + "num_input_tokens_seen": 2844192, + "step": 14915 + }, + { + "epoch": 7.754677754677755, + "grad_norm": 0.4701833724975586, + "learning_rate": 7.296691424711826e-06, + "loss": 0.2701, + "num_input_tokens_seen": 2845120, + "step": 14920 + }, + { + "epoch": 7.757276507276507, + "grad_norm": 0.20869185030460358, + "learning_rate": 7.280685956129049e-06, + "loss": 0.262, + "num_input_tokens_seen": 2846048, + "step": 14925 + }, + { + "epoch": 7.75987525987526, + "grad_norm": 0.7581008076667786, + "learning_rate": 7.2646950686861056e-06, + "loss": 0.2759, + "num_input_tokens_seen": 2847008, + "step": 14930 + }, + { + "epoch": 7.762474012474012, + "grad_norm": 0.26744526624679565, + "learning_rate": 7.248718775541841e-06, + "loss": 0.3129, + "num_input_tokens_seen": 2848000, + "step": 14935 + }, + { + "epoch": 7.765072765072765, + "grad_norm": 0.7092774510383606, + "learning_rate": 7.232757089843062e-06, + "loss": 0.2148, + "num_input_tokens_seen": 2848896, + "step": 14940 + }, + { + "epoch": 7.767671517671518, + "grad_norm": 0.5896897912025452, + "learning_rate": 7.216810024724574e-06, + "loss": 0.2219, + "num_input_tokens_seen": 2849856, + "step": 14945 + }, + { + "epoch": 7.77027027027027, + "grad_norm": 0.3893957734107971, + "learning_rate": 7.20087759330913e-06, + "loss": 0.2686, + "num_input_tokens_seen": 2850848, + "step": 14950 + }, + { + "epoch": 7.772869022869023, + "grad_norm": 0.46071335673332214, + "learning_rate": 7.1849598087074645e-06, + "loss": 0.2617, + "num_input_tokens_seen": 2851776, + "step": 14955 + }, + { + "epoch": 7.775467775467775, + "grad_norm": 0.5871466994285583, + "learning_rate": 7.169056684018244e-06, + "loss": 0.2708, + "num_input_tokens_seen": 2852736, + "step": 14960 + }, + { + "epoch": 7.778066528066528, + "grad_norm": 0.5161524415016174, + "learning_rate": 7.153168232328067e-06, + "loss": 0.226, + "num_input_tokens_seen": 2853760, + "step": 14965 + }, + { + "epoch": 7.78066528066528, + "grad_norm": 0.5112266540527344, + "learning_rate": 7.137294466711475e-06, + "loss": 0.2356, + "num_input_tokens_seen": 2854656, + "step": 14970 + }, + { + "epoch": 7.783264033264033, + "grad_norm": 0.4820730686187744, + "learning_rate": 7.121435400230905e-06, + "loss": 0.2461, + "num_input_tokens_seen": 2855648, + "step": 14975 + }, + { + "epoch": 7.785862785862786, + "grad_norm": 0.4824641942977905, + "learning_rate": 7.105591045936722e-06, + "loss": 0.3119, + "num_input_tokens_seen": 2856608, + "step": 14980 + }, + { + "epoch": 7.788461538461538, + "grad_norm": 0.504660427570343, + "learning_rate": 7.089761416867153e-06, + "loss": 0.3316, + "num_input_tokens_seen": 2857632, + "step": 14985 + }, + { + "epoch": 7.791060291060291, + "grad_norm": 0.4556129276752472, + "learning_rate": 7.073946526048342e-06, + "loss": 0.2458, + "num_input_tokens_seen": 2858592, + "step": 14990 + }, + { + "epoch": 7.793659043659043, + "grad_norm": 0.415467768907547, + "learning_rate": 7.05814638649428e-06, + "loss": 0.2859, + "num_input_tokens_seen": 2859520, + "step": 14995 + }, + { + "epoch": 7.796257796257796, + "grad_norm": 0.4779427945613861, + "learning_rate": 7.042361011206819e-06, + "loss": 0.2789, + "num_input_tokens_seen": 2860448, + "step": 15000 + }, + { + "epoch": 7.798856548856548, + "grad_norm": 0.4538855254650116, + "learning_rate": 7.026590413175685e-06, + "loss": 0.2942, + "num_input_tokens_seen": 2861408, + "step": 15005 + }, + { + "epoch": 7.801455301455301, + "grad_norm": 0.7663313150405884, + "learning_rate": 7.010834605378414e-06, + "loss": 0.2862, + "num_input_tokens_seen": 2862400, + "step": 15010 + }, + { + "epoch": 7.804054054054054, + "grad_norm": 0.514127790927887, + "learning_rate": 6.995093600780403e-06, + "loss": 0.2601, + "num_input_tokens_seen": 2863328, + "step": 15015 + }, + { + "epoch": 7.8066528066528065, + "grad_norm": 0.29473546147346497, + "learning_rate": 6.979367412334839e-06, + "loss": 0.2895, + "num_input_tokens_seen": 2864256, + "step": 15020 + }, + { + "epoch": 7.8092515592515594, + "grad_norm": 0.6799107193946838, + "learning_rate": 6.963656052982731e-06, + "loss": 0.2803, + "num_input_tokens_seen": 2865184, + "step": 15025 + }, + { + "epoch": 7.8118503118503115, + "grad_norm": 0.3191409409046173, + "learning_rate": 6.947959535652873e-06, + "loss": 0.2597, + "num_input_tokens_seen": 2866080, + "step": 15030 + }, + { + "epoch": 7.8144490644490645, + "grad_norm": 0.32397565245628357, + "learning_rate": 6.932277873261864e-06, + "loss": 0.2619, + "num_input_tokens_seen": 2866976, + "step": 15035 + }, + { + "epoch": 7.817047817047817, + "grad_norm": 0.4002811908721924, + "learning_rate": 6.916611078714077e-06, + "loss": 0.2309, + "num_input_tokens_seen": 2867936, + "step": 15040 + }, + { + "epoch": 7.81964656964657, + "grad_norm": 0.6390411257743835, + "learning_rate": 6.9009591649016295e-06, + "loss": 0.2222, + "num_input_tokens_seen": 2868992, + "step": 15045 + }, + { + "epoch": 7.8222453222453225, + "grad_norm": 0.644018828868866, + "learning_rate": 6.88532214470442e-06, + "loss": 0.2336, + "num_input_tokens_seen": 2870016, + "step": 15050 + }, + { + "epoch": 7.824844074844075, + "grad_norm": 0.19940631091594696, + "learning_rate": 6.86970003099007e-06, + "loss": 0.3022, + "num_input_tokens_seen": 2870976, + "step": 15055 + }, + { + "epoch": 7.827442827442828, + "grad_norm": 0.5769034028053284, + "learning_rate": 6.854092836613948e-06, + "loss": 0.2754, + "num_input_tokens_seen": 2871904, + "step": 15060 + }, + { + "epoch": 7.83004158004158, + "grad_norm": 0.7389149069786072, + "learning_rate": 6.838500574419129e-06, + "loss": 0.2721, + "num_input_tokens_seen": 2872832, + "step": 15065 + }, + { + "epoch": 7.832640332640333, + "grad_norm": 1.0798791646957397, + "learning_rate": 6.822923257236427e-06, + "loss": 0.3014, + "num_input_tokens_seen": 2873888, + "step": 15070 + }, + { + "epoch": 7.835239085239085, + "grad_norm": 0.41794800758361816, + "learning_rate": 6.80736089788433e-06, + "loss": 0.2505, + "num_input_tokens_seen": 2874880, + "step": 15075 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 0.7155308723449707, + "learning_rate": 6.7918135091690454e-06, + "loss": 0.3225, + "num_input_tokens_seen": 2875840, + "step": 15080 + }, + { + "epoch": 7.840436590436591, + "grad_norm": 0.37320181727409363, + "learning_rate": 6.776281103884427e-06, + "loss": 0.2855, + "num_input_tokens_seen": 2876800, + "step": 15085 + }, + { + "epoch": 7.843035343035343, + "grad_norm": 0.6752042770385742, + "learning_rate": 6.7607636948120364e-06, + "loss": 0.3001, + "num_input_tokens_seen": 2877792, + "step": 15090 + }, + { + "epoch": 7.845634095634096, + "grad_norm": 0.7174725532531738, + "learning_rate": 6.745261294721067e-06, + "loss": 0.2944, + "num_input_tokens_seen": 2878688, + "step": 15095 + }, + { + "epoch": 7.848232848232849, + "grad_norm": 0.38072386384010315, + "learning_rate": 6.729773916368365e-06, + "loss": 0.2798, + "num_input_tokens_seen": 2879648, + "step": 15100 + }, + { + "epoch": 7.850831600831601, + "grad_norm": 0.3081047236919403, + "learning_rate": 6.714301572498435e-06, + "loss": 0.2489, + "num_input_tokens_seen": 2880608, + "step": 15105 + }, + { + "epoch": 7.853430353430354, + "grad_norm": 0.3265131115913391, + "learning_rate": 6.6988442758433805e-06, + "loss": 0.2824, + "num_input_tokens_seen": 2881600, + "step": 15110 + }, + { + "epoch": 7.856029106029106, + "grad_norm": 0.7511910796165466, + "learning_rate": 6.683402039122949e-06, + "loss": 0.2836, + "num_input_tokens_seen": 2882528, + "step": 15115 + }, + { + "epoch": 7.858627858627859, + "grad_norm": 0.7417609095573425, + "learning_rate": 6.667974875044483e-06, + "loss": 0.2587, + "num_input_tokens_seen": 2883456, + "step": 15120 + }, + { + "epoch": 7.861226611226611, + "grad_norm": 0.2396116554737091, + "learning_rate": 6.652562796302913e-06, + "loss": 0.2442, + "num_input_tokens_seen": 2884416, + "step": 15125 + }, + { + "epoch": 7.863825363825364, + "grad_norm": 0.4341225326061249, + "learning_rate": 6.637165815580782e-06, + "loss": 0.2486, + "num_input_tokens_seen": 2885472, + "step": 15130 + }, + { + "epoch": 7.866424116424117, + "grad_norm": 0.24757955968379974, + "learning_rate": 6.621783945548174e-06, + "loss": 0.2853, + "num_input_tokens_seen": 2886464, + "step": 15135 + }, + { + "epoch": 7.869022869022869, + "grad_norm": 0.6860865950584412, + "learning_rate": 6.6064171988627775e-06, + "loss": 0.2997, + "num_input_tokens_seen": 2887456, + "step": 15140 + }, + { + "epoch": 7.871621621621622, + "grad_norm": 0.49091997742652893, + "learning_rate": 6.591065588169795e-06, + "loss": 0.2331, + "num_input_tokens_seen": 2888416, + "step": 15145 + }, + { + "epoch": 7.874220374220374, + "grad_norm": 0.6250935196876526, + "learning_rate": 6.5757291261020145e-06, + "loss": 0.2253, + "num_input_tokens_seen": 2889408, + "step": 15150 + }, + { + "epoch": 7.876819126819127, + "grad_norm": 0.49437156319618225, + "learning_rate": 6.5604078252797265e-06, + "loss": 0.2516, + "num_input_tokens_seen": 2890336, + "step": 15155 + }, + { + "epoch": 7.879417879417879, + "grad_norm": 0.12438622117042542, + "learning_rate": 6.545101698310755e-06, + "loss": 0.2685, + "num_input_tokens_seen": 2891296, + "step": 15160 + }, + { + "epoch": 7.882016632016632, + "grad_norm": 0.13977548480033875, + "learning_rate": 6.5298107577904474e-06, + "loss": 0.2819, + "num_input_tokens_seen": 2892256, + "step": 15165 + }, + { + "epoch": 7.884615384615385, + "grad_norm": 0.5591821670532227, + "learning_rate": 6.514535016301637e-06, + "loss": 0.1886, + "num_input_tokens_seen": 2893216, + "step": 15170 + }, + { + "epoch": 7.887214137214137, + "grad_norm": 0.5503240823745728, + "learning_rate": 6.499274486414672e-06, + "loss": 0.1882, + "num_input_tokens_seen": 2894176, + "step": 15175 + }, + { + "epoch": 7.88981288981289, + "grad_norm": 0.5270857214927673, + "learning_rate": 6.484029180687357e-06, + "loss": 0.2771, + "num_input_tokens_seen": 2895200, + "step": 15180 + }, + { + "epoch": 7.892411642411642, + "grad_norm": 0.5490984320640564, + "learning_rate": 6.468799111665003e-06, + "loss": 0.2783, + "num_input_tokens_seen": 2896128, + "step": 15185 + }, + { + "epoch": 7.895010395010395, + "grad_norm": 0.7944501638412476, + "learning_rate": 6.4535842918803326e-06, + "loss": 0.2756, + "num_input_tokens_seen": 2897056, + "step": 15190 + }, + { + "epoch": 7.897609147609147, + "grad_norm": 0.5098227262496948, + "learning_rate": 6.4383847338535725e-06, + "loss": 0.296, + "num_input_tokens_seen": 2897952, + "step": 15195 + }, + { + "epoch": 7.9002079002079, + "grad_norm": 0.5482873320579529, + "learning_rate": 6.423200450092351e-06, + "loss": 0.2264, + "num_input_tokens_seen": 2898816, + "step": 15200 + }, + { + "epoch": 7.902806652806653, + "grad_norm": 0.2863842844963074, + "learning_rate": 6.4080314530917565e-06, + "loss": 0.2433, + "num_input_tokens_seen": 2899776, + "step": 15205 + }, + { + "epoch": 7.905405405405405, + "grad_norm": 0.41950640082359314, + "learning_rate": 6.392877755334276e-06, + "loss": 0.2785, + "num_input_tokens_seen": 2900704, + "step": 15210 + }, + { + "epoch": 7.908004158004158, + "grad_norm": 0.270827978849411, + "learning_rate": 6.377739369289815e-06, + "loss": 0.3196, + "num_input_tokens_seen": 2901696, + "step": 15215 + }, + { + "epoch": 7.91060291060291, + "grad_norm": 0.24944299459457397, + "learning_rate": 6.362616307415703e-06, + "loss": 0.2598, + "num_input_tokens_seen": 2902592, + "step": 15220 + }, + { + "epoch": 7.913201663201663, + "grad_norm": 0.7265914678573608, + "learning_rate": 6.3475085821566e-06, + "loss": 0.2849, + "num_input_tokens_seen": 2903616, + "step": 15225 + }, + { + "epoch": 7.915800415800415, + "grad_norm": 0.4946223795413971, + "learning_rate": 6.332416205944611e-06, + "loss": 0.2585, + "num_input_tokens_seen": 2904640, + "step": 15230 + }, + { + "epoch": 7.918399168399168, + "grad_norm": 0.37088343501091003, + "learning_rate": 6.3173391911991595e-06, + "loss": 0.2557, + "num_input_tokens_seen": 2905568, + "step": 15235 + }, + { + "epoch": 7.920997920997921, + "grad_norm": 0.0986846387386322, + "learning_rate": 6.3022775503270656e-06, + "loss": 0.2521, + "num_input_tokens_seen": 2906432, + "step": 15240 + }, + { + "epoch": 7.923596673596673, + "grad_norm": 0.568642795085907, + "learning_rate": 6.28723129572247e-06, + "loss": 0.2544, + "num_input_tokens_seen": 2907456, + "step": 15245 + }, + { + "epoch": 7.926195426195426, + "grad_norm": 0.28921017050743103, + "learning_rate": 6.272200439766882e-06, + "loss": 0.2881, + "num_input_tokens_seen": 2908416, + "step": 15250 + }, + { + "epoch": 7.9287941787941785, + "grad_norm": 0.7123751640319824, + "learning_rate": 6.257184994829108e-06, + "loss": 0.2515, + "num_input_tokens_seen": 2909344, + "step": 15255 + }, + { + "epoch": 7.9313929313929314, + "grad_norm": 0.39124318957328796, + "learning_rate": 6.242184973265283e-06, + "loss": 0.2796, + "num_input_tokens_seen": 2910240, + "step": 15260 + }, + { + "epoch": 7.9339916839916835, + "grad_norm": 0.470842182636261, + "learning_rate": 6.227200387418869e-06, + "loss": 0.293, + "num_input_tokens_seen": 2911104, + "step": 15265 + }, + { + "epoch": 7.9365904365904365, + "grad_norm": 0.28928032517433167, + "learning_rate": 6.212231249620595e-06, + "loss": 0.2261, + "num_input_tokens_seen": 2912032, + "step": 15270 + }, + { + "epoch": 7.9391891891891895, + "grad_norm": 0.1848672479391098, + "learning_rate": 6.197277572188509e-06, + "loss": 0.2474, + "num_input_tokens_seen": 2913024, + "step": 15275 + }, + { + "epoch": 7.941787941787942, + "grad_norm": 0.21227547526359558, + "learning_rate": 6.182339367427906e-06, + "loss": 0.234, + "num_input_tokens_seen": 2913952, + "step": 15280 + }, + { + "epoch": 7.9443866943866945, + "grad_norm": 0.19979526102542877, + "learning_rate": 6.16741664763138e-06, + "loss": 0.2781, + "num_input_tokens_seen": 2914848, + "step": 15285 + }, + { + "epoch": 7.946985446985447, + "grad_norm": 0.5583847165107727, + "learning_rate": 6.152509425078759e-06, + "loss": 0.3006, + "num_input_tokens_seen": 2915744, + "step": 15290 + }, + { + "epoch": 7.9495841995842, + "grad_norm": 0.4338890314102173, + "learning_rate": 6.137617712037116e-06, + "loss": 0.2878, + "num_input_tokens_seen": 2916672, + "step": 15295 + }, + { + "epoch": 7.952182952182953, + "grad_norm": 0.44224441051483154, + "learning_rate": 6.122741520760791e-06, + "loss": 0.2482, + "num_input_tokens_seen": 2917568, + "step": 15300 + }, + { + "epoch": 7.954781704781705, + "grad_norm": 0.6029319763183594, + "learning_rate": 6.1078808634913165e-06, + "loss": 0.2161, + "num_input_tokens_seen": 2918528, + "step": 15305 + }, + { + "epoch": 7.957380457380458, + "grad_norm": 0.3928508460521698, + "learning_rate": 6.093035752457468e-06, + "loss": 0.263, + "num_input_tokens_seen": 2919392, + "step": 15310 + }, + { + "epoch": 7.95997920997921, + "grad_norm": 0.31403931975364685, + "learning_rate": 6.078206199875211e-06, + "loss": 0.2337, + "num_input_tokens_seen": 2920416, + "step": 15315 + }, + { + "epoch": 7.962577962577963, + "grad_norm": 0.6030830144882202, + "learning_rate": 6.063392217947714e-06, + "loss": 0.2466, + "num_input_tokens_seen": 2921344, + "step": 15320 + }, + { + "epoch": 7.965176715176716, + "grad_norm": 0.47216469049453735, + "learning_rate": 6.048593818865328e-06, + "loss": 0.2764, + "num_input_tokens_seen": 2922368, + "step": 15325 + }, + { + "epoch": 7.967775467775468, + "grad_norm": 0.1991174817085266, + "learning_rate": 6.033811014805599e-06, + "loss": 0.3203, + "num_input_tokens_seen": 2923264, + "step": 15330 + }, + { + "epoch": 7.970374220374221, + "grad_norm": 0.6305701732635498, + "learning_rate": 6.019043817933212e-06, + "loss": 0.2743, + "num_input_tokens_seen": 2924224, + "step": 15335 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 0.5524091720581055, + "learning_rate": 6.004292240400031e-06, + "loss": 0.2804, + "num_input_tokens_seen": 2925184, + "step": 15340 + }, + { + "epoch": 7.975571725571726, + "grad_norm": 0.657218873500824, + "learning_rate": 5.989556294345067e-06, + "loss": 0.2883, + "num_input_tokens_seen": 2926112, + "step": 15345 + }, + { + "epoch": 7.978170478170478, + "grad_norm": 0.7132387757301331, + "learning_rate": 5.9748359918944504e-06, + "loss": 0.2622, + "num_input_tokens_seen": 2927008, + "step": 15350 + }, + { + "epoch": 7.980769230769231, + "grad_norm": 0.7620096802711487, + "learning_rate": 5.960131345161454e-06, + "loss": 0.2919, + "num_input_tokens_seen": 2927968, + "step": 15355 + }, + { + "epoch": 7.983367983367984, + "grad_norm": 0.3652995824813843, + "learning_rate": 5.945442366246448e-06, + "loss": 0.2623, + "num_input_tokens_seen": 2928992, + "step": 15360 + }, + { + "epoch": 7.985966735966736, + "grad_norm": 0.38424253463745117, + "learning_rate": 5.930769067236944e-06, + "loss": 0.2701, + "num_input_tokens_seen": 2929984, + "step": 15365 + }, + { + "epoch": 7.988565488565489, + "grad_norm": 0.2759951055049896, + "learning_rate": 5.916111460207516e-06, + "loss": 0.2632, + "num_input_tokens_seen": 2930976, + "step": 15370 + }, + { + "epoch": 7.991164241164241, + "grad_norm": 0.689002513885498, + "learning_rate": 5.901469557219849e-06, + "loss": 0.2368, + "num_input_tokens_seen": 2931904, + "step": 15375 + }, + { + "epoch": 7.993762993762994, + "grad_norm": 0.48424333333969116, + "learning_rate": 5.886843370322692e-06, + "loss": 0.2914, + "num_input_tokens_seen": 2932896, + "step": 15380 + }, + { + "epoch": 7.996361746361746, + "grad_norm": 0.4400303065776825, + "learning_rate": 5.872232911551859e-06, + "loss": 0.2605, + "num_input_tokens_seen": 2933792, + "step": 15385 + }, + { + "epoch": 7.998960498960499, + "grad_norm": 0.6524741649627686, + "learning_rate": 5.85763819293024e-06, + "loss": 0.2327, + "num_input_tokens_seen": 2934720, + "step": 15390 + }, + { + "epoch": 8.0, + "eval_loss": 0.2523098289966583, + "eval_runtime": 7.9245, + "eval_samples_per_second": 108.019, + "eval_steps_per_second": 27.005, + "num_input_tokens_seen": 2935056, + "step": 15392 + }, + { + "epoch": 8.001559251559252, + "grad_norm": 0.5569748878479004, + "learning_rate": 5.843059226467745e-06, + "loss": 0.2868, + "num_input_tokens_seen": 2935600, + "step": 15395 + }, + { + "epoch": 8.004158004158004, + "grad_norm": 0.37323540449142456, + "learning_rate": 5.828496024161353e-06, + "loss": 0.2974, + "num_input_tokens_seen": 2936528, + "step": 15400 + }, + { + "epoch": 8.006756756756756, + "grad_norm": 0.7094139456748962, + "learning_rate": 5.81394859799504e-06, + "loss": 0.2635, + "num_input_tokens_seen": 2937520, + "step": 15405 + }, + { + "epoch": 8.00935550935551, + "grad_norm": 0.62590092420578, + "learning_rate": 5.799416959939827e-06, + "loss": 0.2481, + "num_input_tokens_seen": 2938416, + "step": 15410 + }, + { + "epoch": 8.011954261954262, + "grad_norm": 0.22699092328548431, + "learning_rate": 5.784901121953723e-06, + "loss": 0.2664, + "num_input_tokens_seen": 2939376, + "step": 15415 + }, + { + "epoch": 8.014553014553014, + "grad_norm": 0.40768998861312866, + "learning_rate": 5.770401095981739e-06, + "loss": 0.2618, + "num_input_tokens_seen": 2940336, + "step": 15420 + }, + { + "epoch": 8.017151767151768, + "grad_norm": 0.3085450530052185, + "learning_rate": 5.755916893955887e-06, + "loss": 0.258, + "num_input_tokens_seen": 2941264, + "step": 15425 + }, + { + "epoch": 8.01975051975052, + "grad_norm": 0.3188365399837494, + "learning_rate": 5.741448527795137e-06, + "loss": 0.2342, + "num_input_tokens_seen": 2942192, + "step": 15430 + }, + { + "epoch": 8.022349272349272, + "grad_norm": 0.8372742533683777, + "learning_rate": 5.726996009405455e-06, + "loss": 0.2594, + "num_input_tokens_seen": 2943216, + "step": 15435 + }, + { + "epoch": 8.024948024948024, + "grad_norm": 0.18493768572807312, + "learning_rate": 5.712559350679733e-06, + "loss": 0.2458, + "num_input_tokens_seen": 2944144, + "step": 15440 + }, + { + "epoch": 8.027546777546778, + "grad_norm": 0.34105169773101807, + "learning_rate": 5.698138563497854e-06, + "loss": 0.2644, + "num_input_tokens_seen": 2945072, + "step": 15445 + }, + { + "epoch": 8.03014553014553, + "grad_norm": 0.7569416165351868, + "learning_rate": 5.683733659726581e-06, + "loss": 0.249, + "num_input_tokens_seen": 2945968, + "step": 15450 + }, + { + "epoch": 8.032744282744282, + "grad_norm": 0.4448297321796417, + "learning_rate": 5.669344651219663e-06, + "loss": 0.2355, + "num_input_tokens_seen": 2946928, + "step": 15455 + }, + { + "epoch": 8.035343035343036, + "grad_norm": 0.22005689144134521, + "learning_rate": 5.654971549817748e-06, + "loss": 0.2363, + "num_input_tokens_seen": 2947888, + "step": 15460 + }, + { + "epoch": 8.037941787941788, + "grad_norm": 0.7650142312049866, + "learning_rate": 5.640614367348385e-06, + "loss": 0.3183, + "num_input_tokens_seen": 2948816, + "step": 15465 + }, + { + "epoch": 8.04054054054054, + "grad_norm": 0.15671765804290771, + "learning_rate": 5.626273115626038e-06, + "loss": 0.2718, + "num_input_tokens_seen": 2949680, + "step": 15470 + }, + { + "epoch": 8.043139293139292, + "grad_norm": 0.4584433138370514, + "learning_rate": 5.61194780645205e-06, + "loss": 0.2345, + "num_input_tokens_seen": 2950704, + "step": 15475 + }, + { + "epoch": 8.045738045738046, + "grad_norm": 0.3180578649044037, + "learning_rate": 5.597638451614665e-06, + "loss": 0.302, + "num_input_tokens_seen": 2951664, + "step": 15480 + }, + { + "epoch": 8.048336798336798, + "grad_norm": 0.3857976496219635, + "learning_rate": 5.583345062888956e-06, + "loss": 0.2629, + "num_input_tokens_seen": 2952720, + "step": 15485 + }, + { + "epoch": 8.05093555093555, + "grad_norm": 0.5516009330749512, + "learning_rate": 5.569067652036911e-06, + "loss": 0.2512, + "num_input_tokens_seen": 2953680, + "step": 15490 + }, + { + "epoch": 8.053534303534304, + "grad_norm": 0.49103158712387085, + "learning_rate": 5.5548062308073246e-06, + "loss": 0.2679, + "num_input_tokens_seen": 2954640, + "step": 15495 + }, + { + "epoch": 8.056133056133056, + "grad_norm": 0.5668928623199463, + "learning_rate": 5.540560810935871e-06, + "loss": 0.1779, + "num_input_tokens_seen": 2955632, + "step": 15500 + }, + { + "epoch": 8.058731808731808, + "grad_norm": 0.18732748925685883, + "learning_rate": 5.526331404145021e-06, + "loss": 0.2788, + "num_input_tokens_seen": 2956496, + "step": 15505 + }, + { + "epoch": 8.06133056133056, + "grad_norm": 0.4372001886367798, + "learning_rate": 5.512118022144105e-06, + "loss": 0.2658, + "num_input_tokens_seen": 2957424, + "step": 15510 + }, + { + "epoch": 8.063929313929314, + "grad_norm": 0.5445491671562195, + "learning_rate": 5.497920676629234e-06, + "loss": 0.2505, + "num_input_tokens_seen": 2958384, + "step": 15515 + }, + { + "epoch": 8.066528066528067, + "grad_norm": 0.5212180614471436, + "learning_rate": 5.483739379283337e-06, + "loss": 0.325, + "num_input_tokens_seen": 2959280, + "step": 15520 + }, + { + "epoch": 8.069126819126819, + "grad_norm": 0.4292585551738739, + "learning_rate": 5.469574141776146e-06, + "loss": 0.3006, + "num_input_tokens_seen": 2960240, + "step": 15525 + }, + { + "epoch": 8.071725571725572, + "grad_norm": 0.10543115437030792, + "learning_rate": 5.455424975764156e-06, + "loss": 0.2391, + "num_input_tokens_seen": 2961104, + "step": 15530 + }, + { + "epoch": 8.074324324324325, + "grad_norm": 0.5821784734725952, + "learning_rate": 5.4412918928906625e-06, + "loss": 0.2112, + "num_input_tokens_seen": 2962000, + "step": 15535 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 0.7877317667007446, + "learning_rate": 5.4271749047856975e-06, + "loss": 0.2582, + "num_input_tokens_seen": 2962928, + "step": 15540 + }, + { + "epoch": 8.079521829521829, + "grad_norm": 0.3098394572734833, + "learning_rate": 5.413074023066081e-06, + "loss": 0.2648, + "num_input_tokens_seen": 2963856, + "step": 15545 + }, + { + "epoch": 8.082120582120583, + "grad_norm": 0.28785496950149536, + "learning_rate": 5.398989259335352e-06, + "loss": 0.3191, + "num_input_tokens_seen": 2964752, + "step": 15550 + }, + { + "epoch": 8.084719334719335, + "grad_norm": 0.635875403881073, + "learning_rate": 5.38492062518379e-06, + "loss": 0.2649, + "num_input_tokens_seen": 2965712, + "step": 15555 + }, + { + "epoch": 8.087318087318087, + "grad_norm": 0.6418886780738831, + "learning_rate": 5.37086813218842e-06, + "loss": 0.2538, + "num_input_tokens_seen": 2966640, + "step": 15560 + }, + { + "epoch": 8.08991683991684, + "grad_norm": 0.43300116062164307, + "learning_rate": 5.356831791912958e-06, + "loss": 0.2971, + "num_input_tokens_seen": 2967568, + "step": 15565 + }, + { + "epoch": 8.092515592515593, + "grad_norm": 0.49571579694747925, + "learning_rate": 5.342811615907861e-06, + "loss": 0.2498, + "num_input_tokens_seen": 2968592, + "step": 15570 + }, + { + "epoch": 8.095114345114345, + "grad_norm": 0.22561442852020264, + "learning_rate": 5.328807615710246e-06, + "loss": 0.2699, + "num_input_tokens_seen": 2969552, + "step": 15575 + }, + { + "epoch": 8.097713097713097, + "grad_norm": 0.1638268083333969, + "learning_rate": 5.3148198028439565e-06, + "loss": 0.2456, + "num_input_tokens_seen": 2970544, + "step": 15580 + }, + { + "epoch": 8.10031185031185, + "grad_norm": 0.2177913784980774, + "learning_rate": 5.300848188819491e-06, + "loss": 0.2349, + "num_input_tokens_seen": 2971504, + "step": 15585 + }, + { + "epoch": 8.102910602910603, + "grad_norm": 0.4637247920036316, + "learning_rate": 5.286892785134012e-06, + "loss": 0.2906, + "num_input_tokens_seen": 2972432, + "step": 15590 + }, + { + "epoch": 8.105509355509355, + "grad_norm": 0.27925845980644226, + "learning_rate": 5.272953603271375e-06, + "loss": 0.2041, + "num_input_tokens_seen": 2973360, + "step": 15595 + }, + { + "epoch": 8.108108108108109, + "grad_norm": 0.45718029141426086, + "learning_rate": 5.259030654702052e-06, + "loss": 0.2892, + "num_input_tokens_seen": 2974288, + "step": 15600 + }, + { + "epoch": 8.11070686070686, + "grad_norm": 0.45616376399993896, + "learning_rate": 5.2451239508831824e-06, + "loss": 0.3228, + "num_input_tokens_seen": 2975280, + "step": 15605 + }, + { + "epoch": 8.113305613305613, + "grad_norm": 0.4774545729160309, + "learning_rate": 5.231233503258523e-06, + "loss": 0.2442, + "num_input_tokens_seen": 2976240, + "step": 15610 + }, + { + "epoch": 8.115904365904365, + "grad_norm": 0.6402755975723267, + "learning_rate": 5.217359323258459e-06, + "loss": 0.2373, + "num_input_tokens_seen": 2977264, + "step": 15615 + }, + { + "epoch": 8.118503118503119, + "grad_norm": 0.17406295239925385, + "learning_rate": 5.203501422299975e-06, + "loss": 0.2855, + "num_input_tokens_seen": 2978256, + "step": 15620 + }, + { + "epoch": 8.121101871101871, + "grad_norm": 0.2643541693687439, + "learning_rate": 5.1896598117866925e-06, + "loss": 0.2269, + "num_input_tokens_seen": 2979216, + "step": 15625 + }, + { + "epoch": 8.123700623700623, + "grad_norm": 0.32956069707870483, + "learning_rate": 5.1758345031087895e-06, + "loss": 0.2648, + "num_input_tokens_seen": 2980144, + "step": 15630 + }, + { + "epoch": 8.126299376299377, + "grad_norm": 0.3235411047935486, + "learning_rate": 5.162025507643057e-06, + "loss": 0.2517, + "num_input_tokens_seen": 2981136, + "step": 15635 + }, + { + "epoch": 8.128898128898129, + "grad_norm": 0.15921273827552795, + "learning_rate": 5.148232836752856e-06, + "loss": 0.2378, + "num_input_tokens_seen": 2982064, + "step": 15640 + }, + { + "epoch": 8.131496881496881, + "grad_norm": 0.4461462199687958, + "learning_rate": 5.134456501788104e-06, + "loss": 0.3048, + "num_input_tokens_seen": 2983056, + "step": 15645 + }, + { + "epoch": 8.134095634095635, + "grad_norm": 0.2165348380804062, + "learning_rate": 5.1206965140852825e-06, + "loss": 0.2345, + "num_input_tokens_seen": 2983984, + "step": 15650 + }, + { + "epoch": 8.136694386694387, + "grad_norm": 0.1980523318052292, + "learning_rate": 5.106952884967417e-06, + "loss": 0.1969, + "num_input_tokens_seen": 2984976, + "step": 15655 + }, + { + "epoch": 8.13929313929314, + "grad_norm": 0.5598882436752319, + "learning_rate": 5.093225625744083e-06, + "loss": 0.2954, + "num_input_tokens_seen": 2985936, + "step": 15660 + }, + { + "epoch": 8.141891891891891, + "grad_norm": 0.47924819588661194, + "learning_rate": 5.079514747711367e-06, + "loss": 0.2877, + "num_input_tokens_seen": 2986896, + "step": 15665 + }, + { + "epoch": 8.144490644490645, + "grad_norm": 0.5364359021186829, + "learning_rate": 5.065820262151899e-06, + "loss": 0.2907, + "num_input_tokens_seen": 2987920, + "step": 15670 + }, + { + "epoch": 8.147089397089397, + "grad_norm": 0.5255890488624573, + "learning_rate": 5.052142180334799e-06, + "loss": 0.2627, + "num_input_tokens_seen": 2988880, + "step": 15675 + }, + { + "epoch": 8.14968814968815, + "grad_norm": 0.44960731267929077, + "learning_rate": 5.038480513515689e-06, + "loss": 0.2508, + "num_input_tokens_seen": 2989872, + "step": 15680 + }, + { + "epoch": 8.152286902286903, + "grad_norm": 0.5159833431243896, + "learning_rate": 5.024835272936704e-06, + "loss": 0.3024, + "num_input_tokens_seen": 2990832, + "step": 15685 + }, + { + "epoch": 8.154885654885655, + "grad_norm": 0.10455876588821411, + "learning_rate": 5.011206469826435e-06, + "loss": 0.2924, + "num_input_tokens_seen": 2991696, + "step": 15690 + }, + { + "epoch": 8.157484407484407, + "grad_norm": 0.7094444632530212, + "learning_rate": 4.9975941153999725e-06, + "loss": 0.2458, + "num_input_tokens_seen": 2992752, + "step": 15695 + }, + { + "epoch": 8.16008316008316, + "grad_norm": 0.14427602291107178, + "learning_rate": 4.983998220858846e-06, + "loss": 0.2857, + "num_input_tokens_seen": 2993648, + "step": 15700 + }, + { + "epoch": 8.162681912681913, + "grad_norm": 0.38643375039100647, + "learning_rate": 4.9704187973910635e-06, + "loss": 0.2902, + "num_input_tokens_seen": 2994608, + "step": 15705 + }, + { + "epoch": 8.165280665280665, + "grad_norm": 0.3407570421695709, + "learning_rate": 4.956855856171067e-06, + "loss": 0.2655, + "num_input_tokens_seen": 2995536, + "step": 15710 + }, + { + "epoch": 8.167879417879417, + "grad_norm": 0.389006108045578, + "learning_rate": 4.9433094083597256e-06, + "loss": 0.3017, + "num_input_tokens_seen": 2996496, + "step": 15715 + }, + { + "epoch": 8.170478170478171, + "grad_norm": 0.6510845422744751, + "learning_rate": 4.929779465104365e-06, + "loss": 0.2461, + "num_input_tokens_seen": 2997456, + "step": 15720 + }, + { + "epoch": 8.173076923076923, + "grad_norm": 0.3417345881462097, + "learning_rate": 4.916266037538691e-06, + "loss": 0.268, + "num_input_tokens_seen": 2998416, + "step": 15725 + }, + { + "epoch": 8.175675675675675, + "grad_norm": 0.24903379380702972, + "learning_rate": 4.902769136782859e-06, + "loss": 0.2873, + "num_input_tokens_seen": 2999344, + "step": 15730 + }, + { + "epoch": 8.178274428274428, + "grad_norm": 0.6554333567619324, + "learning_rate": 4.88928877394339e-06, + "loss": 0.2477, + "num_input_tokens_seen": 3000304, + "step": 15735 + }, + { + "epoch": 8.180873180873181, + "grad_norm": 0.7010923624038696, + "learning_rate": 4.875824960113231e-06, + "loss": 0.2296, + "num_input_tokens_seen": 3001296, + "step": 15740 + }, + { + "epoch": 8.183471933471933, + "grad_norm": 0.7680396437644958, + "learning_rate": 4.862377706371665e-06, + "loss": 0.2588, + "num_input_tokens_seen": 3002256, + "step": 15745 + }, + { + "epoch": 8.186070686070686, + "grad_norm": 0.4903126060962677, + "learning_rate": 4.848947023784389e-06, + "loss": 0.2448, + "num_input_tokens_seen": 3003216, + "step": 15750 + }, + { + "epoch": 8.18866943866944, + "grad_norm": 0.4538021683692932, + "learning_rate": 4.835532923403441e-06, + "loss": 0.2526, + "num_input_tokens_seen": 3004208, + "step": 15755 + }, + { + "epoch": 8.191268191268192, + "grad_norm": 0.4440026879310608, + "learning_rate": 4.822135416267223e-06, + "loss": 0.2425, + "num_input_tokens_seen": 3005168, + "step": 15760 + }, + { + "epoch": 8.193866943866944, + "grad_norm": 0.2557007372379303, + "learning_rate": 4.80875451340049e-06, + "loss": 0.2635, + "num_input_tokens_seen": 3006128, + "step": 15765 + }, + { + "epoch": 8.196465696465696, + "grad_norm": 0.2653263509273529, + "learning_rate": 4.795390225814308e-06, + "loss": 0.2977, + "num_input_tokens_seen": 3007152, + "step": 15770 + }, + { + "epoch": 8.19906444906445, + "grad_norm": 0.7127352952957153, + "learning_rate": 4.782042564506109e-06, + "loss": 0.288, + "num_input_tokens_seen": 3008016, + "step": 15775 + }, + { + "epoch": 8.201663201663202, + "grad_norm": 0.16830670833587646, + "learning_rate": 4.768711540459591e-06, + "loss": 0.2503, + "num_input_tokens_seen": 3008912, + "step": 15780 + }, + { + "epoch": 8.204261954261954, + "grad_norm": 0.6217049360275269, + "learning_rate": 4.755397164644812e-06, + "loss": 0.2804, + "num_input_tokens_seen": 3009840, + "step": 15785 + }, + { + "epoch": 8.206860706860708, + "grad_norm": 0.599090039730072, + "learning_rate": 4.742099448018097e-06, + "loss": 0.278, + "num_input_tokens_seen": 3010800, + "step": 15790 + }, + { + "epoch": 8.20945945945946, + "grad_norm": 0.6381352543830872, + "learning_rate": 4.728818401522084e-06, + "loss": 0.1917, + "num_input_tokens_seen": 3011792, + "step": 15795 + }, + { + "epoch": 8.212058212058212, + "grad_norm": 0.24065056443214417, + "learning_rate": 4.715554036085673e-06, + "loss": 0.2728, + "num_input_tokens_seen": 3012688, + "step": 15800 + }, + { + "epoch": 8.214656964656964, + "grad_norm": 0.5731284618377686, + "learning_rate": 4.702306362624062e-06, + "loss": 0.2359, + "num_input_tokens_seen": 3013680, + "step": 15805 + }, + { + "epoch": 8.217255717255718, + "grad_norm": 0.33388087153434753, + "learning_rate": 4.6890753920386885e-06, + "loss": 0.2736, + "num_input_tokens_seen": 3014608, + "step": 15810 + }, + { + "epoch": 8.21985446985447, + "grad_norm": 0.8336775898933411, + "learning_rate": 4.675861135217252e-06, + "loss": 0.2588, + "num_input_tokens_seen": 3015600, + "step": 15815 + }, + { + "epoch": 8.222453222453222, + "grad_norm": 0.1315883994102478, + "learning_rate": 4.662663603033715e-06, + "loss": 0.2313, + "num_input_tokens_seen": 3016496, + "step": 15820 + }, + { + "epoch": 8.225051975051976, + "grad_norm": 0.5920493006706238, + "learning_rate": 4.649482806348249e-06, + "loss": 0.2189, + "num_input_tokens_seen": 3017520, + "step": 15825 + }, + { + "epoch": 8.227650727650728, + "grad_norm": 0.6787465214729309, + "learning_rate": 4.636318756007285e-06, + "loss": 0.2459, + "num_input_tokens_seen": 3018576, + "step": 15830 + }, + { + "epoch": 8.23024948024948, + "grad_norm": 0.13381525874137878, + "learning_rate": 4.6231714628434425e-06, + "loss": 0.2542, + "num_input_tokens_seen": 3019504, + "step": 15835 + }, + { + "epoch": 8.232848232848234, + "grad_norm": 0.8122750520706177, + "learning_rate": 4.610040937675583e-06, + "loss": 0.2463, + "num_input_tokens_seen": 3020400, + "step": 15840 + }, + { + "epoch": 8.235446985446986, + "grad_norm": 0.5253936052322388, + "learning_rate": 4.596927191308744e-06, + "loss": 0.2167, + "num_input_tokens_seen": 3021296, + "step": 15845 + }, + { + "epoch": 8.238045738045738, + "grad_norm": 0.5442778468132019, + "learning_rate": 4.583830234534161e-06, + "loss": 0.3005, + "num_input_tokens_seen": 3022224, + "step": 15850 + }, + { + "epoch": 8.24064449064449, + "grad_norm": 0.3467840552330017, + "learning_rate": 4.5707500781292715e-06, + "loss": 0.2292, + "num_input_tokens_seen": 3023184, + "step": 15855 + }, + { + "epoch": 8.243243243243244, + "grad_norm": 0.5270783305168152, + "learning_rate": 4.557686732857661e-06, + "loss": 0.2623, + "num_input_tokens_seen": 3024144, + "step": 15860 + }, + { + "epoch": 8.245841995841996, + "grad_norm": 0.5048145651817322, + "learning_rate": 4.544640209469103e-06, + "loss": 0.2765, + "num_input_tokens_seen": 3025104, + "step": 15865 + }, + { + "epoch": 8.248440748440748, + "grad_norm": 0.11519847810268402, + "learning_rate": 4.531610518699514e-06, + "loss": 0.2649, + "num_input_tokens_seen": 3026000, + "step": 15870 + }, + { + "epoch": 8.2510395010395, + "grad_norm": 0.13909363746643066, + "learning_rate": 4.51859767127098e-06, + "loss": 0.2819, + "num_input_tokens_seen": 3026896, + "step": 15875 + }, + { + "epoch": 8.253638253638254, + "grad_norm": 0.5250791907310486, + "learning_rate": 4.505601677891688e-06, + "loss": 0.2602, + "num_input_tokens_seen": 3027824, + "step": 15880 + }, + { + "epoch": 8.256237006237006, + "grad_norm": 0.6325995922088623, + "learning_rate": 4.492622549255992e-06, + "loss": 0.2444, + "num_input_tokens_seen": 3028784, + "step": 15885 + }, + { + "epoch": 8.258835758835758, + "grad_norm": 0.38820329308509827, + "learning_rate": 4.4796602960443604e-06, + "loss": 0.2105, + "num_input_tokens_seen": 3029680, + "step": 15890 + }, + { + "epoch": 8.261434511434512, + "grad_norm": 0.5327358245849609, + "learning_rate": 4.46671492892336e-06, + "loss": 0.2477, + "num_input_tokens_seen": 3030608, + "step": 15895 + }, + { + "epoch": 8.264033264033264, + "grad_norm": 0.19051086902618408, + "learning_rate": 4.4537864585456834e-06, + "loss": 0.2051, + "num_input_tokens_seen": 3031536, + "step": 15900 + }, + { + "epoch": 8.266632016632016, + "grad_norm": 0.5828871130943298, + "learning_rate": 4.4408748955501015e-06, + "loss": 0.3508, + "num_input_tokens_seen": 3032496, + "step": 15905 + }, + { + "epoch": 8.26923076923077, + "grad_norm": 0.23105968534946442, + "learning_rate": 4.427980250561478e-06, + "loss": 0.3316, + "num_input_tokens_seen": 3033488, + "step": 15910 + }, + { + "epoch": 8.271829521829522, + "grad_norm": 0.5256811380386353, + "learning_rate": 4.415102534190749e-06, + "loss": 0.3046, + "num_input_tokens_seen": 3034384, + "step": 15915 + }, + { + "epoch": 8.274428274428274, + "grad_norm": 0.7987467050552368, + "learning_rate": 4.4022417570349415e-06, + "loss": 0.2644, + "num_input_tokens_seen": 3035248, + "step": 15920 + }, + { + "epoch": 8.277027027027026, + "grad_norm": 0.47213882207870483, + "learning_rate": 4.389397929677113e-06, + "loss": 0.2895, + "num_input_tokens_seen": 3036208, + "step": 15925 + }, + { + "epoch": 8.27962577962578, + "grad_norm": 0.4504607021808624, + "learning_rate": 4.376571062686405e-06, + "loss": 0.2402, + "num_input_tokens_seen": 3037136, + "step": 15930 + }, + { + "epoch": 8.282224532224532, + "grad_norm": 0.31831324100494385, + "learning_rate": 4.3637611666179686e-06, + "loss": 0.2548, + "num_input_tokens_seen": 3038032, + "step": 15935 + }, + { + "epoch": 8.284823284823284, + "grad_norm": 0.22506378591060638, + "learning_rate": 4.350968252013021e-06, + "loss": 0.2637, + "num_input_tokens_seen": 3038928, + "step": 15940 + }, + { + "epoch": 8.287422037422038, + "grad_norm": 0.5771199464797974, + "learning_rate": 4.3381923293987855e-06, + "loss": 0.1823, + "num_input_tokens_seen": 3039920, + "step": 15945 + }, + { + "epoch": 8.29002079002079, + "grad_norm": 0.5208243131637573, + "learning_rate": 4.325433409288498e-06, + "loss": 0.2691, + "num_input_tokens_seen": 3040912, + "step": 15950 + }, + { + "epoch": 8.292619542619542, + "grad_norm": 0.6822947859764099, + "learning_rate": 4.3126915021814346e-06, + "loss": 0.274, + "num_input_tokens_seen": 3041840, + "step": 15955 + }, + { + "epoch": 8.295218295218294, + "grad_norm": 0.2669938802719116, + "learning_rate": 4.2999666185628315e-06, + "loss": 0.2683, + "num_input_tokens_seen": 3042832, + "step": 15960 + }, + { + "epoch": 8.297817047817048, + "grad_norm": 0.5089616775512695, + "learning_rate": 4.2872587689039484e-06, + "loss": 0.2574, + "num_input_tokens_seen": 3043696, + "step": 15965 + }, + { + "epoch": 8.3004158004158, + "grad_norm": 0.25497788190841675, + "learning_rate": 4.27456796366201e-06, + "loss": 0.2694, + "num_input_tokens_seen": 3044560, + "step": 15970 + }, + { + "epoch": 8.303014553014552, + "grad_norm": 0.45204174518585205, + "learning_rate": 4.261894213280215e-06, + "loss": 0.2072, + "num_input_tokens_seen": 3045552, + "step": 15975 + }, + { + "epoch": 8.305613305613306, + "grad_norm": 0.24874247610569, + "learning_rate": 4.249237528187741e-06, + "loss": 0.1941, + "num_input_tokens_seen": 3046544, + "step": 15980 + }, + { + "epoch": 8.308212058212058, + "grad_norm": 0.5510877370834351, + "learning_rate": 4.236597918799709e-06, + "loss": 0.2655, + "num_input_tokens_seen": 3047440, + "step": 15985 + }, + { + "epoch": 8.31081081081081, + "grad_norm": 0.4071790277957916, + "learning_rate": 4.223975395517199e-06, + "loss": 0.2825, + "num_input_tokens_seen": 3048400, + "step": 15990 + }, + { + "epoch": 8.313409563409563, + "grad_norm": 0.5381672382354736, + "learning_rate": 4.211369968727216e-06, + "loss": 0.2899, + "num_input_tokens_seen": 3049392, + "step": 15995 + }, + { + "epoch": 8.316008316008316, + "grad_norm": 0.2435770332813263, + "learning_rate": 4.1987816488027186e-06, + "loss": 0.2273, + "num_input_tokens_seen": 3050480, + "step": 16000 + }, + { + "epoch": 8.318607068607069, + "grad_norm": 0.5454203486442566, + "learning_rate": 4.1862104461025704e-06, + "loss": 0.2504, + "num_input_tokens_seen": 3051472, + "step": 16005 + }, + { + "epoch": 8.32120582120582, + "grad_norm": 0.20379923284053802, + "learning_rate": 4.173656370971549e-06, + "loss": 0.2681, + "num_input_tokens_seen": 3052432, + "step": 16010 + }, + { + "epoch": 8.323804573804575, + "grad_norm": 0.7981832027435303, + "learning_rate": 4.161119433740351e-06, + "loss": 0.2837, + "num_input_tokens_seen": 3053392, + "step": 16015 + }, + { + "epoch": 8.326403326403327, + "grad_norm": 0.3011397421360016, + "learning_rate": 4.1485996447255595e-06, + "loss": 0.2529, + "num_input_tokens_seen": 3054352, + "step": 16020 + }, + { + "epoch": 8.329002079002079, + "grad_norm": 0.4353864789009094, + "learning_rate": 4.136097014229653e-06, + "loss": 0.2835, + "num_input_tokens_seen": 3055248, + "step": 16025 + }, + { + "epoch": 8.33160083160083, + "grad_norm": 0.5686289668083191, + "learning_rate": 4.1236115525409815e-06, + "loss": 0.2655, + "num_input_tokens_seen": 3056144, + "step": 16030 + }, + { + "epoch": 8.334199584199585, + "grad_norm": 0.795365571975708, + "learning_rate": 4.111143269933787e-06, + "loss": 0.267, + "num_input_tokens_seen": 3057104, + "step": 16035 + }, + { + "epoch": 8.336798336798337, + "grad_norm": 0.4876331388950348, + "learning_rate": 4.098692176668137e-06, + "loss": 0.2835, + "num_input_tokens_seen": 3058096, + "step": 16040 + }, + { + "epoch": 8.339397089397089, + "grad_norm": 0.6634555459022522, + "learning_rate": 4.086258282989996e-06, + "loss": 0.239, + "num_input_tokens_seen": 3058992, + "step": 16045 + }, + { + "epoch": 8.341995841995843, + "grad_norm": 0.3754941523075104, + "learning_rate": 4.073841599131145e-06, + "loss": 0.2218, + "num_input_tokens_seen": 3059920, + "step": 16050 + }, + { + "epoch": 8.344594594594595, + "grad_norm": 0.6149423718452454, + "learning_rate": 4.061442135309224e-06, + "loss": 0.2896, + "num_input_tokens_seen": 3060784, + "step": 16055 + }, + { + "epoch": 8.347193347193347, + "grad_norm": 0.7615821957588196, + "learning_rate": 4.049059901727681e-06, + "loss": 0.31, + "num_input_tokens_seen": 3061776, + "step": 16060 + }, + { + "epoch": 8.3497920997921, + "grad_norm": 0.21303026378154755, + "learning_rate": 4.036694908575808e-06, + "loss": 0.2995, + "num_input_tokens_seen": 3062736, + "step": 16065 + }, + { + "epoch": 8.352390852390853, + "grad_norm": 0.3000330328941345, + "learning_rate": 4.024347166028708e-06, + "loss": 0.2657, + "num_input_tokens_seen": 3063728, + "step": 16070 + }, + { + "epoch": 8.354989604989605, + "grad_norm": 0.3409865200519562, + "learning_rate": 4.012016684247258e-06, + "loss": 0.2872, + "num_input_tokens_seen": 3064688, + "step": 16075 + }, + { + "epoch": 8.357588357588357, + "grad_norm": 0.2997090816497803, + "learning_rate": 3.999703473378169e-06, + "loss": 0.2732, + "num_input_tokens_seen": 3065648, + "step": 16080 + }, + { + "epoch": 8.36018711018711, + "grad_norm": 0.30393123626708984, + "learning_rate": 3.987407543553911e-06, + "loss": 0.2609, + "num_input_tokens_seen": 3066576, + "step": 16085 + }, + { + "epoch": 8.362785862785863, + "grad_norm": 0.13827063143253326, + "learning_rate": 3.9751289048927635e-06, + "loss": 0.2567, + "num_input_tokens_seen": 3067504, + "step": 16090 + }, + { + "epoch": 8.365384615384615, + "grad_norm": 0.49067848920822144, + "learning_rate": 3.962867567498746e-06, + "loss": 0.2748, + "num_input_tokens_seen": 3068432, + "step": 16095 + }, + { + "epoch": 8.367983367983369, + "grad_norm": 0.40860632061958313, + "learning_rate": 3.950623541461665e-06, + "loss": 0.2341, + "num_input_tokens_seen": 3069456, + "step": 16100 + }, + { + "epoch": 8.370582120582121, + "grad_norm": 0.34276264905929565, + "learning_rate": 3.938396836857067e-06, + "loss": 0.2528, + "num_input_tokens_seen": 3070352, + "step": 16105 + }, + { + "epoch": 8.373180873180873, + "grad_norm": 0.5650733709335327, + "learning_rate": 3.926187463746242e-06, + "loss": 0.2227, + "num_input_tokens_seen": 3071280, + "step": 16110 + }, + { + "epoch": 8.375779625779625, + "grad_norm": 0.7225736975669861, + "learning_rate": 3.913995432176243e-06, + "loss": 0.2925, + "num_input_tokens_seen": 3072240, + "step": 16115 + }, + { + "epoch": 8.378378378378379, + "grad_norm": 0.4574034810066223, + "learning_rate": 3.901820752179816e-06, + "loss": 0.2456, + "num_input_tokens_seen": 3073232, + "step": 16120 + }, + { + "epoch": 8.380977130977131, + "grad_norm": 0.33224451541900635, + "learning_rate": 3.889663433775465e-06, + "loss": 0.2956, + "num_input_tokens_seen": 3074224, + "step": 16125 + }, + { + "epoch": 8.383575883575883, + "grad_norm": 0.5797611474990845, + "learning_rate": 3.877523486967377e-06, + "loss": 0.2303, + "num_input_tokens_seen": 3075184, + "step": 16130 + }, + { + "epoch": 8.386174636174637, + "grad_norm": 0.2649322748184204, + "learning_rate": 3.865400921745466e-06, + "loss": 0.2476, + "num_input_tokens_seen": 3076112, + "step": 16135 + }, + { + "epoch": 8.388773388773389, + "grad_norm": 0.5886318683624268, + "learning_rate": 3.853295748085331e-06, + "loss": 0.2643, + "num_input_tokens_seen": 3077008, + "step": 16140 + }, + { + "epoch": 8.391372141372141, + "grad_norm": 0.32874101400375366, + "learning_rate": 3.841207975948255e-06, + "loss": 0.2872, + "num_input_tokens_seen": 3078032, + "step": 16145 + }, + { + "epoch": 8.393970893970893, + "grad_norm": 0.6076323986053467, + "learning_rate": 3.829137615281217e-06, + "loss": 0.2379, + "num_input_tokens_seen": 3079024, + "step": 16150 + }, + { + "epoch": 8.396569646569647, + "grad_norm": 0.44295644760131836, + "learning_rate": 3.817084676016855e-06, + "loss": 0.2937, + "num_input_tokens_seen": 3079920, + "step": 16155 + }, + { + "epoch": 8.3991683991684, + "grad_norm": 0.24821361899375916, + "learning_rate": 3.8050491680734823e-06, + "loss": 0.2405, + "num_input_tokens_seen": 3080848, + "step": 16160 + }, + { + "epoch": 8.401767151767151, + "grad_norm": 0.40845930576324463, + "learning_rate": 3.793031101355057e-06, + "loss": 0.2718, + "num_input_tokens_seen": 3081744, + "step": 16165 + }, + { + "epoch": 8.404365904365905, + "grad_norm": 0.5745272040367126, + "learning_rate": 3.7810304857511914e-06, + "loss": 0.2796, + "num_input_tokens_seen": 3082672, + "step": 16170 + }, + { + "epoch": 8.406964656964657, + "grad_norm": 0.5078439712524414, + "learning_rate": 3.7690473311371267e-06, + "loss": 0.2594, + "num_input_tokens_seen": 3083600, + "step": 16175 + }, + { + "epoch": 8.40956340956341, + "grad_norm": 0.5886228680610657, + "learning_rate": 3.7570816473737584e-06, + "loss": 0.251, + "num_input_tokens_seen": 3084528, + "step": 16180 + }, + { + "epoch": 8.412162162162161, + "grad_norm": 0.388919860124588, + "learning_rate": 3.7451334443075747e-06, + "loss": 0.2424, + "num_input_tokens_seen": 3085520, + "step": 16185 + }, + { + "epoch": 8.414760914760915, + "grad_norm": 0.2552807033061981, + "learning_rate": 3.7332027317707076e-06, + "loss": 0.234, + "num_input_tokens_seen": 3086480, + "step": 16190 + }, + { + "epoch": 8.417359667359667, + "grad_norm": 0.28067493438720703, + "learning_rate": 3.7212895195808868e-06, + "loss": 0.2213, + "num_input_tokens_seen": 3087472, + "step": 16195 + }, + { + "epoch": 8.41995841995842, + "grad_norm": 0.6027266979217529, + "learning_rate": 3.7093938175414344e-06, + "loss": 0.2566, + "num_input_tokens_seen": 3088432, + "step": 16200 + }, + { + "epoch": 8.422557172557173, + "grad_norm": 0.15213647484779358, + "learning_rate": 3.697515635441262e-06, + "loss": 0.2638, + "num_input_tokens_seen": 3089456, + "step": 16205 + }, + { + "epoch": 8.425155925155925, + "grad_norm": 0.583132803440094, + "learning_rate": 3.6856549830548704e-06, + "loss": 0.2359, + "num_input_tokens_seen": 3090448, + "step": 16210 + }, + { + "epoch": 8.427754677754677, + "grad_norm": 0.4545833468437195, + "learning_rate": 3.6738118701423434e-06, + "loss": 0.2568, + "num_input_tokens_seen": 3091536, + "step": 16215 + }, + { + "epoch": 8.43035343035343, + "grad_norm": 0.5920125842094421, + "learning_rate": 3.661986306449311e-06, + "loss": 0.2251, + "num_input_tokens_seen": 3092464, + "step": 16220 + }, + { + "epoch": 8.432952182952183, + "grad_norm": 1.0350650548934937, + "learning_rate": 3.650178301706983e-06, + "loss": 0.235, + "num_input_tokens_seen": 3093456, + "step": 16225 + }, + { + "epoch": 8.435550935550935, + "grad_norm": 0.5616374015808105, + "learning_rate": 3.638387865632109e-06, + "loss": 0.2719, + "num_input_tokens_seen": 3094384, + "step": 16230 + }, + { + "epoch": 8.438149688149688, + "grad_norm": 0.32017526030540466, + "learning_rate": 3.6266150079269755e-06, + "loss": 0.2432, + "num_input_tokens_seen": 3095312, + "step": 16235 + }, + { + "epoch": 8.440748440748441, + "grad_norm": 0.23005282878875732, + "learning_rate": 3.614859738279422e-06, + "loss": 0.2234, + "num_input_tokens_seen": 3096336, + "step": 16240 + }, + { + "epoch": 8.443347193347194, + "grad_norm": 0.5742866396903992, + "learning_rate": 3.603122066362796e-06, + "loss": 0.2799, + "num_input_tokens_seen": 3097296, + "step": 16245 + }, + { + "epoch": 8.445945945945946, + "grad_norm": 0.5716853141784668, + "learning_rate": 3.5914020018359804e-06, + "loss": 0.2269, + "num_input_tokens_seen": 3098224, + "step": 16250 + }, + { + "epoch": 8.448544698544698, + "grad_norm": 0.5488483905792236, + "learning_rate": 3.579699554343352e-06, + "loss": 0.1941, + "num_input_tokens_seen": 3099216, + "step": 16255 + }, + { + "epoch": 8.451143451143452, + "grad_norm": 0.4019309878349304, + "learning_rate": 3.56801473351481e-06, + "loss": 0.2303, + "num_input_tokens_seen": 3100176, + "step": 16260 + }, + { + "epoch": 8.453742203742204, + "grad_norm": 0.32575055956840515, + "learning_rate": 3.5563475489657326e-06, + "loss": 0.2231, + "num_input_tokens_seen": 3101104, + "step": 16265 + }, + { + "epoch": 8.456340956340956, + "grad_norm": 0.47796279191970825, + "learning_rate": 3.544698010296982e-06, + "loss": 0.3092, + "num_input_tokens_seen": 3102032, + "step": 16270 + }, + { + "epoch": 8.45893970893971, + "grad_norm": 0.14881488680839539, + "learning_rate": 3.533066127094925e-06, + "loss": 0.2356, + "num_input_tokens_seen": 3102992, + "step": 16275 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 0.20028482377529144, + "learning_rate": 3.5214519089313726e-06, + "loss": 0.29, + "num_input_tokens_seen": 3103856, + "step": 16280 + }, + { + "epoch": 8.464137214137214, + "grad_norm": 0.2931399643421173, + "learning_rate": 3.509855365363615e-06, + "loss": 0.2895, + "num_input_tokens_seen": 3104816, + "step": 16285 + }, + { + "epoch": 8.466735966735968, + "grad_norm": 0.5042515397071838, + "learning_rate": 3.4982765059343864e-06, + "loss": 0.2112, + "num_input_tokens_seen": 3105776, + "step": 16290 + }, + { + "epoch": 8.46933471933472, + "grad_norm": 0.4780595898628235, + "learning_rate": 3.4867153401718865e-06, + "loss": 0.3076, + "num_input_tokens_seen": 3106768, + "step": 16295 + }, + { + "epoch": 8.471933471933472, + "grad_norm": 0.36012986302375793, + "learning_rate": 3.4751718775897392e-06, + "loss": 0.2421, + "num_input_tokens_seen": 3107760, + "step": 16300 + }, + { + "epoch": 8.474532224532224, + "grad_norm": 0.16788485646247864, + "learning_rate": 3.4636461276870038e-06, + "loss": 0.2574, + "num_input_tokens_seen": 3108720, + "step": 16305 + }, + { + "epoch": 8.477130977130978, + "grad_norm": 0.4656088948249817, + "learning_rate": 3.45213809994816e-06, + "loss": 0.1803, + "num_input_tokens_seen": 3109616, + "step": 16310 + }, + { + "epoch": 8.47972972972973, + "grad_norm": 0.5629457235336304, + "learning_rate": 3.4406478038431137e-06, + "loss": 0.215, + "num_input_tokens_seen": 3110544, + "step": 16315 + }, + { + "epoch": 8.482328482328482, + "grad_norm": 0.47860413789749146, + "learning_rate": 3.429175248827182e-06, + "loss": 0.3556, + "num_input_tokens_seen": 3111504, + "step": 16320 + }, + { + "epoch": 8.484927234927236, + "grad_norm": 0.5228756666183472, + "learning_rate": 3.4177204443410688e-06, + "loss": 0.2263, + "num_input_tokens_seen": 3112432, + "step": 16325 + }, + { + "epoch": 8.487525987525988, + "grad_norm": 0.38257288932800293, + "learning_rate": 3.406283399810889e-06, + "loss": 0.2912, + "num_input_tokens_seen": 3113456, + "step": 16330 + }, + { + "epoch": 8.49012474012474, + "grad_norm": 0.3059269189834595, + "learning_rate": 3.3948641246481142e-06, + "loss": 0.1888, + "num_input_tokens_seen": 3114384, + "step": 16335 + }, + { + "epoch": 8.492723492723492, + "grad_norm": 0.5218487977981567, + "learning_rate": 3.3834626282496285e-06, + "loss": 0.2605, + "num_input_tokens_seen": 3115344, + "step": 16340 + }, + { + "epoch": 8.495322245322246, + "grad_norm": 0.4828503131866455, + "learning_rate": 3.3720789199976567e-06, + "loss": 0.3132, + "num_input_tokens_seen": 3116304, + "step": 16345 + }, + { + "epoch": 8.497920997920998, + "grad_norm": 0.2823927104473114, + "learning_rate": 3.360713009259811e-06, + "loss": 0.2754, + "num_input_tokens_seen": 3117264, + "step": 16350 + }, + { + "epoch": 8.5, + "eval_loss": 0.24837632477283478, + "eval_runtime": 7.9384, + "eval_samples_per_second": 107.83, + "eval_steps_per_second": 26.957, + "num_input_tokens_seen": 3118000, + "step": 16354 + }, + { + "epoch": 8.50051975051975, + "grad_norm": 0.5524745583534241, + "learning_rate": 3.3493649053890326e-06, + "loss": 0.2338, + "num_input_tokens_seen": 3118224, + "step": 16355 + }, + { + "epoch": 8.503118503118504, + "grad_norm": 0.5190361142158508, + "learning_rate": 3.338034617723637e-06, + "loss": 0.3082, + "num_input_tokens_seen": 3119152, + "step": 16360 + }, + { + "epoch": 8.505717255717256, + "grad_norm": 0.17977343499660492, + "learning_rate": 3.3267221555872584e-06, + "loss": 0.3186, + "num_input_tokens_seen": 3120112, + "step": 16365 + }, + { + "epoch": 8.508316008316008, + "grad_norm": 0.3885452151298523, + "learning_rate": 3.3154275282888585e-06, + "loss": 0.2533, + "num_input_tokens_seen": 3121104, + "step": 16370 + }, + { + "epoch": 8.51091476091476, + "grad_norm": 0.7959849834442139, + "learning_rate": 3.30415074512275e-06, + "loss": 0.3423, + "num_input_tokens_seen": 3122096, + "step": 16375 + }, + { + "epoch": 8.513513513513514, + "grad_norm": 0.5415710806846619, + "learning_rate": 3.292891815368526e-06, + "loss": 0.2604, + "num_input_tokens_seen": 3122992, + "step": 16380 + }, + { + "epoch": 8.516112266112266, + "grad_norm": 0.651553750038147, + "learning_rate": 3.2816507482911264e-06, + "loss": 0.2413, + "num_input_tokens_seen": 3123952, + "step": 16385 + }, + { + "epoch": 8.518711018711018, + "grad_norm": 0.4069029688835144, + "learning_rate": 3.2704275531407565e-06, + "loss": 0.3106, + "num_input_tokens_seen": 3124944, + "step": 16390 + }, + { + "epoch": 8.521309771309772, + "grad_norm": 0.39799395203590393, + "learning_rate": 3.25922223915294e-06, + "loss": 0.2819, + "num_input_tokens_seen": 3125936, + "step": 16395 + }, + { + "epoch": 8.523908523908524, + "grad_norm": 0.6356663107872009, + "learning_rate": 3.248034815548473e-06, + "loss": 0.2863, + "num_input_tokens_seen": 3126896, + "step": 16400 + }, + { + "epoch": 8.526507276507276, + "grad_norm": 0.37915244698524475, + "learning_rate": 3.2368652915334307e-06, + "loss": 0.2491, + "num_input_tokens_seen": 3127856, + "step": 16405 + }, + { + "epoch": 8.529106029106028, + "grad_norm": 0.34859123826026917, + "learning_rate": 3.225713676299169e-06, + "loss": 0.2389, + "num_input_tokens_seen": 3128784, + "step": 16410 + }, + { + "epoch": 8.531704781704782, + "grad_norm": 0.5391877889633179, + "learning_rate": 3.2145799790222893e-06, + "loss": 0.2604, + "num_input_tokens_seen": 3129744, + "step": 16415 + }, + { + "epoch": 8.534303534303534, + "grad_norm": 0.39306220412254333, + "learning_rate": 3.2034642088646704e-06, + "loss": 0.252, + "num_input_tokens_seen": 3130768, + "step": 16420 + }, + { + "epoch": 8.536902286902286, + "grad_norm": 0.2361217886209488, + "learning_rate": 3.1923663749734182e-06, + "loss": 0.2524, + "num_input_tokens_seen": 3131760, + "step": 16425 + }, + { + "epoch": 8.53950103950104, + "grad_norm": 0.3774906396865845, + "learning_rate": 3.1812864864808973e-06, + "loss": 0.2744, + "num_input_tokens_seen": 3132752, + "step": 16430 + }, + { + "epoch": 8.542099792099792, + "grad_norm": 0.14569427073001862, + "learning_rate": 3.1702245525046803e-06, + "loss": 0.292, + "num_input_tokens_seen": 3133712, + "step": 16435 + }, + { + "epoch": 8.544698544698544, + "grad_norm": 0.3027002215385437, + "learning_rate": 3.159180582147589e-06, + "loss": 0.223, + "num_input_tokens_seen": 3134640, + "step": 16440 + }, + { + "epoch": 8.547297297297296, + "grad_norm": 0.147825226187706, + "learning_rate": 3.1481545844976617e-06, + "loss": 0.2479, + "num_input_tokens_seen": 3135600, + "step": 16445 + }, + { + "epoch": 8.54989604989605, + "grad_norm": 0.6223174929618835, + "learning_rate": 3.137146568628127e-06, + "loss": 0.2523, + "num_input_tokens_seen": 3136560, + "step": 16450 + }, + { + "epoch": 8.552494802494802, + "grad_norm": 0.5797799825668335, + "learning_rate": 3.126156543597439e-06, + "loss": 0.2928, + "num_input_tokens_seen": 3137520, + "step": 16455 + }, + { + "epoch": 8.555093555093555, + "grad_norm": 0.5921339988708496, + "learning_rate": 3.115184518449232e-06, + "loss": 0.2457, + "num_input_tokens_seen": 3138448, + "step": 16460 + }, + { + "epoch": 8.557692307692308, + "grad_norm": 0.38485756516456604, + "learning_rate": 3.104230502212338e-06, + "loss": 0.2088, + "num_input_tokens_seen": 3139376, + "step": 16465 + }, + { + "epoch": 8.56029106029106, + "grad_norm": 0.773973286151886, + "learning_rate": 3.0932945039007536e-06, + "loss": 0.2529, + "num_input_tokens_seen": 3140304, + "step": 16470 + }, + { + "epoch": 8.562889812889813, + "grad_norm": 0.40343061089515686, + "learning_rate": 3.0823765325136754e-06, + "loss": 0.2531, + "num_input_tokens_seen": 3141168, + "step": 16475 + }, + { + "epoch": 8.565488565488565, + "grad_norm": 0.43484699726104736, + "learning_rate": 3.0714765970354414e-06, + "loss": 0.266, + "num_input_tokens_seen": 3142128, + "step": 16480 + }, + { + "epoch": 8.568087318087318, + "grad_norm": 0.748672604560852, + "learning_rate": 3.06059470643556e-06, + "loss": 0.2395, + "num_input_tokens_seen": 3143088, + "step": 16485 + }, + { + "epoch": 8.57068607068607, + "grad_norm": 0.088621124625206, + "learning_rate": 3.049730869668685e-06, + "loss": 0.2743, + "num_input_tokens_seen": 3144048, + "step": 16490 + }, + { + "epoch": 8.573284823284823, + "grad_norm": 0.5819865465164185, + "learning_rate": 3.0388850956746233e-06, + "loss": 0.2177, + "num_input_tokens_seen": 3144912, + "step": 16495 + }, + { + "epoch": 8.575883575883577, + "grad_norm": 0.5872961282730103, + "learning_rate": 3.028057393378306e-06, + "loss": 0.1906, + "num_input_tokens_seen": 3145776, + "step": 16500 + }, + { + "epoch": 8.578482328482329, + "grad_norm": 0.5410410761833191, + "learning_rate": 3.0172477716897934e-06, + "loss": 0.2681, + "num_input_tokens_seen": 3146736, + "step": 16505 + }, + { + "epoch": 8.58108108108108, + "grad_norm": 0.27369657158851624, + "learning_rate": 3.0064562395042824e-06, + "loss": 0.2594, + "num_input_tokens_seen": 3147728, + "step": 16510 + }, + { + "epoch": 8.583679833679835, + "grad_norm": 0.5843499302864075, + "learning_rate": 2.995682805702063e-06, + "loss": 0.2612, + "num_input_tokens_seen": 3148752, + "step": 16515 + }, + { + "epoch": 8.586278586278587, + "grad_norm": 0.549202024936676, + "learning_rate": 2.9849274791485554e-06, + "loss": 0.2253, + "num_input_tokens_seen": 3149744, + "step": 16520 + }, + { + "epoch": 8.588877338877339, + "grad_norm": 0.553983747959137, + "learning_rate": 2.9741902686942575e-06, + "loss": 0.277, + "num_input_tokens_seen": 3150704, + "step": 16525 + }, + { + "epoch": 8.59147609147609, + "grad_norm": 0.5068913102149963, + "learning_rate": 2.963471183174768e-06, + "loss": 0.297, + "num_input_tokens_seen": 3151600, + "step": 16530 + }, + { + "epoch": 8.594074844074845, + "grad_norm": 0.5170083045959473, + "learning_rate": 2.9527702314107814e-06, + "loss": 0.277, + "num_input_tokens_seen": 3152592, + "step": 16535 + }, + { + "epoch": 8.596673596673597, + "grad_norm": 0.5214045643806458, + "learning_rate": 2.942087422208051e-06, + "loss": 0.1853, + "num_input_tokens_seen": 3153584, + "step": 16540 + }, + { + "epoch": 8.599272349272349, + "grad_norm": 0.30232781171798706, + "learning_rate": 2.9314227643574166e-06, + "loss": 0.1931, + "num_input_tokens_seen": 3154512, + "step": 16545 + }, + { + "epoch": 8.601871101871101, + "grad_norm": 0.8255061507225037, + "learning_rate": 2.920776266634767e-06, + "loss": 0.2677, + "num_input_tokens_seen": 3155536, + "step": 16550 + }, + { + "epoch": 8.604469854469855, + "grad_norm": 0.5892074108123779, + "learning_rate": 2.9101479378010637e-06, + "loss": 0.2654, + "num_input_tokens_seen": 3156528, + "step": 16555 + }, + { + "epoch": 8.607068607068607, + "grad_norm": 0.07065124809741974, + "learning_rate": 2.899537786602305e-06, + "loss": 0.2084, + "num_input_tokens_seen": 3157488, + "step": 16560 + }, + { + "epoch": 8.609667359667359, + "grad_norm": 0.8013378381729126, + "learning_rate": 2.8889458217695287e-06, + "loss": 0.2744, + "num_input_tokens_seen": 3158448, + "step": 16565 + }, + { + "epoch": 8.612266112266113, + "grad_norm": 0.4151361286640167, + "learning_rate": 2.878372052018824e-06, + "loss": 0.2014, + "num_input_tokens_seen": 3159312, + "step": 16570 + }, + { + "epoch": 8.614864864864865, + "grad_norm": 0.3049032986164093, + "learning_rate": 2.8678164860512834e-06, + "loss": 0.2411, + "num_input_tokens_seen": 3160272, + "step": 16575 + }, + { + "epoch": 8.617463617463617, + "grad_norm": 0.8126928210258484, + "learning_rate": 2.8572791325530425e-06, + "loss": 0.2995, + "num_input_tokens_seen": 3161232, + "step": 16580 + }, + { + "epoch": 8.62006237006237, + "grad_norm": 0.45172733068466187, + "learning_rate": 2.8467600001952336e-06, + "loss": 0.2863, + "num_input_tokens_seen": 3162192, + "step": 16585 + }, + { + "epoch": 8.622661122661123, + "grad_norm": 0.5358821749687195, + "learning_rate": 2.8362590976340114e-06, + "loss": 0.2259, + "num_input_tokens_seen": 3163152, + "step": 16590 + }, + { + "epoch": 8.625259875259875, + "grad_norm": 0.5077499747276306, + "learning_rate": 2.825776433510499e-06, + "loss": 0.2535, + "num_input_tokens_seen": 3164048, + "step": 16595 + }, + { + "epoch": 8.627858627858627, + "grad_norm": 0.3475364148616791, + "learning_rate": 2.8153120164508493e-06, + "loss": 0.2471, + "num_input_tokens_seen": 3165008, + "step": 16600 + }, + { + "epoch": 8.630457380457381, + "grad_norm": 0.5189000368118286, + "learning_rate": 2.8048658550661714e-06, + "loss": 0.3082, + "num_input_tokens_seen": 3165904, + "step": 16605 + }, + { + "epoch": 8.633056133056133, + "grad_norm": 0.48546627163887024, + "learning_rate": 2.7944379579525697e-06, + "loss": 0.1851, + "num_input_tokens_seen": 3166864, + "step": 16610 + }, + { + "epoch": 8.635654885654885, + "grad_norm": 0.5247379541397095, + "learning_rate": 2.784028333691105e-06, + "loss": 0.2574, + "num_input_tokens_seen": 3167888, + "step": 16615 + }, + { + "epoch": 8.638253638253639, + "grad_norm": 0.15828797221183777, + "learning_rate": 2.773636990847808e-06, + "loss": 0.3059, + "num_input_tokens_seen": 3168784, + "step": 16620 + }, + { + "epoch": 8.640852390852391, + "grad_norm": 0.3445800542831421, + "learning_rate": 2.763263937973681e-06, + "loss": 0.2477, + "num_input_tokens_seen": 3169712, + "step": 16625 + }, + { + "epoch": 8.643451143451143, + "grad_norm": 0.38023367524147034, + "learning_rate": 2.752909183604635e-06, + "loss": 0.2153, + "num_input_tokens_seen": 3170704, + "step": 16630 + }, + { + "epoch": 8.646049896049895, + "grad_norm": 0.28781116008758545, + "learning_rate": 2.74257273626157e-06, + "loss": 0.2872, + "num_input_tokens_seen": 3171568, + "step": 16635 + }, + { + "epoch": 8.64864864864865, + "grad_norm": 0.31516793370246887, + "learning_rate": 2.7322546044502824e-06, + "loss": 0.2467, + "num_input_tokens_seen": 3172496, + "step": 16640 + }, + { + "epoch": 8.651247401247401, + "grad_norm": 0.5640187859535217, + "learning_rate": 2.72195479666153e-06, + "loss": 0.2722, + "num_input_tokens_seen": 3173456, + "step": 16645 + }, + { + "epoch": 8.653846153846153, + "grad_norm": 0.6017543077468872, + "learning_rate": 2.711673321370961e-06, + "loss": 0.256, + "num_input_tokens_seen": 3174448, + "step": 16650 + }, + { + "epoch": 8.656444906444907, + "grad_norm": 0.10516424477100372, + "learning_rate": 2.701410187039169e-06, + "loss": 0.2395, + "num_input_tokens_seen": 3175472, + "step": 16655 + }, + { + "epoch": 8.65904365904366, + "grad_norm": 0.20163676142692566, + "learning_rate": 2.691165402111628e-06, + "loss": 0.2699, + "num_input_tokens_seen": 3176432, + "step": 16660 + }, + { + "epoch": 8.661642411642411, + "grad_norm": 0.6342714428901672, + "learning_rate": 2.6809389750187208e-06, + "loss": 0.264, + "num_input_tokens_seen": 3177360, + "step": 16665 + }, + { + "epoch": 8.664241164241163, + "grad_norm": 0.30005401372909546, + "learning_rate": 2.670730914175737e-06, + "loss": 0.3357, + "num_input_tokens_seen": 3178320, + "step": 16670 + }, + { + "epoch": 8.666839916839917, + "grad_norm": 0.5461612343788147, + "learning_rate": 2.6605412279828267e-06, + "loss": 0.233, + "num_input_tokens_seen": 3179248, + "step": 16675 + }, + { + "epoch": 8.66943866943867, + "grad_norm": 0.5636135339736938, + "learning_rate": 2.6503699248250523e-06, + "loss": 0.2122, + "num_input_tokens_seen": 3180240, + "step": 16680 + }, + { + "epoch": 8.672037422037421, + "grad_norm": 0.23138518631458282, + "learning_rate": 2.6402170130723132e-06, + "loss": 0.262, + "num_input_tokens_seen": 3181232, + "step": 16685 + }, + { + "epoch": 8.674636174636175, + "grad_norm": 0.13066020607948303, + "learning_rate": 2.630082501079409e-06, + "loss": 0.2798, + "num_input_tokens_seen": 3182192, + "step": 16690 + }, + { + "epoch": 8.677234927234927, + "grad_norm": 0.47491660714149475, + "learning_rate": 2.619966397185972e-06, + "loss": 0.2341, + "num_input_tokens_seen": 3183088, + "step": 16695 + }, + { + "epoch": 8.67983367983368, + "grad_norm": 0.6499217748641968, + "learning_rate": 2.6098687097164955e-06, + "loss": 0.2648, + "num_input_tokens_seen": 3183984, + "step": 16700 + }, + { + "epoch": 8.682432432432432, + "grad_norm": 0.7729581594467163, + "learning_rate": 2.5997894469803247e-06, + "loss": 0.2969, + "num_input_tokens_seen": 3184912, + "step": 16705 + }, + { + "epoch": 8.685031185031185, + "grad_norm": 0.5573490858078003, + "learning_rate": 2.5897286172716307e-06, + "loss": 0.1862, + "num_input_tokens_seen": 3185904, + "step": 16710 + }, + { + "epoch": 8.687629937629938, + "grad_norm": 0.2550241947174072, + "learning_rate": 2.5796862288694324e-06, + "loss": 0.2503, + "num_input_tokens_seen": 3186864, + "step": 16715 + }, + { + "epoch": 8.69022869022869, + "grad_norm": 0.4708923399448395, + "learning_rate": 2.56966229003755e-06, + "loss": 0.3422, + "num_input_tokens_seen": 3187792, + "step": 16720 + }, + { + "epoch": 8.692827442827443, + "grad_norm": 0.3087703585624695, + "learning_rate": 2.5596568090246548e-06, + "loss": 0.3189, + "num_input_tokens_seen": 3188720, + "step": 16725 + }, + { + "epoch": 8.695426195426196, + "grad_norm": 0.27556103467941284, + "learning_rate": 2.5496697940641854e-06, + "loss": 0.2519, + "num_input_tokens_seen": 3189648, + "step": 16730 + }, + { + "epoch": 8.698024948024948, + "grad_norm": 0.6161733269691467, + "learning_rate": 2.53970125337443e-06, + "loss": 0.2307, + "num_input_tokens_seen": 3190640, + "step": 16735 + }, + { + "epoch": 8.700623700623701, + "grad_norm": 0.6339653730392456, + "learning_rate": 2.5297511951584417e-06, + "loss": 0.2428, + "num_input_tokens_seen": 3191664, + "step": 16740 + }, + { + "epoch": 8.703222453222454, + "grad_norm": 0.4798462986946106, + "learning_rate": 2.5198196276040782e-06, + "loss": 0.2672, + "num_input_tokens_seen": 3192656, + "step": 16745 + }, + { + "epoch": 8.705821205821206, + "grad_norm": 0.5494415163993835, + "learning_rate": 2.509906558883987e-06, + "loss": 0.248, + "num_input_tokens_seen": 3193616, + "step": 16750 + }, + { + "epoch": 8.708419958419958, + "grad_norm": 0.4396020472049713, + "learning_rate": 2.5000119971555826e-06, + "loss": 0.3036, + "num_input_tokens_seen": 3194544, + "step": 16755 + }, + { + "epoch": 8.711018711018712, + "grad_norm": 0.24212297797203064, + "learning_rate": 2.4901359505610482e-06, + "loss": 0.2712, + "num_input_tokens_seen": 3195472, + "step": 16760 + }, + { + "epoch": 8.713617463617464, + "grad_norm": 0.5867029428482056, + "learning_rate": 2.480278427227334e-06, + "loss": 0.1964, + "num_input_tokens_seen": 3196368, + "step": 16765 + }, + { + "epoch": 8.716216216216216, + "grad_norm": 0.8095393776893616, + "learning_rate": 2.4704394352661586e-06, + "loss": 0.3156, + "num_input_tokens_seen": 3197296, + "step": 16770 + }, + { + "epoch": 8.71881496881497, + "grad_norm": 0.5957677364349365, + "learning_rate": 2.460618982773974e-06, + "loss": 0.1868, + "num_input_tokens_seen": 3198224, + "step": 16775 + }, + { + "epoch": 8.721413721413722, + "grad_norm": 0.3447374701499939, + "learning_rate": 2.4508170778319904e-06, + "loss": 0.3184, + "num_input_tokens_seen": 3199120, + "step": 16780 + }, + { + "epoch": 8.724012474012474, + "grad_norm": 0.1640871912240982, + "learning_rate": 2.4410337285061424e-06, + "loss": 0.2787, + "num_input_tokens_seen": 3200080, + "step": 16785 + }, + { + "epoch": 8.726611226611226, + "grad_norm": 0.6247601509094238, + "learning_rate": 2.431268942847112e-06, + "loss": 0.2388, + "num_input_tokens_seen": 3200976, + "step": 16790 + }, + { + "epoch": 8.72920997920998, + "grad_norm": 0.3212721049785614, + "learning_rate": 2.4215227288902883e-06, + "loss": 0.2504, + "num_input_tokens_seen": 3201904, + "step": 16795 + }, + { + "epoch": 8.731808731808732, + "grad_norm": 0.6261093616485596, + "learning_rate": 2.4117950946557807e-06, + "loss": 0.2204, + "num_input_tokens_seen": 3202864, + "step": 16800 + }, + { + "epoch": 8.734407484407484, + "grad_norm": 0.6053287386894226, + "learning_rate": 2.402086048148422e-06, + "loss": 0.2137, + "num_input_tokens_seen": 3203760, + "step": 16805 + }, + { + "epoch": 8.737006237006238, + "grad_norm": 0.5328572988510132, + "learning_rate": 2.3923955973577327e-06, + "loss": 0.2639, + "num_input_tokens_seen": 3204752, + "step": 16810 + }, + { + "epoch": 8.73960498960499, + "grad_norm": 0.760982871055603, + "learning_rate": 2.382723750257948e-06, + "loss": 0.2846, + "num_input_tokens_seen": 3205712, + "step": 16815 + }, + { + "epoch": 8.742203742203742, + "grad_norm": 0.5554988384246826, + "learning_rate": 2.373070514807979e-06, + "loss": 0.2311, + "num_input_tokens_seen": 3206640, + "step": 16820 + }, + { + "epoch": 8.744802494802494, + "grad_norm": 0.38906940817832947, + "learning_rate": 2.3634358989514273e-06, + "loss": 0.2363, + "num_input_tokens_seen": 3207600, + "step": 16825 + }, + { + "epoch": 8.747401247401248, + "grad_norm": 0.335017591714859, + "learning_rate": 2.3538199106165754e-06, + "loss": 0.2863, + "num_input_tokens_seen": 3208592, + "step": 16830 + }, + { + "epoch": 8.75, + "grad_norm": 0.15835772454738617, + "learning_rate": 2.3442225577163717e-06, + "loss": 0.2736, + "num_input_tokens_seen": 3209552, + "step": 16835 + }, + { + "epoch": 8.752598752598752, + "grad_norm": 0.22048242390155792, + "learning_rate": 2.3346438481484407e-06, + "loss": 0.2593, + "num_input_tokens_seen": 3210512, + "step": 16840 + }, + { + "epoch": 8.755197505197506, + "grad_norm": 0.6029435992240906, + "learning_rate": 2.3250837897950433e-06, + "loss": 0.2689, + "num_input_tokens_seen": 3211376, + "step": 16845 + }, + { + "epoch": 8.757796257796258, + "grad_norm": 0.21623289585113525, + "learning_rate": 2.3155423905231207e-06, + "loss": 0.2351, + "num_input_tokens_seen": 3212368, + "step": 16850 + }, + { + "epoch": 8.76039501039501, + "grad_norm": 0.5878241062164307, + "learning_rate": 2.3060196581842385e-06, + "loss": 0.2577, + "num_input_tokens_seen": 3213296, + "step": 16855 + }, + { + "epoch": 8.762993762993762, + "grad_norm": 0.27771955728530884, + "learning_rate": 2.29651560061461e-06, + "loss": 0.2575, + "num_input_tokens_seen": 3214256, + "step": 16860 + }, + { + "epoch": 8.765592515592516, + "grad_norm": 0.22684532403945923, + "learning_rate": 2.28703022563507e-06, + "loss": 0.2841, + "num_input_tokens_seen": 3215280, + "step": 16865 + }, + { + "epoch": 8.768191268191268, + "grad_norm": 0.2500496208667755, + "learning_rate": 2.2775635410510975e-06, + "loss": 0.3247, + "num_input_tokens_seen": 3216304, + "step": 16870 + }, + { + "epoch": 8.77079002079002, + "grad_norm": 0.5921105146408081, + "learning_rate": 2.2681155546527886e-06, + "loss": 0.2411, + "num_input_tokens_seen": 3217200, + "step": 16875 + }, + { + "epoch": 8.773388773388774, + "grad_norm": 0.3667896091938019, + "learning_rate": 2.258686274214833e-06, + "loss": 0.2575, + "num_input_tokens_seen": 3218128, + "step": 16880 + }, + { + "epoch": 8.775987525987526, + "grad_norm": 0.595609724521637, + "learning_rate": 2.2492757074965594e-06, + "loss": 0.2899, + "num_input_tokens_seen": 3219056, + "step": 16885 + }, + { + "epoch": 8.778586278586278, + "grad_norm": 0.45058244466781616, + "learning_rate": 2.2398838622418568e-06, + "loss": 0.261, + "num_input_tokens_seen": 3220016, + "step": 16890 + }, + { + "epoch": 8.78118503118503, + "grad_norm": 0.1623951494693756, + "learning_rate": 2.230510746179243e-06, + "loss": 0.2388, + "num_input_tokens_seen": 3220912, + "step": 16895 + }, + { + "epoch": 8.783783783783784, + "grad_norm": 0.7854175567626953, + "learning_rate": 2.2211563670218067e-06, + "loss": 0.2815, + "num_input_tokens_seen": 3222000, + "step": 16900 + }, + { + "epoch": 8.786382536382536, + "grad_norm": 0.27167099714279175, + "learning_rate": 2.2118207324672293e-06, + "loss": 0.298, + "num_input_tokens_seen": 3222992, + "step": 16905 + }, + { + "epoch": 8.788981288981288, + "grad_norm": 0.29586946964263916, + "learning_rate": 2.2025038501977486e-06, + "loss": 0.2582, + "num_input_tokens_seen": 3223920, + "step": 16910 + }, + { + "epoch": 8.791580041580042, + "grad_norm": 0.4356180429458618, + "learning_rate": 2.193205727880193e-06, + "loss": 0.2623, + "num_input_tokens_seen": 3224880, + "step": 16915 + }, + { + "epoch": 8.794178794178794, + "grad_norm": 0.45885998010635376, + "learning_rate": 2.1839263731659425e-06, + "loss": 0.3135, + "num_input_tokens_seen": 3225808, + "step": 16920 + }, + { + "epoch": 8.796777546777546, + "grad_norm": 0.16196425259113312, + "learning_rate": 2.1746657936909278e-06, + "loss": 0.283, + "num_input_tokens_seen": 3226768, + "step": 16925 + }, + { + "epoch": 8.799376299376299, + "grad_norm": 0.8498143553733826, + "learning_rate": 2.165423997075644e-06, + "loss": 0.2885, + "num_input_tokens_seen": 3227824, + "step": 16930 + }, + { + "epoch": 8.801975051975052, + "grad_norm": 0.405510812997818, + "learning_rate": 2.156200990925114e-06, + "loss": 0.2717, + "num_input_tokens_seen": 3228784, + "step": 16935 + }, + { + "epoch": 8.804573804573804, + "grad_norm": 0.6686161160469055, + "learning_rate": 2.146996782828914e-06, + "loss": 0.2919, + "num_input_tokens_seen": 3229840, + "step": 16940 + }, + { + "epoch": 8.807172557172557, + "grad_norm": 0.36889445781707764, + "learning_rate": 2.137811380361135e-06, + "loss": 0.223, + "num_input_tokens_seen": 3230864, + "step": 16945 + }, + { + "epoch": 8.80977130977131, + "grad_norm": 0.43165406584739685, + "learning_rate": 2.1286447910804086e-06, + "loss": 0.2777, + "num_input_tokens_seen": 3231792, + "step": 16950 + }, + { + "epoch": 8.812370062370062, + "grad_norm": 0.5971927642822266, + "learning_rate": 2.1194970225298786e-06, + "loss": 0.2097, + "num_input_tokens_seen": 3232656, + "step": 16955 + }, + { + "epoch": 8.814968814968815, + "grad_norm": 0.6304140090942383, + "learning_rate": 2.110368082237188e-06, + "loss": 0.261, + "num_input_tokens_seen": 3233648, + "step": 16960 + }, + { + "epoch": 8.817567567567568, + "grad_norm": 0.38031312823295593, + "learning_rate": 2.101257977714516e-06, + "loss": 0.2489, + "num_input_tokens_seen": 3234704, + "step": 16965 + }, + { + "epoch": 8.82016632016632, + "grad_norm": 0.4897674322128296, + "learning_rate": 2.09216671645851e-06, + "loss": 0.2394, + "num_input_tokens_seen": 3235696, + "step": 16970 + }, + { + "epoch": 8.822765072765073, + "grad_norm": 0.48527491092681885, + "learning_rate": 2.0830943059503367e-06, + "loss": 0.3195, + "num_input_tokens_seen": 3236720, + "step": 16975 + }, + { + "epoch": 8.825363825363825, + "grad_norm": 0.2890283167362213, + "learning_rate": 2.0740407536556318e-06, + "loss": 0.2204, + "num_input_tokens_seen": 3237680, + "step": 16980 + }, + { + "epoch": 8.827962577962579, + "grad_norm": 0.4981621503829956, + "learning_rate": 2.0650060670245303e-06, + "loss": 0.2906, + "num_input_tokens_seen": 3238672, + "step": 16985 + }, + { + "epoch": 8.83056133056133, + "grad_norm": 0.5954980254173279, + "learning_rate": 2.0559902534916213e-06, + "loss": 0.1881, + "num_input_tokens_seen": 3239632, + "step": 16990 + }, + { + "epoch": 8.833160083160083, + "grad_norm": 0.33526232838630676, + "learning_rate": 2.046993320475979e-06, + "loss": 0.3003, + "num_input_tokens_seen": 3240624, + "step": 16995 + }, + { + "epoch": 8.835758835758837, + "grad_norm": 0.46231165528297424, + "learning_rate": 2.0380152753811443e-06, + "loss": 0.2595, + "num_input_tokens_seen": 3241648, + "step": 17000 + }, + { + "epoch": 8.838357588357589, + "grad_norm": 0.4146597981452942, + "learning_rate": 2.0290561255950967e-06, + "loss": 0.2575, + "num_input_tokens_seen": 3242640, + "step": 17005 + }, + { + "epoch": 8.84095634095634, + "grad_norm": 0.7870091199874878, + "learning_rate": 2.0201158784902916e-06, + "loss": 0.2997, + "num_input_tokens_seen": 3243536, + "step": 17010 + }, + { + "epoch": 8.843555093555093, + "grad_norm": 0.2716161608695984, + "learning_rate": 2.0111945414236083e-06, + "loss": 0.2542, + "num_input_tokens_seen": 3244464, + "step": 17015 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 0.6486825346946716, + "learning_rate": 2.002292121736371e-06, + "loss": 0.2873, + "num_input_tokens_seen": 3245392, + "step": 17020 + }, + { + "epoch": 8.848752598752599, + "grad_norm": 0.10801484435796738, + "learning_rate": 1.9934086267543396e-06, + "loss": 0.2437, + "num_input_tokens_seen": 3246224, + "step": 17025 + }, + { + "epoch": 8.85135135135135, + "grad_norm": 0.4322042465209961, + "learning_rate": 1.984544063787705e-06, + "loss": 0.2765, + "num_input_tokens_seen": 3247152, + "step": 17030 + }, + { + "epoch": 8.853950103950105, + "grad_norm": 0.6657654643058777, + "learning_rate": 1.9756984401310684e-06, + "loss": 0.2626, + "num_input_tokens_seen": 3248112, + "step": 17035 + }, + { + "epoch": 8.856548856548857, + "grad_norm": 0.7385665774345398, + "learning_rate": 1.9668717630634575e-06, + "loss": 0.3061, + "num_input_tokens_seen": 3249072, + "step": 17040 + }, + { + "epoch": 8.859147609147609, + "grad_norm": 0.7692967057228088, + "learning_rate": 1.958064039848295e-06, + "loss": 0.2823, + "num_input_tokens_seen": 3250128, + "step": 17045 + }, + { + "epoch": 8.861746361746361, + "grad_norm": 0.6095483899116516, + "learning_rate": 1.9492752777334256e-06, + "loss": 0.2446, + "num_input_tokens_seen": 3251056, + "step": 17050 + }, + { + "epoch": 8.864345114345115, + "grad_norm": 0.27954986691474915, + "learning_rate": 1.9405054839510744e-06, + "loss": 0.2258, + "num_input_tokens_seen": 3252144, + "step": 17055 + }, + { + "epoch": 8.866943866943867, + "grad_norm": 0.5726961493492126, + "learning_rate": 1.931754665717858e-06, + "loss": 0.2015, + "num_input_tokens_seen": 3253040, + "step": 17060 + }, + { + "epoch": 8.869542619542619, + "grad_norm": 0.4936210811138153, + "learning_rate": 1.9230228302347942e-06, + "loss": 0.2944, + "num_input_tokens_seen": 3254000, + "step": 17065 + }, + { + "epoch": 8.872141372141373, + "grad_norm": 0.4279714524745941, + "learning_rate": 1.91430998468726e-06, + "loss": 0.249, + "num_input_tokens_seen": 3254960, + "step": 17070 + }, + { + "epoch": 8.874740124740125, + "grad_norm": 0.6152064204216003, + "learning_rate": 1.9056161362450226e-06, + "loss": 0.2519, + "num_input_tokens_seen": 3255856, + "step": 17075 + }, + { + "epoch": 8.877338877338877, + "grad_norm": 0.2252865582704544, + "learning_rate": 1.8969412920622015e-06, + "loss": 0.2568, + "num_input_tokens_seen": 3256848, + "step": 17080 + }, + { + "epoch": 8.87993762993763, + "grad_norm": 0.36837446689605713, + "learning_rate": 1.8882854592772892e-06, + "loss": 0.2209, + "num_input_tokens_seen": 3257776, + "step": 17085 + }, + { + "epoch": 8.882536382536383, + "grad_norm": 0.5758026838302612, + "learning_rate": 1.8796486450131296e-06, + "loss": 0.2569, + "num_input_tokens_seen": 3258736, + "step": 17090 + }, + { + "epoch": 8.885135135135135, + "grad_norm": 0.6353163719177246, + "learning_rate": 1.8710308563769124e-06, + "loss": 0.2587, + "num_input_tokens_seen": 3259728, + "step": 17095 + }, + { + "epoch": 8.887733887733887, + "grad_norm": 0.6009305715560913, + "learning_rate": 1.862432100460182e-06, + "loss": 0.2511, + "num_input_tokens_seen": 3260720, + "step": 17100 + }, + { + "epoch": 8.890332640332641, + "grad_norm": 0.46576768159866333, + "learning_rate": 1.8538523843388056e-06, + "loss": 0.2684, + "num_input_tokens_seen": 3261712, + "step": 17105 + }, + { + "epoch": 8.892931392931393, + "grad_norm": 0.3049167990684509, + "learning_rate": 1.845291715073e-06, + "loss": 0.2671, + "num_input_tokens_seen": 3262672, + "step": 17110 + }, + { + "epoch": 8.895530145530145, + "grad_norm": 0.24039414525032043, + "learning_rate": 1.836750099707296e-06, + "loss": 0.2697, + "num_input_tokens_seen": 3263568, + "step": 17115 + }, + { + "epoch": 8.898128898128899, + "grad_norm": 0.25430500507354736, + "learning_rate": 1.8282275452705444e-06, + "loss": 0.2036, + "num_input_tokens_seen": 3264560, + "step": 17120 + }, + { + "epoch": 8.900727650727651, + "grad_norm": 0.6293903589248657, + "learning_rate": 1.8197240587759207e-06, + "loss": 0.2249, + "num_input_tokens_seen": 3265488, + "step": 17125 + }, + { + "epoch": 8.903326403326403, + "grad_norm": 0.4605918228626251, + "learning_rate": 1.8112396472208997e-06, + "loss": 0.2778, + "num_input_tokens_seen": 3266416, + "step": 17130 + }, + { + "epoch": 8.905925155925155, + "grad_norm": 0.30586305260658264, + "learning_rate": 1.8027743175872664e-06, + "loss": 0.2154, + "num_input_tokens_seen": 3267440, + "step": 17135 + }, + { + "epoch": 8.90852390852391, + "grad_norm": 0.503738522529602, + "learning_rate": 1.7943280768410981e-06, + "loss": 0.3172, + "num_input_tokens_seen": 3268368, + "step": 17140 + }, + { + "epoch": 8.911122661122661, + "grad_norm": 0.49254319071769714, + "learning_rate": 1.7859009319327713e-06, + "loss": 0.3271, + "num_input_tokens_seen": 3269360, + "step": 17145 + }, + { + "epoch": 8.913721413721413, + "grad_norm": 0.157418891787529, + "learning_rate": 1.7774928897969418e-06, + "loss": 0.2478, + "num_input_tokens_seen": 3270288, + "step": 17150 + }, + { + "epoch": 8.916320166320165, + "grad_norm": 0.4663027822971344, + "learning_rate": 1.7691039573525442e-06, + "loss": 0.2857, + "num_input_tokens_seen": 3271248, + "step": 17155 + }, + { + "epoch": 8.91891891891892, + "grad_norm": 0.6291508078575134, + "learning_rate": 1.76073414150279e-06, + "loss": 0.2154, + "num_input_tokens_seen": 3272112, + "step": 17160 + }, + { + "epoch": 8.921517671517671, + "grad_norm": 0.5145701169967651, + "learning_rate": 1.7523834491351699e-06, + "loss": 0.2383, + "num_input_tokens_seen": 3273168, + "step": 17165 + }, + { + "epoch": 8.924116424116423, + "grad_norm": 0.2558283805847168, + "learning_rate": 1.7440518871214173e-06, + "loss": 0.2545, + "num_input_tokens_seen": 3274128, + "step": 17170 + }, + { + "epoch": 8.926715176715177, + "grad_norm": 0.5827282667160034, + "learning_rate": 1.7357394623175454e-06, + "loss": 0.2266, + "num_input_tokens_seen": 3275056, + "step": 17175 + }, + { + "epoch": 8.92931392931393, + "grad_norm": 0.40103524923324585, + "learning_rate": 1.7274461815638104e-06, + "loss": 0.2848, + "num_input_tokens_seen": 3275952, + "step": 17180 + }, + { + "epoch": 8.931912681912682, + "grad_norm": 0.24455320835113525, + "learning_rate": 1.7191720516847032e-06, + "loss": 0.2744, + "num_input_tokens_seen": 3276880, + "step": 17185 + }, + { + "epoch": 8.934511434511435, + "grad_norm": 0.84245765209198, + "learning_rate": 1.7109170794889773e-06, + "loss": 0.2732, + "num_input_tokens_seen": 3277808, + "step": 17190 + }, + { + "epoch": 8.937110187110187, + "grad_norm": 0.3777548372745514, + "learning_rate": 1.7026812717695988e-06, + "loss": 0.2988, + "num_input_tokens_seen": 3278768, + "step": 17195 + }, + { + "epoch": 8.93970893970894, + "grad_norm": 0.47599491477012634, + "learning_rate": 1.6944646353037858e-06, + "loss": 0.2484, + "num_input_tokens_seen": 3279792, + "step": 17200 + }, + { + "epoch": 8.942307692307692, + "grad_norm": 0.37561649084091187, + "learning_rate": 1.6862671768529626e-06, + "loss": 0.2646, + "num_input_tokens_seen": 3280848, + "step": 17205 + }, + { + "epoch": 8.944906444906445, + "grad_norm": 0.20683470368385315, + "learning_rate": 1.6780889031627861e-06, + "loss": 0.2667, + "num_input_tokens_seen": 3281808, + "step": 17210 + }, + { + "epoch": 8.947505197505198, + "grad_norm": 0.35596317052841187, + "learning_rate": 1.6699298209631148e-06, + "loss": 0.2254, + "num_input_tokens_seen": 3282704, + "step": 17215 + }, + { + "epoch": 8.95010395010395, + "grad_norm": 0.4125673174858093, + "learning_rate": 1.661789936968014e-06, + "loss": 0.2553, + "num_input_tokens_seen": 3283632, + "step": 17220 + }, + { + "epoch": 8.952702702702704, + "grad_norm": 0.153655007481575, + "learning_rate": 1.6536692578757646e-06, + "loss": 0.2939, + "num_input_tokens_seen": 3284592, + "step": 17225 + }, + { + "epoch": 8.955301455301456, + "grad_norm": 0.5722097158432007, + "learning_rate": 1.6455677903688293e-06, + "loss": 0.2333, + "num_input_tokens_seen": 3285488, + "step": 17230 + }, + { + "epoch": 8.957900207900208, + "grad_norm": 0.719990074634552, + "learning_rate": 1.6374855411138702e-06, + "loss": 0.2484, + "num_input_tokens_seen": 3286416, + "step": 17235 + }, + { + "epoch": 8.96049896049896, + "grad_norm": 0.49086952209472656, + "learning_rate": 1.6294225167617305e-06, + "loss": 0.2994, + "num_input_tokens_seen": 3287440, + "step": 17240 + }, + { + "epoch": 8.963097713097714, + "grad_norm": 0.5651909708976746, + "learning_rate": 1.6213787239474365e-06, + "loss": 0.3116, + "num_input_tokens_seen": 3288336, + "step": 17245 + }, + { + "epoch": 8.965696465696466, + "grad_norm": 0.31239357590675354, + "learning_rate": 1.6133541692901877e-06, + "loss": 0.2581, + "num_input_tokens_seen": 3289264, + "step": 17250 + }, + { + "epoch": 8.968295218295218, + "grad_norm": 0.2875784635543823, + "learning_rate": 1.6053488593933464e-06, + "loss": 0.2705, + "num_input_tokens_seen": 3290192, + "step": 17255 + }, + { + "epoch": 8.970893970893972, + "grad_norm": 0.42467427253723145, + "learning_rate": 1.597362800844454e-06, + "loss": 0.2828, + "num_input_tokens_seen": 3291120, + "step": 17260 + }, + { + "epoch": 8.973492723492724, + "grad_norm": 0.39187368750572205, + "learning_rate": 1.5893960002151903e-06, + "loss": 0.2905, + "num_input_tokens_seen": 3292112, + "step": 17265 + }, + { + "epoch": 8.976091476091476, + "grad_norm": 0.19226883351802826, + "learning_rate": 1.581448464061408e-06, + "loss": 0.24, + "num_input_tokens_seen": 3293040, + "step": 17270 + }, + { + "epoch": 8.978690228690228, + "grad_norm": 0.3257329761981964, + "learning_rate": 1.5735201989230868e-06, + "loss": 0.2704, + "num_input_tokens_seen": 3294032, + "step": 17275 + }, + { + "epoch": 8.981288981288982, + "grad_norm": 0.37637799978256226, + "learning_rate": 1.5656112113243721e-06, + "loss": 0.265, + "num_input_tokens_seen": 3294960, + "step": 17280 + }, + { + "epoch": 8.983887733887734, + "grad_norm": 0.7765858769416809, + "learning_rate": 1.5577215077735157e-06, + "loss": 0.2693, + "num_input_tokens_seen": 3295952, + "step": 17285 + }, + { + "epoch": 8.986486486486486, + "grad_norm": 0.6316784024238586, + "learning_rate": 1.5498510947629302e-06, + "loss": 0.2493, + "num_input_tokens_seen": 3296880, + "step": 17290 + }, + { + "epoch": 8.98908523908524, + "grad_norm": 0.45716461539268494, + "learning_rate": 1.541999978769132e-06, + "loss": 0.2374, + "num_input_tokens_seen": 3297808, + "step": 17295 + }, + { + "epoch": 8.991683991683992, + "grad_norm": 0.44943967461586, + "learning_rate": 1.5341681662527724e-06, + "loss": 0.2712, + "num_input_tokens_seen": 3298768, + "step": 17300 + }, + { + "epoch": 8.994282744282744, + "grad_norm": 0.2851673662662506, + "learning_rate": 1.5263556636586157e-06, + "loss": 0.2165, + "num_input_tokens_seen": 3299728, + "step": 17305 + }, + { + "epoch": 8.996881496881496, + "grad_norm": 0.578364372253418, + "learning_rate": 1.5185624774155333e-06, + "loss": 0.2743, + "num_input_tokens_seen": 3300656, + "step": 17310 + }, + { + "epoch": 8.99948024948025, + "grad_norm": 0.39216363430023193, + "learning_rate": 1.5107886139364952e-06, + "loss": 0.2497, + "num_input_tokens_seen": 3301616, + "step": 17315 + }, + { + "epoch": 9.0, + "eval_loss": 0.2505683898925781, + "eval_runtime": 7.934, + "eval_samples_per_second": 107.89, + "eval_steps_per_second": 26.972, + "num_input_tokens_seen": 3301760, + "step": 17316 + }, + { + "epoch": 9.002079002079002, + "grad_norm": 0.517192542552948, + "learning_rate": 1.5030340796185787e-06, + "loss": 0.2165, + "num_input_tokens_seen": 3302528, + "step": 17320 + }, + { + "epoch": 9.004677754677754, + "grad_norm": 0.25189098715782166, + "learning_rate": 1.4952988808429575e-06, + "loss": 0.2319, + "num_input_tokens_seen": 3303424, + "step": 17325 + }, + { + "epoch": 9.007276507276508, + "grad_norm": 0.48622602224349976, + "learning_rate": 1.4875830239748867e-06, + "loss": 0.2725, + "num_input_tokens_seen": 3304384, + "step": 17330 + }, + { + "epoch": 9.00987525987526, + "grad_norm": 0.4578523337841034, + "learning_rate": 1.4798865153637097e-06, + "loss": 0.2951, + "num_input_tokens_seen": 3305376, + "step": 17335 + }, + { + "epoch": 9.012474012474012, + "grad_norm": 0.6856257915496826, + "learning_rate": 1.472209361342844e-06, + "loss": 0.2366, + "num_input_tokens_seen": 3306464, + "step": 17340 + }, + { + "epoch": 9.015072765072764, + "grad_norm": 0.5915378928184509, + "learning_rate": 1.4645515682297911e-06, + "loss": 0.2637, + "num_input_tokens_seen": 3307360, + "step": 17345 + }, + { + "epoch": 9.017671517671518, + "grad_norm": 0.06956221908330917, + "learning_rate": 1.456913142326108e-06, + "loss": 0.2589, + "num_input_tokens_seen": 3308288, + "step": 17350 + }, + { + "epoch": 9.02027027027027, + "grad_norm": 0.2178877741098404, + "learning_rate": 1.4492940899174134e-06, + "loss": 0.2534, + "num_input_tokens_seen": 3309216, + "step": 17355 + }, + { + "epoch": 9.022869022869022, + "grad_norm": 0.6002789735794067, + "learning_rate": 1.441694417273401e-06, + "loss": 0.2383, + "num_input_tokens_seen": 3310208, + "step": 17360 + }, + { + "epoch": 9.025467775467776, + "grad_norm": 0.5947749614715576, + "learning_rate": 1.4341141306477957e-06, + "loss": 0.2317, + "num_input_tokens_seen": 3311168, + "step": 17365 + }, + { + "epoch": 9.028066528066528, + "grad_norm": 0.5573158264160156, + "learning_rate": 1.4265532362783884e-06, + "loss": 0.2417, + "num_input_tokens_seen": 3312096, + "step": 17370 + }, + { + "epoch": 9.03066528066528, + "grad_norm": 0.4467281103134155, + "learning_rate": 1.4190117403869968e-06, + "loss": 0.28, + "num_input_tokens_seen": 3313088, + "step": 17375 + }, + { + "epoch": 9.033264033264032, + "grad_norm": 0.795970618724823, + "learning_rate": 1.4114896491794816e-06, + "loss": 0.2785, + "num_input_tokens_seen": 3314080, + "step": 17380 + }, + { + "epoch": 9.035862785862786, + "grad_norm": 0.6154310703277588, + "learning_rate": 1.4039869688457414e-06, + "loss": 0.2119, + "num_input_tokens_seen": 3315008, + "step": 17385 + }, + { + "epoch": 9.038461538461538, + "grad_norm": 0.43275314569473267, + "learning_rate": 1.396503705559693e-06, + "loss": 0.2853, + "num_input_tokens_seen": 3315936, + "step": 17390 + }, + { + "epoch": 9.04106029106029, + "grad_norm": 0.5319951772689819, + "learning_rate": 1.3890398654792803e-06, + "loss": 0.3417, + "num_input_tokens_seen": 3316896, + "step": 17395 + }, + { + "epoch": 9.043659043659044, + "grad_norm": 0.39027485251426697, + "learning_rate": 1.3815954547464565e-06, + "loss": 0.2883, + "num_input_tokens_seen": 3317952, + "step": 17400 + }, + { + "epoch": 9.046257796257796, + "grad_norm": 0.6020687222480774, + "learning_rate": 1.3741704794872024e-06, + "loss": 0.253, + "num_input_tokens_seen": 3318944, + "step": 17405 + }, + { + "epoch": 9.048856548856548, + "grad_norm": 0.43913906812667847, + "learning_rate": 1.3667649458114857e-06, + "loss": 0.2804, + "num_input_tokens_seen": 3319840, + "step": 17410 + }, + { + "epoch": 9.051455301455302, + "grad_norm": 0.7802557945251465, + "learning_rate": 1.3593788598132928e-06, + "loss": 0.2765, + "num_input_tokens_seen": 3320768, + "step": 17415 + }, + { + "epoch": 9.054054054054054, + "grad_norm": 0.4412871301174164, + "learning_rate": 1.3520122275705871e-06, + "loss": 0.2659, + "num_input_tokens_seen": 3321728, + "step": 17420 + }, + { + "epoch": 9.056652806652806, + "grad_norm": 0.6902355551719666, + "learning_rate": 1.344665055145347e-06, + "loss": 0.2711, + "num_input_tokens_seen": 3322720, + "step": 17425 + }, + { + "epoch": 9.059251559251559, + "grad_norm": 0.9093404412269592, + "learning_rate": 1.3373373485835227e-06, + "loss": 0.2324, + "num_input_tokens_seen": 3323744, + "step": 17430 + }, + { + "epoch": 9.061850311850312, + "grad_norm": 0.5930798649787903, + "learning_rate": 1.3300291139150461e-06, + "loss": 0.2412, + "num_input_tokens_seen": 3324672, + "step": 17435 + }, + { + "epoch": 9.064449064449065, + "grad_norm": 0.16552338004112244, + "learning_rate": 1.3227403571538398e-06, + "loss": 0.2678, + "num_input_tokens_seen": 3325600, + "step": 17440 + }, + { + "epoch": 9.067047817047817, + "grad_norm": 0.33837804198265076, + "learning_rate": 1.3154710842977703e-06, + "loss": 0.2618, + "num_input_tokens_seen": 3326528, + "step": 17445 + }, + { + "epoch": 9.06964656964657, + "grad_norm": 0.58681720495224, + "learning_rate": 1.3082213013286993e-06, + "loss": 0.249, + "num_input_tokens_seen": 3327424, + "step": 17450 + }, + { + "epoch": 9.072245322245323, + "grad_norm": 0.48353344202041626, + "learning_rate": 1.3009910142124354e-06, + "loss": 0.2307, + "num_input_tokens_seen": 3328352, + "step": 17455 + }, + { + "epoch": 9.074844074844075, + "grad_norm": 0.410655677318573, + "learning_rate": 1.2937802288987499e-06, + "loss": 0.3105, + "num_input_tokens_seen": 3329248, + "step": 17460 + }, + { + "epoch": 9.077442827442827, + "grad_norm": 0.23255430161952972, + "learning_rate": 1.286588951321363e-06, + "loss": 0.2133, + "num_input_tokens_seen": 3330208, + "step": 17465 + }, + { + "epoch": 9.08004158004158, + "grad_norm": 0.1810254156589508, + "learning_rate": 1.2794171873979439e-06, + "loss": 0.256, + "num_input_tokens_seen": 3331232, + "step": 17470 + }, + { + "epoch": 9.082640332640333, + "grad_norm": 0.4944901764392853, + "learning_rate": 1.272264943030102e-06, + "loss": 0.2974, + "num_input_tokens_seen": 3332096, + "step": 17475 + }, + { + "epoch": 9.085239085239085, + "grad_norm": 0.15789233148097992, + "learning_rate": 1.2651322241033825e-06, + "loss": 0.2744, + "num_input_tokens_seen": 3333024, + "step": 17480 + }, + { + "epoch": 9.087837837837839, + "grad_norm": 0.2764982581138611, + "learning_rate": 1.2580190364872706e-06, + "loss": 0.2204, + "num_input_tokens_seen": 3333920, + "step": 17485 + }, + { + "epoch": 9.09043659043659, + "grad_norm": 0.09917490929365158, + "learning_rate": 1.2509253860351732e-06, + "loss": 0.2676, + "num_input_tokens_seen": 3334912, + "step": 17490 + }, + { + "epoch": 9.093035343035343, + "grad_norm": 0.5055806040763855, + "learning_rate": 1.2438512785844237e-06, + "loss": 0.265, + "num_input_tokens_seen": 3335840, + "step": 17495 + }, + { + "epoch": 9.095634095634095, + "grad_norm": 0.3252499997615814, + "learning_rate": 1.236796719956268e-06, + "loss": 0.3056, + "num_input_tokens_seen": 3336768, + "step": 17500 + }, + { + "epoch": 9.098232848232849, + "grad_norm": 0.30877235531806946, + "learning_rate": 1.229761715955874e-06, + "loss": 0.2879, + "num_input_tokens_seen": 3337824, + "step": 17505 + }, + { + "epoch": 9.1008316008316, + "grad_norm": 0.11621136963367462, + "learning_rate": 1.2227462723723077e-06, + "loss": 0.2835, + "num_input_tokens_seen": 3338720, + "step": 17510 + }, + { + "epoch": 9.103430353430353, + "grad_norm": 0.3723784387111664, + "learning_rate": 1.2157503949785487e-06, + "loss": 0.259, + "num_input_tokens_seen": 3339808, + "step": 17515 + }, + { + "epoch": 9.106029106029107, + "grad_norm": 0.5092584490776062, + "learning_rate": 1.2087740895314697e-06, + "loss": 0.3054, + "num_input_tokens_seen": 3340768, + "step": 17520 + }, + { + "epoch": 9.108627858627859, + "grad_norm": 0.4067954421043396, + "learning_rate": 1.201817361771837e-06, + "loss": 0.2671, + "num_input_tokens_seen": 3341696, + "step": 17525 + }, + { + "epoch": 9.111226611226611, + "grad_norm": 0.2736160457134247, + "learning_rate": 1.1948802174243158e-06, + "loss": 0.2254, + "num_input_tokens_seen": 3342656, + "step": 17530 + }, + { + "epoch": 9.113825363825363, + "grad_norm": 0.19481131434440613, + "learning_rate": 1.187962662197442e-06, + "loss": 0.2514, + "num_input_tokens_seen": 3343648, + "step": 17535 + }, + { + "epoch": 9.116424116424117, + "grad_norm": 0.46727702021598816, + "learning_rate": 1.181064701783649e-06, + "loss": 0.2788, + "num_input_tokens_seen": 3344576, + "step": 17540 + }, + { + "epoch": 9.119022869022869, + "grad_norm": 0.8230842351913452, + "learning_rate": 1.174186341859221e-06, + "loss": 0.3247, + "num_input_tokens_seen": 3345600, + "step": 17545 + }, + { + "epoch": 9.121621621621621, + "grad_norm": 0.6071925163269043, + "learning_rate": 1.1673275880843382e-06, + "loss": 0.2632, + "num_input_tokens_seen": 3346528, + "step": 17550 + }, + { + "epoch": 9.124220374220375, + "grad_norm": 0.3531988561153412, + "learning_rate": 1.1604884461030392e-06, + "loss": 0.2875, + "num_input_tokens_seen": 3347424, + "step": 17555 + }, + { + "epoch": 9.126819126819127, + "grad_norm": 0.6572548151016235, + "learning_rate": 1.1536689215432106e-06, + "loss": 0.2744, + "num_input_tokens_seen": 3348352, + "step": 17560 + }, + { + "epoch": 9.129417879417879, + "grad_norm": 0.38222452998161316, + "learning_rate": 1.1468690200166193e-06, + "loss": 0.2523, + "num_input_tokens_seen": 3349184, + "step": 17565 + }, + { + "epoch": 9.132016632016631, + "grad_norm": 0.7229334712028503, + "learning_rate": 1.1400887471188614e-06, + "loss": 0.3121, + "num_input_tokens_seen": 3350176, + "step": 17570 + }, + { + "epoch": 9.134615384615385, + "grad_norm": 0.3287067711353302, + "learning_rate": 1.1333281084294045e-06, + "loss": 0.2586, + "num_input_tokens_seen": 3351168, + "step": 17575 + }, + { + "epoch": 9.137214137214137, + "grad_norm": 0.40318915247917175, + "learning_rate": 1.1265871095115315e-06, + "loss": 0.2918, + "num_input_tokens_seen": 3352064, + "step": 17580 + }, + { + "epoch": 9.13981288981289, + "grad_norm": 0.6470275521278381, + "learning_rate": 1.1198657559123888e-06, + "loss": 0.2446, + "num_input_tokens_seen": 3352928, + "step": 17585 + }, + { + "epoch": 9.142411642411643, + "grad_norm": 0.37650004029273987, + "learning_rate": 1.1131640531629377e-06, + "loss": 0.2755, + "num_input_tokens_seen": 3353824, + "step": 17590 + }, + { + "epoch": 9.145010395010395, + "grad_norm": 0.6441230177879333, + "learning_rate": 1.1064820067779897e-06, + "loss": 0.2402, + "num_input_tokens_seen": 3354752, + "step": 17595 + }, + { + "epoch": 9.147609147609147, + "grad_norm": 0.3998930752277374, + "learning_rate": 1.0998196222561568e-06, + "loss": 0.2433, + "num_input_tokens_seen": 3355680, + "step": 17600 + }, + { + "epoch": 9.1502079002079, + "grad_norm": 0.3974587321281433, + "learning_rate": 1.093176905079893e-06, + "loss": 0.269, + "num_input_tokens_seen": 3356672, + "step": 17605 + }, + { + "epoch": 9.152806652806653, + "grad_norm": 0.4248654842376709, + "learning_rate": 1.0865538607154557e-06, + "loss": 0.2634, + "num_input_tokens_seen": 3357536, + "step": 17610 + }, + { + "epoch": 9.155405405405405, + "grad_norm": 0.2663780450820923, + "learning_rate": 1.0799504946129135e-06, + "loss": 0.2058, + "num_input_tokens_seen": 3358496, + "step": 17615 + }, + { + "epoch": 9.158004158004157, + "grad_norm": 0.6029179096221924, + "learning_rate": 1.0733668122061503e-06, + "loss": 0.2512, + "num_input_tokens_seen": 3359360, + "step": 17620 + }, + { + "epoch": 9.160602910602911, + "grad_norm": 0.3733580410480499, + "learning_rate": 1.0668028189128431e-06, + "loss": 0.2759, + "num_input_tokens_seen": 3360320, + "step": 17625 + }, + { + "epoch": 9.163201663201663, + "grad_norm": 0.31925007700920105, + "learning_rate": 1.0602585201344772e-06, + "loss": 0.2282, + "num_input_tokens_seen": 3361184, + "step": 17630 + }, + { + "epoch": 9.165800415800415, + "grad_norm": 0.4602401554584503, + "learning_rate": 1.053733921256317e-06, + "loss": 0.2758, + "num_input_tokens_seen": 3362112, + "step": 17635 + }, + { + "epoch": 9.16839916839917, + "grad_norm": 0.539225697517395, + "learning_rate": 1.0472290276474312e-06, + "loss": 0.2941, + "num_input_tokens_seen": 3363104, + "step": 17640 + }, + { + "epoch": 9.170997920997921, + "grad_norm": 0.3857838809490204, + "learning_rate": 1.0407438446606633e-06, + "loss": 0.2847, + "num_input_tokens_seen": 3364032, + "step": 17645 + }, + { + "epoch": 9.173596673596673, + "grad_norm": 0.6367378830909729, + "learning_rate": 1.034278377632636e-06, + "loss": 0.2028, + "num_input_tokens_seen": 3364992, + "step": 17650 + }, + { + "epoch": 9.176195426195425, + "grad_norm": 0.6372125744819641, + "learning_rate": 1.0278326318837571e-06, + "loss": 0.3039, + "num_input_tokens_seen": 3365920, + "step": 17655 + }, + { + "epoch": 9.17879417879418, + "grad_norm": 0.41472768783569336, + "learning_rate": 1.0214066127181953e-06, + "loss": 0.2791, + "num_input_tokens_seen": 3366848, + "step": 17660 + }, + { + "epoch": 9.181392931392931, + "grad_norm": 0.4123001992702484, + "learning_rate": 1.0150003254238983e-06, + "loss": 0.2608, + "num_input_tokens_seen": 3367776, + "step": 17665 + }, + { + "epoch": 9.183991683991684, + "grad_norm": 0.4833027124404907, + "learning_rate": 1.0086137752725655e-06, + "loss": 0.2812, + "num_input_tokens_seen": 3368704, + "step": 17670 + }, + { + "epoch": 9.186590436590437, + "grad_norm": 0.23921316862106323, + "learning_rate": 1.0022469675196572e-06, + "loss": 0.2341, + "num_input_tokens_seen": 3369696, + "step": 17675 + }, + { + "epoch": 9.18918918918919, + "grad_norm": 0.6593016386032104, + "learning_rate": 9.958999074043935e-07, + "loss": 0.3155, + "num_input_tokens_seen": 3370528, + "step": 17680 + }, + { + "epoch": 9.191787941787942, + "grad_norm": 0.19571660459041595, + "learning_rate": 9.895726001497352e-07, + "loss": 0.274, + "num_input_tokens_seen": 3371424, + "step": 17685 + }, + { + "epoch": 9.194386694386694, + "grad_norm": 0.7119560837745667, + "learning_rate": 9.83265050962398e-07, + "loss": 0.3156, + "num_input_tokens_seen": 3372416, + "step": 17690 + }, + { + "epoch": 9.196985446985448, + "grad_norm": 0.410797655582428, + "learning_rate": 9.769772650328328e-07, + "loss": 0.2826, + "num_input_tokens_seen": 3373344, + "step": 17695 + }, + { + "epoch": 9.1995841995842, + "grad_norm": 0.6726171970367432, + "learning_rate": 9.707092475352285e-07, + "loss": 0.267, + "num_input_tokens_seen": 3374304, + "step": 17700 + }, + { + "epoch": 9.202182952182952, + "grad_norm": 0.6466044783592224, + "learning_rate": 9.644610036275093e-07, + "loss": 0.2582, + "num_input_tokens_seen": 3375232, + "step": 17705 + }, + { + "epoch": 9.204781704781706, + "grad_norm": 0.26436111330986023, + "learning_rate": 9.58232538451323e-07, + "loss": 0.2517, + "num_input_tokens_seen": 3376192, + "step": 17710 + }, + { + "epoch": 9.207380457380458, + "grad_norm": 0.42042702436447144, + "learning_rate": 9.520238571320423e-07, + "loss": 0.2428, + "num_input_tokens_seen": 3377152, + "step": 17715 + }, + { + "epoch": 9.20997920997921, + "grad_norm": 0.4332435131072998, + "learning_rate": 9.458349647787662e-07, + "loss": 0.234, + "num_input_tokens_seen": 3378112, + "step": 17720 + }, + { + "epoch": 9.212577962577962, + "grad_norm": 0.6922247409820557, + "learning_rate": 9.396658664843017e-07, + "loss": 0.2408, + "num_input_tokens_seen": 3379072, + "step": 17725 + }, + { + "epoch": 9.215176715176716, + "grad_norm": 0.685303807258606, + "learning_rate": 9.335165673251739e-07, + "loss": 0.2656, + "num_input_tokens_seen": 3380000, + "step": 17730 + }, + { + "epoch": 9.217775467775468, + "grad_norm": 0.26052579283714294, + "learning_rate": 9.273870723616129e-07, + "loss": 0.2669, + "num_input_tokens_seen": 3381024, + "step": 17735 + }, + { + "epoch": 9.22037422037422, + "grad_norm": 0.2852136492729187, + "learning_rate": 9.212773866375424e-07, + "loss": 0.29, + "num_input_tokens_seen": 3381984, + "step": 17740 + }, + { + "epoch": 9.222972972972974, + "grad_norm": 0.6102067828178406, + "learning_rate": 9.151875151806044e-07, + "loss": 0.2554, + "num_input_tokens_seen": 3382944, + "step": 17745 + }, + { + "epoch": 9.225571725571726, + "grad_norm": 0.38371407985687256, + "learning_rate": 9.091174630021182e-07, + "loss": 0.2509, + "num_input_tokens_seen": 3383936, + "step": 17750 + }, + { + "epoch": 9.228170478170478, + "grad_norm": 0.6299558281898499, + "learning_rate": 9.030672350971076e-07, + "loss": 0.2247, + "num_input_tokens_seen": 3384864, + "step": 17755 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.4556545317173004, + "learning_rate": 8.970368364442705e-07, + "loss": 0.2364, + "num_input_tokens_seen": 3385856, + "step": 17760 + }, + { + "epoch": 9.233367983367984, + "grad_norm": 0.2071264684200287, + "learning_rate": 8.910262720059959e-07, + "loss": 0.3094, + "num_input_tokens_seen": 3386848, + "step": 17765 + }, + { + "epoch": 9.235966735966736, + "grad_norm": 0.3840556740760803, + "learning_rate": 8.850355467283494e-07, + "loss": 0.2889, + "num_input_tokens_seen": 3387776, + "step": 17770 + }, + { + "epoch": 9.238565488565488, + "grad_norm": 0.549126386642456, + "learning_rate": 8.790646655410684e-07, + "loss": 0.2616, + "num_input_tokens_seen": 3388800, + "step": 17775 + }, + { + "epoch": 9.241164241164242, + "grad_norm": 0.12618768215179443, + "learning_rate": 8.731136333575668e-07, + "loss": 0.2915, + "num_input_tokens_seen": 3389856, + "step": 17780 + }, + { + "epoch": 9.243762993762994, + "grad_norm": 0.5545233488082886, + "learning_rate": 8.671824550749164e-07, + "loss": 0.2417, + "num_input_tokens_seen": 3390880, + "step": 17785 + }, + { + "epoch": 9.246361746361746, + "grad_norm": 0.16257798671722412, + "learning_rate": 8.612711355738601e-07, + "loss": 0.2733, + "num_input_tokens_seen": 3391840, + "step": 17790 + }, + { + "epoch": 9.248960498960498, + "grad_norm": 1.0075355768203735, + "learning_rate": 8.553796797187902e-07, + "loss": 0.2817, + "num_input_tokens_seen": 3392768, + "step": 17795 + }, + { + "epoch": 9.251559251559252, + "grad_norm": 0.5709313750267029, + "learning_rate": 8.495080923577619e-07, + "loss": 0.2089, + "num_input_tokens_seen": 3393728, + "step": 17800 + }, + { + "epoch": 9.254158004158004, + "grad_norm": 0.596480131149292, + "learning_rate": 8.436563783224744e-07, + "loss": 0.2475, + "num_input_tokens_seen": 3394656, + "step": 17805 + }, + { + "epoch": 9.256756756756756, + "grad_norm": 0.40103426575660706, + "learning_rate": 8.378245424282755e-07, + "loss": 0.3328, + "num_input_tokens_seen": 3395648, + "step": 17810 + }, + { + "epoch": 9.25935550935551, + "grad_norm": 0.1970403641462326, + "learning_rate": 8.320125894741598e-07, + "loss": 0.2713, + "num_input_tokens_seen": 3396640, + "step": 17815 + }, + { + "epoch": 9.261954261954262, + "grad_norm": 0.6212409138679504, + "learning_rate": 8.262205242427462e-07, + "loss": 0.2017, + "num_input_tokens_seen": 3397600, + "step": 17820 + }, + { + "epoch": 9.264553014553014, + "grad_norm": 0.2418156862258911, + "learning_rate": 8.204483515003081e-07, + "loss": 0.2325, + "num_input_tokens_seen": 3398528, + "step": 17825 + }, + { + "epoch": 9.267151767151766, + "grad_norm": 0.6379526853561401, + "learning_rate": 8.146960759967348e-07, + "loss": 0.2094, + "num_input_tokens_seen": 3399616, + "step": 17830 + }, + { + "epoch": 9.26975051975052, + "grad_norm": 0.6233708262443542, + "learning_rate": 8.089637024655483e-07, + "loss": 0.2549, + "num_input_tokens_seen": 3400544, + "step": 17835 + }, + { + "epoch": 9.272349272349272, + "grad_norm": 0.41276460886001587, + "learning_rate": 8.032512356238864e-07, + "loss": 0.2779, + "num_input_tokens_seen": 3401504, + "step": 17840 + }, + { + "epoch": 9.274948024948024, + "grad_norm": 0.2125103771686554, + "learning_rate": 7.975586801725194e-07, + "loss": 0.2224, + "num_input_tokens_seen": 3402464, + "step": 17845 + }, + { + "epoch": 9.277546777546778, + "grad_norm": 0.6665888428688049, + "learning_rate": 7.91886040795814e-07, + "loss": 0.2682, + "num_input_tokens_seen": 3403392, + "step": 17850 + }, + { + "epoch": 9.28014553014553, + "grad_norm": 0.985129177570343, + "learning_rate": 7.862333221617668e-07, + "loss": 0.2612, + "num_input_tokens_seen": 3404384, + "step": 17855 + }, + { + "epoch": 9.282744282744282, + "grad_norm": 0.6230641603469849, + "learning_rate": 7.806005289219737e-07, + "loss": 0.2731, + "num_input_tokens_seen": 3405376, + "step": 17860 + }, + { + "epoch": 9.285343035343036, + "grad_norm": 0.5433678030967712, + "learning_rate": 7.749876657116295e-07, + "loss": 0.256, + "num_input_tokens_seen": 3406304, + "step": 17865 + }, + { + "epoch": 9.287941787941788, + "grad_norm": 0.22102338075637817, + "learning_rate": 7.693947371495313e-07, + "loss": 0.306, + "num_input_tokens_seen": 3407232, + "step": 17870 + }, + { + "epoch": 9.29054054054054, + "grad_norm": 0.4976428151130676, + "learning_rate": 7.638217478380782e-07, + "loss": 0.2295, + "num_input_tokens_seen": 3408128, + "step": 17875 + }, + { + "epoch": 9.293139293139292, + "grad_norm": 0.3312537372112274, + "learning_rate": 7.582687023632545e-07, + "loss": 0.2658, + "num_input_tokens_seen": 3409120, + "step": 17880 + }, + { + "epoch": 9.295738045738046, + "grad_norm": 0.2901025116443634, + "learning_rate": 7.527356052946327e-07, + "loss": 0.2749, + "num_input_tokens_seen": 3410048, + "step": 17885 + }, + { + "epoch": 9.298336798336798, + "grad_norm": 0.41099992394447327, + "learning_rate": 7.47222461185379e-07, + "loss": 0.3182, + "num_input_tokens_seen": 3411040, + "step": 17890 + }, + { + "epoch": 9.30093555093555, + "grad_norm": 0.41717973351478577, + "learning_rate": 7.417292745722282e-07, + "loss": 0.2746, + "num_input_tokens_seen": 3412000, + "step": 17895 + }, + { + "epoch": 9.303534303534304, + "grad_norm": 0.1894095540046692, + "learning_rate": 7.362560499755006e-07, + "loss": 0.2754, + "num_input_tokens_seen": 3412992, + "step": 17900 + }, + { + "epoch": 9.306133056133056, + "grad_norm": 0.2818840444087982, + "learning_rate": 7.30802791899085e-07, + "loss": 0.2855, + "num_input_tokens_seen": 3413952, + "step": 17905 + }, + { + "epoch": 9.308731808731808, + "grad_norm": 0.41036468744277954, + "learning_rate": 7.253695048304394e-07, + "loss": 0.3008, + "num_input_tokens_seen": 3414944, + "step": 17910 + }, + { + "epoch": 9.31133056133056, + "grad_norm": 0.21233218908309937, + "learning_rate": 7.199561932405952e-07, + "loss": 0.2305, + "num_input_tokens_seen": 3415808, + "step": 17915 + }, + { + "epoch": 9.313929313929314, + "grad_norm": 0.1113169938325882, + "learning_rate": 7.145628615841365e-07, + "loss": 0.2707, + "num_input_tokens_seen": 3416736, + "step": 17920 + }, + { + "epoch": 9.316528066528067, + "grad_norm": 0.6829268932342529, + "learning_rate": 7.091895142992133e-07, + "loss": 0.2987, + "num_input_tokens_seen": 3417600, + "step": 17925 + }, + { + "epoch": 9.319126819126819, + "grad_norm": 0.4168505370616913, + "learning_rate": 7.038361558075273e-07, + "loss": 0.2524, + "num_input_tokens_seen": 3418528, + "step": 17930 + }, + { + "epoch": 9.321725571725572, + "grad_norm": 0.551282525062561, + "learning_rate": 6.985027905143299e-07, + "loss": 0.3042, + "num_input_tokens_seen": 3419456, + "step": 17935 + }, + { + "epoch": 9.324324324324325, + "grad_norm": 0.6531422734260559, + "learning_rate": 6.931894228084268e-07, + "loss": 0.2524, + "num_input_tokens_seen": 3420320, + "step": 17940 + }, + { + "epoch": 9.326923076923077, + "grad_norm": 0.38804391026496887, + "learning_rate": 6.878960570621568e-07, + "loss": 0.2689, + "num_input_tokens_seen": 3421280, + "step": 17945 + }, + { + "epoch": 9.329521829521829, + "grad_norm": 0.22942419350147247, + "learning_rate": 6.826226976314104e-07, + "loss": 0.2583, + "num_input_tokens_seen": 3422208, + "step": 17950 + }, + { + "epoch": 9.332120582120583, + "grad_norm": 0.6169215440750122, + "learning_rate": 6.773693488556083e-07, + "loss": 0.2173, + "num_input_tokens_seen": 3423232, + "step": 17955 + }, + { + "epoch": 9.334719334719335, + "grad_norm": 0.3385472297668457, + "learning_rate": 6.721360150577089e-07, + "loss": 0.2627, + "num_input_tokens_seen": 3424160, + "step": 17960 + }, + { + "epoch": 9.337318087318087, + "grad_norm": 0.3898667097091675, + "learning_rate": 6.669227005441953e-07, + "loss": 0.2656, + "num_input_tokens_seen": 3425088, + "step": 17965 + }, + { + "epoch": 9.33991683991684, + "grad_norm": 0.29234305024147034, + "learning_rate": 6.617294096050802e-07, + "loss": 0.2774, + "num_input_tokens_seen": 3426176, + "step": 17970 + }, + { + "epoch": 9.342515592515593, + "grad_norm": 0.6060370802879333, + "learning_rate": 6.565561465138953e-07, + "loss": 0.2609, + "num_input_tokens_seen": 3427168, + "step": 17975 + }, + { + "epoch": 9.345114345114345, + "grad_norm": 0.43095746636390686, + "learning_rate": 6.514029155276962e-07, + "loss": 0.2566, + "num_input_tokens_seen": 3428192, + "step": 17980 + }, + { + "epoch": 9.347713097713097, + "grad_norm": 0.3382585346698761, + "learning_rate": 6.46269720887055e-07, + "loss": 0.2611, + "num_input_tokens_seen": 3429120, + "step": 17985 + }, + { + "epoch": 9.35031185031185, + "grad_norm": 0.7671194672584534, + "learning_rate": 6.411565668160507e-07, + "loss": 0.2923, + "num_input_tokens_seen": 3430016, + "step": 17990 + }, + { + "epoch": 9.352910602910603, + "grad_norm": 0.5185756683349609, + "learning_rate": 6.360634575222762e-07, + "loss": 0.2749, + "num_input_tokens_seen": 3430944, + "step": 17995 + }, + { + "epoch": 9.355509355509355, + "grad_norm": 0.3312532603740692, + "learning_rate": 6.309903971968262e-07, + "loss": 0.2755, + "num_input_tokens_seen": 3431904, + "step": 18000 + }, + { + "epoch": 9.358108108108109, + "grad_norm": 0.41646286845207214, + "learning_rate": 6.259373900142945e-07, + "loss": 0.2759, + "num_input_tokens_seen": 3432832, + "step": 18005 + }, + { + "epoch": 9.36070686070686, + "grad_norm": 0.5698254704475403, + "learning_rate": 6.209044401327801e-07, + "loss": 0.2581, + "num_input_tokens_seen": 3433824, + "step": 18010 + }, + { + "epoch": 9.363305613305613, + "grad_norm": 0.4245671033859253, + "learning_rate": 6.158915516938729e-07, + "loss": 0.2461, + "num_input_tokens_seen": 3434784, + "step": 18015 + }, + { + "epoch": 9.365904365904367, + "grad_norm": 0.5153218507766724, + "learning_rate": 6.108987288226536e-07, + "loss": 0.2803, + "num_input_tokens_seen": 3435776, + "step": 18020 + }, + { + "epoch": 9.368503118503119, + "grad_norm": 0.3421570658683777, + "learning_rate": 6.059259756276969e-07, + "loss": 0.2904, + "num_input_tokens_seen": 3436736, + "step": 18025 + }, + { + "epoch": 9.371101871101871, + "grad_norm": 0.2652986943721771, + "learning_rate": 6.009732962010544e-07, + "loss": 0.239, + "num_input_tokens_seen": 3437792, + "step": 18030 + }, + { + "epoch": 9.373700623700623, + "grad_norm": 0.36208751797676086, + "learning_rate": 5.960406946182634e-07, + "loss": 0.2826, + "num_input_tokens_seen": 3438688, + "step": 18035 + }, + { + "epoch": 9.376299376299377, + "grad_norm": 1.0130611658096313, + "learning_rate": 5.91128174938338e-07, + "loss": 0.2944, + "num_input_tokens_seen": 3439616, + "step": 18040 + }, + { + "epoch": 9.378898128898129, + "grad_norm": 0.48931050300598145, + "learning_rate": 5.862357412037666e-07, + "loss": 0.2827, + "num_input_tokens_seen": 3440576, + "step": 18045 + }, + { + "epoch": 9.381496881496881, + "grad_norm": 0.3130832016468048, + "learning_rate": 5.813633974405153e-07, + "loss": 0.2331, + "num_input_tokens_seen": 3441504, + "step": 18050 + }, + { + "epoch": 9.384095634095633, + "grad_norm": 0.24472378194332123, + "learning_rate": 5.765111476580043e-07, + "loss": 0.1984, + "num_input_tokens_seen": 3442432, + "step": 18055 + }, + { + "epoch": 9.386694386694387, + "grad_norm": 0.2936934232711792, + "learning_rate": 5.716789958491342e-07, + "loss": 0.2688, + "num_input_tokens_seen": 3443360, + "step": 18060 + }, + { + "epoch": 9.38929313929314, + "grad_norm": 0.26168203353881836, + "learning_rate": 5.668669459902576e-07, + "loss": 0.2733, + "num_input_tokens_seen": 3444320, + "step": 18065 + }, + { + "epoch": 9.391891891891891, + "grad_norm": 0.6726421117782593, + "learning_rate": 5.620750020411847e-07, + "loss": 0.277, + "num_input_tokens_seen": 3445312, + "step": 18070 + }, + { + "epoch": 9.394490644490645, + "grad_norm": 0.38618186116218567, + "learning_rate": 5.573031679451863e-07, + "loss": 0.2581, + "num_input_tokens_seen": 3446304, + "step": 18075 + }, + { + "epoch": 9.397089397089397, + "grad_norm": 0.20977406203746796, + "learning_rate": 5.525514476289823e-07, + "loss": 0.2345, + "num_input_tokens_seen": 3447168, + "step": 18080 + }, + { + "epoch": 9.39968814968815, + "grad_norm": 0.38811811804771423, + "learning_rate": 5.478198450027422e-07, + "loss": 0.2677, + "num_input_tokens_seen": 3448096, + "step": 18085 + }, + { + "epoch": 9.402286902286903, + "grad_norm": 0.37513241171836853, + "learning_rate": 5.431083639600737e-07, + "loss": 0.2286, + "num_input_tokens_seen": 3449024, + "step": 18090 + }, + { + "epoch": 9.404885654885655, + "grad_norm": 0.2907707989215851, + "learning_rate": 5.384170083780421e-07, + "loss": 0.2502, + "num_input_tokens_seen": 3449952, + "step": 18095 + }, + { + "epoch": 9.407484407484407, + "grad_norm": 0.42615339159965515, + "learning_rate": 5.337457821171316e-07, + "loss": 0.2579, + "num_input_tokens_seen": 3450848, + "step": 18100 + }, + { + "epoch": 9.41008316008316, + "grad_norm": 0.8238784074783325, + "learning_rate": 5.290946890212756e-07, + "loss": 0.2619, + "num_input_tokens_seen": 3451904, + "step": 18105 + }, + { + "epoch": 9.412681912681913, + "grad_norm": 1.0069117546081543, + "learning_rate": 5.244637329178403e-07, + "loss": 0.3209, + "num_input_tokens_seen": 3452800, + "step": 18110 + }, + { + "epoch": 9.415280665280665, + "grad_norm": 0.6038723587989807, + "learning_rate": 5.198529176176109e-07, + "loss": 0.2311, + "num_input_tokens_seen": 3453728, + "step": 18115 + }, + { + "epoch": 9.417879417879417, + "grad_norm": 0.6705043911933899, + "learning_rate": 5.152622469148133e-07, + "loss": 0.2425, + "num_input_tokens_seen": 3454688, + "step": 18120 + }, + { + "epoch": 9.420478170478171, + "grad_norm": 0.6683027148246765, + "learning_rate": 5.10691724587084e-07, + "loss": 0.2493, + "num_input_tokens_seen": 3455648, + "step": 18125 + }, + { + "epoch": 9.423076923076923, + "grad_norm": 0.6189332008361816, + "learning_rate": 5.061413543954868e-07, + "loss": 0.2461, + "num_input_tokens_seen": 3456704, + "step": 18130 + }, + { + "epoch": 9.425675675675675, + "grad_norm": 0.22615131735801697, + "learning_rate": 5.016111400844958e-07, + "loss": 0.2176, + "num_input_tokens_seen": 3457696, + "step": 18135 + }, + { + "epoch": 9.428274428274428, + "grad_norm": 0.4639214873313904, + "learning_rate": 4.971010853820069e-07, + "loss": 0.2475, + "num_input_tokens_seen": 3458720, + "step": 18140 + }, + { + "epoch": 9.430873180873181, + "grad_norm": 0.6414378881454468, + "learning_rate": 4.926111939993206e-07, + "loss": 0.2683, + "num_input_tokens_seen": 3459648, + "step": 18145 + }, + { + "epoch": 9.433471933471933, + "grad_norm": 0.49336037039756775, + "learning_rate": 4.881414696311482e-07, + "loss": 0.2572, + "num_input_tokens_seen": 3460608, + "step": 18150 + }, + { + "epoch": 9.436070686070686, + "grad_norm": 0.13656817376613617, + "learning_rate": 4.83691915955603e-07, + "loss": 0.2405, + "num_input_tokens_seen": 3461536, + "step": 18155 + }, + { + "epoch": 9.43866943866944, + "grad_norm": 0.5222653150558472, + "learning_rate": 4.792625366342062e-07, + "loss": 0.258, + "num_input_tokens_seen": 3462464, + "step": 18160 + }, + { + "epoch": 9.441268191268192, + "grad_norm": 0.5963772535324097, + "learning_rate": 4.7485333531187003e-07, + "loss": 0.276, + "num_input_tokens_seen": 3463360, + "step": 18165 + }, + { + "epoch": 9.443866943866944, + "grad_norm": 0.22277478873729706, + "learning_rate": 4.7046431561690307e-07, + "loss": 0.2131, + "num_input_tokens_seen": 3464256, + "step": 18170 + }, + { + "epoch": 9.446465696465696, + "grad_norm": 0.6215074062347412, + "learning_rate": 4.6609548116101354e-07, + "loss": 0.2341, + "num_input_tokens_seen": 3465216, + "step": 18175 + }, + { + "epoch": 9.44906444906445, + "grad_norm": 0.5781880021095276, + "learning_rate": 4.6174683553928954e-07, + "loss": 0.2875, + "num_input_tokens_seen": 3466112, + "step": 18180 + }, + { + "epoch": 9.451663201663202, + "grad_norm": 0.3588712215423584, + "learning_rate": 4.574183823302186e-07, + "loss": 0.2447, + "num_input_tokens_seen": 3467072, + "step": 18185 + }, + { + "epoch": 9.454261954261954, + "grad_norm": 0.5110021233558655, + "learning_rate": 4.531101250956571e-07, + "loss": 0.248, + "num_input_tokens_seen": 3468032, + "step": 18190 + }, + { + "epoch": 9.456860706860708, + "grad_norm": 0.6003345251083374, + "learning_rate": 4.4882206738085243e-07, + "loss": 0.2258, + "num_input_tokens_seen": 3468992, + "step": 18195 + }, + { + "epoch": 9.45945945945946, + "grad_norm": 0.699407160282135, + "learning_rate": 4.445542127144292e-07, + "loss": 0.2898, + "num_input_tokens_seen": 3470016, + "step": 18200 + }, + { + "epoch": 9.462058212058212, + "grad_norm": 0.15645304322242737, + "learning_rate": 4.403065646083809e-07, + "loss": 0.2684, + "num_input_tokens_seen": 3471008, + "step": 18205 + }, + { + "epoch": 9.464656964656964, + "grad_norm": 0.4526180922985077, + "learning_rate": 4.360791265580783e-07, + "loss": 0.234, + "num_input_tokens_seen": 3472032, + "step": 18210 + }, + { + "epoch": 9.467255717255718, + "grad_norm": 0.4725095331668854, + "learning_rate": 4.318719020422607e-07, + "loss": 0.2463, + "num_input_tokens_seen": 3472992, + "step": 18215 + }, + { + "epoch": 9.46985446985447, + "grad_norm": 0.6440339684486389, + "learning_rate": 4.276848945230366e-07, + "loss": 0.2391, + "num_input_tokens_seen": 3473952, + "step": 18220 + }, + { + "epoch": 9.472453222453222, + "grad_norm": 0.37334010004997253, + "learning_rate": 4.235181074458694e-07, + "loss": 0.2997, + "num_input_tokens_seen": 3474912, + "step": 18225 + }, + { + "epoch": 9.475051975051976, + "grad_norm": 0.24213474988937378, + "learning_rate": 4.193715442395885e-07, + "loss": 0.256, + "num_input_tokens_seen": 3475872, + "step": 18230 + }, + { + "epoch": 9.477650727650728, + "grad_norm": 0.5864493250846863, + "learning_rate": 4.152452083163866e-07, + "loss": 0.2286, + "num_input_tokens_seen": 3476864, + "step": 18235 + }, + { + "epoch": 9.48024948024948, + "grad_norm": 0.4109950065612793, + "learning_rate": 4.111391030718004e-07, + "loss": 0.2915, + "num_input_tokens_seen": 3477920, + "step": 18240 + }, + { + "epoch": 9.482848232848234, + "grad_norm": 0.37168484926223755, + "learning_rate": 4.07053231884727e-07, + "loss": 0.2338, + "num_input_tokens_seen": 3478880, + "step": 18245 + }, + { + "epoch": 9.485446985446986, + "grad_norm": 0.4828430712223053, + "learning_rate": 4.0298759811741026e-07, + "loss": 0.2801, + "num_input_tokens_seen": 3479968, + "step": 18250 + }, + { + "epoch": 9.488045738045738, + "grad_norm": 0.6597934365272522, + "learning_rate": 3.989422051154407e-07, + "loss": 0.2225, + "num_input_tokens_seen": 3481024, + "step": 18255 + }, + { + "epoch": 9.49064449064449, + "grad_norm": 0.7216570973396301, + "learning_rate": 3.949170562077553e-07, + "loss": 0.2376, + "num_input_tokens_seen": 3482016, + "step": 18260 + }, + { + "epoch": 9.493243243243244, + "grad_norm": 0.4620494544506073, + "learning_rate": 3.909121547066297e-07, + "loss": 0.2833, + "num_input_tokens_seen": 3482976, + "step": 18265 + }, + { + "epoch": 9.495841995841996, + "grad_norm": 0.5688343644142151, + "learning_rate": 3.8692750390767196e-07, + "loss": 0.2637, + "num_input_tokens_seen": 3483904, + "step": 18270 + }, + { + "epoch": 9.498440748440748, + "grad_norm": 0.6109029650688171, + "learning_rate": 3.8296310708984264e-07, + "loss": 0.2494, + "num_input_tokens_seen": 3484800, + "step": 18275 + }, + { + "epoch": 9.5, + "eval_loss": 0.24974730610847473, + "eval_runtime": 7.9256, + "eval_samples_per_second": 108.005, + "eval_steps_per_second": 27.001, + "num_input_tokens_seen": 3485344, + "step": 18278 + }, + { + "epoch": 9.5010395010395, + "grad_norm": 0.7443499565124512, + "learning_rate": 3.7901896751541545e-07, + "loss": 0.2585, + "num_input_tokens_seen": 3485728, + "step": 18280 + }, + { + "epoch": 9.503638253638254, + "grad_norm": 0.6624563932418823, + "learning_rate": 3.750950884300108e-07, + "loss": 0.233, + "num_input_tokens_seen": 3486720, + "step": 18285 + }, + { + "epoch": 9.506237006237006, + "grad_norm": 0.5057051777839661, + "learning_rate": 3.71191473062571e-07, + "loss": 0.2521, + "num_input_tokens_seen": 3487680, + "step": 18290 + }, + { + "epoch": 9.508835758835758, + "grad_norm": 0.36316558718681335, + "learning_rate": 3.6730812462535404e-07, + "loss": 0.2349, + "num_input_tokens_seen": 3488640, + "step": 18295 + }, + { + "epoch": 9.511434511434512, + "grad_norm": 0.453091025352478, + "learning_rate": 3.6344504631395934e-07, + "loss": 0.2359, + "num_input_tokens_seen": 3489632, + "step": 18300 + }, + { + "epoch": 9.514033264033264, + "grad_norm": 0.6554409265518188, + "learning_rate": 3.5960224130728857e-07, + "loss": 0.2436, + "num_input_tokens_seen": 3490528, + "step": 18305 + }, + { + "epoch": 9.516632016632016, + "grad_norm": 0.1804627776145935, + "learning_rate": 3.5577971276757325e-07, + "loss": 0.3032, + "num_input_tokens_seen": 3491424, + "step": 18310 + }, + { + "epoch": 9.51923076923077, + "grad_norm": 0.2731804847717285, + "learning_rate": 3.519774638403472e-07, + "loss": 0.2596, + "num_input_tokens_seen": 3492256, + "step": 18315 + }, + { + "epoch": 9.521829521829522, + "grad_norm": 0.5628616213798523, + "learning_rate": 3.481954976544716e-07, + "loss": 0.3007, + "num_input_tokens_seen": 3493152, + "step": 18320 + }, + { + "epoch": 9.524428274428274, + "grad_norm": 0.2772600054740906, + "learning_rate": 3.44433817322104e-07, + "loss": 0.2834, + "num_input_tokens_seen": 3494048, + "step": 18325 + }, + { + "epoch": 9.527027027027026, + "grad_norm": 0.46239256858825684, + "learning_rate": 3.406924259387101e-07, + "loss": 0.2406, + "num_input_tokens_seen": 3495040, + "step": 18330 + }, + { + "epoch": 9.52962577962578, + "grad_norm": 0.2462340146303177, + "learning_rate": 3.369713265830715e-07, + "loss": 0.2414, + "num_input_tokens_seen": 3496064, + "step": 18335 + }, + { + "epoch": 9.532224532224532, + "grad_norm": 0.3919811248779297, + "learning_rate": 3.3327052231725276e-07, + "loss": 0.255, + "num_input_tokens_seen": 3496992, + "step": 18340 + }, + { + "epoch": 9.534823284823284, + "grad_norm": 0.2471795678138733, + "learning_rate": 3.2959001618664e-07, + "loss": 0.2418, + "num_input_tokens_seen": 3497952, + "step": 18345 + }, + { + "epoch": 9.537422037422038, + "grad_norm": 0.7675511240959167, + "learning_rate": 3.2592981121989384e-07, + "loss": 0.2347, + "num_input_tokens_seen": 3498912, + "step": 18350 + }, + { + "epoch": 9.54002079002079, + "grad_norm": 0.6506656408309937, + "learning_rate": 3.222899104289856e-07, + "loss": 0.2645, + "num_input_tokens_seen": 3499840, + "step": 18355 + }, + { + "epoch": 9.542619542619542, + "grad_norm": 0.7350687384605408, + "learning_rate": 3.18670316809172e-07, + "loss": 0.3277, + "num_input_tokens_seen": 3500768, + "step": 18360 + }, + { + "epoch": 9.545218295218294, + "grad_norm": 0.5742272138595581, + "learning_rate": 3.150710333389983e-07, + "loss": 0.3109, + "num_input_tokens_seen": 3501760, + "step": 18365 + }, + { + "epoch": 9.547817047817048, + "grad_norm": 0.5492591261863708, + "learning_rate": 3.114920629802981e-07, + "loss": 0.2982, + "num_input_tokens_seen": 3502784, + "step": 18370 + }, + { + "epoch": 9.5504158004158, + "grad_norm": 0.3237125277519226, + "learning_rate": 3.0793340867818763e-07, + "loss": 0.2516, + "num_input_tokens_seen": 3503680, + "step": 18375 + }, + { + "epoch": 9.553014553014552, + "grad_norm": 0.14759546518325806, + "learning_rate": 3.04395073361069e-07, + "loss": 0.2223, + "num_input_tokens_seen": 3504640, + "step": 18380 + }, + { + "epoch": 9.555613305613306, + "grad_norm": 0.6814732551574707, + "learning_rate": 3.008770599406213e-07, + "loss": 0.3066, + "num_input_tokens_seen": 3505504, + "step": 18385 + }, + { + "epoch": 9.558212058212058, + "grad_norm": 0.3048241138458252, + "learning_rate": 2.973793713118039e-07, + "loss": 0.214, + "num_input_tokens_seen": 3506432, + "step": 18390 + }, + { + "epoch": 9.56081081081081, + "grad_norm": 0.5753053426742554, + "learning_rate": 2.9390201035284226e-07, + "loss": 0.2611, + "num_input_tokens_seen": 3507328, + "step": 18395 + }, + { + "epoch": 9.563409563409563, + "grad_norm": 0.2291644960641861, + "learning_rate": 2.904449799252418e-07, + "loss": 0.2533, + "num_input_tokens_seen": 3508192, + "step": 18400 + }, + { + "epoch": 9.566008316008316, + "grad_norm": 0.25726836919784546, + "learning_rate": 2.870082828737797e-07, + "loss": 0.2544, + "num_input_tokens_seen": 3509088, + "step": 18405 + }, + { + "epoch": 9.568607068607069, + "grad_norm": 0.29776531457901, + "learning_rate": 2.8359192202649376e-07, + "loss": 0.2647, + "num_input_tokens_seen": 3510112, + "step": 18410 + }, + { + "epoch": 9.57120582120582, + "grad_norm": 0.06916343420743942, + "learning_rate": 2.8019590019469633e-07, + "loss": 0.2476, + "num_input_tokens_seen": 3510976, + "step": 18415 + }, + { + "epoch": 9.573804573804575, + "grad_norm": 0.23419931530952454, + "learning_rate": 2.7682022017295197e-07, + "loss": 0.2228, + "num_input_tokens_seen": 3511936, + "step": 18420 + }, + { + "epoch": 9.576403326403327, + "grad_norm": 0.5744374394416809, + "learning_rate": 2.734648847390997e-07, + "loss": 0.243, + "num_input_tokens_seen": 3512896, + "step": 18425 + }, + { + "epoch": 9.579002079002079, + "grad_norm": 0.38981154561042786, + "learning_rate": 2.7012989665421706e-07, + "loss": 0.2099, + "num_input_tokens_seen": 3513792, + "step": 18430 + }, + { + "epoch": 9.58160083160083, + "grad_norm": 0.30255207419395447, + "learning_rate": 2.6681525866266157e-07, + "loss": 0.2314, + "num_input_tokens_seen": 3514816, + "step": 18435 + }, + { + "epoch": 9.584199584199585, + "grad_norm": 0.22918939590454102, + "learning_rate": 2.635209734920291e-07, + "loss": 0.2011, + "num_input_tokens_seen": 3515840, + "step": 18440 + }, + { + "epoch": 9.586798336798337, + "grad_norm": 0.7492639422416687, + "learning_rate": 2.602470438531679e-07, + "loss": 0.3124, + "num_input_tokens_seen": 3516704, + "step": 18445 + }, + { + "epoch": 9.589397089397089, + "grad_norm": 0.1386294811964035, + "learning_rate": 2.5699347244018404e-07, + "loss": 0.2375, + "num_input_tokens_seen": 3517664, + "step": 18450 + }, + { + "epoch": 9.591995841995843, + "grad_norm": 0.4687719941139221, + "learning_rate": 2.537602619304247e-07, + "loss": 0.2884, + "num_input_tokens_seen": 3518688, + "step": 18455 + }, + { + "epoch": 9.594594594594595, + "grad_norm": 0.5998404026031494, + "learning_rate": 2.5054741498448386e-07, + "loss": 0.2286, + "num_input_tokens_seen": 3519648, + "step": 18460 + }, + { + "epoch": 9.597193347193347, + "grad_norm": 0.14413122832775116, + "learning_rate": 2.4735493424619394e-07, + "loss": 0.2699, + "num_input_tokens_seen": 3520576, + "step": 18465 + }, + { + "epoch": 9.5997920997921, + "grad_norm": 0.17900440096855164, + "learning_rate": 2.4418282234263957e-07, + "loss": 0.2567, + "num_input_tokens_seen": 3521536, + "step": 18470 + }, + { + "epoch": 9.602390852390853, + "grad_norm": 0.3697381317615509, + "learning_rate": 2.410310818841299e-07, + "loss": 0.2374, + "num_input_tokens_seen": 3522528, + "step": 18475 + }, + { + "epoch": 9.604989604989605, + "grad_norm": 0.27199864387512207, + "learning_rate": 2.3789971546422374e-07, + "loss": 0.2634, + "num_input_tokens_seen": 3523520, + "step": 18480 + }, + { + "epoch": 9.607588357588357, + "grad_norm": 0.5874216556549072, + "learning_rate": 2.3478872565969867e-07, + "loss": 0.2574, + "num_input_tokens_seen": 3524480, + "step": 18485 + }, + { + "epoch": 9.61018711018711, + "grad_norm": 0.32591554522514343, + "learning_rate": 2.316981150305847e-07, + "loss": 0.3252, + "num_input_tokens_seen": 3525408, + "step": 18490 + }, + { + "epoch": 9.612785862785863, + "grad_norm": 0.3220791518688202, + "learning_rate": 2.2862788612012244e-07, + "loss": 0.2755, + "num_input_tokens_seen": 3526304, + "step": 18495 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.37822413444519043, + "learning_rate": 2.255780414547909e-07, + "loss": 0.2495, + "num_input_tokens_seen": 3527232, + "step": 18500 + }, + { + "epoch": 9.617983367983367, + "grad_norm": 0.317420095205307, + "learning_rate": 2.2254858354429364e-07, + "loss": 0.2801, + "num_input_tokens_seen": 3528224, + "step": 18505 + }, + { + "epoch": 9.620582120582121, + "grad_norm": 0.5886354446411133, + "learning_rate": 2.19539514881556e-07, + "loss": 0.2494, + "num_input_tokens_seen": 3529120, + "step": 18510 + }, + { + "epoch": 9.623180873180873, + "grad_norm": 0.4477730393409729, + "learning_rate": 2.165508379427278e-07, + "loss": 0.2987, + "num_input_tokens_seen": 3530048, + "step": 18515 + }, + { + "epoch": 9.625779625779625, + "grad_norm": 0.3739985227584839, + "learning_rate": 2.1358255518717786e-07, + "loss": 0.2507, + "num_input_tokens_seen": 3531040, + "step": 18520 + }, + { + "epoch": 9.628378378378379, + "grad_norm": 0.5861207246780396, + "learning_rate": 2.106346690574912e-07, + "loss": 0.2094, + "num_input_tokens_seen": 3532096, + "step": 18525 + }, + { + "epoch": 9.630977130977131, + "grad_norm": 0.5701107978820801, + "learning_rate": 2.0770718197946625e-07, + "loss": 0.233, + "num_input_tokens_seen": 3533088, + "step": 18530 + }, + { + "epoch": 9.633575883575883, + "grad_norm": 0.20972570776939392, + "learning_rate": 2.0480009636212327e-07, + "loss": 0.2596, + "num_input_tokens_seen": 3534080, + "step": 18535 + }, + { + "epoch": 9.636174636174637, + "grad_norm": 0.3098584711551666, + "learning_rate": 2.0191341459768475e-07, + "loss": 0.2583, + "num_input_tokens_seen": 3534976, + "step": 18540 + }, + { + "epoch": 9.638773388773389, + "grad_norm": 0.2780306339263916, + "learning_rate": 1.9904713906159224e-07, + "loss": 0.2756, + "num_input_tokens_seen": 3535936, + "step": 18545 + }, + { + "epoch": 9.641372141372141, + "grad_norm": 0.5229358077049255, + "learning_rate": 1.9620127211248672e-07, + "loss": 0.2278, + "num_input_tokens_seen": 3536896, + "step": 18550 + }, + { + "epoch": 9.643970893970893, + "grad_norm": 0.590140163898468, + "learning_rate": 1.9337581609222277e-07, + "loss": 0.2445, + "num_input_tokens_seen": 3537856, + "step": 18555 + }, + { + "epoch": 9.646569646569647, + "grad_norm": 0.43243369460105896, + "learning_rate": 1.9057077332584883e-07, + "loss": 0.2647, + "num_input_tokens_seen": 3538752, + "step": 18560 + }, + { + "epoch": 9.6491683991684, + "grad_norm": 0.24380013346672058, + "learning_rate": 1.8778614612162404e-07, + "loss": 0.2724, + "num_input_tokens_seen": 3539744, + "step": 18565 + }, + { + "epoch": 9.651767151767151, + "grad_norm": 0.15851661562919617, + "learning_rate": 1.850219367710071e-07, + "loss": 0.2252, + "num_input_tokens_seen": 3540640, + "step": 18570 + }, + { + "epoch": 9.654365904365905, + "grad_norm": 0.3474172055721283, + "learning_rate": 1.8227814754865068e-07, + "loss": 0.1944, + "num_input_tokens_seen": 3541536, + "step": 18575 + }, + { + "epoch": 9.656964656964657, + "grad_norm": 0.3900673985481262, + "learning_rate": 1.7955478071240706e-07, + "loss": 0.2358, + "num_input_tokens_seen": 3542432, + "step": 18580 + }, + { + "epoch": 9.65956340956341, + "grad_norm": 0.21324650943279266, + "learning_rate": 1.7685183850331965e-07, + "loss": 0.2451, + "num_input_tokens_seen": 3543392, + "step": 18585 + }, + { + "epoch": 9.662162162162161, + "grad_norm": 0.7378517985343933, + "learning_rate": 1.7416932314562872e-07, + "loss": 0.2353, + "num_input_tokens_seen": 3544352, + "step": 18590 + }, + { + "epoch": 9.664760914760915, + "grad_norm": 0.3596142828464508, + "learning_rate": 1.7150723684676572e-07, + "loss": 0.2393, + "num_input_tokens_seen": 3545280, + "step": 18595 + }, + { + "epoch": 9.667359667359667, + "grad_norm": 0.1590615063905716, + "learning_rate": 1.6886558179734225e-07, + "loss": 0.2707, + "num_input_tokens_seen": 3546240, + "step": 18600 + }, + { + "epoch": 9.66995841995842, + "grad_norm": 0.49209901690483093, + "learning_rate": 1.662443601711694e-07, + "loss": 0.2327, + "num_input_tokens_seen": 3547200, + "step": 18605 + }, + { + "epoch": 9.672557172557173, + "grad_norm": 0.3766759932041168, + "learning_rate": 1.6364357412523845e-07, + "loss": 0.2993, + "num_input_tokens_seen": 3548224, + "step": 18610 + }, + { + "epoch": 9.675155925155925, + "grad_norm": 0.572062611579895, + "learning_rate": 1.6106322579972077e-07, + "loss": 0.2528, + "num_input_tokens_seen": 3549216, + "step": 18615 + }, + { + "epoch": 9.677754677754677, + "grad_norm": 0.598198413848877, + "learning_rate": 1.585033173179734e-07, + "loss": 0.2658, + "num_input_tokens_seen": 3550208, + "step": 18620 + }, + { + "epoch": 9.68035343035343, + "grad_norm": 0.2921161651611328, + "learning_rate": 1.5596385078653353e-07, + "loss": 0.3102, + "num_input_tokens_seen": 3551168, + "step": 18625 + }, + { + "epoch": 9.682952182952183, + "grad_norm": 0.24963603913784027, + "learning_rate": 1.5344482829511842e-07, + "loss": 0.2671, + "num_input_tokens_seen": 3552160, + "step": 18630 + }, + { + "epoch": 9.685550935550935, + "grad_norm": 0.19811686873435974, + "learning_rate": 1.5094625191661715e-07, + "loss": 0.2852, + "num_input_tokens_seen": 3553152, + "step": 18635 + }, + { + "epoch": 9.688149688149688, + "grad_norm": 0.6159384250640869, + "learning_rate": 1.4846812370709617e-07, + "loss": 0.2785, + "num_input_tokens_seen": 3554080, + "step": 18640 + }, + { + "epoch": 9.690748440748441, + "grad_norm": 0.29077157378196716, + "learning_rate": 1.4601044570579647e-07, + "loss": 0.2745, + "num_input_tokens_seen": 3555040, + "step": 18645 + }, + { + "epoch": 9.693347193347194, + "grad_norm": 0.4158150255680084, + "learning_rate": 1.4357321993513084e-07, + "loss": 0.2583, + "num_input_tokens_seen": 3556032, + "step": 18650 + }, + { + "epoch": 9.695945945945946, + "grad_norm": 0.34541717171669006, + "learning_rate": 1.4115644840067833e-07, + "loss": 0.2727, + "num_input_tokens_seen": 3557024, + "step": 18655 + }, + { + "epoch": 9.698544698544698, + "grad_norm": 0.34536707401275635, + "learning_rate": 1.3876013309118697e-07, + "loss": 0.2682, + "num_input_tokens_seen": 3557984, + "step": 18660 + }, + { + "epoch": 9.701143451143452, + "grad_norm": 0.4566798806190491, + "learning_rate": 1.363842759785794e-07, + "loss": 0.2469, + "num_input_tokens_seen": 3558976, + "step": 18665 + }, + { + "epoch": 9.703742203742204, + "grad_norm": 0.42601194977760315, + "learning_rate": 1.3402887901793338e-07, + "loss": 0.2534, + "num_input_tokens_seen": 3559872, + "step": 18670 + }, + { + "epoch": 9.706340956340956, + "grad_norm": 0.22519707679748535, + "learning_rate": 1.316939441474957e-07, + "loss": 0.2719, + "num_input_tokens_seen": 3560800, + "step": 18675 + }, + { + "epoch": 9.70893970893971, + "grad_norm": 0.4312206208705902, + "learning_rate": 1.2937947328867106e-07, + "loss": 0.2664, + "num_input_tokens_seen": 3561760, + "step": 18680 + }, + { + "epoch": 9.711538461538462, + "grad_norm": 0.4202684462070465, + "learning_rate": 1.270854683460304e-07, + "loss": 0.2885, + "num_input_tokens_seen": 3562688, + "step": 18685 + }, + { + "epoch": 9.714137214137214, + "grad_norm": 0.40043336153030396, + "learning_rate": 1.2481193120729427e-07, + "loss": 0.223, + "num_input_tokens_seen": 3563744, + "step": 18690 + }, + { + "epoch": 9.716735966735968, + "grad_norm": 0.442580908536911, + "learning_rate": 1.2255886374334946e-07, + "loss": 0.2884, + "num_input_tokens_seen": 3564672, + "step": 18695 + }, + { + "epoch": 9.71933471933472, + "grad_norm": 0.5790236592292786, + "learning_rate": 1.203262678082323e-07, + "loss": 0.2341, + "num_input_tokens_seen": 3565600, + "step": 18700 + }, + { + "epoch": 9.721933471933472, + "grad_norm": 0.4618585407733917, + "learning_rate": 1.1811414523913711e-07, + "loss": 0.2322, + "num_input_tokens_seen": 3566624, + "step": 18705 + }, + { + "epoch": 9.724532224532224, + "grad_norm": 0.6193612813949585, + "learning_rate": 1.1592249785641052e-07, + "loss": 0.2369, + "num_input_tokens_seen": 3567584, + "step": 18710 + }, + { + "epoch": 9.727130977130978, + "grad_norm": 0.41462433338165283, + "learning_rate": 1.1375132746354322e-07, + "loss": 0.3308, + "num_input_tokens_seen": 3568480, + "step": 18715 + }, + { + "epoch": 9.72972972972973, + "grad_norm": 0.33880797028541565, + "learning_rate": 1.1160063584718661e-07, + "loss": 0.2341, + "num_input_tokens_seen": 3569408, + "step": 18720 + }, + { + "epoch": 9.732328482328482, + "grad_norm": 0.5403966307640076, + "learning_rate": 1.0947042477713332e-07, + "loss": 0.227, + "num_input_tokens_seen": 3570368, + "step": 18725 + }, + { + "epoch": 9.734927234927234, + "grad_norm": 0.26210588216781616, + "learning_rate": 1.0736069600632281e-07, + "loss": 0.2922, + "num_input_tokens_seen": 3571328, + "step": 18730 + }, + { + "epoch": 9.737525987525988, + "grad_norm": 0.4112553298473358, + "learning_rate": 1.0527145127084136e-07, + "loss": 0.2243, + "num_input_tokens_seen": 3572384, + "step": 18735 + }, + { + "epoch": 9.74012474012474, + "grad_norm": 0.5155070424079895, + "learning_rate": 1.032026922899193e-07, + "loss": 0.2677, + "num_input_tokens_seen": 3573376, + "step": 18740 + }, + { + "epoch": 9.742723492723492, + "grad_norm": 0.5089104771614075, + "learning_rate": 1.0115442076592541e-07, + "loss": 0.2508, + "num_input_tokens_seen": 3574304, + "step": 18745 + }, + { + "epoch": 9.745322245322246, + "grad_norm": 0.3908654451370239, + "learning_rate": 9.912663838437808e-08, + "loss": 0.2631, + "num_input_tokens_seen": 3575328, + "step": 18750 + }, + { + "epoch": 9.747920997920998, + "grad_norm": 0.6880642175674438, + "learning_rate": 9.711934681392587e-08, + "loss": 0.232, + "num_input_tokens_seen": 3576288, + "step": 18755 + }, + { + "epoch": 9.75051975051975, + "grad_norm": 0.5098856091499329, + "learning_rate": 9.513254770636137e-08, + "loss": 0.2597, + "num_input_tokens_seen": 3577312, + "step": 18760 + }, + { + "epoch": 9.753118503118504, + "grad_norm": 0.33457478880882263, + "learning_rate": 9.31662426966129e-08, + "loss": 0.2204, + "num_input_tokens_seen": 3578240, + "step": 18765 + }, + { + "epoch": 9.755717255717256, + "grad_norm": 0.3115820288658142, + "learning_rate": 9.122043340273889e-08, + "loss": 0.2716, + "num_input_tokens_seen": 3579168, + "step": 18770 + }, + { + "epoch": 9.758316008316008, + "grad_norm": 0.41022351384162903, + "learning_rate": 8.929512142594187e-08, + "loss": 0.2155, + "num_input_tokens_seen": 3580096, + "step": 18775 + }, + { + "epoch": 9.76091476091476, + "grad_norm": 0.36792388558387756, + "learning_rate": 8.739030835055173e-08, + "loss": 0.1968, + "num_input_tokens_seen": 3580992, + "step": 18780 + }, + { + "epoch": 9.763513513513514, + "grad_norm": 0.24501033127307892, + "learning_rate": 8.550599574402574e-08, + "loss": 0.2659, + "num_input_tokens_seen": 3581984, + "step": 18785 + }, + { + "epoch": 9.766112266112266, + "grad_norm": 0.25329089164733887, + "learning_rate": 8.364218515695965e-08, + "loss": 0.2565, + "num_input_tokens_seen": 3582912, + "step": 18790 + }, + { + "epoch": 9.768711018711018, + "grad_norm": 0.7947752475738525, + "learning_rate": 8.179887812307386e-08, + "loss": 0.2671, + "num_input_tokens_seen": 3583872, + "step": 18795 + }, + { + "epoch": 9.771309771309772, + "grad_norm": 0.3491280674934387, + "learning_rate": 7.99760761592161e-08, + "loss": 0.2307, + "num_input_tokens_seen": 3584768, + "step": 18800 + }, + { + "epoch": 9.773908523908524, + "grad_norm": 0.4437684118747711, + "learning_rate": 7.817378076536153e-08, + "loss": 0.2718, + "num_input_tokens_seen": 3585664, + "step": 18805 + }, + { + "epoch": 9.776507276507276, + "grad_norm": 0.470745325088501, + "learning_rate": 7.63919934246099e-08, + "loss": 0.2453, + "num_input_tokens_seen": 3586624, + "step": 18810 + }, + { + "epoch": 9.779106029106028, + "grad_norm": 0.2404901385307312, + "learning_rate": 7.463071560318835e-08, + "loss": 0.2596, + "num_input_tokens_seen": 3587616, + "step": 18815 + }, + { + "epoch": 9.781704781704782, + "grad_norm": 0.4437086582183838, + "learning_rate": 7.288994875044308e-08, + "loss": 0.3123, + "num_input_tokens_seen": 3588544, + "step": 18820 + }, + { + "epoch": 9.784303534303534, + "grad_norm": 0.29080504179000854, + "learning_rate": 7.116969429883935e-08, + "loss": 0.2599, + "num_input_tokens_seen": 3589472, + "step": 18825 + }, + { + "epoch": 9.786902286902286, + "grad_norm": 0.6450490355491638, + "learning_rate": 6.946995366397257e-08, + "loss": 0.2172, + "num_input_tokens_seen": 3590432, + "step": 18830 + }, + { + "epoch": 9.78950103950104, + "grad_norm": 0.34151551127433777, + "learning_rate": 6.779072824454614e-08, + "loss": 0.261, + "num_input_tokens_seen": 3591424, + "step": 18835 + }, + { + "epoch": 9.792099792099792, + "grad_norm": 0.7566419839859009, + "learning_rate": 6.6132019422388e-08, + "loss": 0.2757, + "num_input_tokens_seen": 3592448, + "step": 18840 + }, + { + "epoch": 9.794698544698544, + "grad_norm": 0.5122021436691284, + "learning_rate": 6.449382856244246e-08, + "loss": 0.2679, + "num_input_tokens_seen": 3593472, + "step": 18845 + }, + { + "epoch": 9.797297297297296, + "grad_norm": 0.7868977785110474, + "learning_rate": 6.287615701277005e-08, + "loss": 0.2554, + "num_input_tokens_seen": 3594496, + "step": 18850 + }, + { + "epoch": 9.79989604989605, + "grad_norm": 0.3592422604560852, + "learning_rate": 6.127900610454207e-08, + "loss": 0.2092, + "num_input_tokens_seen": 3595456, + "step": 18855 + }, + { + "epoch": 9.802494802494802, + "grad_norm": 0.2117585688829422, + "learning_rate": 5.970237715204885e-08, + "loss": 0.2939, + "num_input_tokens_seen": 3596352, + "step": 18860 + }, + { + "epoch": 9.805093555093555, + "grad_norm": 0.7884005904197693, + "learning_rate": 5.814627145269147e-08, + "loss": 0.2817, + "num_input_tokens_seen": 3597344, + "step": 18865 + }, + { + "epoch": 9.807692307692308, + "grad_norm": 0.7204837799072266, + "learning_rate": 5.661069028697896e-08, + "loss": 0.3313, + "num_input_tokens_seen": 3598272, + "step": 18870 + }, + { + "epoch": 9.81029106029106, + "grad_norm": 0.18933753669261932, + "learning_rate": 5.509563491853942e-08, + "loss": 0.2513, + "num_input_tokens_seen": 3599200, + "step": 18875 + }, + { + "epoch": 9.812889812889813, + "grad_norm": 0.5633370876312256, + "learning_rate": 5.3601106594097784e-08, + "loss": 0.1963, + "num_input_tokens_seen": 3600224, + "step": 18880 + }, + { + "epoch": 9.815488565488565, + "grad_norm": 0.16775870323181152, + "learning_rate": 5.2127106543498063e-08, + "loss": 0.2091, + "num_input_tokens_seen": 3601120, + "step": 18885 + }, + { + "epoch": 9.818087318087318, + "grad_norm": 0.3302549123764038, + "learning_rate": 5.0673635979686665e-08, + "loss": 0.2394, + "num_input_tokens_seen": 3602080, + "step": 18890 + }, + { + "epoch": 9.82068607068607, + "grad_norm": 0.3253507912158966, + "learning_rate": 4.924069609872073e-08, + "loss": 0.2571, + "num_input_tokens_seen": 3603072, + "step": 18895 + }, + { + "epoch": 9.823284823284823, + "grad_norm": 0.291252076625824, + "learning_rate": 4.7828288079757035e-08, + "loss": 0.2986, + "num_input_tokens_seen": 3604032, + "step": 18900 + }, + { + "epoch": 9.825883575883577, + "grad_norm": 0.29953309893608093, + "learning_rate": 4.643641308505753e-08, + "loss": 0.251, + "num_input_tokens_seen": 3604960, + "step": 18905 + }, + { + "epoch": 9.828482328482329, + "grad_norm": 0.5966617465019226, + "learning_rate": 4.50650722599949e-08, + "loss": 0.229, + "num_input_tokens_seen": 3605856, + "step": 18910 + }, + { + "epoch": 9.83108108108108, + "grad_norm": 0.41585415601730347, + "learning_rate": 4.3714266733035914e-08, + "loss": 0.2275, + "num_input_tokens_seen": 3606816, + "step": 18915 + }, + { + "epoch": 9.833679833679835, + "grad_norm": 0.699379026889801, + "learning_rate": 4.238399761574974e-08, + "loss": 0.2475, + "num_input_tokens_seen": 3607776, + "step": 18920 + }, + { + "epoch": 9.836278586278587, + "grad_norm": 0.47404199838638306, + "learning_rate": 4.10742660028135e-08, + "loss": 0.2915, + "num_input_tokens_seen": 3608736, + "step": 18925 + }, + { + "epoch": 9.838877338877339, + "grad_norm": 0.6480771899223328, + "learning_rate": 3.978507297199285e-08, + "loss": 0.249, + "num_input_tokens_seen": 3609728, + "step": 18930 + }, + { + "epoch": 9.84147609147609, + "grad_norm": 0.5409592986106873, + "learning_rate": 3.851641958416696e-08, + "loss": 0.2304, + "num_input_tokens_seen": 3610720, + "step": 18935 + }, + { + "epoch": 9.844074844074845, + "grad_norm": 0.4933767020702362, + "learning_rate": 3.7268306883297966e-08, + "loss": 0.2498, + "num_input_tokens_seen": 3611648, + "step": 18940 + }, + { + "epoch": 9.846673596673597, + "grad_norm": 0.4000359773635864, + "learning_rate": 3.604073589645596e-08, + "loss": 0.2587, + "num_input_tokens_seen": 3612608, + "step": 18945 + }, + { + "epoch": 9.849272349272349, + "grad_norm": 0.26893168687820435, + "learning_rate": 3.4833707633799565e-08, + "loss": 0.2574, + "num_input_tokens_seen": 3613600, + "step": 18950 + }, + { + "epoch": 9.851871101871101, + "grad_norm": 0.4104331433773041, + "learning_rate": 3.3647223088589805e-08, + "loss": 0.2937, + "num_input_tokens_seen": 3614560, + "step": 18955 + }, + { + "epoch": 9.854469854469855, + "grad_norm": 0.5743434429168701, + "learning_rate": 3.248128323717625e-08, + "loss": 0.251, + "num_input_tokens_seen": 3615488, + "step": 18960 + }, + { + "epoch": 9.857068607068607, + "grad_norm": 0.3323560953140259, + "learning_rate": 3.133588903900808e-08, + "loss": 0.2481, + "num_input_tokens_seen": 3616448, + "step": 18965 + }, + { + "epoch": 9.859667359667359, + "grad_norm": 0.3969305753707886, + "learning_rate": 3.021104143662301e-08, + "loss": 0.2342, + "num_input_tokens_seen": 3617312, + "step": 18970 + }, + { + "epoch": 9.862266112266113, + "grad_norm": 0.6437161564826965, + "learning_rate": 2.910674135565561e-08, + "loss": 0.2949, + "num_input_tokens_seen": 3618272, + "step": 18975 + }, + { + "epoch": 9.864864864864865, + "grad_norm": 0.5226194858551025, + "learning_rate": 2.8022989704826196e-08, + "loss": 0.2707, + "num_input_tokens_seen": 3619136, + "step": 18980 + }, + { + "epoch": 9.867463617463617, + "grad_norm": 0.44593095779418945, + "learning_rate": 2.6959787375949174e-08, + "loss": 0.241, + "num_input_tokens_seen": 3620160, + "step": 18985 + }, + { + "epoch": 9.87006237006237, + "grad_norm": 0.22915233671665192, + "learning_rate": 2.5917135243930245e-08, + "loss": 0.2455, + "num_input_tokens_seen": 3621088, + "step": 18990 + }, + { + "epoch": 9.872661122661123, + "grad_norm": 0.23071518540382385, + "learning_rate": 2.4895034166760865e-08, + "loss": 0.3202, + "num_input_tokens_seen": 3622048, + "step": 18995 + }, + { + "epoch": 9.875259875259875, + "grad_norm": 0.39794284105300903, + "learning_rate": 2.389348498552657e-08, + "loss": 0.2429, + "num_input_tokens_seen": 3622976, + "step": 19000 + }, + { + "epoch": 9.877858627858627, + "grad_norm": 0.20056761801242828, + "learning_rate": 2.2912488524393095e-08, + "loss": 0.2704, + "num_input_tokens_seen": 3623968, + "step": 19005 + }, + { + "epoch": 9.880457380457381, + "grad_norm": 0.7986275553703308, + "learning_rate": 2.1952045590620253e-08, + "loss": 0.3275, + "num_input_tokens_seen": 3624960, + "step": 19010 + }, + { + "epoch": 9.883056133056133, + "grad_norm": 0.11231900006532669, + "learning_rate": 2.101215697455361e-08, + "loss": 0.2599, + "num_input_tokens_seen": 3625952, + "step": 19015 + }, + { + "epoch": 9.885654885654885, + "grad_norm": 0.28798186779022217, + "learning_rate": 2.0092823449618935e-08, + "loss": 0.2286, + "num_input_tokens_seen": 3626944, + "step": 19020 + }, + { + "epoch": 9.888253638253639, + "grad_norm": 0.3333415389060974, + "learning_rate": 1.9194045772336077e-08, + "loss": 0.2803, + "num_input_tokens_seen": 3627968, + "step": 19025 + }, + { + "epoch": 9.890852390852391, + "grad_norm": 0.4493728280067444, + "learning_rate": 1.831582468229953e-08, + "loss": 0.2683, + "num_input_tokens_seen": 3628928, + "step": 19030 + }, + { + "epoch": 9.893451143451143, + "grad_norm": 0.4157513976097107, + "learning_rate": 1.7458160902197872e-08, + "loss": 0.22, + "num_input_tokens_seen": 3629888, + "step": 19035 + }, + { + "epoch": 9.896049896049895, + "grad_norm": 0.5109702348709106, + "learning_rate": 1.6621055137797105e-08, + "loss": 0.3089, + "num_input_tokens_seen": 3630944, + "step": 19040 + }, + { + "epoch": 9.89864864864865, + "grad_norm": 0.36473262310028076, + "learning_rate": 1.5804508077946202e-08, + "loss": 0.2275, + "num_input_tokens_seen": 3631904, + "step": 19045 + }, + { + "epoch": 9.901247401247401, + "grad_norm": 0.6023039817810059, + "learning_rate": 1.500852039458267e-08, + "loss": 0.2618, + "num_input_tokens_seen": 3632832, + "step": 19050 + }, + { + "epoch": 9.903846153846153, + "grad_norm": 0.6158295273780823, + "learning_rate": 1.4233092742713116e-08, + "loss": 0.1758, + "num_input_tokens_seen": 3633760, + "step": 19055 + }, + { + "epoch": 9.906444906444907, + "grad_norm": 0.43336406350135803, + "learning_rate": 1.3478225760441e-08, + "loss": 0.2905, + "num_input_tokens_seen": 3634752, + "step": 19060 + }, + { + "epoch": 9.90904365904366, + "grad_norm": 0.6825036406517029, + "learning_rate": 1.2743920068938874e-08, + "loss": 0.2631, + "num_input_tokens_seen": 3635680, + "step": 19065 + }, + { + "epoch": 9.911642411642411, + "grad_norm": 0.5754250288009644, + "learning_rate": 1.203017627246228e-08, + "loss": 0.1893, + "num_input_tokens_seen": 3636672, + "step": 19070 + }, + { + "epoch": 9.914241164241163, + "grad_norm": 0.26934608817100525, + "learning_rate": 1.1336994958349723e-08, + "loss": 0.2565, + "num_input_tokens_seen": 3637632, + "step": 19075 + }, + { + "epoch": 9.916839916839917, + "grad_norm": 0.5518823862075806, + "learning_rate": 1.0664376697017142e-08, + "loss": 0.2463, + "num_input_tokens_seen": 3638592, + "step": 19080 + }, + { + "epoch": 9.91943866943867, + "grad_norm": 0.2028670459985733, + "learning_rate": 1.0012322041960676e-08, + "loss": 0.2523, + "num_input_tokens_seen": 3639552, + "step": 19085 + }, + { + "epoch": 9.922037422037421, + "grad_norm": 0.13783946633338928, + "learning_rate": 9.38083152974556e-09, + "loss": 0.2567, + "num_input_tokens_seen": 3640480, + "step": 19090 + }, + { + "epoch": 9.924636174636175, + "grad_norm": 0.11896271258592606, + "learning_rate": 8.76990568003111e-09, + "loss": 0.2261, + "num_input_tokens_seen": 3641472, + "step": 19095 + }, + { + "epoch": 9.927234927234927, + "grad_norm": 0.25669750571250916, + "learning_rate": 8.17954499554019e-09, + "loss": 0.2806, + "num_input_tokens_seen": 3642464, + "step": 19100 + }, + { + "epoch": 9.92983367983368, + "grad_norm": 0.79314124584198, + "learning_rate": 7.609749962081413e-09, + "loss": 0.257, + "num_input_tokens_seen": 3643456, + "step": 19105 + }, + { + "epoch": 9.932432432432432, + "grad_norm": 0.41440871357917786, + "learning_rate": 7.060521048532498e-09, + "loss": 0.2734, + "num_input_tokens_seen": 3644352, + "step": 19110 + }, + { + "epoch": 9.935031185031185, + "grad_norm": 0.5282419323921204, + "learning_rate": 6.5318587068541325e-09, + "loss": 0.25, + "num_input_tokens_seen": 3645344, + "step": 19115 + }, + { + "epoch": 9.937629937629938, + "grad_norm": 0.2063898742198944, + "learning_rate": 6.023763372076108e-09, + "loss": 0.2383, + "num_input_tokens_seen": 3646336, + "step": 19120 + }, + { + "epoch": 9.94022869022869, + "grad_norm": 0.5390633940696716, + "learning_rate": 5.536235462313965e-09, + "loss": 0.3058, + "num_input_tokens_seen": 3647296, + "step": 19125 + }, + { + "epoch": 9.942827442827443, + "grad_norm": 0.5007284879684448, + "learning_rate": 5.069275378746796e-09, + "loss": 0.241, + "num_input_tokens_seen": 3648224, + "step": 19130 + }, + { + "epoch": 9.945426195426196, + "grad_norm": 0.5803914070129395, + "learning_rate": 4.622883505636666e-09, + "loss": 0.2468, + "num_input_tokens_seen": 3649152, + "step": 19135 + }, + { + "epoch": 9.948024948024948, + "grad_norm": 0.46244171261787415, + "learning_rate": 4.197060210317516e-09, + "loss": 0.2391, + "num_input_tokens_seen": 3650048, + "step": 19140 + }, + { + "epoch": 9.950623700623701, + "grad_norm": 0.45073097944259644, + "learning_rate": 3.791805843195162e-09, + "loss": 0.3219, + "num_input_tokens_seen": 3650944, + "step": 19145 + }, + { + "epoch": 9.953222453222454, + "grad_norm": 0.2708904445171356, + "learning_rate": 3.4071207377500693e-09, + "loss": 0.2518, + "num_input_tokens_seen": 3651936, + "step": 19150 + }, + { + "epoch": 9.955821205821206, + "grad_norm": 0.4809665381908417, + "learning_rate": 3.043005210542904e-09, + "loss": 0.2813, + "num_input_tokens_seen": 3652864, + "step": 19155 + }, + { + "epoch": 9.958419958419958, + "grad_norm": 0.3984449505805969, + "learning_rate": 2.6994595612006566e-09, + "loss": 0.2783, + "num_input_tokens_seen": 3653856, + "step": 19160 + }, + { + "epoch": 9.961018711018712, + "grad_norm": 0.13449101150035858, + "learning_rate": 2.376484072424967e-09, + "loss": 0.2675, + "num_input_tokens_seen": 3654784, + "step": 19165 + }, + { + "epoch": 9.963617463617464, + "grad_norm": 0.19283053278923035, + "learning_rate": 2.074079009989349e-09, + "loss": 0.2758, + "num_input_tokens_seen": 3655712, + "step": 19170 + }, + { + "epoch": 9.966216216216216, + "grad_norm": 0.3747500479221344, + "learning_rate": 1.7922446227447432e-09, + "loss": 0.2441, + "num_input_tokens_seen": 3656704, + "step": 19175 + }, + { + "epoch": 9.96881496881497, + "grad_norm": 0.5904030203819275, + "learning_rate": 1.5309811426056364e-09, + "loss": 0.2548, + "num_input_tokens_seen": 3657664, + "step": 19180 + }, + { + "epoch": 9.971413721413722, + "grad_norm": 0.7485054135322571, + "learning_rate": 1.2902887845722688e-09, + "loss": 0.2695, + "num_input_tokens_seen": 3658656, + "step": 19185 + }, + { + "epoch": 9.974012474012474, + "grad_norm": 0.268167644739151, + "learning_rate": 1.070167746702877e-09, + "loss": 0.2381, + "num_input_tokens_seen": 3659616, + "step": 19190 + }, + { + "epoch": 9.976611226611226, + "grad_norm": 0.5937804579734802, + "learning_rate": 8.70618210138674e-10, + "loss": 0.2297, + "num_input_tokens_seen": 3660640, + "step": 19195 + }, + { + "epoch": 9.97920997920998, + "grad_norm": 0.5605359673500061, + "learning_rate": 6.916403390844206e-10, + "loss": 0.231, + "num_input_tokens_seen": 3661568, + "step": 19200 + }, + { + "epoch": 9.981808731808732, + "grad_norm": 0.43724310398101807, + "learning_rate": 5.332342808223034e-10, + "loss": 0.2461, + "num_input_tokens_seen": 3662560, + "step": 19205 + }, + { + "epoch": 9.984407484407484, + "grad_norm": 0.46338406205177307, + "learning_rate": 3.9540016570083215e-10, + "loss": 0.2594, + "num_input_tokens_seen": 3663520, + "step": 19210 + }, + { + "epoch": 9.987006237006238, + "grad_norm": 0.31585589051246643, + "learning_rate": 2.7813810714871767e-10, + "loss": 0.2462, + "num_input_tokens_seen": 3664416, + "step": 19215 + }, + { + "epoch": 9.98960498960499, + "grad_norm": 0.2658061683177948, + "learning_rate": 1.8144820165544307e-10, + "loss": 0.2073, + "num_input_tokens_seen": 3665312, + "step": 19220 + }, + { + "epoch": 9.992203742203742, + "grad_norm": 0.3769450783729553, + "learning_rate": 1.0533052878791694e-10, + "loss": 0.2718, + "num_input_tokens_seen": 3666272, + "step": 19225 + }, + { + "epoch": 9.994802494802494, + "grad_norm": 0.24269090592861176, + "learning_rate": 4.978515118214677e-11, + "loss": 0.2291, + "num_input_tokens_seen": 3667200, + "step": 19230 + }, + { + "epoch": 9.997401247401248, + "grad_norm": 0.44502007961273193, + "learning_rate": 1.4812114548790057e-11, + "loss": 0.2725, + "num_input_tokens_seen": 3668256, + "step": 19235 + }, + { + "epoch": 10.0, + "grad_norm": 0.5981783866882324, + "learning_rate": 4.114476648275911e-13, + "loss": 0.2594, + "num_input_tokens_seen": 3669168, + "step": 19240 + }, + { + "epoch": 10.0, + "eval_loss": 0.24947991967201233, + "eval_runtime": 7.9251, + "eval_samples_per_second": 108.011, + "eval_steps_per_second": 27.003, + "num_input_tokens_seen": 3669168, + "step": 19240 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 3669168, + "step": 19240, + "total_flos": 1.6522097669347738e+17, + "train_loss": 0.6069137412675205, + "train_runtime": 1773.4362, + "train_samples_per_second": 43.39, + "train_steps_per_second": 10.849 + } + ], + "logging_steps": 5, + "max_steps": 19240, + "num_input_tokens_seen": 3669168, + "num_train_epochs": 10, + "save_steps": 962, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6522097669347738e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}